swapping-v2 New usr/src/uts/common/vm/seg

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * Portions of this source code were derived from Berkeley 4.3 BSD
  31  * under license from the Regents of the University of California.
  32  */
  33 
  34 /*
  35  * VM - generic vnode mapping segment.
  36  *
  37  * The segmap driver is used only by the kernel to get faster (than seg_vn)
  38  * mappings [lower routine overhead; more persistent cache] to random
  39  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/param.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/buf.h>
  47 #include <sys/systm.h>
  48 #include <sys/vnode.h>
  49 #include <sys/mman.h>
  50 #include <sys/errno.h>
  51 #include <sys/cred.h>
  52 #include <sys/kmem.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/debug.h>
  56 #include <sys/thread.h>
  57 #include <sys/dumphdr.h>
  58 #include <sys/bitmap.h>
  59 #include <sys/lgrp.h>
  60 
  61 #include <vm/seg_kmem.h>
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_kpm.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/page.h>
  68 #include <vm/pvn.h>
  69 #include <vm/rm.h>
  70 
  71 /*
  72  * Private seg op routines.
  73  */
  74 static void     segmap_free(struct seg *seg);
  75 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
  76                         size_t len, enum fault_type type, enum seg_rw rw);
  77 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
  78 static int      segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
  79                         uint_t prot);
  80 static int      segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
  81 static int      segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
  82                         uint_t *protv);
  83 static u_offset_t       segmap_getoffset(struct seg *seg, caddr_t addr);
  84 static int      segmap_gettype(struct seg *seg, caddr_t addr);
  85 static int      segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
  86 static void     segmap_dump(struct seg *seg);
  87 static int      segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
  88                         struct page ***ppp, enum lock_type type,
  89                         enum seg_rw rw);
  90 static void     segmap_badop(void);
  91 static int      segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
  92 static lgrp_mem_policy_info_t   *segmap_getpolicy(struct seg *seg,
  93     caddr_t addr);
  94 static int      segmap_capable(struct seg *seg, segcapability_t capability);
  95 
  96 /* segkpm support */
  97 static caddr_t  segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
  98                         struct smap *, enum seg_rw);
  99 struct smap     *get_smap_kpm(caddr_t, page_t **);
 100 
 101 #define SEGMAP_BADOP(t) (t(*)())segmap_badop
 102 
 103 static struct seg_ops segmap_ops = {
 104         SEGMAP_BADOP(int),      /* dup */
 105         SEGMAP_BADOP(int),      /* unmap */
 106         segmap_free,
 107         segmap_fault,
 108         segmap_faulta,
 109         SEGMAP_BADOP(int),      /* setprot */
 110         segmap_checkprot,
 111         segmap_kluster,
 112         SEGMAP_BADOP(int),      /* sync */
 113         SEGMAP_BADOP(size_t),   /* incore */
 114         SEGMAP_BADOP(int),      /* lockop */
 115         segmap_getprot,
 116         segmap_getoffset,
 117         segmap_gettype,
 118         segmap_getvp,
 119         SEGMAP_BADOP(int),      /* advise */
 120         segmap_dump,
 121         segmap_pagelock,        /* pagelock */
 122         SEGMAP_BADOP(int),      /* setpgsz */
 123         segmap_getmemid,        /* getmemid */
 124         segmap_getpolicy,       /* getpolicy */
 125         segmap_capable,         /* capable */
 126 };
 127 
 128 /*
 129  * Private segmap routines.
 130  */
 131 static void     segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
 132                         size_t len, enum seg_rw rw, struct smap *smp);
 133 static void     segmap_smapadd(struct smap *smp);
 134 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
 135                         u_offset_t off, int hashid);
 136 static void     segmap_hashout(struct smap *smp);
 137 
 138 
 139 /*
 140  * Statistics for segmap operations.
 141  *
 142  * No explicit locking to protect these stats.
 143  */
 144 struct segmapcnt segmapcnt = {
 145         { "fault",              KSTAT_DATA_ULONG },
 146         { "faulta",             KSTAT_DATA_ULONG },
 147         { "getmap",             KSTAT_DATA_ULONG },
 148         { "get_use",            KSTAT_DATA_ULONG },
 149         { "get_reclaim",        KSTAT_DATA_ULONG },
 150         { "get_reuse",          KSTAT_DATA_ULONG },
 151         { "get_unused",         KSTAT_DATA_ULONG },
 152         { "get_nofree",         KSTAT_DATA_ULONG },
 153         { "rel_async",          KSTAT_DATA_ULONG },
 154         { "rel_write",          KSTAT_DATA_ULONG },
 155         { "rel_free",           KSTAT_DATA_ULONG },
 156         { "rel_abort",          KSTAT_DATA_ULONG },
 157         { "rel_dontneed",       KSTAT_DATA_ULONG },
 158         { "release",            KSTAT_DATA_ULONG },
 159         { "pagecreate",         KSTAT_DATA_ULONG },
 160         { "free_notfree",       KSTAT_DATA_ULONG },
 161         { "free_dirty",         KSTAT_DATA_ULONG },
 162         { "free",               KSTAT_DATA_ULONG },
 163         { "stolen",             KSTAT_DATA_ULONG },
 164         { "get_nomtx",          KSTAT_DATA_ULONG }
 165 };
 166 
 167 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
 168 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
 169 
 170 /*
 171  * Return number of map pages in segment.
 172  */
 173 #define MAP_PAGES(seg)          ((seg)->s_size >> MAXBSHIFT)
 174 
 175 /*
 176  * Translate addr into smap number within segment.
 177  */
 178 #define MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
 179 
 180 /*
 181  * Translate addr in seg into struct smap pointer.
 182  */
 183 #define GET_SMAP(seg, addr)     \
 184         &(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
 185 
 186 /*
 187  * Bit in map (16 bit bitmap).
 188  */
 189 #define SMAP_BIT_MASK(bitindex) (1 << ((bitindex) & 0xf))
 190 
 191 static int smd_colormsk = 0;
 192 static int smd_ncolor = 0;
 193 static int smd_nfree = 0;
 194 static int smd_freemsk = 0;
 195 #ifdef DEBUG
 196 static int *colors_used;
 197 #endif
 198 static struct smap *smd_smap;
 199 static struct smaphash *smd_hash;
 200 #ifdef SEGMAP_HASHSTATS
 201 static unsigned int *smd_hash_len;
 202 #endif
 203 static struct smfree *smd_free;
 204 static ulong_t smd_hashmsk = 0;
 205 
 206 #define SEGMAP_MAXCOLOR         2
 207 #define SEGMAP_CACHE_PAD        64
 208 
 209 union segmap_cpu {
 210         struct {
 211                 uint32_t        scpu_free_ndx[SEGMAP_MAXCOLOR];
 212                 struct smap     *scpu_last_smap;
 213                 ulong_t         scpu_getmap;
 214                 ulong_t         scpu_release;
 215                 ulong_t         scpu_get_reclaim;
 216                 ulong_t         scpu_fault;
 217                 ulong_t         scpu_pagecreate;
 218                 ulong_t         scpu_get_reuse;
 219         } scpu;
 220         char    scpu_pad[SEGMAP_CACHE_PAD];
 221 };
 222 static union segmap_cpu *smd_cpu;
 223 
 224 /*
 225  * There are three locks in seg_map:
 226  *      - per freelist mutexes
 227  *      - per hashchain mutexes
 228  *      - per smap mutexes
 229  *
 230  * The lock ordering is to get the smap mutex to lock down the slot
 231  * first then the hash lock (for hash in/out (vp, off) list) or the
 232  * freelist lock to put the slot back on the free list.
 233  *
 234  * The hash search is done by only holding the hashchain lock, when a wanted
 235  * slot is found, we drop the hashchain lock then lock the slot so there
 236  * is no overlapping of hashchain and smap locks. After the slot is
 237  * locked, we verify again if the slot is still what we are looking
 238  * for.
 239  *
 240  * Allocation of a free slot is done by holding the freelist lock,
 241  * then locking the smap slot at the head of the freelist. This is
 242  * in reversed lock order so mutex_tryenter() is used.
 243  *
 244  * The smap lock protects all fields in smap structure except for
 245  * the link fields for hash/free lists which are protected by
 246  * hashchain and freelist locks.
 247  */
 248 
 249 #define SHASHMTX(hashid)        (&smd_hash[hashid].sh_mtx)
 250 
 251 #define SMP2SMF(smp)            (&smd_free[(smp - smd_smap) & smd_freemsk])
 252 #define SMP2SMF_NDX(smp)        (ushort_t)((smp - smd_smap) & smd_freemsk)
 253 
 254 #define SMAPMTX(smp) (&smp->sm_mtx)
 255 
 256 #define SMAP_HASHFUNC(vp, off, hashid) \
 257         { \
 258         hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
 259                 ((off) >> MAXBSHIFT)) & smd_hashmsk); \
 260         }
 261 
 262 /*
 263  * The most frequently updated kstat counters are kept in the
 264  * per cpu array to avoid hot cache blocks. The update function
 265  * sums the cpu local counters to update the global counters.
 266  */
 267 
 268 /* ARGSUSED */
 269 int
 270 segmap_kstat_update(kstat_t *ksp, int rw)
 271 {
 272         int i;
 273         ulong_t getmap, release, get_reclaim;
 274         ulong_t fault, pagecreate, get_reuse;
 275 
 276         if (rw == KSTAT_WRITE)
 277                 return (EACCES);
 278         getmap = release = get_reclaim = (ulong_t)0;
 279         fault = pagecreate = get_reuse = (ulong_t)0;
 280         for (i = 0; i < max_ncpus; i++) {
 281                 getmap += smd_cpu[i].scpu.scpu_getmap;
 282                 release  += smd_cpu[i].scpu.scpu_release;
 283                 get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
 284                 fault  += smd_cpu[i].scpu.scpu_fault;
 285                 pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
 286                 get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
 287         }
 288         segmapcnt.smp_getmap.value.ul = getmap;
 289         segmapcnt.smp_release.value.ul = release;
 290         segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
 291         segmapcnt.smp_fault.value.ul = fault;
 292         segmapcnt.smp_pagecreate.value.ul = pagecreate;
 293         segmapcnt.smp_get_reuse.value.ul = get_reuse;
 294         return (0);
 295 }
 296 
 297 int
 298 segmap_create(struct seg *seg, void *argsp)
 299 {
 300         struct segmap_data *smd;
 301         struct smap *smp;
 302         struct smfree *sm;
 303         struct segmap_crargs *a = (struct segmap_crargs *)argsp;
 304         struct smaphash *shashp;
 305         union segmap_cpu *scpu;
 306         long i, npages;
 307         size_t hashsz;
 308         uint_t nfreelist;
 309         extern void prefetch_smap_w(void *);
 310         extern int max_ncpus;
 311 
 312         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 313 
 314         if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
 315                 panic("segkmap not MAXBSIZE aligned");
 316                 /*NOTREACHED*/
 317         }
 318 
 319         smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
 320 
 321         seg->s_data = (void *)smd;
 322         seg->s_ops = &segmap_ops;
 323         smd->smd_prot = a->prot;
 324 
 325         /*
 326          * Scale the number of smap freelists to be
 327          * proportional to max_ncpus * number of virtual colors.
 328          * The caller can over-ride this scaling by providing
 329          * a non-zero a->nfreelist argument.
 330          */
 331         nfreelist = a->nfreelist;
 332         if (nfreelist == 0)
 333                 nfreelist = max_ncpus;
 334         else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
 335                 cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
 336                 "%d, using %d", nfreelist, max_ncpus);
 337                 nfreelist = max_ncpus;
 338         }
 339         if (nfreelist & (nfreelist - 1)) {
 340                 /* round up nfreelist to the next power of two. */
 341                 nfreelist = 1 << (highbit(nfreelist));
 342         }
 343 
 344         /*
 345          * Get the number of virtual colors - must be a power of 2.
 346          */
 347         if (a->shmsize)
 348                 smd_ncolor = a->shmsize >> MAXBSHIFT;
 349         else
 350                 smd_ncolor = 1;
 351         ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
 352         ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
 353         smd_colormsk = smd_ncolor - 1;
 354         smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
 355         smd_freemsk = smd_nfree - 1;
 356 
 357         /*
 358          * Allocate and initialize the freelist headers.
 359          * Note that sm_freeq[1] starts out as the release queue. This
 360          * is known when the smap structures are initialized below.
 361          */
 362         smd_free = smd->smd_free =
 363             kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
 364         for (i = 0; i < smd_nfree; i++) {
 365                 sm = &smd->smd_free[i];
 366                 mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 367                 mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 368                 sm->sm_allocq = &sm->sm_freeq[0];
 369                 sm->sm_releq = &sm->sm_freeq[1];
 370         }
 371 
 372         /*
 373          * Allocate and initialize the smap hash chain headers.
 374          * Compute hash size rounding down to the next power of two.
 375          */
 376         npages = MAP_PAGES(seg);
 377         smd->smd_npages = npages;
 378         hashsz = npages / SMAP_HASHAVELEN;
 379         hashsz = 1 << (highbit(hashsz)-1);
 380         smd_hashmsk = hashsz - 1;
 381         smd_hash = smd->smd_hash =
 382             kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
 383 #ifdef SEGMAP_HASHSTATS
 384         smd_hash_len =
 385             kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
 386 #endif
 387         for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
 388                 shashp->sh_hash_list = NULL;
 389                 mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
 390         }
 391 
 392         /*
 393          * Allocate and initialize the smap structures.
 394          * Link all slots onto the appropriate freelist.
 395          * The smap array is large enough to affect boot time
 396          * on large systems, so use memory prefetching and only
 397          * go through the array 1 time. Inline a optimized version
 398          * of segmap_smapadd to add structures to freelists with
 399          * knowledge that no locks are needed here.
 400          */
 401         smd_smap = smd->smd_sm =
 402             kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
 403 
 404         for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
 405             smp >= smd->smd_sm; smp--) {
 406                 struct smap *smpfreelist;
 407                 struct sm_freeq *releq;
 408 
 409                 prefetch_smap_w((char *)smp);
 410 
 411                 smp->sm_vp = NULL;
 412                 smp->sm_hash = NULL;
 413                 smp->sm_off = 0;
 414                 smp->sm_bitmap = 0;
 415                 smp->sm_refcnt = 0;
 416                 mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
 417                 smp->sm_free_ndx = SMP2SMF_NDX(smp);
 418 
 419                 sm = SMP2SMF(smp);
 420                 releq = sm->sm_releq;
 421 
 422                 smpfreelist = releq->smq_free;
 423                 if (smpfreelist == 0) {
 424                         releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 425                 } else {
 426                         smp->sm_next = smpfreelist;
 427                         smp->sm_prev = smpfreelist->sm_prev;
 428                         smpfreelist->sm_prev = smp;
 429                         smp->sm_prev->sm_next = smp;
 430                         releq->smq_free = smp->sm_next;
 431                 }
 432 
 433                 /*
 434                  * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
 435                  */
 436                 smp->sm_flags = 0;
 437 
 438 #ifdef  SEGKPM_SUPPORT
 439                 /*
 440                  * Due to the fragile prefetch loop no
 441                  * separate function is used here.
 442                  */
 443                 smp->sm_kpme_next = NULL;
 444                 smp->sm_kpme_prev = NULL;
 445                 smp->sm_kpme_page = NULL;
 446 #endif
 447         }
 448 
 449         /*
 450          * Allocate the per color indices that distribute allocation
 451          * requests over the free lists. Each cpu will have a private
 452          * rotor index to spread the allocations even across the available
 453          * smap freelists. Init the scpu_last_smap field to the first
 454          * smap element so there is no need to check for NULL.
 455          */
 456         smd_cpu =
 457             kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
 458         for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
 459                 int j;
 460                 for (j = 0; j < smd_ncolor; j++)
 461                         scpu->scpu.scpu_free_ndx[j] = j;
 462                 scpu->scpu.scpu_last_smap = smd_smap;
 463         }
 464 
 465         vpm_init();
 466 
 467 #ifdef DEBUG
 468         /*
 469          * Keep track of which colors are used more often.
 470          */
 471         colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
 472 #endif /* DEBUG */
 473 
 474         return (0);
 475 }
 476 
 477 static void
 478 segmap_free(seg)
 479         struct seg *seg;
 480 {
 481         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 482 }
 483 
 484 /*
 485  * Do a F_SOFTUNLOCK call over the range requested.
 486  * The range must have already been F_SOFTLOCK'ed.
 487  */
 488 static void
 489 segmap_unlock(
 490         struct hat *hat,
 491         struct seg *seg,
 492         caddr_t addr,
 493         size_t len,
 494         enum seg_rw rw,
 495         struct smap *smp)
 496 {
 497         page_t *pp;
 498         caddr_t adr;
 499         u_offset_t off;
 500         struct vnode *vp;
 501         kmutex_t *smtx;
 502 
 503         ASSERT(smp->sm_refcnt > 0);
 504 
 505 #ifdef lint
 506         seg = seg;
 507 #endif
 508 
 509         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 510 
 511                 /*
 512                  * We're called only from segmap_fault and this was a
 513                  * NOP in case of a kpm based smap, so dangerous things
 514                  * must have happened in the meantime. Pages are prefaulted
 515                  * and locked in segmap_getmapflt and they will not be
 516                  * unlocked until segmap_release.
 517                  */
 518                 panic("segmap_unlock: called with kpm addr %p", (void *)addr);
 519                 /*NOTREACHED*/
 520         }
 521 
 522         vp = smp->sm_vp;
 523         off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 524 
 525         hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
 526         for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
 527                 ushort_t bitmask;
 528 
 529                 /*
 530                  * Use page_find() instead of page_lookup() to
 531                  * find the page since we know that it has
 532                  * "shared" lock.
 533                  */
 534                 pp = page_find(vp, off);
 535                 if (pp == NULL) {
 536                         panic("segmap_unlock: page not found");
 537                         /*NOTREACHED*/
 538                 }
 539 
 540                 if (rw == S_WRITE) {
 541                         hat_setrefmod(pp);
 542                 } else if (rw != S_OTHER) {
 543                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 544                         "segmap_fault:pp %p vp %p offset %llx", pp, vp, off);
 545                         hat_setref(pp);
 546                 }
 547 
 548                 /*
 549                  * Clear bitmap, if the bit corresponding to "off" is set,
 550                  * since the page and translation are being unlocked.
 551                  */
 552                 bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
 553 
 554                 /*
 555                  * Large Files: Following assertion is to verify
 556                  * the correctness of the cast to (int) above.
 557                  */
 558                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
 559                 smtx = SMAPMTX(smp);
 560                 mutex_enter(smtx);
 561                 if (smp->sm_bitmap & bitmask) {
 562                         smp->sm_bitmap &= ~bitmask;
 563                 }
 564                 mutex_exit(smtx);
 565 
 566                 page_unlock(pp);
 567         }
 568 }
 569 
 570 #define MAXPPB  (MAXBSIZE/4096) /* assumes minimum page size of 4k */
 571 
 572 /*
 573  * This routine is called via a machine specific fault handling
 574  * routine.  It is also called by software routines wishing to
 575  * lock or unlock a range of addresses.
 576  *
 577  * Note that this routine expects a page-aligned "addr".
 578  */
 579 faultcode_t
 580 segmap_fault(
 581         struct hat *hat,
 582         struct seg *seg,
 583         caddr_t addr,
 584         size_t len,
 585         enum fault_type type,
 586         enum seg_rw rw)
 587 {
 588         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 589         struct smap *smp;
 590         page_t *pp, **ppp;
 591         struct vnode *vp;
 592         u_offset_t off;
 593         page_t *pl[MAXPPB + 1];
 594         uint_t prot;
 595         u_offset_t addroff;
 596         caddr_t adr;
 597         int err;
 598         u_offset_t sm_off;
 599         int hat_flag;
 600 
 601         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 602                 int newpage;
 603                 kmutex_t *smtx;
 604 
 605                 /*
 606                  * Pages are successfully prefaulted and locked in
 607                  * segmap_getmapflt and can't be unlocked until
 608                  * segmap_release. No hat mappings have to be locked
 609                  * and they also can't be unlocked as long as the
 610                  * caller owns an active kpm addr.
 611                  */
 612 #ifndef DEBUG
 613                 if (type != F_SOFTUNLOCK)
 614                         return (0);
 615 #endif
 616 
 617                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 618                         panic("segmap_fault: smap not found "
 619                             "for addr %p", (void *)addr);
 620                         /*NOTREACHED*/
 621                 }
 622 
 623                 smtx = SMAPMTX(smp);
 624 #ifdef  DEBUG
 625                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 626                 if (newpage) {
 627                         cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
 628                             (void *)smp);
 629                 }
 630 
 631                 if (type != F_SOFTUNLOCK) {
 632                         mutex_exit(smtx);
 633                         return (0);
 634                 }
 635 #endif
 636                 mutex_exit(smtx);
 637                 vp = smp->sm_vp;
 638                 sm_off = smp->sm_off;
 639 
 640                 if (vp == NULL)
 641                         return (FC_MAKE_ERR(EIO));
 642 
 643                 ASSERT(smp->sm_refcnt > 0);
 644 
 645                 addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 646                 if (addroff + len > MAXBSIZE)
 647                         panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
 648                             (void *)(addr + len));
 649 
 650                 off = sm_off + addroff;
 651 
 652                 pp = page_find(vp, off);
 653 
 654                 if (pp == NULL)
 655                         panic("segmap_fault: softunlock page not found");
 656 
 657                 /*
 658                  * Set ref bit also here in case of S_OTHER to avoid the
 659                  * overhead of supporting other cases than F_SOFTUNLOCK
 660                  * with segkpm. We can do this because the underlying
 661                  * pages are locked anyway.
 662                  */
 663                 if (rw == S_WRITE) {
 664                         hat_setrefmod(pp);
 665                 } else {
 666                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 667                             "segmap_fault:pp %p vp %p offset %llx",
 668                             pp, vp, off);
 669                         hat_setref(pp);
 670                 }
 671 
 672                 return (0);
 673         }
 674 
 675         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
 676         smp = GET_SMAP(seg, addr);
 677         vp = smp->sm_vp;
 678         sm_off = smp->sm_off;
 679 
 680         if (vp == NULL)
 681                 return (FC_MAKE_ERR(EIO));
 682 
 683         ASSERT(smp->sm_refcnt > 0);
 684 
 685         addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 686         if (addroff + len > MAXBSIZE) {
 687                 panic("segmap_fault: endaddr %p "
 688                     "exceeds MAXBSIZE chunk", (void *)(addr + len));
 689                 /*NOTREACHED*/
 690         }
 691         off = sm_off + addroff;
 692 
 693         /*
 694          * First handle the easy stuff
 695          */
 696         if (type == F_SOFTUNLOCK) {
 697                 segmap_unlock(hat, seg, addr, len, rw, smp);
 698                 return (0);
 699         }
 700 
 701         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 702             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 703         err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
 704             seg, addr, rw, CRED(), NULL);
 705 
 706         if (err)
 707                 return (FC_MAKE_ERR(err));
 708 
 709         prot &= smd->smd_prot;
 710 
 711         /*
 712          * Handle all pages returned in the pl[] array.
 713          * This loop is coded on the assumption that if
 714          * there was no error from the VOP_GETPAGE routine,
 715          * that the page list returned will contain all the
 716          * needed pages for the vp from [off..off + len].
 717          */
 718         ppp = pl;
 719         while ((pp = *ppp++) != NULL) {
 720                 u_offset_t poff;
 721                 ASSERT(pp->p_vnode == vp);
 722                 hat_flag = HAT_LOAD;
 723 
 724                 /*
 725                  * Verify that the pages returned are within the range
 726                  * of this segmap region.  Note that it is theoretically
 727                  * possible for pages outside this range to be returned,
 728                  * but it is not very likely.  If we cannot use the
 729                  * page here, just release it and go on to the next one.
 730                  */
 731                 if (pp->p_offset < sm_off ||
 732                     pp->p_offset >= sm_off + MAXBSIZE) {
 733                         (void) page_release(pp, 1);
 734                         continue;
 735                 }
 736 
 737                 ASSERT(hat == kas.a_hat);
 738                 poff = pp->p_offset;
 739                 adr = addr + (poff - off);
 740                 if (adr >= addr && adr < addr + len) {
 741                         hat_setref(pp);
 742                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 743                             "segmap_fault:pp %p vp %p offset %llx",
 744                             pp, vp, poff);
 745                         if (type == F_SOFTLOCK)
 746                                 hat_flag = HAT_LOAD_LOCK;
 747                 }
 748 
 749                 /*
 750                  * Deal with VMODSORT pages here. If we know this is a write
 751                  * do the setmod now and allow write protection.
 752                  * As long as it's modified or not S_OTHER, remove write
 753                  * protection. With S_OTHER it's up to the FS to deal with this.
 754                  */
 755                 if (IS_VMODSORT(vp)) {
 756                         if (rw == S_WRITE)
 757                                 hat_setmod(pp);
 758                         else if (rw != S_OTHER && !hat_ismod(pp))
 759                                 prot &= ~PROT_WRITE;
 760                 }
 761 
 762                 hat_memload(hat, adr, pp, prot, hat_flag);
 763                 if (hat_flag != HAT_LOAD_LOCK)
 764                         page_unlock(pp);
 765         }
 766         return (0);
 767 }
 768 
 769 /*
 770  * This routine is used to start I/O on pages asynchronously.
 771  */
 772 static faultcode_t
 773 segmap_faulta(struct seg *seg, caddr_t addr)
 774 {
 775         struct smap *smp;
 776         struct vnode *vp;
 777         u_offset_t off;
 778         int err;
 779 
 780         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 781                 int     newpage;
 782                 kmutex_t *smtx;
 783 
 784                 /*
 785                  * Pages are successfully prefaulted and locked in
 786                  * segmap_getmapflt and can't be unlocked until
 787                  * segmap_release. No hat mappings have to be locked
 788                  * and they also can't be unlocked as long as the
 789                  * caller owns an active kpm addr.
 790                  */
 791 #ifdef  DEBUG
 792                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 793                         panic("segmap_faulta: smap not found "
 794                             "for addr %p", (void *)addr);
 795                         /*NOTREACHED*/
 796                 }
 797 
 798                 smtx = SMAPMTX(smp);
 799                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 800                 mutex_exit(smtx);
 801                 if (newpage)
 802                         cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
 803                             (void *)smp);
 804 #endif
 805                 return (0);
 806         }
 807 
 808         segmapcnt.smp_faulta.value.ul++;
 809         smp = GET_SMAP(seg, addr);
 810 
 811         ASSERT(smp->sm_refcnt > 0);
 812 
 813         vp = smp->sm_vp;
 814         off = smp->sm_off;
 815 
 816         if (vp == NULL) {
 817                 cmn_err(CE_WARN, "segmap_faulta - no vp");
 818                 return (FC_MAKE_ERR(EIO));
 819         }
 820 
 821         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 822             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 823 
 824         err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
 825             & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
 826             seg, addr, S_READ, CRED(), NULL);
 827 
 828         if (err)
 829                 return (FC_MAKE_ERR(err));
 830         return (0);
 831 }
 832 
 833 /*ARGSUSED*/
 834 static int
 835 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
 836 {
 837         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 838 
 839         ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
 840 
 841         /*
 842          * Need not acquire the segment lock since
 843          * "smd_prot" is a read-only field.
 844          */
 845         return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
 846 }
 847 
 848 static int
 849 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
 850 {
 851         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 852         size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
 853 
 854         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 855 
 856         if (pgno != 0) {
 857                 do {
 858                         protv[--pgno] = smd->smd_prot;
 859                 } while (pgno != 0);
 860         }
 861         return (0);
 862 }
 863 
 864 static u_offset_t
 865 segmap_getoffset(struct seg *seg, caddr_t addr)
 866 {
 867         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 868 
 869         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 870 
 871         return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
 872 }
 873 
 874 /*ARGSUSED*/
 875 static int
 876 segmap_gettype(struct seg *seg, caddr_t addr)
 877 {
 878         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 879 
 880         return (MAP_SHARED);
 881 }
 882 
 883 /*ARGSUSED*/
 884 static int
 885 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
 886 {
 887         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 888 
 889         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 890 
 891         /* XXX - This doesn't make any sense */
 892         *vpp = smd->smd_sm->sm_vp;
 893         return (0);
 894 }
 895 
 896 /*
 897  * Check to see if it makes sense to do kluster/read ahead to
 898  * addr + delta relative to the mapping at addr.  We assume here
 899  * that delta is a signed PAGESIZE'd multiple (which can be negative).
 900  *
 901  * For segmap we always "approve" of this action from our standpoint.
 902  */
 903 /*ARGSUSED*/
 904 static int
 905 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 906 {
 907         return (0);
 908 }
 909 
 910 static void
 911 segmap_badop()
 912 {
 913         panic("segmap_badop");
 914         /*NOTREACHED*/
 915 }
 916 
 917 /*
 918  * Special private segmap operations
 919  */
 920 
 921 /*
 922  * Add smap to the appropriate free list.
 923  */
 924 static void
 925 segmap_smapadd(struct smap *smp)
 926 {
 927         struct smfree *sm;
 928         struct smap *smpfreelist;
 929         struct sm_freeq *releq;
 930 
 931         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 932 
 933         if (smp->sm_refcnt != 0) {
 934                 panic("segmap_smapadd");
 935                 /*NOTREACHED*/
 936         }
 937 
 938         sm = &smd_free[smp->sm_free_ndx];
 939         /*
 940          * Add to the tail of the release queue
 941          * Note that sm_releq and sm_allocq could toggle
 942          * before we get the lock. This does not affect
 943          * correctness as the 2 queues are only maintained
 944          * to reduce lock pressure.
 945          */
 946         releq = sm->sm_releq;
 947         if (releq == &sm->sm_freeq[0])
 948                 smp->sm_flags |= SM_QNDX_ZERO;
 949         else
 950                 smp->sm_flags &= ~SM_QNDX_ZERO;
 951         mutex_enter(&releq->smq_mtx);
 952         smpfreelist = releq->smq_free;
 953         if (smpfreelist == 0) {
 954                 int want;
 955 
 956                 releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 957                 /*
 958                  * Both queue mutexes held to set sm_want;
 959                  * snapshot the value before dropping releq mutex.
 960                  * If sm_want appears after the releq mutex is dropped,
 961                  * then the smap just freed is already gone.
 962                  */
 963                 want = sm->sm_want;
 964                 mutex_exit(&releq->smq_mtx);
 965                 /*
 966                  * See if there was a waiter before dropping the releq mutex
 967                  * then recheck after obtaining sm_freeq[0] mutex as
 968                  * the another thread may have already signaled.
 969                  */
 970                 if (want) {
 971                         mutex_enter(&sm->sm_freeq[0].smq_mtx);
 972                         if (sm->sm_want)
 973                                 cv_signal(&sm->sm_free_cv);
 974                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
 975                 }
 976         } else {
 977                 smp->sm_next = smpfreelist;
 978                 smp->sm_prev = smpfreelist->sm_prev;
 979                 smpfreelist->sm_prev = smp;
 980                 smp->sm_prev->sm_next = smp;
 981                 mutex_exit(&releq->smq_mtx);
 982         }
 983 }
 984 
 985 
 986 static struct smap *
 987 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
 988 {
 989         struct smap **hpp;
 990         struct smap *tmp;
 991         kmutex_t *hmtx;
 992 
 993         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 994         ASSERT(smp->sm_vp == NULL);
 995         ASSERT(smp->sm_hash == NULL);
 996         ASSERT(smp->sm_prev == NULL);
 997         ASSERT(smp->sm_next == NULL);
 998         ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
 999 
1000         hmtx = SHASHMTX(hashid);
1001 
1002         mutex_enter(hmtx);
1003         /*
1004          * First we need to verify that no one has created a smp
1005          * with (vp,off) as its tag before we us.
1006          */
1007         for (tmp = smd_hash[hashid].sh_hash_list;
1008             tmp != NULL; tmp = tmp->sm_hash)
1009                 if (tmp->sm_vp == vp && tmp->sm_off == off)
1010                         break;
1011 
1012         if (tmp == NULL) {
1013                 /*
1014                  * No one created one yet.
1015                  *
1016                  * Funniness here - we don't increment the ref count on the
1017                  * vnode * even though we have another pointer to it here.
1018                  * The reason for this is that we don't want the fact that
1019                  * a seg_map entry somewhere refers to a vnode to prevent the
1020                  * vnode * itself from going away.  This is because this
1021                  * reference to the vnode is a "soft one".  In the case where
1022                  * a mapping is being used by a rdwr [or directory routine?]
1023                  * there already has to be a non-zero ref count on the vnode.
1024                  * In the case where the vp has been freed and the the smap
1025                  * structure is on the free list, there are no pages in memory
1026                  * that can refer to the vnode.  Thus even if we reuse the same
1027                  * vnode/smap structure for a vnode which has the same
1028                  * address but represents a different object, we are ok.
1029                  */
1030                 smp->sm_vp = vp;
1031                 smp->sm_off = off;
1032 
1033                 hpp = &smd_hash[hashid].sh_hash_list;
1034                 smp->sm_hash = *hpp;
1035                 *hpp = smp;
1036 #ifdef SEGMAP_HASHSTATS
1037                 smd_hash_len[hashid]++;
1038 #endif
1039         }
1040         mutex_exit(hmtx);
1041 
1042         return (tmp);
1043 }
1044 
1045 static void
1046 segmap_hashout(struct smap *smp)
1047 {
1048         struct smap **hpp, *hp;
1049         struct vnode *vp;
1050         kmutex_t *mtx;
1051         int hashid;
1052         u_offset_t off;
1053 
1054         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1055 
1056         vp = smp->sm_vp;
1057         off = smp->sm_off;
1058 
1059         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1060         mtx = SHASHMTX(hashid);
1061         mutex_enter(mtx);
1062 
1063         hpp = &smd_hash[hashid].sh_hash_list;
1064         for (;;) {
1065                 hp = *hpp;
1066                 if (hp == NULL) {
1067                         panic("segmap_hashout");
1068                         /*NOTREACHED*/
1069                 }
1070                 if (hp == smp)
1071                         break;
1072                 hpp = &hp->sm_hash;
1073         }
1074 
1075         *hpp = smp->sm_hash;
1076         smp->sm_hash = NULL;
1077 #ifdef SEGMAP_HASHSTATS
1078         smd_hash_len[hashid]--;
1079 #endif
1080         mutex_exit(mtx);
1081 
1082         smp->sm_vp = NULL;
1083         smp->sm_off = (u_offset_t)0;
1084 
1085 }
1086 
1087 /*
1088  * Attempt to free unmodified, unmapped, and non locked segmap
1089  * pages.
1090  */
1091 void
1092 segmap_pagefree(struct vnode *vp, u_offset_t off)
1093 {
1094         u_offset_t pgoff;
1095         page_t  *pp;
1096 
1097         for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
1098 
1099                 if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
1100                         continue;
1101 
1102                 switch (page_release(pp, 1)) {
1103                 case PGREL_NOTREL:
1104                         segmapcnt.smp_free_notfree.value.ul++;
1105                         break;
1106                 case PGREL_MOD:
1107                         segmapcnt.smp_free_dirty.value.ul++;
1108                         break;
1109                 case PGREL_CLEAN:
1110                         segmapcnt.smp_free.value.ul++;
1111                         break;
1112                 }
1113         }
1114 }
1115 
1116 /*
1117  * Locks held on entry: smap lock
1118  * Locks held on exit : smap lock.
1119  */
1120 
1121 static void
1122 grab_smp(struct smap *smp, page_t *pp)
1123 {
1124         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1125         ASSERT(smp->sm_refcnt == 0);
1126 
1127         if (smp->sm_vp != (struct vnode *)NULL) {
1128                 struct vnode    *vp = smp->sm_vp;
1129                 u_offset_t      off = smp->sm_off;
1130                 /*
1131                  * Destroy old vnode association and
1132                  * unload any hardware translations to
1133                  * the old object.
1134                  */
1135                 smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
1136                 segmap_hashout(smp);
1137 
1138                 /*
1139                  * This node is off freelist and hashlist,
1140                  * so there is no reason to drop/reacquire sm_mtx
1141                  * across calls to hat_unload.
1142                  */
1143                 if (segmap_kpm) {
1144                         caddr_t vaddr;
1145                         int hat_unload_needed = 0;
1146 
1147                         /*
1148                          * unload kpm mapping
1149                          */
1150                         if (pp != NULL) {
1151                                 vaddr = hat_kpm_page2va(pp, 1);
1152                                 hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
1153                                 page_unlock(pp);
1154                         }
1155 
1156                         /*
1157                          * Check if we have (also) the rare case of a
1158                          * non kpm mapping.
1159                          */
1160                         if (smp->sm_flags & SM_NOTKPM_RELEASED) {
1161                                 hat_unload_needed = 1;
1162                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1163                         }
1164 
1165                         if (hat_unload_needed) {
1166                                 hat_unload(kas.a_hat, segkmap->s_base +
1167                                     ((smp - smd_smap) * MAXBSIZE),
1168                                     MAXBSIZE, HAT_UNLOAD);
1169                         }
1170 
1171                 } else {
1172                         ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
1173                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1174                         hat_unload(kas.a_hat, segkmap->s_base +
1175                             ((smp - smd_smap) * MAXBSIZE),
1176                             MAXBSIZE, HAT_UNLOAD);
1177                 }
1178                 segmap_pagefree(vp, off);
1179         }
1180 }
1181 
1182 static struct smap *
1183 get_free_smp(int free_ndx)
1184 {
1185         struct smfree *sm;
1186         kmutex_t *smtx;
1187         struct smap *smp, *first;
1188         struct sm_freeq *allocq, *releq;
1189         struct kpme *kpme;
1190         page_t *pp = NULL;
1191         int end_ndx, page_locked = 0;
1192 
1193         end_ndx = free_ndx;
1194         sm = &smd_free[free_ndx];
1195 
1196 retry_queue:
1197         allocq = sm->sm_allocq;
1198         mutex_enter(&allocq->smq_mtx);
1199 
1200         if ((smp = allocq->smq_free) == NULL) {
1201 
1202 skip_queue:
1203                 /*
1204                  * The alloc list is empty or this queue is being skipped;
1205                  * first see if the allocq toggled.
1206                  */
1207                 if (sm->sm_allocq != allocq) {
1208                         /* queue changed */
1209                         mutex_exit(&allocq->smq_mtx);
1210                         goto retry_queue;
1211                 }
1212                 releq = sm->sm_releq;
1213                 if (!mutex_tryenter(&releq->smq_mtx)) {
1214                         /* cannot get releq; a free smp may be there now */
1215                         mutex_exit(&allocq->smq_mtx);
1216 
1217                         /*
1218                          * This loop could spin forever if this thread has
1219                          * higher priority than the thread that is holding
1220                          * releq->smq_mtx. In order to force the other thread
1221                          * to run, we'll lock/unlock the mutex which is safe
1222                          * since we just unlocked the allocq mutex.
1223                          */
1224                         mutex_enter(&releq->smq_mtx);
1225                         mutex_exit(&releq->smq_mtx);
1226                         goto retry_queue;
1227                 }
1228                 if (releq->smq_free == NULL) {
1229                         /*
1230                          * This freelist is empty.
1231                          * This should not happen unless clients
1232                          * are failing to release the segmap
1233                          * window after accessing the data.
1234                          * Before resorting to sleeping, try
1235                          * the next list of the same color.
1236                          */
1237                         free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
1238                         if (free_ndx != end_ndx) {
1239                                 mutex_exit(&releq->smq_mtx);
1240                                 mutex_exit(&allocq->smq_mtx);
1241                                 sm = &smd_free[free_ndx];
1242                                 goto retry_queue;
1243                         }
1244                         /*
1245                          * Tried all freelists of the same color once,
1246                          * wait on this list and hope something gets freed.
1247                          */
1248                         segmapcnt.smp_get_nofree.value.ul++;
1249                         sm->sm_want++;
1250                         mutex_exit(&sm->sm_freeq[1].smq_mtx);
1251                         cv_wait(&sm->sm_free_cv,
1252                             &sm->sm_freeq[0].smq_mtx);
1253                         sm->sm_want--;
1254                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
1255                         sm = &smd_free[free_ndx];
1256                         goto retry_queue;
1257                 } else {
1258                         /*
1259                          * Something on the rele queue; flip the alloc
1260                          * and rele queues and retry.
1261                          */
1262                         sm->sm_allocq = releq;
1263                         sm->sm_releq = allocq;
1264                         mutex_exit(&allocq->smq_mtx);
1265                         mutex_exit(&releq->smq_mtx);
1266                         if (page_locked) {
1267                                 delay(hz >> 2);
1268                                 page_locked = 0;
1269                         }
1270                         goto retry_queue;
1271                 }
1272         } else {
1273                 /*
1274                  * Fastpath the case we get the smap mutex
1275                  * on the first try.
1276                  */
1277                 first = smp;
1278 next_smap:
1279                 smtx = SMAPMTX(smp);
1280                 if (!mutex_tryenter(smtx)) {
1281                         /*
1282                          * Another thread is trying to reclaim this slot.
1283                          * Skip to the next queue or smap.
1284                          */
1285                         if ((smp = smp->sm_next) == first) {
1286                                 goto skip_queue;
1287                         } else {
1288                                 goto next_smap;
1289                         }
1290                 } else {
1291                         /*
1292                          * if kpme exists, get shared lock on the page
1293                          */
1294                         if (segmap_kpm && smp->sm_vp != NULL) {
1295 
1296                                 kpme = GET_KPME(smp);
1297                                 pp = kpme->kpe_page;
1298 
1299                                 if (pp != NULL) {
1300                                         if (!page_trylock(pp, SE_SHARED)) {
1301                                                 smp = smp->sm_next;
1302                                                 mutex_exit(smtx);
1303                                                 page_locked = 1;
1304 
1305                                                 pp = NULL;
1306 
1307                                                 if (smp == first) {
1308                                                         goto skip_queue;
1309                                                 } else {
1310                                                         goto next_smap;
1311                                                 }
1312                                         } else {
1313                                                 if (kpme->kpe_page == NULL) {
1314                                                         page_unlock(pp);
1315                                                         pp = NULL;
1316                                                 }
1317                                         }
1318                                 }
1319                         }
1320 
1321                         /*
1322                          * At this point, we've selected smp.  Remove smp
1323                          * from its freelist.  If smp is the first one in
1324                          * the freelist, update the head of the freelist.
1325                          */
1326                         if (first == smp) {
1327                                 ASSERT(first == allocq->smq_free);
1328                                 allocq->smq_free = smp->sm_next;
1329                         }
1330 
1331                         /*
1332                          * if the head of the freelist still points to smp,
1333                          * then there are no more free smaps in that list.
1334                          */
1335                         if (allocq->smq_free == smp)
1336                                 /*
1337                                  * Took the last one
1338                                  */
1339                                 allocq->smq_free = NULL;
1340                         else {
1341                                 smp->sm_prev->sm_next = smp->sm_next;
1342                                 smp->sm_next->sm_prev = smp->sm_prev;
1343                         }
1344                         mutex_exit(&allocq->smq_mtx);
1345                         smp->sm_prev = smp->sm_next = NULL;
1346 
1347                         /*
1348                          * if pp != NULL, pp must have been locked;
1349                          * grab_smp() unlocks pp.
1350                          */
1351                         ASSERT((pp == NULL) || PAGE_LOCKED(pp));
1352                         grab_smp(smp, pp);
1353                         /* return smp locked. */
1354                         ASSERT(SMAPMTX(smp) == smtx);
1355                         ASSERT(MUTEX_HELD(smtx));
1356                         return (smp);
1357                 }
1358         }
1359 }
1360 
1361 /*
1362  * Special public segmap operations
1363  */
1364 
1365 /*
1366  * Create pages (without using VOP_GETPAGE) and load up translations to them.
1367  * If softlock is TRUE, then set things up so that it looks like a call
1368  * to segmap_fault with F_SOFTLOCK.
1369  *
1370  * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
1371  *
1372  * All fields in the generic segment (struct seg) are considered to be
1373  * read-only for "segmap" even though the kernel address space (kas) may
1374  * not be locked, hence no lock is needed to access them.
1375  */
1376 int
1377 segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
1378 {
1379         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
1380         page_t *pp;
1381         u_offset_t off;
1382         struct smap *smp;
1383         struct vnode *vp;
1384         caddr_t eaddr;
1385         int newpage = 0;
1386         uint_t prot;
1387         kmutex_t *smtx;
1388         int hat_flag;
1389 
1390         ASSERT(seg->s_as == &kas);
1391 
1392         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1393                 /*
1394                  * Pages are successfully prefaulted and locked in
1395                  * segmap_getmapflt and can't be unlocked until
1396                  * segmap_release. The SM_KPM_NEWPAGE flag is set
1397                  * in segmap_pagecreate_kpm when new pages are created.
1398                  * and it is returned as "newpage" indication here.
1399                  */
1400                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1401                         panic("segmap_pagecreate: smap not found "
1402                             "for addr %p", (void *)addr);
1403                         /*NOTREACHED*/
1404                 }
1405 
1406                 smtx = SMAPMTX(smp);
1407                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
1408                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1409                 mutex_exit(smtx);
1410 
1411                 return (newpage);
1412         }
1413 
1414         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
1415 
1416         eaddr = addr + len;
1417         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1418 
1419         smp = GET_SMAP(seg, addr);
1420 
1421         /*
1422          * We don't grab smp mutex here since we assume the smp
1423          * has a refcnt set already which prevents the slot from
1424          * changing its id.
1425          */
1426         ASSERT(smp->sm_refcnt > 0);
1427 
1428         vp = smp->sm_vp;
1429         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1430         prot = smd->smd_prot;
1431 
1432         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1433                 hat_flag = HAT_LOAD;
1434                 pp = page_lookup(vp, off, SE_SHARED);
1435                 if (pp == NULL) {
1436                         ushort_t bitindex;
1437 
1438                         if ((pp = page_create_va(vp, off,
1439                             PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1440                                 panic("segmap_pagecreate: page_create failed");
1441                                 /*NOTREACHED*/
1442                         }
1443                         newpage = 1;
1444                         page_io_unlock(pp);
1445 
1446                         /*
1447                          * Since pages created here do not contain valid
1448                          * data until the caller writes into them, the
1449                          * "exclusive" lock will not be dropped to prevent
1450                          * other users from accessing the page.  We also
1451                          * have to lock the translation to prevent a fault
1452                          * from occurring when the virtual address mapped by
1453                          * this page is written into.  This is necessary to
1454                          * avoid a deadlock since we haven't dropped the
1455                          * "exclusive" lock.
1456                          */
1457                         bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
1458 
1459                         /*
1460                          * Large Files: The following assertion is to
1461                          * verify the cast above.
1462                          */
1463                         ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1464                         smtx = SMAPMTX(smp);
1465                         mutex_enter(smtx);
1466                         smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
1467                         mutex_exit(smtx);
1468 
1469                         hat_flag = HAT_LOAD_LOCK;
1470                 } else if (softlock) {
1471                         hat_flag = HAT_LOAD_LOCK;
1472                 }
1473 
1474                 if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
1475                         hat_setmod(pp);
1476 
1477                 hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
1478 
1479                 if (hat_flag != HAT_LOAD_LOCK)
1480                         page_unlock(pp);
1481 
1482                 TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
1483                     "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
1484                     seg, addr, pp, vp, off);
1485         }
1486 
1487         return (newpage);
1488 }
1489 
1490 void
1491 segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
1492 {
1493         struct smap     *smp;
1494         ushort_t        bitmask;
1495         page_t          *pp;
1496         struct  vnode   *vp;
1497         u_offset_t      off;
1498         caddr_t         eaddr;
1499         kmutex_t        *smtx;
1500 
1501         ASSERT(seg->s_as == &kas);
1502 
1503         eaddr = addr + len;
1504         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1505 
1506         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1507                 /*
1508                  * Pages are successfully prefaulted and locked in
1509                  * segmap_getmapflt and can't be unlocked until
1510                  * segmap_release, so no pages or hat mappings have
1511                  * to be unlocked at this point.
1512                  */
1513 #ifdef DEBUG
1514                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1515                         panic("segmap_pageunlock: smap not found "
1516                             "for addr %p", (void *)addr);
1517                         /*NOTREACHED*/
1518                 }
1519 
1520                 ASSERT(smp->sm_refcnt > 0);
1521                 mutex_exit(SMAPMTX(smp));
1522 #endif
1523                 return;
1524         }
1525 
1526         smp = GET_SMAP(seg, addr);
1527         smtx = SMAPMTX(smp);
1528 
1529         ASSERT(smp->sm_refcnt > 0);
1530 
1531         vp = smp->sm_vp;
1532         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1533 
1534         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1535                 bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
1536 
1537                 /*
1538                  * Large Files: Following assertion is to verify
1539                  * the correctness of the cast to (int) above.
1540                  */
1541                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1542 
1543                 /*
1544                  * If the bit corresponding to "off" is set,
1545                  * clear this bit in the bitmap, unlock translations,
1546                  * and release the "exclusive" lock on the page.
1547                  */
1548                 if (smp->sm_bitmap & bitmask) {
1549                         mutex_enter(smtx);
1550                         smp->sm_bitmap &= ~bitmask;
1551                         mutex_exit(smtx);
1552 
1553                         hat_unlock(kas.a_hat, addr, PAGESIZE);
1554 
1555                         /*
1556                          * Use page_find() instead of page_lookup() to
1557                          * find the page since we know that it has
1558                          * "exclusive" lock.
1559                          */
1560                         pp = page_find(vp, off);
1561                         if (pp == NULL) {
1562                                 panic("segmap_pageunlock: page not found");
1563                                 /*NOTREACHED*/
1564                         }
1565                         if (rw == S_WRITE) {
1566                                 hat_setrefmod(pp);
1567                         } else if (rw != S_OTHER) {
1568                                 hat_setref(pp);
1569                         }
1570 
1571                         page_unlock(pp);
1572                 }
1573         }
1574 }
1575 
1576 caddr_t
1577 segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
1578 {
1579         return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
1580 }
1581 
1582 /*
1583  * This is the magic virtual address that offset 0 of an ELF
1584  * file gets mapped to in user space. This is used to pick
1585  * the vac color on the freelist.
1586  */
1587 #define ELF_OFFZERO_VA  (0x10000)
1588 /*
1589  * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
1590  * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
1591  * The return address is  always MAXBSIZE aligned.
1592  *
1593  * If forcefault is nonzero and the MMU translations haven't yet been created,
1594  * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
1595  */
1596 caddr_t
1597 segmap_getmapflt(
1598         struct seg *seg,
1599         struct vnode *vp,
1600         u_offset_t off,
1601         size_t len,
1602         int forcefault,
1603         enum seg_rw rw)
1604 {
1605         struct smap *smp, *nsmp;
1606         extern struct vnode *common_specvp();
1607         caddr_t baseaddr;                       /* MAXBSIZE aligned */
1608         u_offset_t baseoff;
1609         int newslot;
1610         caddr_t vaddr;
1611         int color, hashid;
1612         kmutex_t *hashmtx, *smapmtx;
1613         struct smfree *sm;
1614         page_t  *pp;
1615         struct kpme *kpme;
1616         uint_t  prot;
1617         caddr_t base;
1618         page_t  *pl[MAXPPB + 1];
1619         int     error;
1620         int     is_kpm = 1;
1621 
1622         ASSERT(seg->s_as == &kas);
1623         ASSERT(seg == segkmap);
1624 
1625         baseoff = off & (offset_t)MAXBMASK;
1626         if (off + len > baseoff + MAXBSIZE) {
1627                 panic("segmap_getmap bad len");
1628                 /*NOTREACHED*/
1629         }
1630 
1631         /*
1632          * If this is a block device we have to be sure to use the
1633          * "common" block device vnode for the mapping.
1634          */
1635         if (vp->v_type == VBLK)
1636                 vp = common_specvp(vp);
1637 
1638         smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
1639 
1640         if (segmap_kpm == 0 ||
1641             (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
1642                 is_kpm = 0;
1643         }
1644 
1645         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1646         hashmtx = SHASHMTX(hashid);
1647 
1648 retry_hash:
1649         mutex_enter(hashmtx);
1650         for (smp = smd_hash[hashid].sh_hash_list;
1651             smp != NULL; smp = smp->sm_hash)
1652                 if (smp->sm_vp == vp && smp->sm_off == baseoff)
1653                         break;
1654         mutex_exit(hashmtx);
1655 
1656 vrfy_smp:
1657         if (smp != NULL) {
1658 
1659                 ASSERT(vp->v_count != 0);
1660 
1661                 /*
1662                  * Get smap lock and recheck its tag. The hash lock
1663                  * is dropped since the hash is based on (vp, off)
1664                  * and (vp, off) won't change when we have smap mtx.
1665                  */
1666                 smapmtx = SMAPMTX(smp);
1667                 mutex_enter(smapmtx);
1668                 if (smp->sm_vp != vp || smp->sm_off != baseoff) {
1669                         mutex_exit(smapmtx);
1670                         goto retry_hash;
1671                 }
1672 
1673                 if (smp->sm_refcnt == 0) {
1674 
1675                         smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
1676 
1677                         /*
1678                          * Could still be on the free list. However, this
1679                          * could also be an smp that is transitioning from
1680                          * the free list when we have too much contention
1681                          * for the smapmtx's. In this case, we have an
1682                          * unlocked smp that is not on the free list any
1683                          * longer, but still has a 0 refcnt.  The only way
1684                          * to be sure is to check the freelist pointers.
1685                          * Since we now have the smapmtx, we are guaranteed
1686                          * that the (vp, off) won't change, so we are safe
1687                          * to reclaim it.  get_free_smp() knows that this
1688                          * can happen, and it will check the refcnt.
1689                          */
1690 
1691                         if ((smp->sm_next != NULL)) {
1692                                 struct sm_freeq *freeq;
1693 
1694                                 ASSERT(smp->sm_prev != NULL);
1695                                 sm = &smd_free[smp->sm_free_ndx];
1696 
1697                                 if (smp->sm_flags & SM_QNDX_ZERO)
1698                                         freeq = &sm->sm_freeq[0];
1699                                 else
1700                                         freeq = &sm->sm_freeq[1];
1701 
1702                                 mutex_enter(&freeq->smq_mtx);
1703                                 if (freeq->smq_free != smp) {
1704                                         /*
1705                                          * fastpath normal case
1706                                          */
1707                                         smp->sm_prev->sm_next = smp->sm_next;
1708                                         smp->sm_next->sm_prev = smp->sm_prev;
1709                                 } else if (smp == smp->sm_next) {
1710                                         /*
1711                                          * Taking the last smap on freelist
1712                                          */
1713                                         freeq->smq_free = NULL;
1714                                 } else {
1715                                         /*
1716                                          * Reclaiming 1st smap on list
1717                                          */
1718                                         freeq->smq_free = smp->sm_next;
1719                                         smp->sm_prev->sm_next = smp->sm_next;
1720                                         smp->sm_next->sm_prev = smp->sm_prev;
1721                                 }
1722                                 mutex_exit(&freeq->smq_mtx);
1723                                 smp->sm_prev = smp->sm_next = NULL;
1724                         } else {
1725                                 ASSERT(smp->sm_prev == NULL);
1726                                 segmapcnt.smp_stolen.value.ul++;
1727                         }
1728 
1729                 } else {
1730                         segmapcnt.smp_get_use.value.ul++;
1731                 }
1732                 smp->sm_refcnt++;            /* another user */
1733 
1734                 /*
1735                  * We don't invoke segmap_fault via TLB miss, so we set ref
1736                  * and mod bits in advance. For S_OTHER  we set them in
1737                  * segmap_fault F_SOFTUNLOCK.
1738                  */
1739                 if (is_kpm) {
1740                         if (rw == S_WRITE) {
1741                                 smp->sm_flags |= SM_WRITE_DATA;
1742                         } else if (rw == S_READ) {
1743                                 smp->sm_flags |= SM_READ_DATA;
1744                         }
1745                 }
1746                 mutex_exit(smapmtx);
1747 
1748                 newslot = 0;
1749         } else {
1750 
1751                 uint32_t free_ndx, *free_ndxp;
1752                 union segmap_cpu *scpu;
1753 
1754                 /*
1755                  * On a PAC machine or a machine with anti-alias
1756                  * hardware, smd_colormsk will be zero.
1757                  *
1758                  * On a VAC machine- pick color by offset in the file
1759                  * so we won't get VAC conflicts on elf files.
1760                  * On data files, color does not matter but we
1761                  * don't know what kind of file it is so we always
1762                  * pick color by offset. This causes color
1763                  * corresponding to file offset zero to be used more
1764                  * heavily.
1765                  */
1766                 color = (baseoff >> MAXBSHIFT) & smd_colormsk;
1767                 scpu = smd_cpu+CPU->cpu_seqid;
1768                 free_ndxp = &scpu->scpu.scpu_free_ndx[color];
1769                 free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
1770 #ifdef DEBUG
1771                 colors_used[free_ndx]++;
1772 #endif /* DEBUG */
1773 
1774                 /*
1775                  * Get a locked smp slot from the free list.
1776                  */
1777                 smp = get_free_smp(free_ndx);
1778                 smapmtx = SMAPMTX(smp);
1779 
1780                 ASSERT(smp->sm_vp == NULL);
1781 
1782                 if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
1783                         /*
1784                          * Failed to hashin, there exists one now.
1785                          * Return the smp we just allocated.
1786                          */
1787                         segmap_smapadd(smp);
1788                         mutex_exit(smapmtx);
1789 
1790                         smp = nsmp;
1791                         goto vrfy_smp;
1792                 }
1793                 smp->sm_refcnt++;            /* another user */
1794 
1795                 /*
1796                  * We don't invoke segmap_fault via TLB miss, so we set ref
1797                  * and mod bits in advance. For S_OTHER  we set them in
1798                  * segmap_fault F_SOFTUNLOCK.
1799                  */
1800                 if (is_kpm) {
1801                         if (rw == S_WRITE) {
1802                                 smp->sm_flags |= SM_WRITE_DATA;
1803                         } else if (rw == S_READ) {
1804                                 smp->sm_flags |= SM_READ_DATA;
1805                         }
1806                 }
1807                 mutex_exit(smapmtx);
1808 
1809                 newslot = 1;
1810         }
1811 
1812         if (!is_kpm)
1813                 goto use_segmap_range;
1814 
1815         /*
1816          * Use segkpm
1817          */
1818         /* Lint directive required until 6746211 is fixed */
1819         /*CONSTCOND*/
1820         ASSERT(PAGESIZE == MAXBSIZE);
1821 
1822         /*
1823          * remember the last smp faulted on this cpu.
1824          */
1825         (smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
1826 
1827         if (forcefault == SM_PAGECREATE) {
1828                 baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
1829                 return (baseaddr);
1830         }
1831 
1832         if (newslot == 0 &&
1833             (pp = GET_KPME(smp)->kpe_page) != NULL) {
1834 
1835                 /* fastpath */
1836                 switch (rw) {
1837                 case S_READ:
1838                 case S_WRITE:
1839                         if (page_trylock(pp, SE_SHARED)) {
1840                                 if (PP_ISFREE(pp) ||
1841                                     !(pp->p_vnode == vp &&
1842                                     pp->p_offset == baseoff)) {
1843                                         page_unlock(pp);
1844                                         pp = page_lookup(vp, baseoff,
1845                                             SE_SHARED);
1846                                 }
1847                         } else {
1848                                 pp = page_lookup(vp, baseoff, SE_SHARED);
1849                         }
1850 
1851                         if (pp == NULL) {
1852                                 ASSERT(GET_KPME(smp)->kpe_page == NULL);
1853                                 break;
1854                         }
1855 
1856                         if (rw == S_WRITE &&
1857                             hat_page_getattr(pp, P_MOD | P_REF) !=
1858                             (P_MOD | P_REF)) {
1859                                 page_unlock(pp);
1860                                 break;
1861                         }
1862 
1863                         /*
1864                          * We have the p_selock as reader, grab_smp
1865                          * can't hit us, we have bumped the smap
1866                          * refcnt and hat_pageunload needs the
1867                          * p_selock exclusive.
1868                          */
1869                         kpme = GET_KPME(smp);
1870                         if (kpme->kpe_page == pp) {
1871                                 baseaddr = hat_kpm_page2va(pp, 0);
1872                         } else if (kpme->kpe_page == NULL) {
1873                                 baseaddr = hat_kpm_mapin(pp, kpme);
1874                         } else {
1875                                 panic("segmap_getmapflt: stale "
1876                                     "kpme page, kpme %p", (void *)kpme);
1877                                 /*NOTREACHED*/
1878                         }
1879 
1880                         /*
1881                          * We don't invoke segmap_fault via TLB miss,
1882                          * so we set ref and mod bits in advance.
1883                          * For S_OTHER and we set them in segmap_fault
1884                          * F_SOFTUNLOCK.
1885                          */
1886                         if (rw == S_READ && !hat_isref(pp))
1887                                 hat_setref(pp);
1888 
1889                         return (baseaddr);
1890                 default:
1891                         break;
1892                 }
1893         }
1894 
1895         base = segkpm_create_va(baseoff);
1896         error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
1897             seg, base, rw, CRED(), NULL);
1898 
1899         pp = pl[0];
1900         if (error || pp == NULL) {
1901                 /*
1902                  * Use segmap address slot and let segmap_fault deal
1903                  * with the error cases. There is no error return
1904                  * possible here.
1905                  */
1906                 goto use_segmap_range;
1907         }
1908 
1909         ASSERT(pl[1] == NULL);
1910 
1911         /*
1912          * When prot is not returned w/ PROT_ALL the returned pages
1913          * are not backed by fs blocks. For most of the segmap users
1914          * this is no problem, they don't write to the pages in the
1915          * same request and therefore don't rely on a following
1916          * trap driven segmap_fault. With SM_LOCKPROTO users it
1917          * is more secure to use segkmap adresses to allow
1918          * protection segmap_fault's.
1919          */
1920         if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
1921                 /*
1922                  * Use segmap address slot and let segmap_fault
1923                  * do the error return.
1924                  */
1925                 ASSERT(rw != S_WRITE);
1926                 ASSERT(PAGE_LOCKED(pp));
1927                 page_unlock(pp);
1928                 forcefault = 0;
1929                 goto use_segmap_range;
1930         }
1931 
1932         /*
1933          * We have the p_selock as reader, grab_smp can't hit us, we
1934          * have bumped the smap refcnt and hat_pageunload needs the
1935          * p_selock exclusive.
1936          */
1937         kpme = GET_KPME(smp);
1938         if (kpme->kpe_page == pp) {
1939                 baseaddr = hat_kpm_page2va(pp, 0);
1940         } else if (kpme->kpe_page == NULL) {
1941                 baseaddr = hat_kpm_mapin(pp, kpme);
1942         } else {
1943                 panic("segmap_getmapflt: stale kpme page after "
1944                     "VOP_GETPAGE, kpme %p", (void *)kpme);
1945                 /*NOTREACHED*/
1946         }
1947 
1948         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
1949 
1950         return (baseaddr);
1951 
1952 
1953 use_segmap_range:
1954         baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
1955         TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
1956             "segmap_getmap:seg %p addr %p vp %p offset %llx",
1957             seg, baseaddr, vp, baseoff);
1958 
1959         /*
1960          * Prefault the translations
1961          */
1962         vaddr = baseaddr + (off - baseoff);
1963         if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
1964 
1965                 caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
1966                     (uintptr_t)PAGEMASK);
1967 
1968                 (void) segmap_fault(kas.a_hat, seg, pgaddr,
1969                     (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
1970                     F_INVAL, rw);
1971         }
1972 
1973         return (baseaddr);
1974 }
1975 
1976 int
1977 segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
1978 {
1979         struct smap     *smp;
1980         int             error;
1981         int             bflags = 0;
1982         struct vnode    *vp;
1983         u_offset_t      offset;
1984         kmutex_t        *smtx;
1985         int             is_kpm = 0;
1986         page_t          *pp;
1987 
1988         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1989 
1990                 if (((uintptr_t)addr & MAXBOFFSET) != 0) {
1991                         panic("segmap_release: addr %p not "
1992                             "MAXBSIZE aligned", (void *)addr);
1993                         /*NOTREACHED*/
1994                 }
1995 
1996                 if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
1997                         panic("segmap_release: smap not found "
1998                             "for addr %p", (void *)addr);
1999                         /*NOTREACHED*/
2000                 }
2001 
2002                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2003                     "segmap_relmap:seg %p addr %p smp %p",
2004                     seg, addr, smp);
2005 
2006                 smtx = SMAPMTX(smp);
2007 
2008                 /*
2009                  * For compatibility reasons segmap_pagecreate_kpm sets this
2010                  * flag to allow a following segmap_pagecreate to return
2011                  * this as "newpage" flag. When segmap_pagecreate is not
2012                  * called at all we clear it now.
2013                  */
2014                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
2015                 is_kpm = 1;
2016                 if (smp->sm_flags & SM_WRITE_DATA) {
2017                         hat_setrefmod(pp);
2018                 } else if (smp->sm_flags & SM_READ_DATA) {
2019                         hat_setref(pp);
2020                 }
2021         } else {
2022                 if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
2023                     ((uintptr_t)addr & MAXBOFFSET) != 0) {
2024                         panic("segmap_release: bad addr %p", (void *)addr);
2025                         /*NOTREACHED*/
2026                 }
2027                 smp = GET_SMAP(seg, addr);
2028 
2029                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2030                     "segmap_relmap:seg %p addr %p smp %p",
2031                     seg, addr, smp);
2032 
2033                 smtx = SMAPMTX(smp);
2034                 mutex_enter(smtx);
2035                 smp->sm_flags |= SM_NOTKPM_RELEASED;
2036         }
2037 
2038         ASSERT(smp->sm_refcnt > 0);
2039 
2040         /*
2041          * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2042          * are set.
2043          */
2044         if ((flags & ~SM_DONTNEED) != 0) {
2045                 if (flags & SM_WRITE)
2046                         segmapcnt.smp_rel_write.value.ul++;
2047                 if (flags & SM_ASYNC) {
2048                         bflags |= B_ASYNC;
2049                         segmapcnt.smp_rel_async.value.ul++;
2050                 }
2051                 if (flags & SM_INVAL) {
2052                         bflags |= B_INVAL;
2053                         segmapcnt.smp_rel_abort.value.ul++;
2054                 }
2055                 if (flags & SM_DESTROY) {
2056                         bflags |= (B_INVAL|B_TRUNC);
2057                         segmapcnt.smp_rel_abort.value.ul++;
2058                 }
2059                 if (smp->sm_refcnt == 1) {
2060                         /*
2061                          * We only bother doing the FREE and DONTNEED flags
2062                          * if no one else is still referencing this mapping.
2063                          */
2064                         if (flags & SM_FREE) {
2065                                 bflags |= B_FREE;
2066                                 segmapcnt.smp_rel_free.value.ul++;
2067                         }
2068                         if (flags & SM_DONTNEED) {
2069                                 bflags |= B_DONTNEED;
2070                                 segmapcnt.smp_rel_dontneed.value.ul++;
2071                         }
2072                 }
2073         } else {
2074                 smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
2075         }
2076 
2077         vp = smp->sm_vp;
2078         offset = smp->sm_off;
2079 
2080         if (--smp->sm_refcnt == 0) {
2081 
2082                 smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
2083 
2084                 if (flags & (SM_INVAL|SM_DESTROY)) {
2085                         segmap_hashout(smp);    /* remove map info */
2086                         if (is_kpm) {
2087                                 hat_kpm_mapout(pp, GET_KPME(smp), addr);
2088                                 if (smp->sm_flags & SM_NOTKPM_RELEASED) {
2089                                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2090                                         hat_unload(kas.a_hat, segkmap->s_base +
2091                                             ((smp - smd_smap) * MAXBSIZE),
2092                                             MAXBSIZE, HAT_UNLOAD);
2093                                 }
2094 
2095                         } else {
2096                                 if (segmap_kpm)
2097                                         segkpm_mapout_validkpme(GET_KPME(smp));
2098 
2099                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2100                                 hat_unload(kas.a_hat, addr, MAXBSIZE,
2101                                     HAT_UNLOAD);
2102                         }
2103                 }
2104                 segmap_smapadd(smp);    /* add to free list */
2105         }
2106 
2107         mutex_exit(smtx);
2108 
2109         if (is_kpm)
2110                 page_unlock(pp);
2111         /*
2112          * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2113          * are set.
2114          */
2115         if ((flags & ~SM_DONTNEED) != 0) {
2116                 error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
2117                     bflags, CRED(), NULL);
2118         } else {
2119                 error = 0;
2120         }
2121 
2122         return (error);
2123 }
2124 
2125 /*
2126  * Dump the pages belonging to this segmap segment.
2127  */
2128 static void
2129 segmap_dump(struct seg *seg)
2130 {
2131         struct segmap_data *smd;
2132         struct smap *smp, *smp_end;
2133         page_t *pp;
2134         pfn_t pfn;
2135         u_offset_t off;
2136         caddr_t addr;
2137 
2138         smd = (struct segmap_data *)seg->s_data;
2139         addr = seg->s_base;
2140         for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
2141             smp < smp_end; smp++) {
2142 
2143                 if (smp->sm_refcnt) {
2144                         for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
2145                                 int we_own_it = 0;
2146 
2147                                 /*
2148                                  * If pp == NULL, the page either does
2149                                  * not exist or is exclusively locked.
2150                                  * So determine if it exists before
2151                                  * searching for it.
2152                                  */
2153                                 if ((pp = page_lookup_nowait(smp->sm_vp,
2154                                     smp->sm_off + off, SE_SHARED)))
2155                                         we_own_it = 1;
2156                                 else
2157                                         pp = page_exists(smp->sm_vp,
2158                                             smp->sm_off + off);
2159 
2160                                 if (pp) {
2161                                         pfn = page_pptonum(pp);
2162                                         dump_addpage(seg->s_as,
2163                                             addr + off, pfn);
2164                                         if (we_own_it)
2165                                                 page_unlock(pp);
2166                                 }
2167                                 dump_timeleft = dump_timeout;
2168                         }
2169                 }
2170                 addr += MAXBSIZE;
2171         }
2172 }
2173 
2174 /*ARGSUSED*/
2175 static int
2176 segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
2177     struct page ***ppp, enum lock_type type, enum seg_rw rw)
2178 {
2179         return (ENOTSUP);
2180 }
2181 
2182 static int
2183 segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2184 {
2185         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
2186 
2187         memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
2188         memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
2189         return (0);
2190 }
2191 
2192 /*ARGSUSED*/
2193 static lgrp_mem_policy_info_t *
2194 segmap_getpolicy(struct seg *seg, caddr_t addr)
2195 {
2196         return (NULL);
2197 }
2198 
2199 /*ARGSUSED*/
2200 static int
2201 segmap_capable(struct seg *seg, segcapability_t capability)
2202 {
2203         return (0);
2204 }
2205 
2206 
2207 #ifdef  SEGKPM_SUPPORT
2208 
2209 /*
2210  * segkpm support routines
2211  */
2212 
2213 static caddr_t
2214 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2215         struct smap *smp, enum seg_rw rw)
2216 {
2217         caddr_t base;
2218         page_t  *pp;
2219         int     newpage = 0;
2220         struct kpme     *kpme;
2221 
2222         ASSERT(smp->sm_refcnt > 0);
2223 
2224         if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
2225                 kmutex_t *smtx;
2226 
2227                 base = segkpm_create_va(off);
2228 
2229                 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
2230                     seg, base)) == NULL) {
2231                         panic("segmap_pagecreate_kpm: "
2232                             "page_create failed");
2233                         /*NOTREACHED*/
2234                 }
2235 
2236                 newpage = 1;
2237                 page_io_unlock(pp);
2238                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
2239 
2240                 /*
2241                  * Mark this here until the following segmap_pagecreate
2242                  * or segmap_release.
2243                  */
2244                 smtx = SMAPMTX(smp);
2245                 mutex_enter(smtx);
2246                 smp->sm_flags |= SM_KPM_NEWPAGE;
2247                 mutex_exit(smtx);
2248         }
2249 
2250         kpme = GET_KPME(smp);
2251         if (!newpage && kpme->kpe_page == pp)
2252                 base = hat_kpm_page2va(pp, 0);
2253         else
2254                 base = hat_kpm_mapin(pp, kpme);
2255 
2256         /*
2257          * FS code may decide not to call segmap_pagecreate and we
2258          * don't invoke segmap_fault via TLB miss, so we have to set
2259          * ref and mod bits in advance.
2260          */
2261         if (rw == S_WRITE) {
2262                 hat_setrefmod(pp);
2263         } else {
2264                 ASSERT(rw == S_READ);
2265                 hat_setref(pp);
2266         }
2267 
2268         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
2269 
2270         return (base);
2271 }
2272 
2273 /*
2274  * Find the smap structure corresponding to the
2275  * KPM addr and return it locked.
2276  */
2277 struct smap *
2278 get_smap_kpm(caddr_t addr, page_t **ppp)
2279 {
2280         struct smap     *smp;
2281         struct vnode    *vp;
2282         u_offset_t      offset;
2283         caddr_t         baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
2284         int             hashid;
2285         kmutex_t        *hashmtx;
2286         page_t          *pp;
2287         union segmap_cpu *scpu;
2288 
2289         pp = hat_kpm_vaddr2page(baseaddr);
2290 
2291         ASSERT(pp && !PP_ISFREE(pp));
2292         ASSERT(PAGE_LOCKED(pp));
2293         ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
2294 
2295         vp = pp->p_vnode;
2296         offset = pp->p_offset;
2297         ASSERT(vp != NULL);
2298 
2299         /*
2300          * Assume the last smap used on this cpu is the one needed.
2301          */
2302         scpu = smd_cpu+CPU->cpu_seqid;
2303         smp = scpu->scpu.scpu_last_smap;
2304         mutex_enter(&smp->sm_mtx);
2305         if (smp->sm_vp == vp && smp->sm_off == offset) {
2306                 ASSERT(smp->sm_refcnt > 0);
2307         } else {
2308                 /*
2309                  * Assumption wrong, find the smap on the hash chain.
2310                  */
2311                 mutex_exit(&smp->sm_mtx);
2312                 SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
2313                 hashmtx = SHASHMTX(hashid);
2314 
2315                 mutex_enter(hashmtx);
2316                 smp = smd_hash[hashid].sh_hash_list;
2317                 for (; smp != NULL; smp = smp->sm_hash) {
2318                         if (smp->sm_vp == vp && smp->sm_off == offset)
2319                                 break;
2320                 }
2321                 mutex_exit(hashmtx);
2322                 if (smp) {
2323                         mutex_enter(&smp->sm_mtx);
2324                         ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
2325                 }
2326         }
2327 
2328         if (ppp)
2329                 *ppp = smp ? pp : NULL;
2330 
2331         return (smp);
2332 }
2333 
2334 #else   /* SEGKPM_SUPPORT */
2335 
2336 /* segkpm stubs */
2337 
2338 /*ARGSUSED*/
2339 static caddr_t
2340 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2341         struct smap *smp, enum seg_rw rw)
2342 {
2343         return (NULL);
2344 }
2345 
2346 /*ARGSUSED*/
2347 struct smap *
2348 get_smap_kpm(caddr_t addr, page_t **ppp)
2349 {
2350         return (NULL);
2351 }
2352 
2353 #endif  /* SEGKPM_SUPPORT */