1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * Portions of this source code were derived from Berkeley 4.3 BSD
  31  * under license from the Regents of the University of California.
  32  */
  33 
  34 /*
  35  * VM - generic vnode mapping segment.
  36  *
  37  * The segmap driver is used only by the kernel to get faster (than seg_vn)
  38  * mappings [lower routine overhead; more persistent cache] to random
  39  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/param.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/buf.h>
  47 #include <sys/systm.h>
  48 #include <sys/vnode.h>
  49 #include <sys/mman.h>
  50 #include <sys/errno.h>
  51 #include <sys/cred.h>
  52 #include <sys/kmem.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/debug.h>
  56 #include <sys/thread.h>
  57 #include <sys/dumphdr.h>
  58 #include <sys/bitmap.h>
  59 #include <sys/lgrp.h>
  60 
  61 #include <vm/seg_kmem.h>
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_kpm.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/page.h>
  68 #include <vm/pvn.h>
  69 #include <vm/rm.h>
  70 
  71 /*
  72  * Private seg op routines.
  73  */
  74 static void     segmap_free(struct seg *seg);
  75 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
  76                         size_t len, enum fault_type type, enum seg_rw rw);
  77 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
  78 static int      segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
  79                         uint_t prot);
  80 static int      segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
  81 static int      segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
  82                         uint_t *protv);
  83 static u_offset_t       segmap_getoffset(struct seg *seg, caddr_t addr);
  84 static int      segmap_gettype(struct seg *seg, caddr_t addr);
  85 static int      segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
  86 static void     segmap_dump(struct seg *seg);
  87 static int      segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
  88                         struct page ***ppp, enum lock_type type,
  89                         enum seg_rw rw);
  90 static void     segmap_badop(void);
  91 static int      segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
  92 static int      segmap_capable(struct seg *seg, segcapability_t capability);
  93 
  94 /* segkpm support */
  95 static caddr_t  segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
  96                         struct smap *, enum seg_rw);
  97 struct smap     *get_smap_kpm(caddr_t, page_t **);
  98 
  99 #define SEGMAP_BADOP(t) (t(*)())segmap_badop
 100 
 101 static struct seg_ops segmap_ops = {
 102         .dup            = SEGMAP_BADOP(int),
 103         .unmap          = SEGMAP_BADOP(int),
 104         .free           = segmap_free,
 105         .fault          = segmap_fault,
 106         .faulta         = segmap_faulta,
 107         .setprot        = SEGMAP_BADOP(int),
 108         .checkprot      = segmap_checkprot,
 109         .kluster        = segmap_kluster,
 110         .swapout        = SEGMAP_BADOP(size_t),
 111         .sync           = SEGMAP_BADOP(int),
 112         .incore         = SEGMAP_BADOP(size_t),
 113         .lockop         = SEGMAP_BADOP(int),
 114         .getprot        = segmap_getprot,
 115         .getoffset      = segmap_getoffset,
 116         .gettype        = segmap_gettype,
 117         .getvp          = segmap_getvp,
 118         .advise         = SEGMAP_BADOP(int),
 119         .dump           = segmap_dump,
 120         .pagelock       = segmap_pagelock,
 121         .setpagesize    = SEGMAP_BADOP(int),
 122         .getmemid       = segmap_getmemid,
 123         .capable        = segmap_capable,
 124 };
 125 
 126 /*
 127  * Private segmap routines.
 128  */
 129 static void     segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
 130                         size_t len, enum seg_rw rw, struct smap *smp);
 131 static void     segmap_smapadd(struct smap *smp);
 132 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
 133                         u_offset_t off, int hashid);
 134 static void     segmap_hashout(struct smap *smp);
 135 
 136 
 137 /*
 138  * Statistics for segmap operations.
 139  *
 140  * No explicit locking to protect these stats.
 141  */
 142 struct segmapcnt segmapcnt = {
 143         { "fault",              KSTAT_DATA_ULONG },
 144         { "faulta",             KSTAT_DATA_ULONG },
 145         { "getmap",             KSTAT_DATA_ULONG },
 146         { "get_use",            KSTAT_DATA_ULONG },
 147         { "get_reclaim",        KSTAT_DATA_ULONG },
 148         { "get_reuse",          KSTAT_DATA_ULONG },
 149         { "get_unused",         KSTAT_DATA_ULONG },
 150         { "get_nofree",         KSTAT_DATA_ULONG },
 151         { "rel_async",          KSTAT_DATA_ULONG },
 152         { "rel_write",          KSTAT_DATA_ULONG },
 153         { "rel_free",           KSTAT_DATA_ULONG },
 154         { "rel_abort",          KSTAT_DATA_ULONG },
 155         { "rel_dontneed",       KSTAT_DATA_ULONG },
 156         { "release",            KSTAT_DATA_ULONG },
 157         { "pagecreate",         KSTAT_DATA_ULONG },
 158         { "free_notfree",       KSTAT_DATA_ULONG },
 159         { "free_dirty",         KSTAT_DATA_ULONG },
 160         { "free",               KSTAT_DATA_ULONG },
 161         { "stolen",             KSTAT_DATA_ULONG },
 162         { "get_nomtx",          KSTAT_DATA_ULONG }
 163 };
 164 
 165 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
 166 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
 167 
 168 /*
 169  * Return number of map pages in segment.
 170  */
 171 #define MAP_PAGES(seg)          ((seg)->s_size >> MAXBSHIFT)
 172 
 173 /*
 174  * Translate addr into smap number within segment.
 175  */
 176 #define MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
 177 
 178 /*
 179  * Translate addr in seg into struct smap pointer.
 180  */
 181 #define GET_SMAP(seg, addr)     \
 182         &(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
 183 
 184 /*
 185  * Bit in map (16 bit bitmap).
 186  */
 187 #define SMAP_BIT_MASK(bitindex) (1 << ((bitindex) & 0xf))
 188 
 189 static int smd_colormsk = 0;
 190 static int smd_ncolor = 0;
 191 static int smd_nfree = 0;
 192 static int smd_freemsk = 0;
 193 #ifdef DEBUG
 194 static int *colors_used;
 195 #endif
 196 static struct smap *smd_smap;
 197 static struct smaphash *smd_hash;
 198 #ifdef SEGMAP_HASHSTATS
 199 static unsigned int *smd_hash_len;
 200 #endif
 201 static struct smfree *smd_free;
 202 static ulong_t smd_hashmsk = 0;
 203 
 204 #define SEGMAP_MAXCOLOR         2
 205 #define SEGMAP_CACHE_PAD        64
 206 
 207 union segmap_cpu {
 208         struct {
 209                 uint32_t        scpu_free_ndx[SEGMAP_MAXCOLOR];
 210                 struct smap     *scpu_last_smap;
 211                 ulong_t         scpu_getmap;
 212                 ulong_t         scpu_release;
 213                 ulong_t         scpu_get_reclaim;
 214                 ulong_t         scpu_fault;
 215                 ulong_t         scpu_pagecreate;
 216                 ulong_t         scpu_get_reuse;
 217         } scpu;
 218         char    scpu_pad[SEGMAP_CACHE_PAD];
 219 };
 220 static union segmap_cpu *smd_cpu;
 221 
 222 /*
 223  * There are three locks in seg_map:
 224  *      - per freelist mutexes
 225  *      - per hashchain mutexes
 226  *      - per smap mutexes
 227  *
 228  * The lock ordering is to get the smap mutex to lock down the slot
 229  * first then the hash lock (for hash in/out (vp, off) list) or the
 230  * freelist lock to put the slot back on the free list.
 231  *
 232  * The hash search is done by only holding the hashchain lock, when a wanted
 233  * slot is found, we drop the hashchain lock then lock the slot so there
 234  * is no overlapping of hashchain and smap locks. After the slot is
 235  * locked, we verify again if the slot is still what we are looking
 236  * for.
 237  *
 238  * Allocation of a free slot is done by holding the freelist lock,
 239  * then locking the smap slot at the head of the freelist. This is
 240  * in reversed lock order so mutex_tryenter() is used.
 241  *
 242  * The smap lock protects all fields in smap structure except for
 243  * the link fields for hash/free lists which are protected by
 244  * hashchain and freelist locks.
 245  */
 246 
 247 #define SHASHMTX(hashid)        (&smd_hash[hashid].sh_mtx)
 248 
 249 #define SMP2SMF(smp)            (&smd_free[(smp - smd_smap) & smd_freemsk])
 250 #define SMP2SMF_NDX(smp)        (ushort_t)((smp - smd_smap) & smd_freemsk)
 251 
 252 #define SMAPMTX(smp) (&smp->sm_mtx)
 253 
 254 #define SMAP_HASHFUNC(vp, off, hashid) \
 255         { \
 256         hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
 257                 ((off) >> MAXBSHIFT)) & smd_hashmsk); \
 258         }
 259 
 260 /*
 261  * The most frequently updated kstat counters are kept in the
 262  * per cpu array to avoid hot cache blocks. The update function
 263  * sums the cpu local counters to update the global counters.
 264  */
 265 
 266 /* ARGSUSED */
 267 int
 268 segmap_kstat_update(kstat_t *ksp, int rw)
 269 {
 270         int i;
 271         ulong_t getmap, release, get_reclaim;
 272         ulong_t fault, pagecreate, get_reuse;
 273 
 274         if (rw == KSTAT_WRITE)
 275                 return (EACCES);
 276         getmap = release = get_reclaim = (ulong_t)0;
 277         fault = pagecreate = get_reuse = (ulong_t)0;
 278         for (i = 0; i < max_ncpus; i++) {
 279                 getmap += smd_cpu[i].scpu.scpu_getmap;
 280                 release  += smd_cpu[i].scpu.scpu_release;
 281                 get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
 282                 fault  += smd_cpu[i].scpu.scpu_fault;
 283                 pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
 284                 get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
 285         }
 286         segmapcnt.smp_getmap.value.ul = getmap;
 287         segmapcnt.smp_release.value.ul = release;
 288         segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
 289         segmapcnt.smp_fault.value.ul = fault;
 290         segmapcnt.smp_pagecreate.value.ul = pagecreate;
 291         segmapcnt.smp_get_reuse.value.ul = get_reuse;
 292         return (0);
 293 }
 294 
 295 int
 296 segmap_create(struct seg *seg, void *argsp)
 297 {
 298         struct segmap_data *smd;
 299         struct smap *smp;
 300         struct smfree *sm;
 301         struct segmap_crargs *a = (struct segmap_crargs *)argsp;
 302         struct smaphash *shashp;
 303         union segmap_cpu *scpu;
 304         long i, npages;
 305         size_t hashsz;
 306         uint_t nfreelist;
 307         extern void prefetch_smap_w(void *);
 308         extern int max_ncpus;
 309 
 310         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 311 
 312         if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
 313                 panic("segkmap not MAXBSIZE aligned");
 314                 /*NOTREACHED*/
 315         }
 316 
 317         smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
 318 
 319         seg->s_data = (void *)smd;
 320         seg->s_ops = &segmap_ops;
 321         smd->smd_prot = a->prot;
 322 
 323         /*
 324          * Scale the number of smap freelists to be
 325          * proportional to max_ncpus * number of virtual colors.
 326          * The caller can over-ride this scaling by providing
 327          * a non-zero a->nfreelist argument.
 328          */
 329         nfreelist = a->nfreelist;
 330         if (nfreelist == 0)
 331                 nfreelist = max_ncpus;
 332         else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
 333                 cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
 334                 "%d, using %d", nfreelist, max_ncpus);
 335                 nfreelist = max_ncpus;
 336         }
 337         if (!ISP2(nfreelist)) {
 338                 /* round up nfreelist to the next power of two. */
 339                 nfreelist = 1 << (highbit(nfreelist));
 340         }
 341 
 342         /*
 343          * Get the number of virtual colors - must be a power of 2.
 344          */
 345         if (a->shmsize)
 346                 smd_ncolor = a->shmsize >> MAXBSHIFT;
 347         else
 348                 smd_ncolor = 1;
 349         ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
 350         ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
 351         smd_colormsk = smd_ncolor - 1;
 352         smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
 353         smd_freemsk = smd_nfree - 1;
 354 
 355         /*
 356          * Allocate and initialize the freelist headers.
 357          * Note that sm_freeq[1] starts out as the release queue. This
 358          * is known when the smap structures are initialized below.
 359          */
 360         smd_free = smd->smd_free =
 361             kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
 362         for (i = 0; i < smd_nfree; i++) {
 363                 sm = &smd->smd_free[i];
 364                 mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 365                 mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 366                 sm->sm_allocq = &sm->sm_freeq[0];
 367                 sm->sm_releq = &sm->sm_freeq[1];
 368         }
 369 
 370         /*
 371          * Allocate and initialize the smap hash chain headers.
 372          * Compute hash size rounding down to the next power of two.
 373          */
 374         npages = MAP_PAGES(seg);
 375         smd->smd_npages = npages;
 376         hashsz = npages / SMAP_HASHAVELEN;
 377         hashsz = 1 << (highbit(hashsz)-1);
 378         smd_hashmsk = hashsz - 1;
 379         smd_hash = smd->smd_hash =
 380             kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
 381 #ifdef SEGMAP_HASHSTATS
 382         smd_hash_len =
 383             kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
 384 #endif
 385         for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
 386                 shashp->sh_hash_list = NULL;
 387                 mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
 388         }
 389 
 390         /*
 391          * Allocate and initialize the smap structures.
 392          * Link all slots onto the appropriate freelist.
 393          * The smap array is large enough to affect boot time
 394          * on large systems, so use memory prefetching and only
 395          * go through the array 1 time. Inline a optimized version
 396          * of segmap_smapadd to add structures to freelists with
 397          * knowledge that no locks are needed here.
 398          */
 399         smd_smap = smd->smd_sm =
 400             kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
 401 
 402         for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
 403             smp >= smd->smd_sm; smp--) {
 404                 struct smap *smpfreelist;
 405                 struct sm_freeq *releq;
 406 
 407                 prefetch_smap_w((char *)smp);
 408 
 409                 smp->sm_vp = NULL;
 410                 smp->sm_hash = NULL;
 411                 smp->sm_off = 0;
 412                 smp->sm_bitmap = 0;
 413                 smp->sm_refcnt = 0;
 414                 mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
 415                 smp->sm_free_ndx = SMP2SMF_NDX(smp);
 416 
 417                 sm = SMP2SMF(smp);
 418                 releq = sm->sm_releq;
 419 
 420                 smpfreelist = releq->smq_free;
 421                 if (smpfreelist == 0) {
 422                         releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 423                 } else {
 424                         smp->sm_next = smpfreelist;
 425                         smp->sm_prev = smpfreelist->sm_prev;
 426                         smpfreelist->sm_prev = smp;
 427                         smp->sm_prev->sm_next = smp;
 428                         releq->smq_free = smp->sm_next;
 429                 }
 430 
 431                 /*
 432                  * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
 433                  */
 434                 smp->sm_flags = 0;
 435 
 436 #ifdef  SEGKPM_SUPPORT
 437                 /*
 438                  * Due to the fragile prefetch loop no
 439                  * separate function is used here.
 440                  */
 441                 smp->sm_kpme_next = NULL;
 442                 smp->sm_kpme_prev = NULL;
 443                 smp->sm_kpme_page = NULL;
 444 #endif
 445         }
 446 
 447         /*
 448          * Allocate the per color indices that distribute allocation
 449          * requests over the free lists. Each cpu will have a private
 450          * rotor index to spread the allocations even across the available
 451          * smap freelists. Init the scpu_last_smap field to the first
 452          * smap element so there is no need to check for NULL.
 453          */
 454         smd_cpu =
 455             kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
 456         for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
 457                 int j;
 458                 for (j = 0; j < smd_ncolor; j++)
 459                         scpu->scpu.scpu_free_ndx[j] = j;
 460                 scpu->scpu.scpu_last_smap = smd_smap;
 461         }
 462 
 463         vpm_init();
 464 
 465 #ifdef DEBUG
 466         /*
 467          * Keep track of which colors are used more often.
 468          */
 469         colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
 470 #endif /* DEBUG */
 471 
 472         return (0);
 473 }
 474 
 475 static void
 476 segmap_free(seg)
 477         struct seg *seg;
 478 {
 479         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 480 }
 481 
 482 /*
 483  * Do a F_SOFTUNLOCK call over the range requested.
 484  * The range must have already been F_SOFTLOCK'ed.
 485  */
 486 static void
 487 segmap_unlock(
 488         struct hat *hat,
 489         struct seg *seg,
 490         caddr_t addr,
 491         size_t len,
 492         enum seg_rw rw,
 493         struct smap *smp)
 494 {
 495         page_t *pp;
 496         caddr_t adr;
 497         u_offset_t off;
 498         struct vnode *vp;
 499         kmutex_t *smtx;
 500 
 501         ASSERT(smp->sm_refcnt > 0);
 502 
 503 #ifdef lint
 504         seg = seg;
 505 #endif
 506 
 507         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 508 
 509                 /*
 510                  * We're called only from segmap_fault and this was a
 511                  * NOP in case of a kpm based smap, so dangerous things
 512                  * must have happened in the meantime. Pages are prefaulted
 513                  * and locked in segmap_getmapflt and they will not be
 514                  * unlocked until segmap_release.
 515                  */
 516                 panic("segmap_unlock: called with kpm addr %p", (void *)addr);
 517                 /*NOTREACHED*/
 518         }
 519 
 520         vp = smp->sm_vp;
 521         off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 522 
 523         hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
 524         for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
 525                 ushort_t bitmask;
 526 
 527                 /*
 528                  * Use page_find() instead of page_lookup() to
 529                  * find the page since we know that it has
 530                  * "shared" lock.
 531                  */
 532                 pp = page_find(vp, off);
 533                 if (pp == NULL) {
 534                         panic("segmap_unlock: page not found");
 535                         /*NOTREACHED*/
 536                 }
 537 
 538                 if (rw == S_WRITE) {
 539                         hat_setrefmod(pp);
 540                 } else if (rw != S_OTHER) {
 541                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 542                         "segmap_fault:pp %p vp %p offset %llx", pp, vp, off);
 543                         hat_setref(pp);
 544                 }
 545 
 546                 /*
 547                  * Clear bitmap, if the bit corresponding to "off" is set,
 548                  * since the page and translation are being unlocked.
 549                  */
 550                 bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
 551 
 552                 /*
 553                  * Large Files: Following assertion is to verify
 554                  * the correctness of the cast to (int) above.
 555                  */
 556                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
 557                 smtx = SMAPMTX(smp);
 558                 mutex_enter(smtx);
 559                 if (smp->sm_bitmap & bitmask) {
 560                         smp->sm_bitmap &= ~bitmask;
 561                 }
 562                 mutex_exit(smtx);
 563 
 564                 page_unlock(pp);
 565         }
 566 }
 567 
 568 #define MAXPPB  (MAXBSIZE/4096) /* assumes minimum page size of 4k */
 569 
 570 /*
 571  * This routine is called via a machine specific fault handling
 572  * routine.  It is also called by software routines wishing to
 573  * lock or unlock a range of addresses.
 574  *
 575  * Note that this routine expects a page-aligned "addr".
 576  */
 577 faultcode_t
 578 segmap_fault(
 579         struct hat *hat,
 580         struct seg *seg,
 581         caddr_t addr,
 582         size_t len,
 583         enum fault_type type,
 584         enum seg_rw rw)
 585 {
 586         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 587         struct smap *smp;
 588         page_t *pp, **ppp;
 589         struct vnode *vp;
 590         u_offset_t off;
 591         page_t *pl[MAXPPB + 1];
 592         uint_t prot;
 593         u_offset_t addroff;
 594         caddr_t adr;
 595         int err;
 596         u_offset_t sm_off;
 597         int hat_flag;
 598 
 599         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 600                 int newpage;
 601                 kmutex_t *smtx;
 602 
 603                 /*
 604                  * Pages are successfully prefaulted and locked in
 605                  * segmap_getmapflt and can't be unlocked until
 606                  * segmap_release. No hat mappings have to be locked
 607                  * and they also can't be unlocked as long as the
 608                  * caller owns an active kpm addr.
 609                  */
 610 #ifndef DEBUG
 611                 if (type != F_SOFTUNLOCK)
 612                         return (0);
 613 #endif
 614 
 615                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 616                         panic("segmap_fault: smap not found "
 617                             "for addr %p", (void *)addr);
 618                         /*NOTREACHED*/
 619                 }
 620 
 621                 smtx = SMAPMTX(smp);
 622 #ifdef  DEBUG
 623                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 624                 if (newpage) {
 625                         cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
 626                             (void *)smp);
 627                 }
 628 
 629                 if (type != F_SOFTUNLOCK) {
 630                         mutex_exit(smtx);
 631                         return (0);
 632                 }
 633 #endif
 634                 mutex_exit(smtx);
 635                 vp = smp->sm_vp;
 636                 sm_off = smp->sm_off;
 637 
 638                 if (vp == NULL)
 639                         return (FC_MAKE_ERR(EIO));
 640 
 641                 ASSERT(smp->sm_refcnt > 0);
 642 
 643                 addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 644                 if (addroff + len > MAXBSIZE)
 645                         panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
 646                             (void *)(addr + len));
 647 
 648                 off = sm_off + addroff;
 649 
 650                 pp = page_find(vp, off);
 651 
 652                 if (pp == NULL)
 653                         panic("segmap_fault: softunlock page not found");
 654 
 655                 /*
 656                  * Set ref bit also here in case of S_OTHER to avoid the
 657                  * overhead of supporting other cases than F_SOFTUNLOCK
 658                  * with segkpm. We can do this because the underlying
 659                  * pages are locked anyway.
 660                  */
 661                 if (rw == S_WRITE) {
 662                         hat_setrefmod(pp);
 663                 } else {
 664                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 665                             "segmap_fault:pp %p vp %p offset %llx",
 666                             pp, vp, off);
 667                         hat_setref(pp);
 668                 }
 669 
 670                 return (0);
 671         }
 672 
 673         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
 674         smp = GET_SMAP(seg, addr);
 675         vp = smp->sm_vp;
 676         sm_off = smp->sm_off;
 677 
 678         if (vp == NULL)
 679                 return (FC_MAKE_ERR(EIO));
 680 
 681         ASSERT(smp->sm_refcnt > 0);
 682 
 683         addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 684         if (addroff + len > MAXBSIZE) {
 685                 panic("segmap_fault: endaddr %p "
 686                     "exceeds MAXBSIZE chunk", (void *)(addr + len));
 687                 /*NOTREACHED*/
 688         }
 689         off = sm_off + addroff;
 690 
 691         /*
 692          * First handle the easy stuff
 693          */
 694         if (type == F_SOFTUNLOCK) {
 695                 segmap_unlock(hat, seg, addr, len, rw, smp);
 696                 return (0);
 697         }
 698 
 699         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 700             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 701         err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
 702             seg, addr, rw, CRED(), NULL);
 703 
 704         if (err)
 705                 return (FC_MAKE_ERR(err));
 706 
 707         prot &= smd->smd_prot;
 708 
 709         /*
 710          * Handle all pages returned in the pl[] array.
 711          * This loop is coded on the assumption that if
 712          * there was no error from the VOP_GETPAGE routine,
 713          * that the page list returned will contain all the
 714          * needed pages for the vp from [off..off + len].
 715          */
 716         ppp = pl;
 717         while ((pp = *ppp++) != NULL) {
 718                 u_offset_t poff;
 719                 ASSERT(pp->p_vnode == vp);
 720                 hat_flag = HAT_LOAD;
 721 
 722                 /*
 723                  * Verify that the pages returned are within the range
 724                  * of this segmap region.  Note that it is theoretically
 725                  * possible for pages outside this range to be returned,
 726                  * but it is not very likely.  If we cannot use the
 727                  * page here, just release it and go on to the next one.
 728                  */
 729                 if (pp->p_offset < sm_off ||
 730                     pp->p_offset >= sm_off + MAXBSIZE) {
 731                         (void) page_release(pp, 1);
 732                         continue;
 733                 }
 734 
 735                 ASSERT(hat == kas.a_hat);
 736                 poff = pp->p_offset;
 737                 adr = addr + (poff - off);
 738                 if (adr >= addr && adr < addr + len) {
 739                         hat_setref(pp);
 740                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 741                             "segmap_fault:pp %p vp %p offset %llx",
 742                             pp, vp, poff);
 743                         if (type == F_SOFTLOCK)
 744                                 hat_flag = HAT_LOAD_LOCK;
 745                 }
 746 
 747                 /*
 748                  * Deal with VMODSORT pages here. If we know this is a write
 749                  * do the setmod now and allow write protection.
 750                  * As long as it's modified or not S_OTHER, remove write
 751                  * protection. With S_OTHER it's up to the FS to deal with this.
 752                  */
 753                 if (IS_VMODSORT(vp)) {
 754                         if (rw == S_WRITE)
 755                                 hat_setmod(pp);
 756                         else if (rw != S_OTHER && !hat_ismod(pp))
 757                                 prot &= ~PROT_WRITE;
 758                 }
 759 
 760                 hat_memload(hat, adr, pp, prot, hat_flag);
 761                 if (hat_flag != HAT_LOAD_LOCK)
 762                         page_unlock(pp);
 763         }
 764         return (0);
 765 }
 766 
 767 /*
 768  * This routine is used to start I/O on pages asynchronously.
 769  */
 770 static faultcode_t
 771 segmap_faulta(struct seg *seg, caddr_t addr)
 772 {
 773         struct smap *smp;
 774         struct vnode *vp;
 775         u_offset_t off;
 776         int err;
 777 
 778         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 779                 int     newpage;
 780                 kmutex_t *smtx;
 781 
 782                 /*
 783                  * Pages are successfully prefaulted and locked in
 784                  * segmap_getmapflt and can't be unlocked until
 785                  * segmap_release. No hat mappings have to be locked
 786                  * and they also can't be unlocked as long as the
 787                  * caller owns an active kpm addr.
 788                  */
 789 #ifdef  DEBUG
 790                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 791                         panic("segmap_faulta: smap not found "
 792                             "for addr %p", (void *)addr);
 793                         /*NOTREACHED*/
 794                 }
 795 
 796                 smtx = SMAPMTX(smp);
 797                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 798                 mutex_exit(smtx);
 799                 if (newpage)
 800                         cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
 801                             (void *)smp);
 802 #endif
 803                 return (0);
 804         }
 805 
 806         segmapcnt.smp_faulta.value.ul++;
 807         smp = GET_SMAP(seg, addr);
 808 
 809         ASSERT(smp->sm_refcnt > 0);
 810 
 811         vp = smp->sm_vp;
 812         off = smp->sm_off;
 813 
 814         if (vp == NULL) {
 815                 cmn_err(CE_WARN, "segmap_faulta - no vp");
 816                 return (FC_MAKE_ERR(EIO));
 817         }
 818 
 819         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 820             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 821 
 822         err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
 823             & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
 824             seg, addr, S_READ, CRED(), NULL);
 825 
 826         if (err)
 827                 return (FC_MAKE_ERR(err));
 828         return (0);
 829 }
 830 
 831 /*ARGSUSED*/
 832 static int
 833 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
 834 {
 835         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 836 
 837         ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
 838 
 839         /*
 840          * Need not acquire the segment lock since
 841          * "smd_prot" is a read-only field.
 842          */
 843         return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
 844 }
 845 
 846 static int
 847 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
 848 {
 849         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 850         size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
 851 
 852         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 853 
 854         if (pgno != 0) {
 855                 do {
 856                         protv[--pgno] = smd->smd_prot;
 857                 } while (pgno != 0);
 858         }
 859         return (0);
 860 }
 861 
 862 static u_offset_t
 863 segmap_getoffset(struct seg *seg, caddr_t addr)
 864 {
 865         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 866 
 867         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 868 
 869         return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
 870 }
 871 
 872 /*ARGSUSED*/
 873 static int
 874 segmap_gettype(struct seg *seg, caddr_t addr)
 875 {
 876         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 877 
 878         return (MAP_SHARED);
 879 }
 880 
 881 /*ARGSUSED*/
 882 static int
 883 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
 884 {
 885         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 886 
 887         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 888 
 889         /* XXX - This doesn't make any sense */
 890         *vpp = smd->smd_sm->sm_vp;
 891         return (0);
 892 }
 893 
 894 /*
 895  * Check to see if it makes sense to do kluster/read ahead to
 896  * addr + delta relative to the mapping at addr.  We assume here
 897  * that delta is a signed PAGESIZE'd multiple (which can be negative).
 898  *
 899  * For segmap we always "approve" of this action from our standpoint.
 900  */
 901 /*ARGSUSED*/
 902 static int
 903 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 904 {
 905         return (0);
 906 }
 907 
 908 static void
 909 segmap_badop()
 910 {
 911         panic("segmap_badop");
 912         /*NOTREACHED*/
 913 }
 914 
 915 /*
 916  * Special private segmap operations
 917  */
 918 
 919 /*
 920  * Add smap to the appropriate free list.
 921  */
 922 static void
 923 segmap_smapadd(struct smap *smp)
 924 {
 925         struct smfree *sm;
 926         struct smap *smpfreelist;
 927         struct sm_freeq *releq;
 928 
 929         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 930 
 931         if (smp->sm_refcnt != 0) {
 932                 panic("segmap_smapadd");
 933                 /*NOTREACHED*/
 934         }
 935 
 936         sm = &smd_free[smp->sm_free_ndx];
 937         /*
 938          * Add to the tail of the release queue
 939          * Note that sm_releq and sm_allocq could toggle
 940          * before we get the lock. This does not affect
 941          * correctness as the 2 queues are only maintained
 942          * to reduce lock pressure.
 943          */
 944         releq = sm->sm_releq;
 945         if (releq == &sm->sm_freeq[0])
 946                 smp->sm_flags |= SM_QNDX_ZERO;
 947         else
 948                 smp->sm_flags &= ~SM_QNDX_ZERO;
 949         mutex_enter(&releq->smq_mtx);
 950         smpfreelist = releq->smq_free;
 951         if (smpfreelist == 0) {
 952                 int want;
 953 
 954                 releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 955                 /*
 956                  * Both queue mutexes held to set sm_want;
 957                  * snapshot the value before dropping releq mutex.
 958                  * If sm_want appears after the releq mutex is dropped,
 959                  * then the smap just freed is already gone.
 960                  */
 961                 want = sm->sm_want;
 962                 mutex_exit(&releq->smq_mtx);
 963                 /*
 964                  * See if there was a waiter before dropping the releq mutex
 965                  * then recheck after obtaining sm_freeq[0] mutex as
 966                  * the another thread may have already signaled.
 967                  */
 968                 if (want) {
 969                         mutex_enter(&sm->sm_freeq[0].smq_mtx);
 970                         if (sm->sm_want)
 971                                 cv_signal(&sm->sm_free_cv);
 972                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
 973                 }
 974         } else {
 975                 smp->sm_next = smpfreelist;
 976                 smp->sm_prev = smpfreelist->sm_prev;
 977                 smpfreelist->sm_prev = smp;
 978                 smp->sm_prev->sm_next = smp;
 979                 mutex_exit(&releq->smq_mtx);
 980         }
 981 }
 982 
 983 
 984 static struct smap *
 985 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
 986 {
 987         struct smap **hpp;
 988         struct smap *tmp;
 989         kmutex_t *hmtx;
 990 
 991         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 992         ASSERT(smp->sm_vp == NULL);
 993         ASSERT(smp->sm_hash == NULL);
 994         ASSERT(smp->sm_prev == NULL);
 995         ASSERT(smp->sm_next == NULL);
 996         ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
 997 
 998         hmtx = SHASHMTX(hashid);
 999 
1000         mutex_enter(hmtx);
1001         /*
1002          * First we need to verify that no one has created a smp
1003          * with (vp,off) as its tag before we us.
1004          */
1005         for (tmp = smd_hash[hashid].sh_hash_list;
1006             tmp != NULL; tmp = tmp->sm_hash)
1007                 if (tmp->sm_vp == vp && tmp->sm_off == off)
1008                         break;
1009 
1010         if (tmp == NULL) {
1011                 /*
1012                  * No one created one yet.
1013                  *
1014                  * Funniness here - we don't increment the ref count on the
1015                  * vnode * even though we have another pointer to it here.
1016                  * The reason for this is that we don't want the fact that
1017                  * a seg_map entry somewhere refers to a vnode to prevent the
1018                  * vnode * itself from going away.  This is because this
1019                  * reference to the vnode is a "soft one".  In the case where
1020                  * a mapping is being used by a rdwr [or directory routine?]
1021                  * there already has to be a non-zero ref count on the vnode.
1022                  * In the case where the vp has been freed and the the smap
1023                  * structure is on the free list, there are no pages in memory
1024                  * that can refer to the vnode.  Thus even if we reuse the same
1025                  * vnode/smap structure for a vnode which has the same
1026                  * address but represents a different object, we are ok.
1027                  */
1028                 smp->sm_vp = vp;
1029                 smp->sm_off = off;
1030 
1031                 hpp = &smd_hash[hashid].sh_hash_list;
1032                 smp->sm_hash = *hpp;
1033                 *hpp = smp;
1034 #ifdef SEGMAP_HASHSTATS
1035                 smd_hash_len[hashid]++;
1036 #endif
1037         }
1038         mutex_exit(hmtx);
1039 
1040         return (tmp);
1041 }
1042 
1043 static void
1044 segmap_hashout(struct smap *smp)
1045 {
1046         struct smap **hpp, *hp;
1047         struct vnode *vp;
1048         kmutex_t *mtx;
1049         int hashid;
1050         u_offset_t off;
1051 
1052         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1053 
1054         vp = smp->sm_vp;
1055         off = smp->sm_off;
1056 
1057         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1058         mtx = SHASHMTX(hashid);
1059         mutex_enter(mtx);
1060 
1061         hpp = &smd_hash[hashid].sh_hash_list;
1062         for (;;) {
1063                 hp = *hpp;
1064                 if (hp == NULL) {
1065                         panic("segmap_hashout");
1066                         /*NOTREACHED*/
1067                 }
1068                 if (hp == smp)
1069                         break;
1070                 hpp = &hp->sm_hash;
1071         }
1072 
1073         *hpp = smp->sm_hash;
1074         smp->sm_hash = NULL;
1075 #ifdef SEGMAP_HASHSTATS
1076         smd_hash_len[hashid]--;
1077 #endif
1078         mutex_exit(mtx);
1079 
1080         smp->sm_vp = NULL;
1081         smp->sm_off = (u_offset_t)0;
1082 
1083 }
1084 
1085 /*
1086  * Attempt to free unmodified, unmapped, and non locked segmap
1087  * pages.
1088  */
1089 void
1090 segmap_pagefree(struct vnode *vp, u_offset_t off)
1091 {
1092         u_offset_t pgoff;
1093         page_t  *pp;
1094 
1095         for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
1096 
1097                 if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
1098                         continue;
1099 
1100                 switch (page_release(pp, 1)) {
1101                 case PGREL_NOTREL:
1102                         segmapcnt.smp_free_notfree.value.ul++;
1103                         break;
1104                 case PGREL_MOD:
1105                         segmapcnt.smp_free_dirty.value.ul++;
1106                         break;
1107                 case PGREL_CLEAN:
1108                         segmapcnt.smp_free.value.ul++;
1109                         break;
1110                 }
1111         }
1112 }
1113 
1114 /*
1115  * Locks held on entry: smap lock
1116  * Locks held on exit : smap lock.
1117  */
1118 
1119 static void
1120 grab_smp(struct smap *smp, page_t *pp)
1121 {
1122         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1123         ASSERT(smp->sm_refcnt == 0);
1124 
1125         if (smp->sm_vp != (struct vnode *)NULL) {
1126                 struct vnode    *vp = smp->sm_vp;
1127                 u_offset_t      off = smp->sm_off;
1128                 /*
1129                  * Destroy old vnode association and
1130                  * unload any hardware translations to
1131                  * the old object.
1132                  */
1133                 smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
1134                 segmap_hashout(smp);
1135 
1136                 /*
1137                  * This node is off freelist and hashlist,
1138                  * so there is no reason to drop/reacquire sm_mtx
1139                  * across calls to hat_unload.
1140                  */
1141                 if (segmap_kpm) {
1142                         caddr_t vaddr;
1143                         int hat_unload_needed = 0;
1144 
1145                         /*
1146                          * unload kpm mapping
1147                          */
1148                         if (pp != NULL) {
1149                                 vaddr = hat_kpm_page2va(pp, 1);
1150                                 hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
1151                                 page_unlock(pp);
1152                         }
1153 
1154                         /*
1155                          * Check if we have (also) the rare case of a
1156                          * non kpm mapping.
1157                          */
1158                         if (smp->sm_flags & SM_NOTKPM_RELEASED) {
1159                                 hat_unload_needed = 1;
1160                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1161                         }
1162 
1163                         if (hat_unload_needed) {
1164                                 hat_unload(kas.a_hat, segkmap->s_base +
1165                                     ((smp - smd_smap) * MAXBSIZE),
1166                                     MAXBSIZE, HAT_UNLOAD);
1167                         }
1168 
1169                 } else {
1170                         ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
1171                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1172                         hat_unload(kas.a_hat, segkmap->s_base +
1173                             ((smp - smd_smap) * MAXBSIZE),
1174                             MAXBSIZE, HAT_UNLOAD);
1175                 }
1176                 segmap_pagefree(vp, off);
1177         }
1178 }
1179 
1180 static struct smap *
1181 get_free_smp(int free_ndx)
1182 {
1183         struct smfree *sm;
1184         kmutex_t *smtx;
1185         struct smap *smp, *first;
1186         struct sm_freeq *allocq, *releq;
1187         struct kpme *kpme;
1188         page_t *pp = NULL;
1189         int end_ndx, page_locked = 0;
1190 
1191         end_ndx = free_ndx;
1192         sm = &smd_free[free_ndx];
1193 
1194 retry_queue:
1195         allocq = sm->sm_allocq;
1196         mutex_enter(&allocq->smq_mtx);
1197 
1198         if ((smp = allocq->smq_free) == NULL) {
1199 
1200 skip_queue:
1201                 /*
1202                  * The alloc list is empty or this queue is being skipped;
1203                  * first see if the allocq toggled.
1204                  */
1205                 if (sm->sm_allocq != allocq) {
1206                         /* queue changed */
1207                         mutex_exit(&allocq->smq_mtx);
1208                         goto retry_queue;
1209                 }
1210                 releq = sm->sm_releq;
1211                 if (!mutex_tryenter(&releq->smq_mtx)) {
1212                         /* cannot get releq; a free smp may be there now */
1213                         mutex_exit(&allocq->smq_mtx);
1214 
1215                         /*
1216                          * This loop could spin forever if this thread has
1217                          * higher priority than the thread that is holding
1218                          * releq->smq_mtx. In order to force the other thread
1219                          * to run, we'll lock/unlock the mutex which is safe
1220                          * since we just unlocked the allocq mutex.
1221                          */
1222                         mutex_enter(&releq->smq_mtx);
1223                         mutex_exit(&releq->smq_mtx);
1224                         goto retry_queue;
1225                 }
1226                 if (releq->smq_free == NULL) {
1227                         /*
1228                          * This freelist is empty.
1229                          * This should not happen unless clients
1230                          * are failing to release the segmap
1231                          * window after accessing the data.
1232                          * Before resorting to sleeping, try
1233                          * the next list of the same color.
1234                          */
1235                         free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
1236                         if (free_ndx != end_ndx) {
1237                                 mutex_exit(&releq->smq_mtx);
1238                                 mutex_exit(&allocq->smq_mtx);
1239                                 sm = &smd_free[free_ndx];
1240                                 goto retry_queue;
1241                         }
1242                         /*
1243                          * Tried all freelists of the same color once,
1244                          * wait on this list and hope something gets freed.
1245                          */
1246                         segmapcnt.smp_get_nofree.value.ul++;
1247                         sm->sm_want++;
1248                         mutex_exit(&sm->sm_freeq[1].smq_mtx);
1249                         cv_wait(&sm->sm_free_cv,
1250                             &sm->sm_freeq[0].smq_mtx);
1251                         sm->sm_want--;
1252                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
1253                         sm = &smd_free[free_ndx];
1254                         goto retry_queue;
1255                 } else {
1256                         /*
1257                          * Something on the rele queue; flip the alloc
1258                          * and rele queues and retry.
1259                          */
1260                         sm->sm_allocq = releq;
1261                         sm->sm_releq = allocq;
1262                         mutex_exit(&allocq->smq_mtx);
1263                         mutex_exit(&releq->smq_mtx);
1264                         if (page_locked) {
1265                                 delay(hz >> 2);
1266                                 page_locked = 0;
1267                         }
1268                         goto retry_queue;
1269                 }
1270         } else {
1271                 /*
1272                  * Fastpath the case we get the smap mutex
1273                  * on the first try.
1274                  */
1275                 first = smp;
1276 next_smap:
1277                 smtx = SMAPMTX(smp);
1278                 if (!mutex_tryenter(smtx)) {
1279                         /*
1280                          * Another thread is trying to reclaim this slot.
1281                          * Skip to the next queue or smap.
1282                          */
1283                         if ((smp = smp->sm_next) == first) {
1284                                 goto skip_queue;
1285                         } else {
1286                                 goto next_smap;
1287                         }
1288                 } else {
1289                         /*
1290                          * if kpme exists, get shared lock on the page
1291                          */
1292                         if (segmap_kpm && smp->sm_vp != NULL) {
1293 
1294                                 kpme = GET_KPME(smp);
1295                                 pp = kpme->kpe_page;
1296 
1297                                 if (pp != NULL) {
1298                                         if (!page_trylock(pp, SE_SHARED)) {
1299                                                 smp = smp->sm_next;
1300                                                 mutex_exit(smtx);
1301                                                 page_locked = 1;
1302 
1303                                                 pp = NULL;
1304 
1305                                                 if (smp == first) {
1306                                                         goto skip_queue;
1307                                                 } else {
1308                                                         goto next_smap;
1309                                                 }
1310                                         } else {
1311                                                 if (kpme->kpe_page == NULL) {
1312                                                         page_unlock(pp);
1313                                                         pp = NULL;
1314                                                 }
1315                                         }
1316                                 }
1317                         }
1318 
1319                         /*
1320                          * At this point, we've selected smp.  Remove smp
1321                          * from its freelist.  If smp is the first one in
1322                          * the freelist, update the head of the freelist.
1323                          */
1324                         if (first == smp) {
1325                                 ASSERT(first == allocq->smq_free);
1326                                 allocq->smq_free = smp->sm_next;
1327                         }
1328 
1329                         /*
1330                          * if the head of the freelist still points to smp,
1331                          * then there are no more free smaps in that list.
1332                          */
1333                         if (allocq->smq_free == smp)
1334                                 /*
1335                                  * Took the last one
1336                                  */
1337                                 allocq->smq_free = NULL;
1338                         else {
1339                                 smp->sm_prev->sm_next = smp->sm_next;
1340                                 smp->sm_next->sm_prev = smp->sm_prev;
1341                         }
1342                         mutex_exit(&allocq->smq_mtx);
1343                         smp->sm_prev = smp->sm_next = NULL;
1344 
1345                         /*
1346                          * if pp != NULL, pp must have been locked;
1347                          * grab_smp() unlocks pp.
1348                          */
1349                         ASSERT((pp == NULL) || PAGE_LOCKED(pp));
1350                         grab_smp(smp, pp);
1351                         /* return smp locked. */
1352                         ASSERT(SMAPMTX(smp) == smtx);
1353                         ASSERT(MUTEX_HELD(smtx));
1354                         return (smp);
1355                 }
1356         }
1357 }
1358 
1359 /*
1360  * Special public segmap operations
1361  */
1362 
1363 /*
1364  * Create pages (without using VOP_GETPAGE) and load up translations to them.
1365  * If softlock is TRUE, then set things up so that it looks like a call
1366  * to segmap_fault with F_SOFTLOCK.
1367  *
1368  * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
1369  *
1370  * All fields in the generic segment (struct seg) are considered to be
1371  * read-only for "segmap" even though the kernel address space (kas) may
1372  * not be locked, hence no lock is needed to access them.
1373  */
1374 int
1375 segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
1376 {
1377         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
1378         page_t *pp;
1379         u_offset_t off;
1380         struct smap *smp;
1381         struct vnode *vp;
1382         caddr_t eaddr;
1383         int newpage = 0;
1384         uint_t prot;
1385         kmutex_t *smtx;
1386         int hat_flag;
1387 
1388         ASSERT(seg->s_as == &kas);
1389 
1390         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1391                 /*
1392                  * Pages are successfully prefaulted and locked in
1393                  * segmap_getmapflt and can't be unlocked until
1394                  * segmap_release. The SM_KPM_NEWPAGE flag is set
1395                  * in segmap_pagecreate_kpm when new pages are created.
1396                  * and it is returned as "newpage" indication here.
1397                  */
1398                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1399                         panic("segmap_pagecreate: smap not found "
1400                             "for addr %p", (void *)addr);
1401                         /*NOTREACHED*/
1402                 }
1403 
1404                 smtx = SMAPMTX(smp);
1405                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
1406                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1407                 mutex_exit(smtx);
1408 
1409                 return (newpage);
1410         }
1411 
1412         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
1413 
1414         eaddr = addr + len;
1415         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1416 
1417         smp = GET_SMAP(seg, addr);
1418 
1419         /*
1420          * We don't grab smp mutex here since we assume the smp
1421          * has a refcnt set already which prevents the slot from
1422          * changing its id.
1423          */
1424         ASSERT(smp->sm_refcnt > 0);
1425 
1426         vp = smp->sm_vp;
1427         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1428         prot = smd->smd_prot;
1429 
1430         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1431                 hat_flag = HAT_LOAD;
1432                 pp = page_lookup(vp, off, SE_SHARED);
1433                 if (pp == NULL) {
1434                         ushort_t bitindex;
1435 
1436                         if ((pp = page_create_va(vp, off,
1437                             PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1438                                 panic("segmap_pagecreate: page_create failed");
1439                                 /*NOTREACHED*/
1440                         }
1441                         newpage = 1;
1442                         page_io_unlock(pp);
1443 
1444                         /*
1445                          * Since pages created here do not contain valid
1446                          * data until the caller writes into them, the
1447                          * "exclusive" lock will not be dropped to prevent
1448                          * other users from accessing the page.  We also
1449                          * have to lock the translation to prevent a fault
1450                          * from occurring when the virtual address mapped by
1451                          * this page is written into.  This is necessary to
1452                          * avoid a deadlock since we haven't dropped the
1453                          * "exclusive" lock.
1454                          */
1455                         bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
1456 
1457                         /*
1458                          * Large Files: The following assertion is to
1459                          * verify the cast above.
1460                          */
1461                         ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1462                         smtx = SMAPMTX(smp);
1463                         mutex_enter(smtx);
1464                         smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
1465                         mutex_exit(smtx);
1466 
1467                         hat_flag = HAT_LOAD_LOCK;
1468                 } else if (softlock) {
1469                         hat_flag = HAT_LOAD_LOCK;
1470                 }
1471 
1472                 if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
1473                         hat_setmod(pp);
1474 
1475                 hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
1476 
1477                 if (hat_flag != HAT_LOAD_LOCK)
1478                         page_unlock(pp);
1479 
1480                 TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
1481                     "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
1482                     seg, addr, pp, vp, off);
1483         }
1484 
1485         return (newpage);
1486 }
1487 
1488 void
1489 segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
1490 {
1491         struct smap     *smp;
1492         ushort_t        bitmask;
1493         page_t          *pp;
1494         struct  vnode   *vp;
1495         u_offset_t      off;
1496         caddr_t         eaddr;
1497         kmutex_t        *smtx;
1498 
1499         ASSERT(seg->s_as == &kas);
1500 
1501         eaddr = addr + len;
1502         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1503 
1504         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1505                 /*
1506                  * Pages are successfully prefaulted and locked in
1507                  * segmap_getmapflt and can't be unlocked until
1508                  * segmap_release, so no pages or hat mappings have
1509                  * to be unlocked at this point.
1510                  */
1511 #ifdef DEBUG
1512                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1513                         panic("segmap_pageunlock: smap not found "
1514                             "for addr %p", (void *)addr);
1515                         /*NOTREACHED*/
1516                 }
1517 
1518                 ASSERT(smp->sm_refcnt > 0);
1519                 mutex_exit(SMAPMTX(smp));
1520 #endif
1521                 return;
1522         }
1523 
1524         smp = GET_SMAP(seg, addr);
1525         smtx = SMAPMTX(smp);
1526 
1527         ASSERT(smp->sm_refcnt > 0);
1528 
1529         vp = smp->sm_vp;
1530         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1531 
1532         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1533                 bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
1534 
1535                 /*
1536                  * Large Files: Following assertion is to verify
1537                  * the correctness of the cast to (int) above.
1538                  */
1539                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1540 
1541                 /*
1542                  * If the bit corresponding to "off" is set,
1543                  * clear this bit in the bitmap, unlock translations,
1544                  * and release the "exclusive" lock on the page.
1545                  */
1546                 if (smp->sm_bitmap & bitmask) {
1547                         mutex_enter(smtx);
1548                         smp->sm_bitmap &= ~bitmask;
1549                         mutex_exit(smtx);
1550 
1551                         hat_unlock(kas.a_hat, addr, PAGESIZE);
1552 
1553                         /*
1554                          * Use page_find() instead of page_lookup() to
1555                          * find the page since we know that it has
1556                          * "exclusive" lock.
1557                          */
1558                         pp = page_find(vp, off);
1559                         if (pp == NULL) {
1560                                 panic("segmap_pageunlock: page not found");
1561                                 /*NOTREACHED*/
1562                         }
1563                         if (rw == S_WRITE) {
1564                                 hat_setrefmod(pp);
1565                         } else if (rw != S_OTHER) {
1566                                 hat_setref(pp);
1567                         }
1568 
1569                         page_unlock(pp);
1570                 }
1571         }
1572 }
1573 
1574 caddr_t
1575 segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
1576 {
1577         return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
1578 }
1579 
1580 /*
1581  * This is the magic virtual address that offset 0 of an ELF
1582  * file gets mapped to in user space. This is used to pick
1583  * the vac color on the freelist.
1584  */
1585 #define ELF_OFFZERO_VA  (0x10000)
1586 /*
1587  * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
1588  * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
1589  * The return address is  always MAXBSIZE aligned.
1590  *
1591  * If forcefault is nonzero and the MMU translations haven't yet been created,
1592  * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
1593  */
1594 caddr_t
1595 segmap_getmapflt(
1596         struct seg *seg,
1597         struct vnode *vp,
1598         u_offset_t off,
1599         size_t len,
1600         int forcefault,
1601         enum seg_rw rw)
1602 {
1603         struct smap *smp, *nsmp;
1604         extern struct vnode *common_specvp();
1605         caddr_t baseaddr;                       /* MAXBSIZE aligned */
1606         u_offset_t baseoff;
1607         int newslot;
1608         caddr_t vaddr;
1609         int color, hashid;
1610         kmutex_t *hashmtx, *smapmtx;
1611         struct smfree *sm;
1612         page_t  *pp;
1613         struct kpme *kpme;
1614         uint_t  prot;
1615         caddr_t base;
1616         page_t  *pl[MAXPPB + 1];
1617         int     error;
1618         int     is_kpm = 1;
1619 
1620         ASSERT(seg->s_as == &kas);
1621         ASSERT(seg == segkmap);
1622 
1623         baseoff = off & (offset_t)MAXBMASK;
1624         if (off + len > baseoff + MAXBSIZE) {
1625                 panic("segmap_getmap bad len");
1626                 /*NOTREACHED*/
1627         }
1628 
1629         /*
1630          * If this is a block device we have to be sure to use the
1631          * "common" block device vnode for the mapping.
1632          */
1633         if (vp->v_type == VBLK)
1634                 vp = common_specvp(vp);
1635 
1636         smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
1637 
1638         if (segmap_kpm == 0 ||
1639             (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
1640                 is_kpm = 0;
1641         }
1642 
1643         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1644         hashmtx = SHASHMTX(hashid);
1645 
1646 retry_hash:
1647         mutex_enter(hashmtx);
1648         for (smp = smd_hash[hashid].sh_hash_list;
1649             smp != NULL; smp = smp->sm_hash)
1650                 if (smp->sm_vp == vp && smp->sm_off == baseoff)
1651                         break;
1652         mutex_exit(hashmtx);
1653 
1654 vrfy_smp:
1655         if (smp != NULL) {
1656 
1657                 ASSERT(vp->v_count != 0);
1658 
1659                 /*
1660                  * Get smap lock and recheck its tag. The hash lock
1661                  * is dropped since the hash is based on (vp, off)
1662                  * and (vp, off) won't change when we have smap mtx.
1663                  */
1664                 smapmtx = SMAPMTX(smp);
1665                 mutex_enter(smapmtx);
1666                 if (smp->sm_vp != vp || smp->sm_off != baseoff) {
1667                         mutex_exit(smapmtx);
1668                         goto retry_hash;
1669                 }
1670 
1671                 if (smp->sm_refcnt == 0) {
1672 
1673                         smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
1674 
1675                         /*
1676                          * Could still be on the free list. However, this
1677                          * could also be an smp that is transitioning from
1678                          * the free list when we have too much contention
1679                          * for the smapmtx's. In this case, we have an
1680                          * unlocked smp that is not on the free list any
1681                          * longer, but still has a 0 refcnt.  The only way
1682                          * to be sure is to check the freelist pointers.
1683                          * Since we now have the smapmtx, we are guaranteed
1684                          * that the (vp, off) won't change, so we are safe
1685                          * to reclaim it.  get_free_smp() knows that this
1686                          * can happen, and it will check the refcnt.
1687                          */
1688 
1689                         if ((smp->sm_next != NULL)) {
1690                                 struct sm_freeq *freeq;
1691 
1692                                 ASSERT(smp->sm_prev != NULL);
1693                                 sm = &smd_free[smp->sm_free_ndx];
1694 
1695                                 if (smp->sm_flags & SM_QNDX_ZERO)
1696                                         freeq = &sm->sm_freeq[0];
1697                                 else
1698                                         freeq = &sm->sm_freeq[1];
1699 
1700                                 mutex_enter(&freeq->smq_mtx);
1701                                 if (freeq->smq_free != smp) {
1702                                         /*
1703                                          * fastpath normal case
1704                                          */
1705                                         smp->sm_prev->sm_next = smp->sm_next;
1706                                         smp->sm_next->sm_prev = smp->sm_prev;
1707                                 } else if (smp == smp->sm_next) {
1708                                         /*
1709                                          * Taking the last smap on freelist
1710                                          */
1711                                         freeq->smq_free = NULL;
1712                                 } else {
1713                                         /*
1714                                          * Reclaiming 1st smap on list
1715                                          */
1716                                         freeq->smq_free = smp->sm_next;
1717                                         smp->sm_prev->sm_next = smp->sm_next;
1718                                         smp->sm_next->sm_prev = smp->sm_prev;
1719                                 }
1720                                 mutex_exit(&freeq->smq_mtx);
1721                                 smp->sm_prev = smp->sm_next = NULL;
1722                         } else {
1723                                 ASSERT(smp->sm_prev == NULL);
1724                                 segmapcnt.smp_stolen.value.ul++;
1725                         }
1726 
1727                 } else {
1728                         segmapcnt.smp_get_use.value.ul++;
1729                 }
1730                 smp->sm_refcnt++;            /* another user */
1731 
1732                 /*
1733                  * We don't invoke segmap_fault via TLB miss, so we set ref
1734                  * and mod bits in advance. For S_OTHER  we set them in
1735                  * segmap_fault F_SOFTUNLOCK.
1736                  */
1737                 if (is_kpm) {
1738                         if (rw == S_WRITE) {
1739                                 smp->sm_flags |= SM_WRITE_DATA;
1740                         } else if (rw == S_READ) {
1741                                 smp->sm_flags |= SM_READ_DATA;
1742                         }
1743                 }
1744                 mutex_exit(smapmtx);
1745 
1746                 newslot = 0;
1747         } else {
1748 
1749                 uint32_t free_ndx, *free_ndxp;
1750                 union segmap_cpu *scpu;
1751 
1752                 /*
1753                  * On a PAC machine or a machine with anti-alias
1754                  * hardware, smd_colormsk will be zero.
1755                  *
1756                  * On a VAC machine- pick color by offset in the file
1757                  * so we won't get VAC conflicts on elf files.
1758                  * On data files, color does not matter but we
1759                  * don't know what kind of file it is so we always
1760                  * pick color by offset. This causes color
1761                  * corresponding to file offset zero to be used more
1762                  * heavily.
1763                  */
1764                 color = (baseoff >> MAXBSHIFT) & smd_colormsk;
1765                 scpu = smd_cpu+CPU->cpu_seqid;
1766                 free_ndxp = &scpu->scpu.scpu_free_ndx[color];
1767                 free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
1768 #ifdef DEBUG
1769                 colors_used[free_ndx]++;
1770 #endif /* DEBUG */
1771 
1772                 /*
1773                  * Get a locked smp slot from the free list.
1774                  */
1775                 smp = get_free_smp(free_ndx);
1776                 smapmtx = SMAPMTX(smp);
1777 
1778                 ASSERT(smp->sm_vp == NULL);
1779 
1780                 if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
1781                         /*
1782                          * Failed to hashin, there exists one now.
1783                          * Return the smp we just allocated.
1784                          */
1785                         segmap_smapadd(smp);
1786                         mutex_exit(smapmtx);
1787 
1788                         smp = nsmp;
1789                         goto vrfy_smp;
1790                 }
1791                 smp->sm_refcnt++;            /* another user */
1792 
1793                 /*
1794                  * We don't invoke segmap_fault via TLB miss, so we set ref
1795                  * and mod bits in advance. For S_OTHER  we set them in
1796                  * segmap_fault F_SOFTUNLOCK.
1797                  */
1798                 if (is_kpm) {
1799                         if (rw == S_WRITE) {
1800                                 smp->sm_flags |= SM_WRITE_DATA;
1801                         } else if (rw == S_READ) {
1802                                 smp->sm_flags |= SM_READ_DATA;
1803                         }
1804                 }
1805                 mutex_exit(smapmtx);
1806 
1807                 newslot = 1;
1808         }
1809 
1810         if (!is_kpm)
1811                 goto use_segmap_range;
1812 
1813         /*
1814          * Use segkpm
1815          */
1816         /* Lint directive required until 6746211 is fixed */
1817         /*CONSTCOND*/
1818         ASSERT(PAGESIZE == MAXBSIZE);
1819 
1820         /*
1821          * remember the last smp faulted on this cpu.
1822          */
1823         (smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
1824 
1825         if (forcefault == SM_PAGECREATE) {
1826                 baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
1827                 return (baseaddr);
1828         }
1829 
1830         if (newslot == 0 &&
1831             (pp = GET_KPME(smp)->kpe_page) != NULL) {
1832 
1833                 /* fastpath */
1834                 switch (rw) {
1835                 case S_READ:
1836                 case S_WRITE:
1837                         if (page_trylock(pp, SE_SHARED)) {
1838                                 if (PP_ISFREE(pp) ||
1839                                     !(pp->p_vnode == vp &&
1840                                     pp->p_offset == baseoff)) {
1841                                         page_unlock(pp);
1842                                         pp = page_lookup(vp, baseoff,
1843                                             SE_SHARED);
1844                                 }
1845                         } else {
1846                                 pp = page_lookup(vp, baseoff, SE_SHARED);
1847                         }
1848 
1849                         if (pp == NULL) {
1850                                 ASSERT(GET_KPME(smp)->kpe_page == NULL);
1851                                 break;
1852                         }
1853 
1854                         if (rw == S_WRITE &&
1855                             hat_page_getattr(pp, P_MOD | P_REF) !=
1856                             (P_MOD | P_REF)) {
1857                                 page_unlock(pp);
1858                                 break;
1859                         }
1860 
1861                         /*
1862                          * We have the p_selock as reader, grab_smp
1863                          * can't hit us, we have bumped the smap
1864                          * refcnt and hat_pageunload needs the
1865                          * p_selock exclusive.
1866                          */
1867                         kpme = GET_KPME(smp);
1868                         if (kpme->kpe_page == pp) {
1869                                 baseaddr = hat_kpm_page2va(pp, 0);
1870                         } else if (kpme->kpe_page == NULL) {
1871                                 baseaddr = hat_kpm_mapin(pp, kpme);
1872                         } else {
1873                                 panic("segmap_getmapflt: stale "
1874                                     "kpme page, kpme %p", (void *)kpme);
1875                                 /*NOTREACHED*/
1876                         }
1877 
1878                         /*
1879                          * We don't invoke segmap_fault via TLB miss,
1880                          * so we set ref and mod bits in advance.
1881                          * For S_OTHER and we set them in segmap_fault
1882                          * F_SOFTUNLOCK.
1883                          */
1884                         if (rw == S_READ && !hat_isref(pp))
1885                                 hat_setref(pp);
1886 
1887                         return (baseaddr);
1888                 default:
1889                         break;
1890                 }
1891         }
1892 
1893         base = segkpm_create_va(baseoff);
1894         error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
1895             seg, base, rw, CRED(), NULL);
1896 
1897         pp = pl[0];
1898         if (error || pp == NULL) {
1899                 /*
1900                  * Use segmap address slot and let segmap_fault deal
1901                  * with the error cases. There is no error return
1902                  * possible here.
1903                  */
1904                 goto use_segmap_range;
1905         }
1906 
1907         ASSERT(pl[1] == NULL);
1908 
1909         /*
1910          * When prot is not returned w/ PROT_ALL the returned pages
1911          * are not backed by fs blocks. For most of the segmap users
1912          * this is no problem, they don't write to the pages in the
1913          * same request and therefore don't rely on a following
1914          * trap driven segmap_fault. With SM_LOCKPROTO users it
1915          * is more secure to use segkmap adresses to allow
1916          * protection segmap_fault's.
1917          */
1918         if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
1919                 /*
1920                  * Use segmap address slot and let segmap_fault
1921                  * do the error return.
1922                  */
1923                 ASSERT(rw != S_WRITE);
1924                 ASSERT(PAGE_LOCKED(pp));
1925                 page_unlock(pp);
1926                 forcefault = 0;
1927                 goto use_segmap_range;
1928         }
1929 
1930         /*
1931          * We have the p_selock as reader, grab_smp can't hit us, we
1932          * have bumped the smap refcnt and hat_pageunload needs the
1933          * p_selock exclusive.
1934          */
1935         kpme = GET_KPME(smp);
1936         if (kpme->kpe_page == pp) {
1937                 baseaddr = hat_kpm_page2va(pp, 0);
1938         } else if (kpme->kpe_page == NULL) {
1939                 baseaddr = hat_kpm_mapin(pp, kpme);
1940         } else {
1941                 panic("segmap_getmapflt: stale kpme page after "
1942                     "VOP_GETPAGE, kpme %p", (void *)kpme);
1943                 /*NOTREACHED*/
1944         }
1945 
1946         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
1947 
1948         return (baseaddr);
1949 
1950 
1951 use_segmap_range:
1952         baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
1953         TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
1954             "segmap_getmap:seg %p addr %p vp %p offset %llx",
1955             seg, baseaddr, vp, baseoff);
1956 
1957         /*
1958          * Prefault the translations
1959          */
1960         vaddr = baseaddr + (off - baseoff);
1961         if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
1962 
1963                 caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
1964                     (uintptr_t)PAGEMASK);
1965 
1966                 (void) segmap_fault(kas.a_hat, seg, pgaddr,
1967                     (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
1968                     F_INVAL, rw);
1969         }
1970 
1971         return (baseaddr);
1972 }
1973 
1974 int
1975 segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
1976 {
1977         struct smap     *smp;
1978         int             error;
1979         int             bflags = 0;
1980         struct vnode    *vp;
1981         u_offset_t      offset;
1982         kmutex_t        *smtx;
1983         int             is_kpm = 0;
1984         page_t          *pp;
1985 
1986         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1987 
1988                 if (((uintptr_t)addr & MAXBOFFSET) != 0) {
1989                         panic("segmap_release: addr %p not "
1990                             "MAXBSIZE aligned", (void *)addr);
1991                         /*NOTREACHED*/
1992                 }
1993 
1994                 if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
1995                         panic("segmap_release: smap not found "
1996                             "for addr %p", (void *)addr);
1997                         /*NOTREACHED*/
1998                 }
1999 
2000                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2001                     "segmap_relmap:seg %p addr %p smp %p",
2002                     seg, addr, smp);
2003 
2004                 smtx = SMAPMTX(smp);
2005 
2006                 /*
2007                  * For compatibility reasons segmap_pagecreate_kpm sets this
2008                  * flag to allow a following segmap_pagecreate to return
2009                  * this as "newpage" flag. When segmap_pagecreate is not
2010                  * called at all we clear it now.
2011                  */
2012                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
2013                 is_kpm = 1;
2014                 if (smp->sm_flags & SM_WRITE_DATA) {
2015                         hat_setrefmod(pp);
2016                 } else if (smp->sm_flags & SM_READ_DATA) {
2017                         hat_setref(pp);
2018                 }
2019         } else {
2020                 if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
2021                     ((uintptr_t)addr & MAXBOFFSET) != 0) {
2022                         panic("segmap_release: bad addr %p", (void *)addr);
2023                         /*NOTREACHED*/
2024                 }
2025                 smp = GET_SMAP(seg, addr);
2026 
2027                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2028                     "segmap_relmap:seg %p addr %p smp %p",
2029                     seg, addr, smp);
2030 
2031                 smtx = SMAPMTX(smp);
2032                 mutex_enter(smtx);
2033                 smp->sm_flags |= SM_NOTKPM_RELEASED;
2034         }
2035 
2036         ASSERT(smp->sm_refcnt > 0);
2037 
2038         /*
2039          * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2040          * are set.
2041          */
2042         if ((flags & ~SM_DONTNEED) != 0) {
2043                 if (flags & SM_WRITE)
2044                         segmapcnt.smp_rel_write.value.ul++;
2045                 if (flags & SM_ASYNC) {
2046                         bflags |= B_ASYNC;
2047                         segmapcnt.smp_rel_async.value.ul++;
2048                 }
2049                 if (flags & SM_INVAL) {
2050                         bflags |= B_INVAL;
2051                         segmapcnt.smp_rel_abort.value.ul++;
2052                 }
2053                 if (flags & SM_DESTROY) {
2054                         bflags |= (B_INVAL|B_TRUNC);
2055                         segmapcnt.smp_rel_abort.value.ul++;
2056                 }
2057                 if (smp->sm_refcnt == 1) {
2058                         /*
2059                          * We only bother doing the FREE and DONTNEED flags
2060                          * if no one else is still referencing this mapping.
2061                          */
2062                         if (flags & SM_FREE) {
2063                                 bflags |= B_FREE;
2064                                 segmapcnt.smp_rel_free.value.ul++;
2065                         }
2066                         if (flags & SM_DONTNEED) {
2067                                 bflags |= B_DONTNEED;
2068                                 segmapcnt.smp_rel_dontneed.value.ul++;
2069                         }
2070                 }
2071         } else {
2072                 smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
2073         }
2074 
2075         vp = smp->sm_vp;
2076         offset = smp->sm_off;
2077 
2078         if (--smp->sm_refcnt == 0) {
2079 
2080                 smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
2081 
2082                 if (flags & (SM_INVAL|SM_DESTROY)) {
2083                         segmap_hashout(smp);    /* remove map info */
2084                         if (is_kpm) {
2085                                 hat_kpm_mapout(pp, GET_KPME(smp), addr);
2086                                 if (smp->sm_flags & SM_NOTKPM_RELEASED) {
2087                                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2088                                         hat_unload(kas.a_hat, segkmap->s_base +
2089                                             ((smp - smd_smap) * MAXBSIZE),
2090                                             MAXBSIZE, HAT_UNLOAD);
2091                                 }
2092 
2093                         } else {
2094                                 if (segmap_kpm)
2095                                         segkpm_mapout_validkpme(GET_KPME(smp));
2096 
2097                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2098                                 hat_unload(kas.a_hat, addr, MAXBSIZE,
2099                                     HAT_UNLOAD);
2100                         }
2101                 }
2102                 segmap_smapadd(smp);    /* add to free list */
2103         }
2104 
2105         mutex_exit(smtx);
2106 
2107         if (is_kpm)
2108                 page_unlock(pp);
2109         /*
2110          * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2111          * are set.
2112          */
2113         if ((flags & ~SM_DONTNEED) != 0) {
2114                 error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
2115                     bflags, CRED(), NULL);
2116         } else {
2117                 error = 0;
2118         }
2119 
2120         return (error);
2121 }
2122 
2123 /*
2124  * Dump the pages belonging to this segmap segment.
2125  */
2126 static void
2127 segmap_dump(struct seg *seg)
2128 {
2129         struct segmap_data *smd;
2130         struct smap *smp, *smp_end;
2131         page_t *pp;
2132         pfn_t pfn;
2133         u_offset_t off;
2134         caddr_t addr;
2135 
2136         smd = (struct segmap_data *)seg->s_data;
2137         addr = seg->s_base;
2138         for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
2139             smp < smp_end; smp++) {
2140 
2141                 if (smp->sm_refcnt) {
2142                         for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
2143                                 int we_own_it = 0;
2144 
2145                                 /*
2146                                  * If pp == NULL, the page either does
2147                                  * not exist or is exclusively locked.
2148                                  * So determine if it exists before
2149                                  * searching for it.
2150                                  */
2151                                 if ((pp = page_lookup_nowait(smp->sm_vp,
2152                                     smp->sm_off + off, SE_SHARED)))
2153                                         we_own_it = 1;
2154                                 else
2155                                         pp = page_exists(smp->sm_vp,
2156                                             smp->sm_off + off);
2157 
2158                                 if (pp) {
2159                                         pfn = page_pptonum(pp);
2160                                         dump_addpage(seg->s_as,
2161                                             addr + off, pfn);
2162                                         if (we_own_it)
2163                                                 page_unlock(pp);
2164                                 }
2165                                 dump_timeleft = dump_timeout;
2166                         }
2167                 }
2168                 addr += MAXBSIZE;
2169         }
2170 }
2171 
2172 /*ARGSUSED*/
2173 static int
2174 segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
2175     struct page ***ppp, enum lock_type type, enum seg_rw rw)
2176 {
2177         return (ENOTSUP);
2178 }
2179 
2180 static int
2181 segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2182 {
2183         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
2184 
2185         memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
2186         memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
2187         return (0);
2188 }
2189 
2190 /*ARGSUSED*/
2191 static int
2192 segmap_capable(struct seg *seg, segcapability_t capability)
2193 {
2194         return (0);
2195 }
2196 
2197 
2198 #ifdef  SEGKPM_SUPPORT
2199 
2200 /*
2201  * segkpm support routines
2202  */
2203 
2204 static caddr_t
2205 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2206         struct smap *smp, enum seg_rw rw)
2207 {
2208         caddr_t base;
2209         page_t  *pp;
2210         int     newpage = 0;
2211         struct kpme     *kpme;
2212 
2213         ASSERT(smp->sm_refcnt > 0);
2214 
2215         if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
2216                 kmutex_t *smtx;
2217 
2218                 base = segkpm_create_va(off);
2219 
2220                 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
2221                     seg, base)) == NULL) {
2222                         panic("segmap_pagecreate_kpm: "
2223                             "page_create failed");
2224                         /*NOTREACHED*/
2225                 }
2226 
2227                 newpage = 1;
2228                 page_io_unlock(pp);
2229                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
2230 
2231                 /*
2232                  * Mark this here until the following segmap_pagecreate
2233                  * or segmap_release.
2234                  */
2235                 smtx = SMAPMTX(smp);
2236                 mutex_enter(smtx);
2237                 smp->sm_flags |= SM_KPM_NEWPAGE;
2238                 mutex_exit(smtx);
2239         }
2240 
2241         kpme = GET_KPME(smp);
2242         if (!newpage && kpme->kpe_page == pp)
2243                 base = hat_kpm_page2va(pp, 0);
2244         else
2245                 base = hat_kpm_mapin(pp, kpme);
2246 
2247         /*
2248          * FS code may decide not to call segmap_pagecreate and we
2249          * don't invoke segmap_fault via TLB miss, so we have to set
2250          * ref and mod bits in advance.
2251          */
2252         if (rw == S_WRITE) {
2253                 hat_setrefmod(pp);
2254         } else {
2255                 ASSERT(rw == S_READ);
2256                 hat_setref(pp);
2257         }
2258 
2259         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
2260 
2261         return (base);
2262 }
2263 
2264 /*
2265  * Find the smap structure corresponding to the
2266  * KPM addr and return it locked.
2267  */
2268 struct smap *
2269 get_smap_kpm(caddr_t addr, page_t **ppp)
2270 {
2271         struct smap     *smp;
2272         struct vnode    *vp;
2273         u_offset_t      offset;
2274         caddr_t         baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
2275         int             hashid;
2276         kmutex_t        *hashmtx;
2277         page_t          *pp;
2278         union segmap_cpu *scpu;
2279 
2280         pp = hat_kpm_vaddr2page(baseaddr);
2281 
2282         ASSERT(pp && !PP_ISFREE(pp));
2283         ASSERT(PAGE_LOCKED(pp));
2284         ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
2285 
2286         vp = pp->p_vnode;
2287         offset = pp->p_offset;
2288         ASSERT(vp != NULL);
2289 
2290         /*
2291          * Assume the last smap used on this cpu is the one needed.
2292          */
2293         scpu = smd_cpu+CPU->cpu_seqid;
2294         smp = scpu->scpu.scpu_last_smap;
2295         mutex_enter(&smp->sm_mtx);
2296         if (smp->sm_vp == vp && smp->sm_off == offset) {
2297                 ASSERT(smp->sm_refcnt > 0);
2298         } else {
2299                 /*
2300                  * Assumption wrong, find the smap on the hash chain.
2301                  */
2302                 mutex_exit(&smp->sm_mtx);
2303                 SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
2304                 hashmtx = SHASHMTX(hashid);
2305 
2306                 mutex_enter(hashmtx);
2307                 smp = smd_hash[hashid].sh_hash_list;
2308                 for (; smp != NULL; smp = smp->sm_hash) {
2309                         if (smp->sm_vp == vp && smp->sm_off == offset)
2310                                 break;
2311                 }
2312                 mutex_exit(hashmtx);
2313                 if (smp) {
2314                         mutex_enter(&smp->sm_mtx);
2315                         ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
2316                 }
2317         }
2318 
2319         if (ppp)
2320                 *ppp = smp ? pp : NULL;
2321 
2322         return (smp);
2323 }
2324 
2325 #else   /* SEGKPM_SUPPORT */
2326 
2327 /* segkpm stubs */
2328 
2329 /*ARGSUSED*/
2330 static caddr_t
2331 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2332         struct smap *smp, enum seg_rw rw)
2333 {
2334         return (NULL);
2335 }
2336 
2337 /*ARGSUSED*/
2338 struct smap *
2339 get_smap_kpm(caddr_t addr, page_t **ppp)
2340 {
2341         return (NULL);
2342 }
2343 
2344 #endif  /* SEGKPM_SUPPORT */