1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  */
  28 
  29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  30 /*      All Rights Reserved   */
  31 
  32 /*
  33  * Portions of this source code were derived from Berkeley 4.3 BSD
  34  * under license from the Regents of the University of California.
  35  */
  36 
  37 /*
  38  * UNIX machine dependent virtual memory support.
  39  */
  40 
  41 #include <sys/types.h>
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/user.h>
  45 #include <sys/proc.h>
  46 #include <sys/kmem.h>
  47 #include <sys/vmem.h>
  48 #include <sys/buf.h>
  49 #include <sys/cpuvar.h>
  50 #include <sys/lgrp.h>
  51 #include <sys/disp.h>
  52 #include <sys/vm.h>
  53 #include <sys/mman.h>
  54 #include <sys/vnode.h>
  55 #include <sys/cred.h>
  56 #include <sys/exec.h>
  57 #include <sys/exechdr.h>
  58 #include <sys/debug.h>
  59 #include <sys/vmsystm.h>
  60 #include <sys/swap.h>
  61 #include <sys/dumphdr.h>
  62 
  63 #include <vm/hat.h>
  64 #include <vm/as.h>
  65 #include <vm/seg.h>
  66 #include <vm/seg_kp.h>
  67 #include <vm/seg_vn.h>
  68 #include <vm/page.h>
  69 #include <vm/seg_kmem.h>
  70 #include <vm/seg_kpm.h>
  71 #include <vm/vm_dep.h>
  72 
  73 #include <sys/cpu.h>
  74 #include <sys/vm_machparam.h>
  75 #include <sys/memlist.h>
  76 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
  77 #include <vm/hat_i86.h>
  78 #include <sys/x86_archext.h>
  79 #include <sys/elf_386.h>
  80 #include <sys/cmn_err.h>
  81 #include <sys/archsystm.h>
  82 #include <sys/machsystm.h>
  83 
  84 #include <sys/vtrace.h>
  85 #include <sys/ddidmareq.h>
  86 #include <sys/promif.h>
  87 #include <sys/memnode.h>
  88 #include <sys/stack.h>
  89 #include <util/qsort.h>
  90 #include <sys/taskq.h>
  91 
  92 #ifdef __xpv
  93 
  94 #include <sys/hypervisor.h>
  95 #include <sys/xen_mmu.h>
  96 #include <sys/balloon_impl.h>
  97 
  98 /*
  99  * domain 0 pages usable for DMA are kept pre-allocated and kept in
 100  * distinct lists, ordered by increasing mfn.
 101  */
 102 static kmutex_t io_pool_lock;
 103 static kmutex_t contig_list_lock;
 104 static page_t *io_pool_4g;      /* pool for 32 bit dma limited devices */
 105 static page_t *io_pool_16m;     /* pool for 24 bit dma limited legacy devices */
 106 static long io_pool_cnt;
 107 static long io_pool_cnt_max = 0;
 108 #define DEFAULT_IO_POOL_MIN     128
 109 static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN;
 110 static long io_pool_cnt_lowater = 0;
 111 static long io_pool_shrink_attempts; /* how many times did we try to shrink */
 112 static long io_pool_shrinks;    /* how many times did we really shrink */
 113 static long io_pool_grows;      /* how many times did we grow */
 114 static mfn_t start_mfn = 1;
 115 static caddr_t io_pool_kva;     /* use to alloc pages when needed */
 116 
 117 static int create_contig_pfnlist(uint_t);
 118 
 119 /*
 120  * percentage of phys mem to hold in the i/o pool
 121  */
 122 #define DEFAULT_IO_POOL_PCT     2
 123 static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT;
 124 static void page_io_pool_sub(page_t **, page_t *, page_t *);
 125 int ioalloc_dbg = 0;
 126 
 127 #endif /* __xpv */
 128 
 129 uint_t vac_colors = 1;
 130 
 131 int largepagesupport = 0;
 132 extern uint_t page_create_new;
 133 extern uint_t page_create_exists;
 134 extern uint_t page_create_putbacks;
 135 /*
 136  * Allow users to disable the kernel's use of SSE.
 137  */
 138 extern int use_sse_pagecopy, use_sse_pagezero;
 139 
 140 /*
 141  * combined memory ranges from mnode and memranges[] to manage single
 142  * mnode/mtype dimension in the page lists.
 143  */
 144 typedef struct {
 145         pfn_t   mnr_pfnlo;
 146         pfn_t   mnr_pfnhi;
 147         int     mnr_mnode;
 148         int     mnr_memrange;           /* index into memranges[] */
 149         int     mnr_next;               /* next lower PA mnoderange */
 150         int     mnr_exists;
 151         /* maintain page list stats */
 152         pgcnt_t mnr_mt_clpgcnt;         /* cache list cnt */
 153         pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */
 154         pgcnt_t mnr_mt_totcnt;          /* sum of cache and free lists */
 155 #ifdef DEBUG
 156         struct mnr_mts {                /* mnode/mtype szc stats */
 157                 pgcnt_t mnr_mts_pgcnt;
 158                 int     mnr_mts_colors;
 159                 pgcnt_t *mnr_mtsc_pgcnt;
 160         }       *mnr_mts;
 161 #endif
 162 } mnoderange_t;
 163 
 164 #define MEMRANGEHI(mtype)                                               \
 165         ((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
 166 #define MEMRANGELO(mtype)       (memranges[mtype])
 167 
 168 #define MTYPE_FREEMEM(mt)       (mnoderanges[mt].mnr_mt_totcnt)
 169 
 170 /*
 171  * As the PC architecture evolved memory up was clumped into several
 172  * ranges for various historical I/O devices to do DMA.
 173  * < 16Meg - ISA bus
 174  * < 2Gig - ???
 175  * < 4Gig - PCI bus or drivers that don't understand PAE mode
 176  *
 177  * These are listed in reverse order, so that we can skip over unused
 178  * ranges on machines with small memories.
 179  *
 180  * For now under the Hypervisor, we'll only ever have one memrange.
 181  */
 182 #define PFN_4GIG        0x100000
 183 #define PFN_16MEG       0x1000
 184 /* Indices into the memory range (arch_memranges) array. */
 185 #define MRI_4G          0
 186 #define MRI_2G          1
 187 #define MRI_16M         2
 188 #define MRI_0           3
 189 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
 190     PFN_4GIG,   /* pfn range for 4G and above */
 191     0x80000,    /* pfn range for 2G-4G */
 192     PFN_16MEG,  /* pfn range for 16M-2G */
 193     0x00000,    /* pfn range for 0-16M */
 194 };
 195 pfn_t *memranges = &arch_memranges[0];
 196 int nranges = NUM_MEM_RANGES;
 197 
 198 /*
 199  * This combines mem_node_config and memranges into one data
 200  * structure to be used for page list management.
 201  */
 202 mnoderange_t    *mnoderanges;
 203 int             mnoderangecnt;
 204 int             mtype4g;
 205 int             mtype16m;
 206 int             mtypetop;       /* index of highest pfn'ed mnoderange */
 207 
 208 /*
 209  * 4g memory management variables for systems with more than 4g of memory:
 210  *
 211  * physical memory below 4g is required for 32bit dma devices and, currently,
 212  * for kmem memory. On systems with more than 4g of memory, the pool of memory
 213  * below 4g can be depleted without any paging activity given that there is
 214  * likely to be sufficient memory above 4g.
 215  *
 216  * physmax4g is set true if the largest pfn is over 4g. The rest of the
 217  * 4g memory management code is enabled only when physmax4g is true.
 218  *
 219  * maxmem4g is the count of the maximum number of pages on the page lists
 220  * with physical addresses below 4g. It can be a lot less then 4g given that
 221  * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
 222  * agp aperture etc.
 223  *
 224  * freemem4g maintains the count of the number of available pages on the
 225  * page lists with physical addresses below 4g.
 226  *
 227  * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
 228  * 6% (desfree4gshift = 4) of maxmem4g.
 229  *
 230  * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
 231  * and the amount of physical memory above 4g is greater than freemem4g.
 232  * In this case, page_get_* routines will restrict below 4g allocations
 233  * for requests that don't specifically require it.
 234  */
 235 
 236 #define DESFREE4G       (maxmem4g >> desfree4gshift)
 237 
 238 #define RESTRICT4G_ALLOC                                        \
 239         (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
 240 
 241 static pgcnt_t  maxmem4g;
 242 static pgcnt_t  freemem4g;
 243 static int      physmax4g;
 244 static int      desfree4gshift = 4;     /* maxmem4g shift to derive DESFREE4G */
 245 
 246 /*
 247  * 16m memory management:
 248  *
 249  * reserve some amount of physical memory below 16m for legacy devices.
 250  *
 251  * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
 252  * 16m or if the 16m pool drops below DESFREE16M.
 253  *
 254  * In this case, general page allocations via page_get_{free,cache}list
 255  * routines will be restricted from allocating from the 16m pool. Allocations
 256  * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
 257  * are not restricted.
 258  */
 259 
 260 #define FREEMEM16M      MTYPE_FREEMEM(mtype16m)
 261 #define DESFREE16M      desfree16m
 262 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags)                \
 263         ((freemem != 0) && ((flags & PG_PANIC) == 0) &&             \
 264             ((freemem >= (FREEMEM16M)) ||                    \
 265             (FREEMEM16M  < (DESFREE16M + pgcnt))))
 266 
 267 static pgcnt_t  desfree16m = 0x380;
 268 
 269 /*
 270  * This can be patched via /etc/system to allow old non-PAE aware device
 271  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
 272  */
 273 int restricted_kmemalloc = 0;
 274 
 275 #ifdef VM_STATS
 276 struct {
 277         ulong_t pga_alloc;
 278         ulong_t pga_notfullrange;
 279         ulong_t pga_nulldmaattr;
 280         ulong_t pga_allocok;
 281         ulong_t pga_allocfailed;
 282         ulong_t pgma_alloc;
 283         ulong_t pgma_allocok;
 284         ulong_t pgma_allocfailed;
 285         ulong_t pgma_allocempty;
 286 } pga_vmstats;
 287 #endif
 288 
 289 uint_t mmu_page_sizes;
 290 
 291 /* How many page sizes the users can see */
 292 uint_t mmu_exported_page_sizes;
 293 
 294 /* page sizes that legacy applications can see */
 295 uint_t mmu_legacy_page_sizes;
 296 
 297 /*
 298  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
 299  * fewer than this many pages.
 300  */
 301 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 302 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 303 
 304 /*
 305  * Maximum and default segment size tunables for user private
 306  * and shared anon memory, and user text and initialized data.
 307  * These can be patched via /etc/system to allow large pages
 308  * to be used for mapping application private and shared anon memory.
 309  */
 310 size_t mcntl0_lpsize = MMU_PAGESIZE;
 311 size_t max_uheap_lpsize = MMU_PAGESIZE;
 312 size_t default_uheap_lpsize = MMU_PAGESIZE;
 313 size_t max_ustack_lpsize = MMU_PAGESIZE;
 314 size_t default_ustack_lpsize = MMU_PAGESIZE;
 315 size_t max_privmap_lpsize = MMU_PAGESIZE;
 316 size_t max_uidata_lpsize = MMU_PAGESIZE;
 317 size_t max_utext_lpsize = MMU_PAGESIZE;
 318 size_t max_shm_lpsize = MMU_PAGESIZE;
 319 
 320 
 321 /*
 322  * initialized by page_coloring_init().
 323  */
 324 uint_t  page_colors;
 325 uint_t  page_colors_mask;
 326 uint_t  page_coloring_shift;
 327 int     cpu_page_colors;
 328 static uint_t   l2_colors;
 329 
 330 /*
 331  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
 332  * and page_colors are calculated from the l2 cache n-way set size.  Within a
 333  * mnode range, the page freelist and cachelist are hashed into bins based on
 334  * color. This makes it easier to search for a page within a specific memory
 335  * range.
 336  */
 337 #define PAGE_COLORS_MIN 16
 338 
 339 page_t ****page_freelists;
 340 page_t ***page_cachelists;
 341 
 342 
 343 /*
 344  * Used by page layer to know about page sizes
 345  */
 346 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
 347 
 348 kmutex_t        *fpc_mutex[NPC_MUTEX];
 349 kmutex_t        *cpc_mutex[NPC_MUTEX];
 350 
 351 /* Lock to protect mnoderanges array for memory DR operations. */
 352 static kmutex_t mnoderange_lock;
 353 
 354 /*
 355  * Only let one thread at a time try to coalesce large pages, to
 356  * prevent them from working against each other.
 357  */
 358 static kmutex_t contig_lock;
 359 #define CONTIG_LOCK()   mutex_enter(&contig_lock);
 360 #define CONTIG_UNLOCK() mutex_exit(&contig_lock);
 361 
 362 #define PFN_16M         (mmu_btop((uint64_t)0x1000000))
 363 
 364 /*
 365  * Return the optimum page size for a given mapping
 366  */
 367 /*ARGSUSED*/
 368 size_t
 369 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
 370 {
 371         level_t l = 0;
 372         size_t pgsz = MMU_PAGESIZE;
 373         size_t max_lpsize;
 374         uint_t mszc;
 375 
 376         ASSERT(maptype != MAPPGSZ_VA);
 377 
 378         if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
 379                 return (MMU_PAGESIZE);
 380         }
 381 
 382         switch (maptype) {
 383         case MAPPGSZ_HEAP:
 384         case MAPPGSZ_STK:
 385                 max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
 386                     MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
 387                 if (max_lpsize == MMU_PAGESIZE) {
 388                         return (MMU_PAGESIZE);
 389                 }
 390                 if (len == 0) {
 391                         len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
 392                             p->p_brksize - p->p_bssbase : p->p_stksize;
 393                 }
 394                 len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
 395                     default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
 396 
 397                 /*
 398                  * use the pages size that best fits len
 399                  */
 400                 for (l = mmu.umax_page_level; l > 0; --l) {
 401                         if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
 402                                 continue;
 403                         } else {
 404                                 pgsz = LEVEL_SIZE(l);
 405                         }
 406                         break;
 407                 }
 408 
 409                 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
 410                     p->p_stkpageszc);
 411                 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
 412                         pgsz = hw_page_array[mszc].hp_size;
 413                 }
 414                 return (pgsz);
 415 
 416         case MAPPGSZ_ISM:
 417                 for (l = mmu.umax_page_level; l > 0; --l) {
 418                         if (len >= LEVEL_SIZE(l))
 419                                 return (LEVEL_SIZE(l));
 420                 }
 421                 return (LEVEL_SIZE(0));
 422         }
 423         return (pgsz);
 424 }
 425 
 426 static uint_t
 427 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
 428     size_t min_physmem)
 429 {
 430         caddr_t eaddr = addr + size;
 431         uint_t szcvec = 0;
 432         caddr_t raddr;
 433         caddr_t readdr;
 434         size_t  pgsz;
 435         int i;
 436 
 437         if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
 438                 return (0);
 439         }
 440 
 441         for (i = mmu_exported_page_sizes - 1; i > 0; i--) {
 442                 pgsz = page_get_pagesize(i);
 443                 if (pgsz > max_lpsize) {
 444                         continue;
 445                 }
 446                 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
 447                 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
 448                 if (raddr < addr || raddr >= readdr) {
 449                         continue;
 450                 }
 451                 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
 452                         continue;
 453                 }
 454                 /*
 455                  * Set szcvec to the remaining page sizes.
 456                  */
 457                 szcvec = ((1 << (i + 1)) - 1) & ~1;
 458                 break;
 459         }
 460         return (szcvec);
 461 }
 462 
 463 /*
 464  * Return a bit vector of large page size codes that
 465  * can be used to map [addr, addr + len) region.
 466  */
 467 /*ARGSUSED*/
 468 uint_t
 469 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
 470     int memcntl)
 471 {
 472         size_t max_lpsize = mcntl0_lpsize;
 473 
 474         if (mmu.max_page_level == 0)
 475                 return (0);
 476 
 477         if (flags & MAP_TEXT) {
 478                 if (!memcntl)
 479                         max_lpsize = max_utext_lpsize;
 480                 return (map_szcvec(addr, size, off, max_lpsize,
 481                     shm_lpg_min_physmem));
 482 
 483         } else if (flags & MAP_INITDATA) {
 484                 if (!memcntl)
 485                         max_lpsize = max_uidata_lpsize;
 486                 return (map_szcvec(addr, size, off, max_lpsize,
 487                     privm_lpg_min_physmem));
 488 
 489         } else if (type == MAPPGSZC_SHM) {
 490                 if (!memcntl)
 491                         max_lpsize = max_shm_lpsize;
 492                 return (map_szcvec(addr, size, off, max_lpsize,
 493                     shm_lpg_min_physmem));
 494 
 495         } else if (type == MAPPGSZC_HEAP) {
 496                 if (!memcntl)
 497                         max_lpsize = max_uheap_lpsize;
 498                 return (map_szcvec(addr, size, off, max_lpsize,
 499                     privm_lpg_min_physmem));
 500 
 501         } else if (type == MAPPGSZC_STACK) {
 502                 if (!memcntl)
 503                         max_lpsize = max_ustack_lpsize;
 504                 return (map_szcvec(addr, size, off, max_lpsize,
 505                     privm_lpg_min_physmem));
 506 
 507         } else {
 508                 if (!memcntl)
 509                         max_lpsize = max_privmap_lpsize;
 510                 return (map_szcvec(addr, size, off, max_lpsize,
 511                     privm_lpg_min_physmem));
 512         }
 513 }
 514 
 515 /*
 516  * Handle a pagefault.
 517  */
 518 faultcode_t
 519 pagefault(
 520         caddr_t addr,
 521         enum fault_type type,
 522         enum seg_rw rw,
 523         int iskernel)
 524 {
 525         struct as *as;
 526         struct hat *hat;
 527         struct proc *p;
 528         kthread_t *t;
 529         faultcode_t res;
 530         caddr_t base;
 531         size_t len;
 532         int err;
 533         int mapped_red;
 534         uintptr_t ea;
 535 
 536         ASSERT_STACK_ALIGNED();
 537 
 538         if (INVALID_VADDR(addr))
 539                 return (FC_NOMAP);
 540 
 541         mapped_red = segkp_map_red();
 542 
 543         if (iskernel) {
 544                 as = &kas;
 545                 hat = as->a_hat;
 546         } else {
 547                 t = curthread;
 548                 p = ttoproc(t);
 549                 as = p->p_as;
 550                 hat = as->a_hat;
 551         }
 552 
 553         /*
 554          * Dispatch pagefault.
 555          */
 556         res = as_fault(hat, as, addr, 1, type, rw);
 557 
 558         /*
 559          * If this isn't a potential unmapped hole in the user's
 560          * UNIX data or stack segments, just return status info.
 561          */
 562         if (res != FC_NOMAP || iskernel)
 563                 goto out;
 564 
 565         /*
 566          * Check to see if we happened to faulted on a currently unmapped
 567          * part of the UNIX data or stack segments.  If so, create a zfod
 568          * mapping there and then try calling the fault routine again.
 569          */
 570         base = p->p_brkbase;
 571         len = p->p_brksize;
 572 
 573         if (addr < base || addr >= base + len) {          /* data seg? */
 574                 base = (caddr_t)p->p_usrstack - p->p_stksize;
 575                 len = p->p_stksize;
 576                 if (addr < base || addr >= p->p_usrstack) {    /* stack seg? */
 577                         /* not in either UNIX data or stack segments */
 578                         res = FC_NOMAP;
 579                         goto out;
 580                 }
 581         }
 582 
 583         /*
 584          * the rest of this function implements a 3.X 4.X 5.X compatibility
 585          * This code is probably not needed anymore
 586          */
 587         if (p->p_model == DATAMODEL_ILP32) {
 588 
 589                 /* expand the gap to the page boundaries on each side */
 590                 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
 591                 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
 592                 len = ea - (uintptr_t)base;
 593 
 594                 as_rangelock(as);
 595                 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
 596                     0) {
 597                         err = as_map(as, base, len, segvn_create, zfod_argsp);
 598                         as_rangeunlock(as);
 599                         if (err) {
 600                                 res = FC_MAKE_ERR(err);
 601                                 goto out;
 602                         }
 603                 } else {
 604                         /*
 605                          * This page is already mapped by another thread after
 606                          * we returned from as_fault() above.  We just fall
 607                          * through as_fault() below.
 608                          */
 609                         as_rangeunlock(as);
 610                 }
 611 
 612                 res = as_fault(hat, as, addr, 1, F_INVAL, rw);
 613         }
 614 
 615 out:
 616         if (mapped_red)
 617                 segkp_unmap_red();
 618 
 619         return (res);
 620 }
 621 
 622 void
 623 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
 624 {
 625         struct proc *p = curproc;
 626         caddr_t userlimit = (flags & _MAP_LOW32) ?
 627             (caddr_t)_userlimit32 : p->p_as->a_userlimit;
 628 
 629         map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
 630 }
 631 
 632 /*ARGSUSED*/
 633 int
 634 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
 635 {
 636         return (0);
 637 }
 638 
 639 /*
 640  * map_addr_proc() is the routine called when the system is to
 641  * choose an address for the user.  We will pick an address
 642  * range which is the highest available below userlimit.
 643  *
 644  * Every mapping will have a redzone of a single page on either side of
 645  * the request. This is done to leave one page unmapped between segments.
 646  * This is not required, but it's useful for the user because if their
 647  * program strays across a segment boundary, it will catch a fault
 648  * immediately making debugging a little easier.  Currently the redzone
 649  * is mandatory.
 650  *
 651  * addrp is a value/result parameter.
 652  *      On input it is a hint from the user to be used in a completely
 653  *      machine dependent fashion.  We decide to completely ignore this hint.
 654  *      If MAP_ALIGN was specified, addrp contains the minimal alignment, which
 655  *      must be some "power of two" multiple of pagesize.
 656  *
 657  *      On output it is NULL if no address can be found in the current
 658  *      processes address space or else an address that is currently
 659  *      not mapped for len bytes with a page of red zone on either side.
 660  *
 661  *      vacalign is not needed on x86 (it's for viturally addressed caches)
 662  */
 663 /*ARGSUSED*/
 664 void
 665 map_addr_proc(
 666         caddr_t *addrp,
 667         size_t len,
 668         offset_t off,
 669         int vacalign,
 670         caddr_t userlimit,
 671         struct proc *p,
 672         uint_t flags)
 673 {
 674         struct as *as = p->p_as;
 675         caddr_t addr;
 676         caddr_t base;
 677         size_t slen;
 678         size_t align_amount;
 679 
 680         ASSERT32(userlimit == as->a_userlimit);
 681 
 682         base = p->p_brkbase;
 683 #if defined(__amd64)
 684         /*
 685          * XX64 Yes, this needs more work.
 686          */
 687         if (p->p_model == DATAMODEL_NATIVE) {
 688                 if (userlimit < as->a_userlimit) {
 689                         /*
 690                          * This happens when a program wants to map
 691                          * something in a range that's accessible to a
 692                          * program in a smaller address space.  For example,
 693                          * a 64-bit program calling mmap32(2) to guarantee
 694                          * that the returned address is below 4Gbytes.
 695                          */
 696                         ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
 697 
 698                         if (userlimit > base)
 699                                 slen = userlimit - base;
 700                         else {
 701                                 *addrp = NULL;
 702                                 return;
 703                         }
 704                 } else {
 705                         /*
 706                          * XX64 This layout is probably wrong .. but in
 707                          * the event we make the amd64 address space look
 708                          * like sparcv9 i.e. with the stack -above- the
 709                          * heap, this bit of code might even be correct.
 710                          */
 711                         slen = p->p_usrstack - base -
 712                             ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
 713                 }
 714         } else
 715 #endif
 716                 slen = userlimit - base;
 717 
 718         /* Make len be a multiple of PAGESIZE */
 719         len = (len + PAGEOFFSET) & PAGEMASK;
 720 
 721         /*
 722          * figure out what the alignment should be
 723          *
 724          * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
 725          */
 726         if (len <= ELF_386_MAXPGSZ) {
 727                 /*
 728                  * Align virtual addresses to ensure that ELF shared libraries
 729                  * are mapped with the appropriate alignment constraints by
 730                  * the run-time linker.
 731                  */
 732                 align_amount = ELF_386_MAXPGSZ;
 733         } else {
 734                 /*
 735                  * For 32-bit processes, only those which have specified
 736                  * MAP_ALIGN and an addr will be aligned on a larger page size.
 737                  * Not doing so can potentially waste up to 1G of process
 738                  * address space.
 739                  */
 740                 int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 :
 741                     mmu.umax_page_level;
 742 
 743                 while (lvl && len < LEVEL_SIZE(lvl))
 744                         --lvl;
 745 
 746                 align_amount = LEVEL_SIZE(lvl);
 747         }
 748         if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
 749                 align_amount = (uintptr_t)*addrp;
 750 
 751         ASSERT(ISP2(align_amount));
 752         ASSERT(align_amount == 0 || align_amount >= PAGESIZE);
 753 
 754         off = off & (align_amount - 1);
 755         /*
 756          * Look for a large enough hole starting below userlimit.
 757          * After finding it, use the upper part.
 758          */
 759         if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount,
 760             PAGESIZE, off) == 0) {
 761                 caddr_t as_addr;
 762 
 763                 /*
 764                  * addr is the highest possible address to use since we have
 765                  * a PAGESIZE redzone at the beginning and end.
 766                  */
 767                 addr = base + slen - (PAGESIZE + len);
 768                 as_addr = addr;
 769                 /*
 770                  * Round address DOWN to the alignment amount and
 771                  * add the offset in.
 772                  * If addr is greater than as_addr, len would not be large
 773                  * enough to include the redzone, so we must adjust down
 774                  * by the alignment amount.
 775                  */
 776                 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
 777                 addr += (uintptr_t)off;
 778                 if (addr > as_addr) {
 779                         addr -= align_amount;
 780                 }
 781 
 782                 ASSERT(addr > base);
 783                 ASSERT(addr + len < base + slen);
 784                 ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
 785                     ((uintptr_t)(off)));
 786                 *addrp = addr;
 787         } else {
 788                 *addrp = NULL;  /* no more virtual space */
 789         }
 790 }
 791 
 792 int valid_va_range_aligned_wraparound;
 793 
 794 /*
 795  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 796  * addresses at least "minlen" long, where the base of the range is at "off"
 797  * phase from an "align" boundary and there is space for a "redzone"-sized
 798  * redzone on either side of the range.  On success, 1 is returned and *basep
 799  * and *lenp are adjusted to describe the acceptable range (including
 800  * the redzone).  On failure, 0 is returned.
 801  */
 802 /*ARGSUSED3*/
 803 int
 804 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
 805     size_t align, size_t redzone, size_t off)
 806 {
 807         uintptr_t hi, lo;
 808         size_t tot_len;
 809 
 810         ASSERT(align == 0 ? off == 0 : off < align);
 811         ASSERT(ISP2(align));
 812         ASSERT(align == 0 || align >= PAGESIZE);
 813 
 814         lo = (uintptr_t)*basep;
 815         hi = lo + *lenp;
 816         tot_len = minlen + 2 * redzone; /* need at least this much space */
 817 
 818         /*
 819          * If hi rolled over the top, try cutting back.
 820          */
 821         if (hi < lo) {
 822                 *lenp = 0UL - lo - 1UL;
 823                 /* See if this really happens. If so, then we figure out why */
 824                 valid_va_range_aligned_wraparound++;
 825                 hi = lo + *lenp;
 826         }
 827         if (*lenp < tot_len) {
 828                 return (0);
 829         }
 830 
 831 #if defined(__amd64)
 832         /*
 833          * Deal with a possible hole in the address range between
 834          * hole_start and hole_end that should never be mapped.
 835          */
 836         if (lo < hole_start) {
 837                 if (hi > hole_start) {
 838                         if (hi < hole_end) {
 839                                 hi = hole_start;
 840                         } else {
 841                                 /* lo < hole_start && hi >= hole_end */
 842                                 if (dir == AH_LO) {
 843                                         /*
 844                                          * prefer lowest range
 845                                          */
 846                                         if (hole_start - lo >= tot_len)
 847                                                 hi = hole_start;
 848                                         else if (hi - hole_end >= tot_len)
 849                                                 lo = hole_end;
 850                                         else
 851                                                 return (0);
 852                                 } else {
 853                                         /*
 854                                          * prefer highest range
 855                                          */
 856                                         if (hi - hole_end >= tot_len)
 857                                                 lo = hole_end;
 858                                         else if (hole_start - lo >= tot_len)
 859                                                 hi = hole_start;
 860                                         else
 861                                                 return (0);
 862                                 }
 863                         }
 864                 }
 865         } else {
 866                 /* lo >= hole_start */
 867                 if (hi < hole_end)
 868                         return (0);
 869                 if (lo < hole_end)
 870                         lo = hole_end;
 871         }
 872 #endif
 873 
 874         if (hi - lo < tot_len)
 875                 return (0);
 876 
 877         if (align > 1) {
 878                 uintptr_t tlo = lo + redzone;
 879                 uintptr_t thi = hi - redzone;
 880                 tlo = (uintptr_t)P2PHASEUP(tlo, align, off);
 881                 if (tlo < lo + redzone) {
 882                         return (0);
 883                 }
 884                 if (thi < tlo || thi - tlo < minlen) {
 885                         return (0);
 886                 }
 887         }
 888 
 889         *basep = (caddr_t)lo;
 890         *lenp = hi - lo;
 891         return (1);
 892 }
 893 
 894 /*
 895  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 896  * addresses at least "minlen" long.  On success, 1 is returned and *basep
 897  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
 898  * is returned.
 899  */
 900 int
 901 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
 902 {
 903         return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
 904 }
 905 
 906 /*
 907  * Determine whether [addr, addr+len] are valid user addresses.
 908  */
 909 /*ARGSUSED*/
 910 int
 911 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
 912     caddr_t userlimit)
 913 {
 914         caddr_t eaddr = addr + len;
 915 
 916         if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
 917                 return (RANGE_BADADDR);
 918 
 919 #if defined(__amd64)
 920         /*
 921          * Check for the VA hole
 922          */
 923         if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
 924                 return (RANGE_BADADDR);
 925 #endif
 926 
 927         return (RANGE_OKAY);
 928 }
 929 
 930 /*
 931  * Return 1 if the page frame is onboard memory, else 0.
 932  */
 933 int
 934 pf_is_memory(pfn_t pf)
 935 {
 936         if (pfn_is_foreign(pf))
 937                 return (0);
 938         return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
 939 }
 940 
 941 /*
 942  * return the memrange containing pfn
 943  */
 944 int
 945 memrange_num(pfn_t pfn)
 946 {
 947         int n;
 948 
 949         for (n = 0; n < nranges - 1; ++n) {
 950                 if (pfn >= memranges[n])
 951                         break;
 952         }
 953         return (n);
 954 }
 955 
 956 /*
 957  * return the mnoderange containing pfn
 958  */
 959 /*ARGSUSED*/
 960 int
 961 pfn_2_mtype(pfn_t pfn)
 962 {
 963 #if defined(__xpv)
 964         return (0);
 965 #else
 966         int     n;
 967 
 968         /* Always start from highest pfn and work our way down */
 969         for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
 970                 if (pfn >= mnoderanges[n].mnr_pfnlo) {
 971                         break;
 972                 }
 973         }
 974         return (n);
 975 #endif
 976 }
 977 
 978 #if !defined(__xpv)
 979 /*
 980  * is_contigpage_free:
 981  *      returns a page list of contiguous pages. It minimally has to return
 982  *      minctg pages. Caller determines minctg based on the scatter-gather
 983  *      list length.
 984  *
 985  *      pfnp is set to the next page frame to search on return.
 986  */
 987 static page_t *
 988 is_contigpage_free(
 989         pfn_t *pfnp,
 990         pgcnt_t *pgcnt,
 991         pgcnt_t minctg,
 992         uint64_t pfnseg,
 993         int iolock)
 994 {
 995         int     i = 0;
 996         pfn_t   pfn = *pfnp;
 997         page_t  *pp;
 998         page_t  *plist = NULL;
 999 
1000         /*
1001          * fail if pfn + minctg crosses a segment boundary.
1002          * Adjust for next starting pfn to begin at segment boundary.
1003          */
1004 
1005         if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
1006                 *pfnp = roundup(*pfnp, pfnseg + 1);
1007                 return (NULL);
1008         }
1009 
1010         do {
1011 retry:
1012                 pp = page_numtopp_nolock(pfn + i);
1013                 if ((pp == NULL) || (page_trylock(pp, SE_EXCL) == 0)) {
1014                         (*pfnp)++;
1015                         break;
1016                 }
1017                 if (page_pptonum(pp) != pfn + i) {
1018                         page_unlock(pp);
1019                         goto retry;
1020                 }
1021 
1022                 if (!(PP_ISFREE(pp))) {
1023                         page_unlock(pp);
1024                         (*pfnp)++;
1025                         break;
1026                 }
1027 
1028                 if (!PP_ISAGED(pp)) {
1029                         page_list_sub(pp, PG_CACHE_LIST);
1030                         page_hashout(pp, (kmutex_t *)NULL);
1031                 } else {
1032                         page_list_sub(pp, PG_FREE_LIST);
1033                 }
1034 
1035                 if (iolock)
1036                         page_io_lock(pp);
1037                 page_list_concat(&plist, &pp);
1038 
1039                 /*
1040                  * exit loop when pgcnt satisfied or segment boundary reached.
1041                  */
1042 
1043         } while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
1044 
1045         *pfnp += i;             /* set to next pfn to search */
1046 
1047         if (i >= minctg) {
1048                 *pgcnt -= i;
1049                 return (plist);
1050         }
1051 
1052         /*
1053          * failure: minctg not satisfied.
1054          *
1055          * if next request crosses segment boundary, set next pfn
1056          * to search from the segment boundary.
1057          */
1058         if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
1059                 *pfnp = roundup(*pfnp, pfnseg + 1);
1060 
1061         /* clean up any pages already allocated */
1062 
1063         while (plist) {
1064                 pp = plist;
1065                 page_sub(&plist, pp);
1066                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
1067                 if (iolock)
1068                         page_io_unlock(pp);
1069                 page_unlock(pp);
1070         }
1071 
1072         return (NULL);
1073 }
1074 #endif  /* !__xpv */
1075 
1076 /*
1077  * verify that pages being returned from allocator have correct DMA attribute
1078  */
1079 #ifndef DEBUG
1080 #define check_dma(a, b, c) (void)(0)
1081 #else
1082 static void
1083 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
1084 {
1085         if (dma_attr == NULL)
1086                 return;
1087 
1088         while (cnt-- > 0) {
1089                 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) <
1090                     dma_attr->dma_attr_addr_lo)
1091                         panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp);
1092                 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >=
1093                     dma_attr->dma_attr_addr_hi)
1094                         panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp);
1095                 pp = pp->p_next;
1096         }
1097 }
1098 #endif
1099 
1100 #if !defined(__xpv)
1101 static page_t *
1102 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
1103 {
1104         pfn_t           pfn;
1105         int             sgllen;
1106         uint64_t        pfnseg;
1107         pgcnt_t         minctg;
1108         page_t          *pplist = NULL, *plist;
1109         uint64_t        lo, hi;
1110         pgcnt_t         pfnalign = 0;
1111         static pfn_t    startpfn;
1112         static pgcnt_t  lastctgcnt;
1113         uintptr_t       align;
1114 
1115         CONTIG_LOCK();
1116 
1117         if (mattr) {
1118                 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
1119                 hi = mmu_btop(mattr->dma_attr_addr_hi);
1120                 if (hi >= physmax)
1121                         hi = physmax - 1;
1122                 sgllen = mattr->dma_attr_sgllen;
1123                 pfnseg = mmu_btop(mattr->dma_attr_seg);
1124 
1125                 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
1126                 if (align > MMU_PAGESIZE)
1127                         pfnalign = mmu_btop(align);
1128 
1129                 /*
1130                  * in order to satisfy the request, must minimally
1131                  * acquire minctg contiguous pages
1132                  */
1133                 minctg = howmany(*pgcnt, sgllen);
1134 
1135                 ASSERT(hi >= lo);
1136 
1137                 /*
1138                  * start from where last searched if the minctg >= lastctgcnt
1139                  */
1140                 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
1141                         startpfn = lo;
1142         } else {
1143                 hi = physmax - 1;
1144                 lo = 0;
1145                 sgllen = 1;
1146                 pfnseg = mmu.highest_pfn;
1147                 minctg = *pgcnt;
1148 
1149                 if (minctg < lastctgcnt)
1150                         startpfn = lo;
1151         }
1152         lastctgcnt = minctg;
1153 
1154         ASSERT(pfnseg + 1 >= (uint64_t)minctg);
1155 
1156         /* conserve 16m memory - start search above 16m when possible */
1157         if (hi > PFN_16M && startpfn < PFN_16M)
1158                 startpfn = PFN_16M;
1159 
1160         pfn = startpfn;
1161         if (pfnalign)
1162                 pfn = P2ROUNDUP(pfn, pfnalign);
1163 
1164         while (pfn + minctg - 1 <= hi) {
1165 
1166                 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1167                 if (plist) {
1168                         page_list_concat(&pplist, &plist);
1169                         sgllen--;
1170                         /*
1171                          * return when contig pages no longer needed
1172                          */
1173                         if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1174                                 startpfn = pfn;
1175                                 CONTIG_UNLOCK();
1176                                 check_dma(mattr, pplist, *pgcnt);
1177                                 return (pplist);
1178                         }
1179                         minctg = howmany(*pgcnt, sgllen);
1180                 }
1181                 if (pfnalign)
1182                         pfn = P2ROUNDUP(pfn, pfnalign);
1183         }
1184 
1185         /* cannot find contig pages in specified range */
1186         if (startpfn == lo) {
1187                 CONTIG_UNLOCK();
1188                 return (NULL);
1189         }
1190 
1191         /* did not start with lo previously */
1192         pfn = lo;
1193         if (pfnalign)
1194                 pfn = P2ROUNDUP(pfn, pfnalign);
1195 
1196         /* allow search to go above startpfn */
1197         while (pfn < startpfn) {
1198 
1199                 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1200                 if (plist != NULL) {
1201 
1202                         page_list_concat(&pplist, &plist);
1203                         sgllen--;
1204 
1205                         /*
1206                          * return when contig pages no longer needed
1207                          */
1208                         if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1209                                 startpfn = pfn;
1210                                 CONTIG_UNLOCK();
1211                                 check_dma(mattr, pplist, *pgcnt);
1212                                 return (pplist);
1213                         }
1214                         minctg = howmany(*pgcnt, sgllen);
1215                 }
1216                 if (pfnalign)
1217                         pfn = P2ROUNDUP(pfn, pfnalign);
1218         }
1219         CONTIG_UNLOCK();
1220         return (NULL);
1221 }
1222 #endif  /* !__xpv */
1223 
1224 /*
1225  * mnode_range_cnt() calculates the number of memory ranges for mnode and
1226  * memranges[]. Used to determine the size of page lists and mnoderanges.
1227  */
1228 int
1229 mnode_range_cnt(int mnode)
1230 {
1231 #if defined(__xpv)
1232         ASSERT(mnode == 0);
1233         return (1);
1234 #else   /* __xpv */
1235         int     mri;
1236         int     mnrcnt = 0;
1237 
1238         if (mem_node_config[mnode].exists != 0) {
1239                 mri = nranges - 1;
1240 
1241                 /* find the memranges index below contained in mnode range */
1242 
1243                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1244                         mri--;
1245 
1246                 /*
1247                  * increment mnode range counter when memranges or mnode
1248                  * boundary is reached.
1249                  */
1250                 while (mri >= 0 &&
1251                     mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1252                         mnrcnt++;
1253                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1254                                 mri--;
1255                         else
1256                                 break;
1257                 }
1258         }
1259         ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1260         return (mnrcnt);
1261 #endif  /* __xpv */
1262 }
1263 
1264 /*
1265  * mnode_range_setup() initializes mnoderanges.
1266  */
1267 void
1268 mnode_range_setup(mnoderange_t *mnoderanges)
1269 {
1270         mnoderange_t *mp = mnoderanges;
1271         int     mnode, mri;
1272         int     mindex = 0;     /* current index into mnoderanges array */
1273         int     i, j;
1274         pfn_t   hipfn;
1275         int     last, hi;
1276 
1277         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
1278                 if (mem_node_config[mnode].exists == 0)
1279                         continue;
1280 
1281                 mri = nranges - 1;
1282 
1283                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1284                         mri--;
1285 
1286                 while (mri >= 0 && mem_node_config[mnode].physmax >=
1287                     MEMRANGELO(mri)) {
1288                         mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri),
1289                             mem_node_config[mnode].physbase);
1290                         mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri),
1291                             mem_node_config[mnode].physmax);
1292                         mnoderanges->mnr_mnode = mnode;
1293                         mnoderanges->mnr_memrange = mri;
1294                         mnoderanges->mnr_exists = 1;
1295                         mnoderanges++;
1296                         mindex++;
1297                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1298                                 mri--;
1299                         else
1300                                 break;
1301                 }
1302         }
1303 
1304         /*
1305          * For now do a simple sort of the mnoderanges array to fill in
1306          * the mnr_next fields.  Since mindex is expected to be relatively
1307          * small, using a simple O(N^2) algorithm.
1308          */
1309         for (i = 0; i < mindex; i++) {
1310                 if (mp[i].mnr_pfnlo == 0)       /* find lowest */
1311                         break;
1312         }
1313         ASSERT(i < mindex);
1314         last = i;
1315         mtype16m = last;
1316         mp[last].mnr_next = -1;
1317         for (i = 0; i < mindex - 1; i++) {
1318                 hipfn = (pfn_t)(-1);
1319                 hi = -1;
1320                 /* find next highest mnode range */
1321                 for (j = 0; j < mindex; j++) {
1322                         if (mp[j].mnr_pfnlo > mp[last].mnr_pfnlo &&
1323                             mp[j].mnr_pfnlo < hipfn) {
1324                                 hipfn = mp[j].mnr_pfnlo;
1325                                 hi = j;
1326                         }
1327                 }
1328                 mp[hi].mnr_next = last;
1329                 last = hi;
1330         }
1331         mtypetop = last;
1332 }
1333 
1334 #ifndef __xpv
1335 /*
1336  * Update mnoderanges for memory hot-add DR operations.
1337  */
1338 static void
1339 mnode_range_add(int mnode)
1340 {
1341         int     *prev;
1342         int     n, mri;
1343         pfn_t   start, end;
1344         extern  void membar_sync(void);
1345 
1346         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1347         ASSERT(mem_node_config[mnode].exists);
1348         start = mem_node_config[mnode].physbase;
1349         end = mem_node_config[mnode].physmax;
1350         ASSERT(start <= end);
1351         mutex_enter(&mnoderange_lock);
1352 
1353 #ifdef  DEBUG
1354         /* Check whether it interleaves with other memory nodes. */
1355         for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1356                 ASSERT(mnoderanges[n].mnr_exists);
1357                 if (mnoderanges[n].mnr_mnode == mnode)
1358                         continue;
1359                 ASSERT(start > mnoderanges[n].mnr_pfnhi ||
1360                     end < mnoderanges[n].mnr_pfnlo);
1361         }
1362 #endif  /* DEBUG */
1363 
1364         mri = nranges - 1;
1365         while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1366                 mri--;
1367         while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1368                 /* Check whether mtype already exists. */
1369                 for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1370                         if (mnoderanges[n].mnr_mnode == mnode &&
1371                             mnoderanges[n].mnr_memrange == mri) {
1372                                 mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri),
1373                                     start);
1374                                 mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri),
1375                                     end);
1376                                 break;
1377                         }
1378                 }
1379 
1380                 /* Add a new entry if it doesn't exist yet. */
1381                 if (n == -1) {
1382                         /* Try to find an unused entry in mnoderanges array. */
1383                         for (n = 0; n < mnoderangecnt; n++) {
1384                                 if (mnoderanges[n].mnr_exists == 0)
1385                                         break;
1386                         }
1387                         ASSERT(n < mnoderangecnt);
1388                         mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start);
1389                         mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end);
1390                         mnoderanges[n].mnr_mnode = mnode;
1391                         mnoderanges[n].mnr_memrange = mri;
1392                         mnoderanges[n].mnr_exists = 1;
1393                         /* Page 0 should always be present. */
1394                         for (prev = &mtypetop;
1395                             mnoderanges[*prev].mnr_pfnlo > start;
1396                             prev = &mnoderanges[*prev].mnr_next) {
1397                                 ASSERT(mnoderanges[*prev].mnr_next >= 0);
1398                                 ASSERT(mnoderanges[*prev].mnr_pfnlo > end);
1399                         }
1400                         mnoderanges[n].mnr_next = *prev;
1401                         membar_sync();
1402                         *prev = n;
1403                 }
1404 
1405                 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1406                         mri--;
1407                 else
1408                         break;
1409         }
1410 
1411         mutex_exit(&mnoderange_lock);
1412 }
1413 
1414 /*
1415  * Update mnoderanges for memory hot-removal DR operations.
1416  */
1417 static void
1418 mnode_range_del(int mnode)
1419 {
1420         _NOTE(ARGUNUSED(mnode));
1421         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1422         /* TODO: support deletion operation. */
1423         ASSERT(0);
1424 }
1425 
1426 void
1427 plat_slice_add(pfn_t start, pfn_t end)
1428 {
1429         mem_node_add_slice(start, end);
1430         if (plat_dr_enabled()) {
1431                 mnode_range_add(PFN_2_MEM_NODE(start));
1432         }
1433 }
1434 
1435 void
1436 plat_slice_del(pfn_t start, pfn_t end)
1437 {
1438         ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end));
1439         ASSERT(plat_dr_enabled());
1440         mnode_range_del(PFN_2_MEM_NODE(start));
1441         mem_node_del_slice(start, end);
1442 }
1443 #endif  /* __xpv */
1444 
1445 /*ARGSUSED*/
1446 int
1447 mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz)
1448 {
1449         int mtype = mtypetop;
1450 
1451 #if !defined(__xpv)
1452 #if defined(__i386)
1453         /*
1454          * set the mtype range
1455          * - kmem requests need to be below 4g if restricted_kmemalloc is set.
1456          * - for non kmem requests, set range to above 4g if memory below 4g
1457          * runs low.
1458          */
1459         if (restricted_kmemalloc && VN_ISKAS(vp) &&
1460             (caddr_t)(vaddr) >= kernelheap &&
1461             (caddr_t)(vaddr) < ekernelheap) {
1462                 ASSERT(physmax4g);
1463                 mtype = mtype4g;
1464                 if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz),
1465                     btop(pgsz), *flags)) {
1466                         *flags |= PGI_MT_RANGE16M;
1467                 } else {
1468                         VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1469                         VM_STAT_COND_ADD((*flags & PG_PANIC),
1470                             vmm_vmstats.pgpanicalloc);
1471                         *flags |= PGI_MT_RANGE0;
1472                 }
1473                 return (mtype);
1474         }
1475 #endif  /* __i386 */
1476 
1477         if (RESTRICT4G_ALLOC) {
1478                 VM_STAT_ADD(vmm_vmstats.restrict4gcnt);
1479                 /* here only for > 4g systems */
1480                 *flags |= PGI_MT_RANGE4G;
1481         } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) {
1482                 *flags |= PGI_MT_RANGE16M;
1483         } else {
1484                 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1485                 VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc);
1486                 *flags |= PGI_MT_RANGE0;
1487         }
1488 #endif /* !__xpv */
1489         return (mtype);
1490 }
1491 
1492 
1493 /* mtype init for page_get_replacement_page */
1494 /*ARGSUSED*/
1495 int
1496 mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt)
1497 {
1498         int mtype = mtypetop;
1499 #if !defined(__xpv)
1500         if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) {
1501                 *flags |= PGI_MT_RANGE16M;
1502         } else {
1503                 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1504                 *flags |= PGI_MT_RANGE0;
1505         }
1506 #endif
1507         return (mtype);
1508 }
1509 
1510 /*
1511  * Determine if the mnode range specified in mtype contains memory belonging
1512  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
1513  * the range from high pfn to 0, 16m or 4g.
1514  *
1515  * Return first mnode range type index found otherwise return -1 if none found.
1516  */
1517 int
1518 mtype_func(int mnode, int mtype, uint_t flags)
1519 {
1520         if (flags & PGI_MT_RANGE) {
1521                 int     mnr_lim = MRI_0;
1522 
1523                 if (flags & PGI_MT_NEXT) {
1524                         mtype = mnoderanges[mtype].mnr_next;
1525                 }
1526                 if (flags & PGI_MT_RANGE4G)
1527                         mnr_lim = MRI_4G;       /* exclude 0-4g range */
1528                 else if (flags & PGI_MT_RANGE16M)
1529                         mnr_lim = MRI_16M;      /* exclude 0-16m range */
1530                 while (mtype != -1 &&
1531                     mnoderanges[mtype].mnr_memrange <= mnr_lim) {
1532                         if (mnoderanges[mtype].mnr_mnode == mnode)
1533                                 return (mtype);
1534                         mtype = mnoderanges[mtype].mnr_next;
1535                 }
1536         } else if (mnoderanges[mtype].mnr_mnode == mnode) {
1537                 return (mtype);
1538         }
1539         return (-1);
1540 }
1541 
1542 /*
1543  * Update the page list max counts with the pfn range specified by the
1544  * input parameters.
1545  */
1546 void
1547 mtype_modify_max(pfn_t startpfn, long cnt)
1548 {
1549         int             mtype;
1550         pgcnt_t         inc;
1551         spgcnt_t        scnt = (spgcnt_t)(cnt);
1552         pgcnt_t         acnt = ABS(scnt);
1553         pfn_t           endpfn = startpfn + acnt;
1554         pfn_t           pfn, lo;
1555 
1556         if (!physmax4g)
1557                 return;
1558 
1559         mtype = mtypetop;
1560         for (pfn = endpfn; pfn > startpfn; ) {
1561                 ASSERT(mtype != -1);
1562                 lo = mnoderanges[mtype].mnr_pfnlo;
1563                 if (pfn > lo) {
1564                         if (startpfn >= lo) {
1565                                 inc = pfn - startpfn;
1566                         } else {
1567                                 inc = pfn - lo;
1568                         }
1569                         if (mnoderanges[mtype].mnr_memrange != MRI_4G) {
1570                                 if (scnt > 0)
1571                                         maxmem4g += inc;
1572                                 else
1573                                         maxmem4g -= inc;
1574                         }
1575                         pfn -= inc;
1576                 }
1577                 mtype = mnoderanges[mtype].mnr_next;
1578         }
1579 }
1580 
1581 int
1582 mtype_2_mrange(int mtype)
1583 {
1584         return (mnoderanges[mtype].mnr_memrange);
1585 }
1586 
1587 void
1588 mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi)
1589 {
1590         _NOTE(ARGUNUSED(mnode));
1591         ASSERT(mnoderanges[mtype].mnr_mnode == mnode);
1592         *pfnlo = mnoderanges[mtype].mnr_pfnlo;
1593         *pfnhi = mnoderanges[mtype].mnr_pfnhi;
1594 }
1595 
1596 size_t
1597 plcnt_sz(size_t ctrs_sz)
1598 {
1599 #ifdef DEBUG
1600         int     szc, colors;
1601 
1602         ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes;
1603         for (szc = 0; szc < mmu_page_sizes; szc++) {
1604                 colors = page_get_pagecolors(szc);
1605                 ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;
1606         }
1607 #endif
1608         return (ctrs_sz);
1609 }
1610 
1611 caddr_t
1612 plcnt_init(caddr_t addr)
1613 {
1614 #ifdef DEBUG
1615         int     mt, szc, colors;
1616 
1617         for (mt = 0; mt < mnoderangecnt; mt++) {
1618                 mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;
1619                 addr += (sizeof (struct mnr_mts) * mmu_page_sizes);
1620                 for (szc = 0; szc < mmu_page_sizes; szc++) {
1621                         colors = page_get_pagecolors(szc);
1622                         mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors;
1623                         mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =
1624                             (pgcnt_t *)addr;
1625                         addr += (sizeof (pgcnt_t) * colors);
1626                 }
1627         }
1628 #endif
1629         return (addr);
1630 }
1631 
1632 void
1633 plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags)
1634 {
1635         _NOTE(ARGUNUSED(pp));
1636 #ifdef DEBUG
1637         int     bin = PP_2_BIN(pp);
1638 
1639         atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt);
1640         atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin],
1641             cnt);
1642 #endif
1643         ASSERT(mtype == PP_2_MTYPE(pp));
1644         if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G)
1645                 atomic_add_long(&freemem4g, cnt);
1646         if (flags & PG_CACHE_LIST)
1647                 atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt);
1648         else
1649                 atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt);
1650         atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt);
1651 }
1652 
1653 /*
1654  * Returns the free page count for mnode
1655  */
1656 int
1657 mnode_pgcnt(int mnode)
1658 {
1659         int     mtype = mtypetop;
1660         int     flags = PGI_MT_RANGE0;
1661         pgcnt_t pgcnt = 0;
1662 
1663         mtype = mtype_func(mnode, mtype, flags);
1664 
1665         while (mtype != -1) {
1666                 pgcnt += MTYPE_FREEMEM(mtype);
1667                 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1668         }
1669         return (pgcnt);
1670 }
1671 
1672 /*
1673  * Initialize page coloring variables based on the l2 cache parameters.
1674  * Calculate and return memory needed for page coloring data structures.
1675  */
1676 size_t
1677 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1678 {
1679         _NOTE(ARGUNUSED(l2_linesz));
1680         size_t  colorsz = 0;
1681         int     i;
1682         int     colors;
1683 
1684 #if defined(__xpv)
1685         /*
1686          * Hypervisor domains currently don't have any concept of NUMA.
1687          * Hence we'll act like there is only 1 memrange.
1688          */
1689         i = memrange_num(1);
1690 #else /* !__xpv */
1691         /*
1692          * Reduce the memory ranges lists if we don't have large amounts
1693          * of memory. This avoids searching known empty free lists.
1694          * To support memory DR operations, we need to keep memory ranges
1695          * for possible memory hot-add operations.
1696          */
1697         if (plat_dr_physmax > physmax)
1698                 i = memrange_num(plat_dr_physmax);
1699         else
1700                 i = memrange_num(physmax);
1701 #if defined(__i386)
1702         if (i > MRI_4G)
1703                 restricted_kmemalloc = 0;
1704 #endif
1705         /* physmax greater than 4g */
1706         if (i == MRI_4G)
1707                 physmax4g = 1;
1708 #endif /* !__xpv */
1709         memranges += i;
1710         nranges -= i;
1711 
1712         ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES);
1713 
1714         ASSERT(ISP2(l2_linesz));
1715         ASSERT(l2_sz > MMU_PAGESIZE);
1716 
1717         /* l2_assoc is 0 for fully associative l2 cache */
1718         if (l2_assoc)
1719                 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1720         else
1721                 l2_colors = 1;
1722 
1723         ASSERT(ISP2(l2_colors));
1724 
1725         /* for scalability, configure at least PAGE_COLORS_MIN color bins */
1726         page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1727 
1728         /*
1729          * cpu_page_colors is non-zero when a page color may be spread across
1730          * multiple bins.
1731          */
1732         if (l2_colors < page_colors)
1733                 cpu_page_colors = l2_colors;
1734 
1735         ASSERT(ISP2(page_colors));
1736 
1737         page_colors_mask = page_colors - 1;
1738 
1739         ASSERT(ISP2(CPUSETSIZE()));
1740         page_coloring_shift = lowbit(CPUSETSIZE());
1741 
1742         /* initialize number of colors per page size */
1743         for (i = 0; i <= mmu.max_page_level; i++) {
1744                 hw_page_array[i].hp_size = LEVEL_SIZE(i);
1745                 hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1746                 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1747                 hw_page_array[i].hp_colors = (page_colors_mask >>
1748                     (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
1749                     + 1;
1750                 colorequivszc[i] = 0;
1751         }
1752 
1753         /*
1754          * The value of cpu_page_colors determines if additional color bins
1755          * need to be checked for a particular color in the page_get routines.
1756          */
1757         if (cpu_page_colors != 0) {
1758 
1759                 int a = lowbit(page_colors) - lowbit(cpu_page_colors);
1760                 ASSERT(a > 0);
1761                 ASSERT(a < 16);
1762 
1763                 for (i = 0; i <= mmu.max_page_level; i++) {
1764                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
1765                                 colorequivszc[i] = 0;
1766                                 continue;
1767                         }
1768                         while ((colors >> a) == 0)
1769                                 a--;
1770                         ASSERT(a >= 0);
1771 
1772                         /* higher 4 bits encodes color equiv mask */
1773                         colorequivszc[i] = (a << 4);
1774                 }
1775         }
1776 
1777         /* factor in colorequiv to check additional 'equivalent' bins. */
1778         if (colorequiv > 1) {
1779 
1780                 int a = lowbit(colorequiv) - 1;
1781                 if (a > 15)
1782                         a = 15;
1783 
1784                 for (i = 0; i <= mmu.max_page_level; i++) {
1785                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
1786                                 continue;
1787                         }
1788                         while ((colors >> a) == 0)
1789                                 a--;
1790                         if ((a << 4) > colorequivszc[i]) {
1791                                 colorequivszc[i] = (a << 4);
1792                         }
1793                 }
1794         }
1795 
1796         /* size for mnoderanges */
1797         for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
1798                 mnoderangecnt += mnode_range_cnt(i);
1799         if (plat_dr_support_memory()) {
1800                 /*
1801                  * Reserve enough space for memory DR operations.
1802                  * Two extra mnoderanges for possbile fragmentations,
1803                  * one for the 2G boundary and the other for the 4G boundary.
1804                  * We don't expect a memory board crossing the 16M boundary
1805                  * for memory hot-add operations on x86 platforms.
1806                  */
1807                 mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt;
1808         }
1809         colorsz = mnoderangecnt * sizeof (mnoderange_t);
1810 
1811         /* size for fpc_mutex and cpc_mutex */
1812         colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1813 
1814         /* size of page_freelists */
1815         colorsz += mnoderangecnt * sizeof (page_t ***);
1816         colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1817 
1818         for (i = 0; i < mmu_page_sizes; i++) {
1819                 colors = page_get_pagecolors(i);
1820                 colorsz += mnoderangecnt * colors * sizeof (page_t *);
1821         }
1822 
1823         /* size of page_cachelists */
1824         colorsz += mnoderangecnt * sizeof (page_t **);
1825         colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1826 
1827         return (colorsz);
1828 }
1829 
1830 /*
1831  * Called once at startup to configure page_coloring data structures and
1832  * does the 1st page_free()/page_freelist_add().
1833  */
1834 void
1835 page_coloring_setup(caddr_t pcmemaddr)
1836 {
1837         int     i;
1838         int     j;
1839         int     k;
1840         caddr_t addr;
1841         int     colors;
1842 
1843         /*
1844          * do page coloring setup
1845          */
1846         addr = pcmemaddr;
1847 
1848         mnoderanges = (mnoderange_t *)addr;
1849         addr += (mnoderangecnt * sizeof (mnoderange_t));
1850 
1851         mnode_range_setup(mnoderanges);
1852 
1853         if (physmax4g)
1854                 mtype4g = pfn_2_mtype(0xfffff);
1855 
1856         for (k = 0; k < NPC_MUTEX; k++) {
1857                 fpc_mutex[k] = (kmutex_t *)addr;
1858                 addr += (max_mem_nodes * sizeof (kmutex_t));
1859         }
1860         for (k = 0; k < NPC_MUTEX; k++) {
1861                 cpc_mutex[k] = (kmutex_t *)addr;
1862                 addr += (max_mem_nodes * sizeof (kmutex_t));
1863         }
1864         page_freelists = (page_t ****)addr;
1865         addr += (mnoderangecnt * sizeof (page_t ***));
1866 
1867         page_cachelists = (page_t ***)addr;
1868         addr += (mnoderangecnt * sizeof (page_t **));
1869 
1870         for (i = 0; i < mnoderangecnt; i++) {
1871                 page_freelists[i] = (page_t ***)addr;
1872                 addr += (mmu_page_sizes * sizeof (page_t **));
1873 
1874                 for (j = 0; j < mmu_page_sizes; j++) {
1875                         colors = page_get_pagecolors(j);
1876                         page_freelists[i][j] = (page_t **)addr;
1877                         addr += (colors * sizeof (page_t *));
1878                 }
1879                 page_cachelists[i] = (page_t **)addr;
1880                 addr += (page_colors * sizeof (page_t *));
1881         }
1882 }
1883 
1884 #if defined(__xpv)
1885 /*
1886  * Give back 10% of the io_pool pages to the free list.
1887  * Don't shrink the pool below some absolute minimum.
1888  */
1889 static void
1890 page_io_pool_shrink()
1891 {
1892         int retcnt;
1893         page_t *pp, *pp_first, *pp_last, **curpool;
1894         mfn_t mfn;
1895         int bothpools = 0;
1896 
1897         mutex_enter(&io_pool_lock);
1898         io_pool_shrink_attempts++;      /* should be a kstat? */
1899         retcnt = io_pool_cnt / 10;
1900         if (io_pool_cnt - retcnt < io_pool_cnt_min)
1901                 retcnt = io_pool_cnt - io_pool_cnt_min;
1902         if (retcnt <= 0)
1903                 goto done;
1904         io_pool_shrinks++;      /* should be a kstat? */
1905         curpool = &io_pool_4g;
1906 domore:
1907         /*
1908          * Loop through taking pages from the end of the list
1909          * (highest mfns) till amount to return reached.
1910          */
1911         for (pp = *curpool; pp && retcnt > 0; ) {
1912                 pp_first = pp_last = pp->p_prev;
1913                 if (pp_first == *curpool)
1914                         break;
1915                 retcnt--;
1916                 io_pool_cnt--;
1917                 page_io_pool_sub(curpool, pp_first, pp_last);
1918                 if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn)
1919                         start_mfn = mfn;
1920                 page_free(pp_first, 1);
1921                 pp = *curpool;
1922         }
1923         if (retcnt != 0 && !bothpools) {
1924                 /*
1925                  * If not enough found in less constrained pool try the
1926                  * more constrained one.
1927                  */
1928                 curpool = &io_pool_16m;
1929                 bothpools = 1;
1930                 goto domore;
1931         }
1932 done:
1933         mutex_exit(&io_pool_lock);
1934 }
1935 
1936 #endif  /* __xpv */
1937 
1938 uint_t
1939 page_create_update_flags_x86(uint_t flags)
1940 {
1941 #if defined(__xpv)
1942         /*
1943          * Check this is an urgent allocation and free pages are depleted.
1944          */
1945         if (!(flags & PG_WAIT) && freemem < desfree)
1946                 page_io_pool_shrink();
1947 #else /* !__xpv */
1948         /*
1949          * page_create_get_something may call this because 4g memory may be
1950          * depleted. Set flags to allow for relocation of base page below
1951          * 4g if necessary.
1952          */
1953         if (physmax4g)
1954                 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
1955 #endif /* __xpv */
1956         return (flags);
1957 }
1958 
1959 /*ARGSUSED*/
1960 int
1961 bp_color(struct buf *bp)
1962 {
1963         return (0);
1964 }
1965 
1966 #if defined(__xpv)
1967 
1968 /*
1969  * Take pages out of an io_pool
1970  */
1971 static void
1972 page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last)
1973 {
1974         if (*poolp == pp_first) {
1975                 *poolp = pp_last->p_next;
1976                 if (*poolp == pp_first)
1977                         *poolp = NULL;
1978         }
1979         pp_first->p_prev->p_next = pp_last->p_next;
1980         pp_last->p_next->p_prev = pp_first->p_prev;
1981         pp_first->p_prev = pp_last;
1982         pp_last->p_next = pp_first;
1983 }
1984 
1985 /*
1986  * Put a page on the io_pool list. The list is ordered by increasing MFN.
1987  */
1988 static void
1989 page_io_pool_add(page_t **poolp, page_t *pp)
1990 {
1991         page_t  *look;
1992         mfn_t   mfn = mfn_list[pp->p_pagenum];
1993 
1994         if (*poolp == NULL) {
1995                 *poolp = pp;
1996                 pp->p_next = pp;
1997                 pp->p_prev = pp;
1998                 return;
1999         }
2000 
2001         /*
2002          * Since we try to take pages from the high end of the pool
2003          * chances are good that the pages to be put on the list will
2004          * go at or near the end of the list. so start at the end and
2005          * work backwards.
2006          */
2007         look = (*poolp)->p_prev;
2008         while (mfn < mfn_list[look->p_pagenum]) {
2009                 look = look->p_prev;
2010                 if (look == (*poolp)->p_prev)
2011                         break; /* backed all the way to front of list */
2012         }
2013 
2014         /* insert after look */
2015         pp->p_prev = look;
2016         pp->p_next = look->p_next;
2017         pp->p_next->p_prev = pp;
2018         look->p_next = pp;
2019         if (mfn < mfn_list[(*poolp)->p_pagenum]) {
2020                 /*
2021                  * we inserted a new first list element
2022                  * adjust pool pointer to newly inserted element
2023                  */
2024                 *poolp = pp;
2025         }
2026 }
2027 
2028 /*
2029  * Add a page to the io_pool.  Setting the force flag will force the page
2030  * into the io_pool no matter what.
2031  */
2032 static void
2033 add_page_to_pool(page_t *pp, int force)
2034 {
2035         page_t *highest;
2036         page_t *freep = NULL;
2037 
2038         mutex_enter(&io_pool_lock);
2039         /*
2040          * Always keep the scarce low memory pages
2041          */
2042         if (mfn_list[pp->p_pagenum] < PFN_16MEG) {
2043                 ++io_pool_cnt;
2044                 page_io_pool_add(&io_pool_16m, pp);
2045                 goto done;
2046         }
2047         if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) {
2048                 ++io_pool_cnt;
2049                 page_io_pool_add(&io_pool_4g, pp);
2050         } else {
2051                 highest = io_pool_4g->p_prev;
2052                 if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) {
2053                         page_io_pool_sub(&io_pool_4g, highest, highest);
2054                         page_io_pool_add(&io_pool_4g, pp);
2055                         freep = highest;
2056                 } else {
2057                         freep = pp;
2058                 }
2059         }
2060 done:
2061         mutex_exit(&io_pool_lock);
2062         if (freep)
2063                 page_free(freep, 1);
2064 }
2065 
2066 
2067 int contig_pfn_cnt;     /* no of pfns in the contig pfn list */
2068 int contig_pfn_max;     /* capacity of the contig pfn list */
2069 int next_alloc_pfn;     /* next position in list to start a contig search */
2070 int contig_pfnlist_updates;     /* pfn list update count */
2071 int contig_pfnlist_builds;      /* how many times have we (re)built list */
2072 int contig_pfnlist_buildfailed; /* how many times has list build failed */
2073 int create_contig_pending;      /* nonzero means taskq creating contig list */
2074 pfn_t *contig_pfn_list = NULL;  /* list of contig pfns in ascending mfn order */
2075 
2076 /*
2077  * Function to use in sorting a list of pfns by their underlying mfns.
2078  */
2079 static int
2080 mfn_compare(const void *pfnp1, const void *pfnp2)
2081 {
2082         mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1];
2083         mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2];
2084 
2085         if (mfn1 > mfn2)
2086                 return (1);
2087         if (mfn1 < mfn2)
2088                 return (-1);
2089         return (0);
2090 }
2091 
2092 /*
2093  * Compact the contig_pfn_list by tossing all the non-contiguous
2094  * elements from the list.
2095  */
2096 static void
2097 compact_contig_pfn_list(void)
2098 {
2099         pfn_t pfn, lapfn, prev_lapfn;
2100         mfn_t mfn;
2101         int i, newcnt = 0;
2102 
2103         prev_lapfn = 0;
2104         for (i = 0; i < contig_pfn_cnt - 1; i++) {
2105                 pfn = contig_pfn_list[i];
2106                 lapfn = contig_pfn_list[i + 1];
2107                 mfn = mfn_list[pfn];
2108                 /*
2109                  * See if next pfn is for a contig mfn
2110                  */
2111                 if (mfn_list[lapfn] != mfn + 1)
2112                         continue;
2113                 /*
2114                  * pfn and lookahead are both put in list
2115                  * unless pfn is the previous lookahead.
2116                  */
2117                 if (pfn != prev_lapfn)
2118                         contig_pfn_list[newcnt++] = pfn;
2119                 contig_pfn_list[newcnt++] = lapfn;
2120                 prev_lapfn = lapfn;
2121         }
2122         for (i = newcnt; i < contig_pfn_cnt; i++)
2123                 contig_pfn_list[i] = 0;
2124         contig_pfn_cnt = newcnt;
2125 }
2126 
2127 /*ARGSUSED*/
2128 static void
2129 call_create_contiglist(void *arg)
2130 {
2131         (void) create_contig_pfnlist(PG_WAIT);
2132 }
2133 
2134 /*
2135  * Create list of freelist pfns that have underlying
2136  * contiguous mfns.  The list is kept in ascending mfn order.
2137  * returns 1 if list created else 0.
2138  */
2139 static int
2140 create_contig_pfnlist(uint_t flags)
2141 {
2142         pfn_t pfn;
2143         page_t *pp;
2144         int ret = 1;
2145 
2146         mutex_enter(&contig_list_lock);
2147         if (contig_pfn_list != NULL)
2148                 goto out;
2149         contig_pfn_max = freemem + (freemem / 10);
2150         contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t),
2151             (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP);
2152         if (contig_pfn_list == NULL) {
2153                 /*
2154                  * If we could not create the contig list (because
2155                  * we could not sleep for memory).  Dispatch a taskq that can
2156                  * sleep to get the memory.
2157                  */
2158                 if (!create_contig_pending) {
2159                         if (taskq_dispatch(system_taskq, call_create_contiglist,
2160                             NULL, TQ_NOSLEEP) != NULL)
2161                                 create_contig_pending = 1;
2162                 }
2163                 contig_pfnlist_buildfailed++;   /* count list build failures */
2164                 ret = 0;
2165                 goto out;
2166         }
2167         create_contig_pending = 0;
2168         ASSERT(contig_pfn_cnt == 0);
2169         for (pfn = 0; pfn < mfn_count; pfn++) {
2170                 pp = page_numtopp_nolock(pfn);
2171                 if (pp == NULL || !PP_ISFREE(pp))
2172                         continue;
2173                 contig_pfn_list[contig_pfn_cnt] = pfn;
2174                 if (++contig_pfn_cnt == contig_pfn_max)
2175                         break;
2176         }
2177         /*
2178          * Sanity check the new list.
2179          */
2180         if (contig_pfn_cnt < 2) { /* no contig pfns */
2181                 contig_pfn_cnt = 0;
2182                 contig_pfnlist_buildfailed++;
2183                 kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t));
2184                 contig_pfn_list = NULL;
2185                 contig_pfn_max = 0;
2186                 ret = 0;
2187                 goto out;
2188         }
2189         qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare);
2190         compact_contig_pfn_list();
2191         /*
2192          * Make sure next search of the newly created contiguous pfn
2193          * list starts at the beginning of the list.
2194          */
2195         next_alloc_pfn = 0;
2196         contig_pfnlist_builds++;        /* count list builds */
2197 out:
2198         mutex_exit(&contig_list_lock);
2199         return (ret);
2200 }
2201 
2202 
2203 /*
2204  * Toss the current contig pfnlist.  Someone is about to do a massive
2205  * update to pfn<->mfn mappings.  So we have them destroy the list and lock
2206  * it till they are done with their update.
2207  */
2208 void
2209 clear_and_lock_contig_pfnlist()
2210 {
2211         pfn_t *listp = NULL;
2212         size_t listsize;
2213 
2214         mutex_enter(&contig_list_lock);
2215         if (contig_pfn_list != NULL) {
2216                 listp = contig_pfn_list;
2217                 listsize = contig_pfn_max * sizeof (pfn_t);
2218                 contig_pfn_list = NULL;
2219                 contig_pfn_max = contig_pfn_cnt = 0;
2220         }
2221         if (listp != NULL)
2222                 kmem_free(listp, listsize);
2223 }
2224 
2225 /*
2226  * Unlock the contig_pfn_list.  The next attempted use of it will cause
2227  * it to be re-created.
2228  */
2229 void
2230 unlock_contig_pfnlist()
2231 {
2232         mutex_exit(&contig_list_lock);
2233 }
2234 
2235 /*
2236  * Update the contiguous pfn list in response to a pfn <-> mfn reassignment
2237  */
2238 void
2239 update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn)
2240 {
2241         int probe_hi, probe_lo, probe_pos, insert_after, insert_point;
2242         pfn_t probe_pfn;
2243         mfn_t probe_mfn;
2244         int drop_lock = 0;
2245 
2246         if (mutex_owner(&contig_list_lock) != curthread) {
2247                 drop_lock = 1;
2248                 mutex_enter(&contig_list_lock);
2249         }
2250         if (contig_pfn_list == NULL)
2251                 goto done;
2252         contig_pfnlist_updates++;
2253         /*
2254          * Find the pfn in the current list.  Use a binary chop to locate it.
2255          */
2256         probe_hi = contig_pfn_cnt - 1;
2257         probe_lo = 0;
2258         probe_pos = (probe_hi + probe_lo) / 2;
2259         while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) {
2260                 if (probe_pos == probe_lo) { /* pfn not in list */
2261                         probe_pos = -1;
2262                         break;
2263                 }
2264                 if (pfn_to_mfn(probe_pfn) <= oldmfn)
2265                         probe_lo = probe_pos;
2266                 else
2267                         probe_hi = probe_pos;
2268                 probe_pos = (probe_hi + probe_lo) / 2;
2269         }
2270         if (probe_pos >= 0) {
2271                 /*
2272                  * Remove pfn from list and ensure next alloc
2273                  * position stays in bounds.
2274                  */
2275                 if (--contig_pfn_cnt <= next_alloc_pfn)
2276                         next_alloc_pfn = 0;
2277                 if (contig_pfn_cnt < 2) { /* no contig pfns */
2278                         contig_pfn_cnt = 0;
2279                         kmem_free(contig_pfn_list,
2280                             contig_pfn_max * sizeof (pfn_t));
2281                         contig_pfn_list = NULL;
2282                         contig_pfn_max = 0;
2283                         goto done;
2284                 }
2285                 ovbcopy(&contig_pfn_list[probe_pos + 1],
2286                     &contig_pfn_list[probe_pos],
2287                     (contig_pfn_cnt - probe_pos) * sizeof (pfn_t));
2288         }
2289         if (newmfn == MFN_INVALID)
2290                 goto done;
2291         /*
2292          * Check if new mfn has adjacent mfns in the list
2293          */
2294         probe_hi = contig_pfn_cnt - 1;
2295         probe_lo = 0;
2296         insert_after = -2;
2297         do {
2298                 probe_pos = (probe_hi + probe_lo) / 2;
2299                 probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]);
2300                 if (newmfn == probe_mfn + 1)
2301                         insert_after = probe_pos;
2302                 else if (newmfn == probe_mfn - 1)
2303                         insert_after = probe_pos - 1;
2304                 if (probe_pos == probe_lo)
2305                         break;
2306                 if (probe_mfn <= newmfn)
2307                         probe_lo = probe_pos;
2308                 else
2309                         probe_hi = probe_pos;
2310         } while (insert_after == -2);
2311         /*
2312          * If there is space in the list and there are adjacent mfns
2313          * insert the pfn in to its proper place in the list.
2314          */
2315         if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) {
2316                 insert_point = insert_after + 1;
2317                 ovbcopy(&contig_pfn_list[insert_point],
2318                     &contig_pfn_list[insert_point + 1],
2319                     (contig_pfn_cnt - insert_point) * sizeof (pfn_t));
2320                 contig_pfn_list[insert_point] = pfn;
2321                 contig_pfn_cnt++;
2322         }
2323 done:
2324         if (drop_lock)
2325                 mutex_exit(&contig_list_lock);
2326 }
2327 
2328 /*
2329  * Called to (re-)populate the io_pool from the free page lists.
2330  */
2331 long
2332 populate_io_pool(void)
2333 {
2334         pfn_t pfn;
2335         mfn_t mfn, max_mfn;
2336         page_t *pp;
2337 
2338         /*
2339          * Figure out the bounds of the pool on first invocation.
2340          * We use a percentage of memory for the io pool size.
2341          * we allow that to shrink, but not to less than a fixed minimum
2342          */
2343         if (io_pool_cnt_max == 0) {
2344                 io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct);
2345                 io_pool_cnt_lowater = io_pool_cnt_max;
2346                 /*
2347                  * This is the first time in populate_io_pool, grab a va to use
2348                  * when we need to allocate pages.
2349                  */
2350                 io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
2351         }
2352         /*
2353          * If we are out of pages in the pool, then grow the size of the pool
2354          */
2355         if (io_pool_cnt == 0) {
2356                 /*
2357                  * Grow the max size of the io pool by 5%, but never more than
2358                  * 25% of physical memory.
2359                  */
2360                 if (io_pool_cnt_max < physmem / 4)
2361                         io_pool_cnt_max += io_pool_cnt_max / 20;
2362         }
2363         io_pool_grows++;        /* should be a kstat? */
2364 
2365         /*
2366          * Get highest mfn on this platform, but limit to the 32 bit DMA max.
2367          */
2368         (void) mfn_to_pfn(start_mfn);
2369         max_mfn = MIN(cached_max_mfn, PFN_4GIG);
2370         for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) {
2371                 pfn = mfn_to_pfn(mfn);
2372                 if (pfn & PFN_IS_FOREIGN_MFN)
2373                         continue;
2374                 /*
2375                  * try to allocate it from free pages
2376                  */
2377                 pp = page_numtopp_alloc(pfn);
2378                 if (pp == NULL)
2379                         continue;
2380                 PP_CLRFREE(pp);
2381                 add_page_to_pool(pp, 1);
2382                 if (io_pool_cnt >= io_pool_cnt_max)
2383                         break;
2384         }
2385 
2386         return (io_pool_cnt);
2387 }
2388 
2389 /*
2390  * Destroy a page that was being used for DMA I/O. It may or
2391  * may not actually go back to the io_pool.
2392  */
2393 void
2394 page_destroy_io(page_t *pp)
2395 {
2396         mfn_t mfn = mfn_list[pp->p_pagenum];
2397 
2398         /*
2399          * When the page was alloc'd a reservation was made, release it now
2400          */
2401         page_unresv(1);
2402         /*
2403          * Unload translations, if any, then hash out the
2404          * page to erase its identity.
2405          */
2406         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2407         page_hashout(pp, NULL);
2408 
2409         /*
2410          * If the page came from the free lists, just put it back to them.
2411          * DomU pages always go on the free lists as well.
2412          */
2413         if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) {
2414                 page_free(pp, 1);
2415                 return;
2416         }
2417 
2418         add_page_to_pool(pp, 0);
2419 }
2420 
2421 
2422 long contig_searches;           /* count of times contig pages requested */
2423 long contig_search_restarts;    /* count of contig ranges tried */
2424 long contig_search_failed;      /* count of contig alloc failures */
2425 
2426 /*
2427  * Free partial page list
2428  */
2429 static void
2430 free_partial_list(page_t **pplist)
2431 {
2432         page_t *pp;
2433 
2434         while (*pplist != NULL) {
2435                 pp = *pplist;
2436                 page_io_pool_sub(pplist, pp, pp);
2437                 page_free(pp, 1);
2438         }
2439 }
2440 
2441 /*
2442  * Look thru the contiguous pfns that are not part of the io_pool for
2443  * contiguous free pages.  Return a list of the found pages or NULL.
2444  */
2445 page_t *
2446 find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg,
2447     pgcnt_t pfnalign)
2448 {
2449         page_t *pp, *plist = NULL;
2450         mfn_t mfn, prev_mfn, start_mfn;
2451         pfn_t pfn;
2452         int pages_needed, pages_requested;
2453         int search_start;
2454 
2455         /*
2456          * create the contig pfn list if not already done
2457          */
2458 retry:
2459         mutex_enter(&contig_list_lock);
2460         if (contig_pfn_list == NULL) {
2461                 mutex_exit(&contig_list_lock);
2462                 if (!create_contig_pfnlist(flags)) {
2463                         return (NULL);
2464                 }
2465                 goto retry;
2466         }
2467         contig_searches++;
2468         /*
2469          * Search contiguous pfn list for physically contiguous pages not in
2470          * the io_pool.  Start the search where the last search left off.
2471          */
2472         pages_requested = pages_needed = npages;
2473         search_start = next_alloc_pfn;
2474         start_mfn = prev_mfn = 0;
2475         while (pages_needed) {
2476                 pfn = contig_pfn_list[next_alloc_pfn];
2477                 mfn = pfn_to_mfn(pfn);
2478                 /*
2479                  * Check if mfn is first one or contig to previous one and
2480                  * if page corresponding to mfn is free and that mfn
2481                  * range is not crossing a segment boundary.
2482                  */
2483                 if ((prev_mfn == 0 || mfn == prev_mfn + 1) &&
2484                     (pp = page_numtopp_alloc(pfn)) != NULL &&
2485                     !((mfn & pfnseg) < (start_mfn & pfnseg))) {
2486                         PP_CLRFREE(pp);
2487                         page_io_pool_add(&plist, pp);
2488                         pages_needed--;
2489                         if (prev_mfn == 0) {
2490                                 if (pfnalign &&
2491                                     mfn != P2ROUNDUP(mfn, pfnalign)) {
2492                                         /*
2493                                          * not properly aligned
2494                                          */
2495                                         contig_search_restarts++;
2496                                         free_partial_list(&plist);
2497                                         pages_needed = pages_requested;
2498                                         start_mfn = prev_mfn = 0;
2499                                         goto skip;
2500                                 }
2501                                 start_mfn = mfn;
2502                         }
2503                         prev_mfn = mfn;
2504                 } else {
2505                         contig_search_restarts++;
2506                         free_partial_list(&plist);
2507                         pages_needed = pages_requested;
2508                         start_mfn = prev_mfn = 0;
2509                 }
2510 skip:
2511                 if (++next_alloc_pfn == contig_pfn_cnt)
2512                         next_alloc_pfn = 0;
2513                 if (next_alloc_pfn == search_start)
2514                         break; /* all pfns searched */
2515         }
2516         mutex_exit(&contig_list_lock);
2517         if (pages_needed) {
2518                 contig_search_failed++;
2519                 /*
2520                  * Failed to find enough contig pages.
2521                  * free partial page list
2522                  */
2523                 free_partial_list(&plist);
2524         }
2525         return (plist);
2526 }
2527 
2528 /*
2529  * Search the reserved io pool pages for a page range with the
2530  * desired characteristics.
2531  */
2532 page_t *
2533 page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg)
2534 {
2535         page_t *pp_first, *pp_last;
2536         page_t *pp, **poolp;
2537         pgcnt_t nwanted, pfnalign;
2538         uint64_t pfnseg;
2539         mfn_t mfn, tmfn, hi_mfn, lo_mfn;
2540         int align, attempt = 0;
2541 
2542         if (minctg == 1)
2543                 contig = 0;
2544         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2545         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2546         pfnseg = mmu_btop(mattr->dma_attr_seg);
2547         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2548         if (align > MMU_PAGESIZE)
2549                 pfnalign = mmu_btop(align);
2550         else
2551                 pfnalign = 0;
2552 
2553 try_again:
2554         /*
2555          * See if we want pages for a legacy device
2556          */
2557         if (hi_mfn < PFN_16MEG)
2558                 poolp = &io_pool_16m;
2559         else
2560                 poolp = &io_pool_4g;
2561 try_smaller:
2562         /*
2563          * Take pages from I/O pool. We'll use pages from the highest
2564          * MFN range possible.
2565          */
2566         pp_first = pp_last = NULL;
2567         mutex_enter(&io_pool_lock);
2568         nwanted = minctg;
2569         for (pp = *poolp; pp && nwanted > 0; ) {
2570                 pp = pp->p_prev;
2571 
2572                 /*
2573                  * skip pages above allowable range
2574                  */
2575                 mfn = mfn_list[pp->p_pagenum];
2576                 if (hi_mfn < mfn)
2577                         goto skip;
2578 
2579                 /*
2580                  * stop at pages below allowable range
2581                  */
2582                 if (lo_mfn > mfn)
2583                         break;
2584 restart:
2585                 if (pp_last == NULL) {
2586                         /*
2587                          * Check alignment
2588                          */
2589                         tmfn = mfn - (minctg - 1);
2590                         if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign))
2591                                 goto skip; /* not properly aligned */
2592                         /*
2593                          * Check segment
2594                          */
2595                         if ((mfn & pfnseg) < (tmfn & pfnseg))
2596                                 goto skip; /* crosses seg boundary */
2597                         /*
2598                          * Start building page list
2599                          */
2600                         pp_first = pp_last = pp;
2601                         nwanted--;
2602                 } else {
2603                         /*
2604                          * check physical contiguity if required
2605                          */
2606                         if (contig &&
2607                             mfn_list[pp_first->p_pagenum] != mfn + 1) {
2608                                 /*
2609                                  * not a contiguous page, restart list.
2610                                  */
2611                                 pp_last = NULL;
2612                                 nwanted = minctg;
2613                                 goto restart;
2614                         } else { /* add page to list */
2615                                 pp_first = pp;
2616                                 nwanted--;
2617                         }
2618                 }
2619 skip:
2620                 if (pp == *poolp)
2621                         break;
2622         }
2623 
2624         /*
2625          * If we didn't find memory. Try the more constrained pool, then
2626          * sweep free pages into the DMA pool and try again.
2627          */
2628         if (nwanted != 0) {
2629                 mutex_exit(&io_pool_lock);
2630                 /*
2631                  * If we were looking in the less constrained pool and
2632                  * didn't find pages, try the more constrained pool.
2633                  */
2634                 if (poolp == &io_pool_4g) {
2635                         poolp = &io_pool_16m;
2636                         goto try_smaller;
2637                 }
2638                 kmem_reap();
2639                 if (++attempt < 4) {
2640                         /*
2641                          * Grab some more io_pool pages
2642                          */
2643                         (void) populate_io_pool();
2644                         goto try_again; /* go around and retry */
2645                 }
2646                 return (NULL);
2647         }
2648         /*
2649          * Found the pages, now snip them from the list
2650          */
2651         page_io_pool_sub(poolp, pp_first, pp_last);
2652         io_pool_cnt -= minctg;
2653         /*
2654          * reset low water mark
2655          */
2656         if (io_pool_cnt < io_pool_cnt_lowater)
2657                 io_pool_cnt_lowater = io_pool_cnt;
2658         mutex_exit(&io_pool_lock);
2659         return (pp_first);
2660 }
2661 
2662 page_t *
2663 page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr,
2664     ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg)
2665 {
2666         uint_t kflags;
2667         int order, extra, extpages, i, contig, nbits, extents;
2668         page_t *pp, *expp, *pp_first, **pplist = NULL;
2669         mfn_t *mfnlist = NULL;
2670 
2671         contig = flags & PG_PHYSCONTIG;
2672         if (minctg == 1)
2673                 contig = 0;
2674         flags &= ~PG_PHYSCONTIG;
2675         kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP;
2676         /*
2677          * Hypervisor will allocate extents, if we want contig
2678          * pages extent must be >= minctg
2679          */
2680         if (contig) {
2681                 order = highbit(minctg) - 1;
2682                 if (minctg & ((1 << order) - 1))
2683                         order++;
2684                 extpages = 1 << order;
2685         } else {
2686                 order = 0;
2687                 extpages = minctg;
2688         }
2689         if (extpages > minctg) {
2690                 extra = extpages - minctg;
2691                 if (!page_resv(extra, kflags))
2692                         return (NULL);
2693         }
2694         pp_first = NULL;
2695         pplist = kmem_alloc(extpages * sizeof (page_t *), kflags);
2696         if (pplist == NULL)
2697                 goto balloon_fail;
2698         mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags);
2699         if (mfnlist == NULL)
2700                 goto balloon_fail;
2701         pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr);
2702         if (pp == NULL)
2703                 goto balloon_fail;
2704         pp_first = pp;
2705         if (extpages > minctg) {
2706                 /*
2707                  * fill out the rest of extent pages to swap
2708                  * with the hypervisor
2709                  */
2710                 for (i = 0; i < extra; i++) {
2711                         expp = page_create_va(vp,
2712                             (u_offset_t)(uintptr_t)io_pool_kva,
2713                             PAGESIZE, flags, &kvseg, io_pool_kva);
2714                         if (expp == NULL)
2715                                 goto balloon_fail;
2716                         (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD);
2717                         page_io_unlock(expp);
2718                         page_hashout(expp, NULL);
2719                         page_io_lock(expp);
2720                         /*
2721                          * add page to end of list
2722                          */
2723                         expp->p_prev = pp_first->p_prev;
2724                         expp->p_next = pp_first;
2725                         expp->p_prev->p_next = expp;
2726                         pp_first->p_prev = expp;
2727                 }
2728 
2729         }
2730         for (i = 0; i < extpages; i++) {
2731                 pplist[i] = pp;
2732                 pp = pp->p_next;
2733         }
2734         nbits = highbit(mattr->dma_attr_addr_hi);
2735         extents = contig ? 1 : minctg;
2736         if (balloon_replace_pages(extents, pplist, nbits, order,
2737             mfnlist) != extents) {
2738                 if (ioalloc_dbg)
2739                         cmn_err(CE_NOTE, "request to hypervisor"
2740                             " for %d pages, maxaddr %" PRIx64 " failed",
2741                             extpages, mattr->dma_attr_addr_hi);
2742                 goto balloon_fail;
2743         }
2744 
2745         kmem_free(pplist, extpages * sizeof (page_t *));
2746         kmem_free(mfnlist, extpages * sizeof (mfn_t));
2747         /*
2748          * Return any excess pages to free list
2749          */
2750         if (extpages > minctg) {
2751                 for (i = 0; i < extra; i++) {
2752                         pp = pp_first->p_prev;
2753                         page_sub(&pp_first, pp);
2754                         page_io_unlock(pp);
2755                         page_unresv(1);
2756                         page_free(pp, 1);
2757                 }
2758         }
2759         return (pp_first);
2760 balloon_fail:
2761         /*
2762          * Return pages to free list and return failure
2763          */
2764         while (pp_first != NULL) {
2765                 pp = pp_first;
2766                 page_sub(&pp_first, pp);
2767                 page_io_unlock(pp);
2768                 if (pp->p_vnode != NULL)
2769                         page_hashout(pp, NULL);
2770                 page_free(pp, 1);
2771         }
2772         if (pplist)
2773                 kmem_free(pplist, extpages * sizeof (page_t *));
2774         if (mfnlist)
2775                 kmem_free(mfnlist, extpages * sizeof (mfn_t));
2776         page_unresv(extpages - minctg);
2777         return (NULL);
2778 }
2779 
2780 static void
2781 return_partial_alloc(page_t *plist)
2782 {
2783         page_t *pp;
2784 
2785         while (plist != NULL) {
2786                 pp = plist;
2787                 page_sub(&plist, pp);
2788                 page_io_unlock(pp);
2789                 page_destroy_io(pp);
2790         }
2791 }
2792 
2793 static page_t *
2794 page_get_contigpages(
2795         struct vnode    *vp,
2796         u_offset_t      off,
2797         int             *npagesp,
2798         uint_t          flags,
2799         caddr_t         vaddr,
2800         ddi_dma_attr_t  *mattr)
2801 {
2802         mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2803         page_t  *plist; /* list to return */
2804         page_t  *pp, *mcpl;
2805         int     contig, anyaddr, npages, getone = 0;
2806         mfn_t   lo_mfn;
2807         mfn_t   hi_mfn;
2808         pgcnt_t pfnalign = 0;
2809         int     align, sgllen;
2810         uint64_t pfnseg;
2811         pgcnt_t minctg;
2812 
2813         npages = *npagesp;
2814         ASSERT(mattr != NULL);
2815         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2816         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2817         sgllen = mattr->dma_attr_sgllen;
2818         pfnseg = mmu_btop(mattr->dma_attr_seg);
2819         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2820         if (align > MMU_PAGESIZE)
2821                 pfnalign = mmu_btop(align);
2822 
2823         contig = flags & PG_PHYSCONTIG;
2824         if (npages == -1) {
2825                 npages = 1;
2826                 pfnalign = 0;
2827         }
2828         /*
2829          * Clear the contig flag if only one page is needed.
2830          */
2831         if (npages == 1) {
2832                 getone = 1;
2833                 contig = 0;
2834         }
2835 
2836         /*
2837          * Check if any page in the system is fine.
2838          */
2839         anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn;
2840         if (!contig && anyaddr && !pfnalign) {
2841                 flags &= ~PG_PHYSCONTIG;
2842                 plist = page_create_va(vp, off, npages * MMU_PAGESIZE,
2843                     flags, &kvseg, vaddr);
2844                 if (plist != NULL) {
2845                         *npagesp = 0;
2846                         return (plist);
2847                 }
2848         }
2849         plist = NULL;
2850         minctg = howmany(npages, sgllen);
2851         while (npages > sgllen || getone) {
2852                 if (minctg > npages)
2853                         minctg = npages;
2854                 mcpl = NULL;
2855                 /*
2856                  * We could want contig pages with no address range limits.
2857                  */
2858                 if (anyaddr && contig) {
2859                         /*
2860                          * Look for free contig pages to satisfy the request.
2861                          */
2862                         mcpl = find_contig_free(minctg, flags, pfnseg,
2863                             pfnalign);
2864                 }
2865                 /*
2866                  * Try the reserved io pools next
2867                  */
2868                 if (mcpl == NULL)
2869                         mcpl = page_io_pool_alloc(mattr, contig, minctg);
2870                 if (mcpl != NULL) {
2871                         pp = mcpl;
2872                         do {
2873                                 if (!page_hashin(pp, vp, off, NULL)) {
2874                                         panic("page_get_contigpages:"
2875                                             " hashin failed"
2876                                             " pp %p, vp %p, off %llx",
2877                                             (void *)pp, (void *)vp, off);
2878                                 }
2879                                 off += MMU_PAGESIZE;
2880                                 PP_CLRFREE(pp);
2881                                 PP_CLRAGED(pp);
2882                                 page_set_props(pp, P_REF);
2883                                 page_io_lock(pp);
2884                                 pp = pp->p_next;
2885                         } while (pp != mcpl);
2886                 } else {
2887                         /*
2888                          * Hypervisor exchange doesn't handle segment or
2889                          * alignment constraints
2890                          */
2891                         if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi ||
2892                             pfnalign)
2893                                 goto fail;
2894                         /*
2895                          * Try exchanging pages with the hypervisor
2896                          */
2897                         mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr,
2898                             flags, minctg);
2899                         if (mcpl == NULL)
2900                                 goto fail;
2901                         off += minctg * MMU_PAGESIZE;
2902                 }
2903                 check_dma(mattr, mcpl, minctg);
2904                 /*
2905                  * Here with a minctg run of contiguous pages, add them to the
2906                  * list we will return for this request.
2907                  */
2908                 page_list_concat(&plist, &mcpl);
2909                 npages -= minctg;
2910                 *npagesp = npages;
2911                 sgllen--;
2912                 if (getone)
2913                         break;
2914         }
2915         return (plist);
2916 fail:
2917         return_partial_alloc(plist);
2918         return (NULL);
2919 }
2920 
2921 /*
2922  * Allocator for domain 0 I/O pages. We match the required
2923  * DMA attributes and contiguity constraints.
2924  */
2925 /*ARGSUSED*/
2926 page_t *
2927 page_create_io(
2928         struct vnode    *vp,
2929         u_offset_t      off,
2930         uint_t          bytes,
2931         uint_t          flags,
2932         struct as       *as,
2933         caddr_t         vaddr,
2934         ddi_dma_attr_t  *mattr)
2935 {
2936         page_t  *plist = NULL, *pp;
2937         int     npages = 0, contig, anyaddr, pages_req;
2938         mfn_t   lo_mfn;
2939         mfn_t   hi_mfn;
2940         pgcnt_t pfnalign = 0;
2941         int     align;
2942         int     is_domu = 0;
2943         int     dummy, bytes_got;
2944         mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2945 
2946         ASSERT(mattr != NULL);
2947         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2948         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2949         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2950         if (align > MMU_PAGESIZE)
2951                 pfnalign = mmu_btop(align);
2952 
2953         /*
2954          * Clear the contig flag if only one page is needed or the scatter
2955          * gather list length is >= npages.
2956          */
2957         pages_req = npages = mmu_btopr(bytes);
2958         contig = (flags & PG_PHYSCONTIG);
2959         bytes = P2ROUNDUP(bytes, MMU_PAGESIZE);
2960         if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages)
2961                 contig = 0;
2962 
2963         /*
2964          * Check if any old page in the system is fine.
2965          * DomU should always go down this path.
2966          */
2967         is_domu = !DOMAIN_IS_INITDOMAIN(xen_info);
2968         anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign;
2969         if ((!contig && anyaddr) || is_domu) {
2970                 flags &= ~PG_PHYSCONTIG;
2971                 plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr);
2972                 if (plist != NULL)
2973                         return (plist);
2974                 else if (is_domu)
2975                         return (NULL); /* no memory available */
2976         }
2977         /*
2978          * DomU should never reach here
2979          */
2980         if (contig) {
2981                 plist = page_get_contigpages(vp, off, &npages, flags, vaddr,
2982                     mattr);
2983                 if (plist == NULL)
2984                         goto fail;
2985                 bytes_got = (pages_req - npages) << MMU_PAGESHIFT;
2986                 vaddr += bytes_got;
2987                 off += bytes_got;
2988                 /*
2989                  * We now have all the contiguous pages we need, but
2990                  * we may still need additional non-contiguous pages.
2991                  */
2992         }
2993         /*
2994          * now loop collecting the requested number of pages, these do
2995          * not have to be contiguous pages but we will use the contig
2996          * page alloc code to get the pages since it will honor any
2997          * other constraints the pages may have.
2998          */
2999         while (npages--) {
3000                 dummy = -1;
3001                 pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr);
3002                 if (pp == NULL)
3003                         goto fail;
3004                 page_add(&plist, pp);
3005                 vaddr += MMU_PAGESIZE;
3006                 off += MMU_PAGESIZE;
3007         }
3008         return (plist);
3009 fail:
3010         /*
3011          * Failed to get enough pages, return ones we did get
3012          */
3013         return_partial_alloc(plist);
3014         return (NULL);
3015 }
3016 
3017 /*
3018  * Lock and return the page with the highest mfn that we can find.  last_mfn
3019  * holds the last one found, so the next search can start from there.  We
3020  * also keep a counter so that we don't loop forever if the machine has no
3021  * free pages.
3022  *
3023  * This is called from the balloon thread to find pages to give away.  new_high
3024  * is used when new mfn's have been added to the system - we will reset our
3025  * search if the new mfn's are higher than our current search position.
3026  */
3027 page_t *
3028 page_get_high_mfn(mfn_t new_high)
3029 {
3030         static mfn_t last_mfn = 0;
3031         pfn_t pfn;
3032         page_t *pp;
3033         ulong_t loop_count = 0;
3034 
3035         if (new_high > last_mfn)
3036                 last_mfn = new_high;
3037 
3038         for (; loop_count < mfn_count; loop_count++, last_mfn--) {
3039                 if (last_mfn == 0) {
3040                         last_mfn = cached_max_mfn;
3041                 }
3042 
3043                 pfn = mfn_to_pfn(last_mfn);
3044                 if (pfn & PFN_IS_FOREIGN_MFN)
3045                         continue;
3046 
3047                 /* See if the page is free.  If so, lock it. */
3048                 pp = page_numtopp_alloc(pfn);
3049                 if (pp == NULL)
3050                         continue;
3051                 PP_CLRFREE(pp);
3052 
3053                 ASSERT(PAGE_EXCL(pp));
3054                 ASSERT(pp->p_vnode == NULL);
3055                 ASSERT(!hat_page_is_mapped(pp));
3056                 last_mfn--;
3057                 return (pp);
3058         }
3059         return (NULL);
3060 }
3061 
3062 #else /* !__xpv */
3063 
3064 /*
3065  * get a page from any list with the given mnode
3066  */
3067 static page_t *
3068 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
3069     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
3070 {
3071         kmutex_t                *pcm;
3072         int                     i;
3073         page_t                  *pp;
3074         page_t                  *first_pp;
3075         uint64_t                pgaddr;
3076         ulong_t                 bin;
3077         int                     mtypestart;
3078         int                     plw_initialized;
3079         page_list_walker_t      plw;
3080 
3081         VM_STAT_ADD(pga_vmstats.pgma_alloc);
3082 
3083         ASSERT((flags & PG_MATCH_COLOR) == 0);
3084         ASSERT(szc == 0);
3085         ASSERT(dma_attr != NULL);
3086 
3087         MTYPE_START(mnode, mtype, flags);
3088         if (mtype < 0) {
3089                 VM_STAT_ADD(pga_vmstats.pgma_allocempty);
3090                 return (NULL);
3091         }
3092 
3093         mtypestart = mtype;
3094 
3095         bin = origbin;
3096 
3097         /*
3098          * check up to page_colors + 1 bins - origbin may be checked twice
3099          * because of BIN_STEP skip
3100          */
3101         do {
3102                 plw_initialized = 0;
3103 
3104                 for (plw.plw_count = 0;
3105                     plw.plw_count < page_colors; plw.plw_count++) {
3106 
3107                         if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
3108                                 goto nextfreebin;
3109 
3110                         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
3111                         mutex_enter(pcm);
3112                         pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
3113                         first_pp = pp;
3114                         while (pp != NULL) {
3115                                 if (page_trylock(pp, SE_EXCL) == 0) {
3116                                         pp = pp->p_next;
3117                                         if (pp == first_pp) {
3118                                                 pp = NULL;
3119                                         }
3120                                         continue;
3121                                 }
3122 
3123                                 ASSERT(PP_ISFREE(pp));
3124                                 ASSERT(PP_ISAGED(pp));
3125                                 ASSERT(pp->p_vnode == NULL);
3126                                 ASSERT(pp->p_hash == NULL);
3127                                 ASSERT(pp->p_offset == (u_offset_t)-1);
3128                                 ASSERT(pp->p_szc == szc);
3129                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3130                                 /* check if page within DMA attributes */
3131                                 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3132                                 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3133                                     (pgaddr + MMU_PAGESIZE - 1 <=
3134                                     dma_attr->dma_attr_addr_hi)) {
3135                                         break;
3136                                 }
3137 
3138                                 /* continue looking */
3139                                 page_unlock(pp);
3140                                 pp = pp->p_next;
3141                                 if (pp == first_pp)
3142                                         pp = NULL;
3143 
3144                         }
3145                         if (pp != NULL) {
3146                                 ASSERT(mtype == PP_2_MTYPE(pp));
3147                                 ASSERT(pp->p_szc == 0);
3148 
3149                                 /* found a page with specified DMA attributes */
3150                                 page_sub(&PAGE_FREELISTS(mnode, szc, bin,
3151                                     mtype), pp);
3152                                 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3153 
3154                                 if ((PP_ISFREE(pp) == 0) ||
3155                                     (PP_ISAGED(pp) == 0)) {
3156                                         cmn_err(CE_PANIC, "page %p is not free",
3157                                             (void *)pp);
3158                                 }
3159 
3160                                 mutex_exit(pcm);
3161                                 check_dma(dma_attr, pp, 1);
3162                                 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3163                                 return (pp);
3164                         }
3165                         mutex_exit(pcm);
3166 nextfreebin:
3167                         if (plw_initialized == 0) {
3168                                 page_list_walk_init(szc, 0, bin, 1, 0, &plw);
3169                                 ASSERT(plw.plw_ceq_dif == page_colors);
3170                                 plw_initialized = 1;
3171                         }
3172 
3173                         if (plw.plw_do_split) {
3174                                 pp = page_freelist_split(szc, bin, mnode,
3175                                     mtype,
3176                                     mmu_btop(dma_attr->dma_attr_addr_lo),
3177                                     mmu_btop(dma_attr->dma_attr_addr_hi + 1),
3178                                     &plw);
3179                                 if (pp != NULL) {
3180                                         check_dma(dma_attr, pp, 1);
3181                                         return (pp);
3182                                 }
3183                         }
3184 
3185                         bin = page_list_walk_next_bin(szc, bin, &plw);
3186                 }
3187 
3188                 MTYPE_NEXT(mnode, mtype, flags);
3189         } while (mtype >= 0);
3190 
3191         /* failed to find a page in the freelist; try it in the cachelist */
3192 
3193         /* reset mtype start for cachelist search */
3194         mtype = mtypestart;
3195         ASSERT(mtype >= 0);
3196 
3197         /* start with the bin of matching color */
3198         bin = origbin;
3199 
3200         do {
3201                 for (i = 0; i <= page_colors; i++) {
3202                         if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
3203                                 goto nextcachebin;
3204                         pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3205                         mutex_enter(pcm);
3206                         pp = PAGE_CACHELISTS(mnode, bin, mtype);
3207                         first_pp = pp;
3208                         while (pp != NULL) {
3209                                 if (page_trylock(pp, SE_EXCL) == 0) {
3210                                         pp = pp->p_next;
3211                                         if (pp == first_pp)
3212                                                 pp = NULL;
3213                                         continue;
3214                                 }
3215                                 ASSERT(pp->p_vnode);
3216                                 ASSERT(PP_ISAGED(pp) == 0);
3217                                 ASSERT(pp->p_szc == 0);
3218                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3219 
3220                                 /* check if page within DMA attributes */
3221 
3222                                 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3223                                 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3224                                     (pgaddr + MMU_PAGESIZE - 1 <=
3225                                     dma_attr->dma_attr_addr_hi)) {
3226                                         break;
3227                                 }
3228 
3229                                 /* continue looking */
3230                                 page_unlock(pp);
3231                                 pp = pp->p_next;
3232                                 if (pp == first_pp)
3233                                         pp = NULL;
3234                         }
3235 
3236                         if (pp != NULL) {
3237                                 ASSERT(mtype == PP_2_MTYPE(pp));
3238                                 ASSERT(pp->p_szc == 0);
3239 
3240                                 /* found a page with specified DMA attributes */
3241                                 page_sub(&PAGE_CACHELISTS(mnode, bin,
3242                                     mtype), pp);
3243                                 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3244 
3245                                 mutex_exit(pcm);
3246                                 ASSERT(pp->p_vnode);
3247                                 ASSERT(PP_ISAGED(pp) == 0);
3248                                 check_dma(dma_attr, pp, 1);
3249                                 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3250                                 return (pp);
3251                         }
3252                         mutex_exit(pcm);
3253 nextcachebin:
3254                         bin += (i == 0) ? BIN_STEP : 1;
3255                         bin &= page_colors_mask;
3256                 }
3257                 MTYPE_NEXT(mnode, mtype, flags);
3258         } while (mtype >= 0);
3259 
3260         VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
3261         return (NULL);
3262 }
3263 
3264 /*
3265  * This function is similar to page_get_freelist()/page_get_cachelist()
3266  * but it searches both the lists to find a page with the specified
3267  * color (or no color) and DMA attributes. The search is done in the
3268  * freelist first and then in the cache list within the highest memory
3269  * range (based on DMA attributes) before searching in the lower
3270  * memory ranges.
3271  *
3272  * Note: This function is called only by page_create_io().
3273  */
3274 /*ARGSUSED*/
3275 static page_t *
3276 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
3277     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp)
3278 {
3279         uint_t          bin;
3280         int             mtype;
3281         page_t          *pp;
3282         int             n;
3283         int             m;
3284         int             szc;
3285         int             fullrange;
3286         int             mnode;
3287         int             local_failed_stat = 0;
3288         lgrp_mnode_cookie_t     lgrp_cookie;
3289 
3290         VM_STAT_ADD(pga_vmstats.pga_alloc);
3291 
3292         /* only base pagesize currently supported */
3293         if (size != MMU_PAGESIZE)
3294                 return (NULL);
3295 
3296         /*
3297          * If we're passed a specific lgroup, we use it.  Otherwise,
3298          * assume first-touch placement is desired.
3299          */
3300         if (!LGRP_EXISTS(lgrp))
3301                 lgrp = lgrp_home_lgrp();
3302 
3303         /* LINTED */
3304         AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3305 
3306         /*
3307          * Only hold one freelist or cachelist lock at a time, that way we
3308          * can start anywhere and not have to worry about lock
3309          * ordering.
3310          */
3311         if (dma_attr == NULL) {
3312                 n = mtype16m;
3313                 m = mtypetop;
3314                 fullrange = 1;
3315                 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
3316         } else {
3317                 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
3318                 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
3319 
3320                 /*
3321                  * We can guarantee alignment only for page boundary.
3322                  */
3323                 if (dma_attr->dma_attr_align > MMU_PAGESIZE)
3324                         return (NULL);
3325 
3326                 /* Sanity check the dma_attr */
3327                 if (pfnlo > pfnhi)
3328                         return (NULL);
3329 
3330                 n = pfn_2_mtype(pfnlo);
3331                 m = pfn_2_mtype(pfnhi);
3332 
3333                 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
3334                     (pfnhi >= mnoderanges[m].mnr_pfnhi));
3335         }
3336         VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
3337 
3338         szc = 0;
3339 
3340         /* cylcing thru mtype handled by RANGE0 if n == mtype16m */
3341         if (n == mtype16m) {
3342                 flags |= PGI_MT_RANGE0;
3343                 n = m;
3344         }
3345 
3346         /*
3347          * Try local memory node first, but try remote if we can't
3348          * get a page of the right color.
3349          */
3350         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
3351         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3352                 /*
3353                  * allocate pages from high pfn to low.
3354                  */
3355                 mtype = m;
3356                 do {
3357                         if (fullrange != 0) {
3358                                 pp = page_get_mnode_freelist(mnode,
3359                                     bin, mtype, szc, flags);
3360                                 if (pp == NULL) {
3361                                         pp = page_get_mnode_cachelist(
3362                                             bin, flags, mnode, mtype);
3363                                 }
3364                         } else {
3365                                 pp = page_get_mnode_anylist(bin, szc,
3366                                     flags, mnode, mtype, dma_attr);
3367                         }
3368                         if (pp != NULL) {
3369                                 VM_STAT_ADD(pga_vmstats.pga_allocok);
3370                                 check_dma(dma_attr, pp, 1);
3371                                 return (pp);
3372                         }
3373                 } while (mtype != n &&
3374                     (mtype = mnoderanges[mtype].mnr_next) != -1);
3375                 if (!local_failed_stat) {
3376                         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3377                         local_failed_stat = 1;
3378                 }
3379         }
3380         VM_STAT_ADD(pga_vmstats.pga_allocfailed);
3381 
3382         return (NULL);
3383 }
3384 
3385 /*
3386  * page_create_io()
3387  *
3388  * This function is a copy of page_create_va() with an additional
3389  * argument 'mattr' that specifies DMA memory requirements to
3390  * the page list functions. This function is used by the segkmem
3391  * allocator so it is only to create new pages (i.e PG_EXCL is
3392  * set).
3393  *
3394  * Note: This interface is currently used by x86 PSM only and is
3395  *       not fully specified so the commitment level is only for
3396  *       private interface specific to x86. This interface uses PSM
3397  *       specific page_get_anylist() interface.
3398  */
3399 
3400 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
3401         for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
3402                 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
3403                         break; \
3404         } \
3405 }
3406 
3407 
3408 page_t *
3409 page_create_io(
3410         struct vnode    *vp,
3411         u_offset_t      off,
3412         uint_t          bytes,
3413         uint_t          flags,
3414         struct as       *as,
3415         caddr_t         vaddr,
3416         ddi_dma_attr_t  *mattr) /* DMA memory attributes if any */
3417 {
3418         page_t          *plist = NULL;
3419         uint_t          plist_len = 0;
3420         pgcnt_t         npages;
3421         page_t          *npp = NULL;
3422         uint_t          pages_req;
3423         page_t          *pp;
3424         kmutex_t        *phm = NULL;
3425         uint_t          index;
3426 
3427         TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
3428             "page_create_start:vp %p off %llx bytes %u flags %x",
3429             vp, off, bytes, flags);
3430 
3431         ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
3432 
3433         pages_req = npages = mmu_btopr(bytes);
3434 
3435         /*
3436          * Do the freemem and pcf accounting.
3437          */
3438         if (!page_create_wait(npages, flags)) {
3439                 return (NULL);
3440         }
3441 
3442         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
3443             "page_create_success:vp %p off %llx", vp, off);
3444 
3445         /*
3446          * If satisfying this request has left us with too little
3447          * memory, start the wheels turning to get some back.  The
3448          * first clause of the test prevents waking up the pageout
3449          * daemon in situations where it would decide that there's
3450          * nothing to do.
3451          */
3452         if (nscan < desscan && freemem < minfree) {
3453                 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
3454                     "pageout_cv_signal:freemem %ld", freemem);
3455                 cv_signal(&proc_pageout->p_cv);
3456         }
3457 
3458         if (flags & PG_PHYSCONTIG) {
3459 
3460                 plist = page_get_contigpage(&npages, mattr, 1);
3461                 if (plist == NULL) {
3462                         page_create_putback(npages);
3463                         return (NULL);
3464                 }
3465 
3466                 pp = plist;
3467 
3468                 do {
3469                         if (!page_hashin(pp, vp, off, NULL)) {
3470                                 panic("pg_creat_io: hashin failed %p %p %llx",
3471                                     (void *)pp, (void *)vp, off);
3472                         }
3473                         VM_STAT_ADD(page_create_new);
3474                         off += MMU_PAGESIZE;
3475                         PP_CLRFREE(pp);
3476                         PP_CLRAGED(pp);
3477                         page_set_props(pp, P_REF);
3478                         pp = pp->p_next;
3479                 } while (pp != plist);
3480 
3481                 if (!npages) {
3482                         check_dma(mattr, plist, pages_req);
3483                         return (plist);
3484                 } else {
3485                         vaddr += (pages_req - npages) << MMU_PAGESHIFT;
3486                 }
3487 
3488                 /*
3489                  * fall-thru:
3490                  *
3491                  * page_get_contigpage returns when npages <= sgllen.
3492                  * Grab the rest of the non-contig pages below from anylist.
3493                  */
3494         }
3495 
3496         /*
3497          * Loop around collecting the requested number of pages.
3498          * Most of the time, we have to `create' a new page. With
3499          * this in mind, pull the page off the free list before
3500          * getting the hash lock.  This will minimize the hash
3501          * lock hold time, nesting, and the like.  If it turns
3502          * out we don't need the page, we put it back at the end.
3503          */
3504         while (npages--) {
3505                 phm = NULL;
3506 
3507                 index = PAGE_HASH_FUNC(vp, off);
3508 top:
3509                 ASSERT(phm == NULL);
3510                 ASSERT(index == PAGE_HASH_FUNC(vp, off));
3511                 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3512 
3513                 if (npp == NULL) {
3514                         /*
3515                          * Try to get the page of any color either from
3516                          * the freelist or from the cache list.
3517                          */
3518                         npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
3519                             flags & ~PG_MATCH_COLOR, mattr, NULL);
3520                         if (npp == NULL) {
3521                                 if (mattr == NULL) {
3522                                         /*
3523                                          * Not looking for a special page;
3524                                          * panic!
3525                                          */
3526                                         panic("no page found %d", (int)npages);
3527                                 }
3528                                 /*
3529                                  * No page found! This can happen
3530                                  * if we are looking for a page
3531                                  * within a specific memory range
3532                                  * for DMA purposes. If PG_WAIT is
3533                                  * specified then we wait for a
3534                                  * while and then try again. The
3535                                  * wait could be forever if we
3536                                  * don't get the page(s) we need.
3537                                  *
3538                                  * Note: XXX We really need a mechanism
3539                                  * to wait for pages in the desired
3540                                  * range. For now, we wait for any
3541                                  * pages and see if we can use it.
3542                                  */
3543 
3544                                 if ((mattr != NULL) && (flags & PG_WAIT)) {
3545                                         delay(10);
3546                                         goto top;
3547                                 }
3548                                 goto fail; /* undo accounting stuff */
3549                         }
3550 
3551                         if (PP_ISAGED(npp) == 0) {
3552                                 /*
3553                                  * Since this page came from the
3554                                  * cachelist, we must destroy the
3555                                  * old vnode association.
3556                                  */
3557                                 page_hashout(npp, (kmutex_t *)NULL);
3558                         }
3559                 }
3560 
3561                 /*
3562                  * We own this page!
3563                  */
3564                 ASSERT(PAGE_EXCL(npp));
3565                 ASSERT(npp->p_vnode == NULL);
3566                 ASSERT(!hat_page_is_mapped(npp));
3567                 PP_CLRFREE(npp);
3568                 PP_CLRAGED(npp);
3569 
3570                 /*
3571                  * Here we have a page in our hot little mits and are
3572                  * just waiting to stuff it on the appropriate lists.
3573                  * Get the mutex and check to see if it really does
3574                  * not exist.
3575                  */
3576                 phm = PAGE_HASH_MUTEX(index);
3577                 mutex_enter(phm);
3578                 PAGE_HASH_SEARCH(index, pp, vp, off);
3579                 if (pp == NULL) {
3580                         VM_STAT_ADD(page_create_new);
3581                         pp = npp;
3582                         npp = NULL;
3583                         if (!page_hashin(pp, vp, off, phm)) {
3584                                 /*
3585                                  * Since we hold the page hash mutex and
3586                                  * just searched for this page, page_hashin
3587                                  * had better not fail.  If it does, that
3588                                  * means somethread did not follow the
3589                                  * page hash mutex rules.  Panic now and
3590                                  * get it over with.  As usual, go down
3591                                  * holding all the locks.
3592                                  */
3593                                 ASSERT(MUTEX_HELD(phm));
3594                                 panic("page_create: hashin fail %p %p %llx %p",
3595                                     (void *)pp, (void *)vp, off, (void *)phm);
3596 
3597                         }
3598                         ASSERT(MUTEX_HELD(phm));
3599                         mutex_exit(phm);
3600                         phm = NULL;
3601 
3602                         /*
3603                          * Hat layer locking need not be done to set
3604                          * the following bits since the page is not hashed
3605                          * and was on the free list (i.e., had no mappings).
3606                          *
3607                          * Set the reference bit to protect
3608                          * against immediate pageout
3609                          *
3610                          * XXXmh modify freelist code to set reference
3611                          * bit so we don't have to do it here.
3612                          */
3613                         page_set_props(pp, P_REF);
3614                 } else {
3615                         ASSERT(MUTEX_HELD(phm));
3616                         mutex_exit(phm);
3617                         phm = NULL;
3618                         /*
3619                          * NOTE: This should not happen for pages associated
3620                          *       with kernel vnode 'kvp'.
3621                          */
3622                         /* XX64 - to debug why this happens! */
3623                         ASSERT(!VN_ISKAS(vp));
3624                         if (VN_ISKAS(vp))
3625                                 cmn_err(CE_NOTE,
3626                                     "page_create: page not expected "
3627                                     "in hash list for kernel vnode - pp 0x%p",
3628                                     (void *)pp);
3629                         VM_STAT_ADD(page_create_exists);
3630                         goto fail;
3631                 }
3632 
3633                 /*
3634                  * Got a page!  It is locked.  Acquire the i/o
3635                  * lock since we are going to use the p_next and
3636                  * p_prev fields to link the requested pages together.
3637                  */
3638                 page_io_lock(pp);
3639                 page_add(&plist, pp);
3640                 plist = plist->p_next;
3641                 off += MMU_PAGESIZE;
3642                 vaddr += MMU_PAGESIZE;
3643         }
3644 
3645         check_dma(mattr, plist, pages_req);
3646         return (plist);
3647 
3648 fail:
3649         if (npp != NULL) {
3650                 /*
3651                  * Did not need this page after all.
3652                  * Put it back on the free list.
3653                  */
3654                 VM_STAT_ADD(page_create_putbacks);
3655                 PP_SETFREE(npp);
3656                 PP_SETAGED(npp);
3657                 npp->p_offset = (u_offset_t)-1;
3658                 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
3659                 page_unlock(npp);
3660         }
3661 
3662         /*
3663          * Give up the pages we already got.
3664          */
3665         while (plist != NULL) {
3666                 pp = plist;
3667                 page_sub(&plist, pp);
3668                 page_io_unlock(pp);
3669                 plist_len++;
3670                 /*LINTED: constant in conditional ctx*/
3671                 VN_DISPOSE(pp, B_INVAL, 0, kcred);
3672         }
3673 
3674         /*
3675          * VN_DISPOSE does freemem accounting for the pages in plist
3676          * by calling page_free. So, we need to undo the pcf accounting
3677          * for only the remaining pages.
3678          */
3679         VM_STAT_ADD(page_create_putbacks);
3680         page_create_putback(pages_req - plist_len);
3681 
3682         return (NULL);
3683 }
3684 #endif /* !__xpv */
3685 
3686 
3687 /*
3688  * Copy the data from the physical page represented by "frompp" to
3689  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
3690  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
3691  * level and no one sleeps with an active mapping there.
3692  *
3693  * Note that the ref/mod bits in the page_t's are not affected by
3694  * this operation, hence it is up to the caller to update them appropriately.
3695  */
3696 int
3697 ppcopy(page_t *frompp, page_t *topp)
3698 {
3699         caddr_t         pp_addr1;
3700         caddr_t         pp_addr2;
3701         hat_mempte_t    pte1;
3702         hat_mempte_t    pte2;
3703         kmutex_t        *ppaddr_mutex;
3704         label_t         ljb;
3705         int             ret = 1;
3706 
3707         ASSERT_STACK_ALIGNED();
3708         ASSERT(PAGE_LOCKED(frompp));
3709         ASSERT(PAGE_LOCKED(topp));
3710 
3711         if (kpm_enable) {
3712                 pp_addr1 = hat_kpm_page2va(frompp, 0);
3713                 pp_addr2 = hat_kpm_page2va(topp, 0);
3714                 kpreempt_disable();
3715         } else {
3716                 /*
3717                  * disable pre-emption so that CPU can't change
3718                  */
3719                 kpreempt_disable();
3720 
3721                 pp_addr1 = CPU->cpu_caddr1;
3722                 pp_addr2 = CPU->cpu_caddr2;
3723                 pte1 = CPU->cpu_caddr1pte;
3724                 pte2 = CPU->cpu_caddr2pte;
3725 
3726                 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3727                 mutex_enter(ppaddr_mutex);
3728 
3729                 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
3730                     PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
3731                 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
3732                     PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3733                     HAT_LOAD_NOCONSIST);
3734         }
3735 
3736         if (on_fault(&ljb)) {
3737                 ret = 0;
3738                 goto faulted;
3739         }
3740         if (use_sse_pagecopy)
3741 #ifdef __xpv
3742                 page_copy_no_xmm(pp_addr2, pp_addr1);
3743 #else
3744                 hwblkpagecopy(pp_addr1, pp_addr2);
3745 #endif
3746         else
3747                 bcopy(pp_addr1, pp_addr2, PAGESIZE);
3748 
3749         no_fault();
3750 faulted:
3751         if (!kpm_enable) {
3752 #ifdef __xpv
3753                 /*
3754                  * We can't leave unused mappings laying about under the
3755                  * hypervisor, so blow them away.
3756                  */
3757                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0,
3758                     UVMF_INVLPG | UVMF_LOCAL) < 0)
3759                         panic("HYPERVISOR_update_va_mapping() failed");
3760                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3761                     UVMF_INVLPG | UVMF_LOCAL) < 0)
3762                         panic("HYPERVISOR_update_va_mapping() failed");
3763 #endif
3764                 mutex_exit(ppaddr_mutex);
3765         }
3766         kpreempt_enable();
3767         return (ret);
3768 }
3769 
3770 void
3771 pagezero(page_t *pp, uint_t off, uint_t len)
3772 {
3773         ASSERT(PAGE_LOCKED(pp));
3774         pfnzero(page_pptonum(pp), off, len);
3775 }
3776 
3777 /*
3778  * Zero the physical page from off to off + len given by pfn
3779  * without changing the reference and modified bits of page.
3780  *
3781  * We use this using CPU private page address #2, see ppcopy() for more info.
3782  * pfnzero() must not be called at interrupt level.
3783  */
3784 void
3785 pfnzero(pfn_t pfn, uint_t off, uint_t len)
3786 {
3787         caddr_t         pp_addr2;
3788         hat_mempte_t    pte2;
3789         kmutex_t        *ppaddr_mutex = NULL;
3790 
3791         ASSERT_STACK_ALIGNED();
3792         ASSERT(len <= MMU_PAGESIZE);
3793         ASSERT(off <= MMU_PAGESIZE);
3794         ASSERT(off + len <= MMU_PAGESIZE);
3795 
3796         if (kpm_enable && !pfn_is_foreign(pfn)) {
3797                 pp_addr2 = hat_kpm_pfn2va(pfn);
3798                 kpreempt_disable();
3799         } else {
3800                 kpreempt_disable();
3801 
3802                 pp_addr2 = CPU->cpu_caddr2;
3803                 pte2 = CPU->cpu_caddr2pte;
3804 
3805                 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3806                 mutex_enter(ppaddr_mutex);
3807 
3808                 hat_mempte_remap(pfn, pp_addr2, pte2,
3809                     PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3810                     HAT_LOAD_NOCONSIST);
3811         }
3812 
3813         if (use_sse_pagezero) {
3814 #ifdef __xpv
3815                 uint_t rem;
3816 
3817                 /*
3818                  * zero a byte at a time until properly aligned for
3819                  * block_zero_no_xmm().
3820                  */
3821                 while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0)
3822                         pp_addr2[off++] = 0;
3823 
3824                 /*
3825                  * Now use faster block_zero_no_xmm() for any range
3826                  * that is properly aligned and sized.
3827                  */
3828                 rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN));
3829                 len -= rem;
3830                 if (len != 0) {
3831                         block_zero_no_xmm(pp_addr2 + off, len);
3832                         off += len;
3833                 }
3834 
3835                 /*
3836                  * zero remainder with byte stores.
3837                  */
3838                 while (rem-- > 0)
3839                         pp_addr2[off++] = 0;
3840 #else
3841                 hwblkclr(pp_addr2 + off, len);
3842 #endif
3843         } else {
3844                 bzero(pp_addr2 + off, len);
3845         }
3846 
3847         if (!kpm_enable || pfn_is_foreign(pfn)) {
3848 #ifdef __xpv
3849                 /*
3850                  * On the hypervisor this page might get used for a page
3851                  * table before any intervening change to this mapping,
3852                  * so blow it away.
3853                  */
3854                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3855                     UVMF_INVLPG) < 0)
3856                         panic("HYPERVISOR_update_va_mapping() failed");
3857 #endif
3858                 mutex_exit(ppaddr_mutex);
3859         }
3860 
3861         kpreempt_enable();
3862 }
3863 
3864 /*
3865  * Platform-dependent page scrub call.
3866  */
3867 void
3868 pagescrub(page_t *pp, uint_t off, uint_t len)
3869 {
3870         /*
3871          * For now, we rely on the fact that pagezero() will
3872          * always clear UEs.
3873          */
3874         pagezero(pp, off, len);
3875 }
3876 
3877 /*
3878  * set up two private addresses for use on a given CPU for use in ppcopy()
3879  */
3880 void
3881 setup_vaddr_for_ppcopy(struct cpu *cpup)
3882 {
3883         void *addr;
3884         hat_mempte_t pte_pa;
3885 
3886         addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
3887         pte_pa = hat_mempte_setup(addr);
3888         cpup->cpu_caddr1 = addr;
3889         cpup->cpu_caddr1pte = pte_pa;
3890 
3891         addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
3892         pte_pa = hat_mempte_setup(addr);
3893         cpup->cpu_caddr2 = addr;
3894         cpup->cpu_caddr2pte = pte_pa;
3895 
3896         mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
3897 }
3898 
3899 /*
3900  * Undo setup_vaddr_for_ppcopy
3901  */
3902 void
3903 teardown_vaddr_for_ppcopy(struct cpu *cpup)
3904 {
3905         mutex_destroy(&cpup->cpu_ppaddr_mutex);
3906 
3907         hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
3908         cpup->cpu_caddr2pte = 0;
3909         vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
3910         cpup->cpu_caddr2 = 0;
3911 
3912         hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
3913         cpup->cpu_caddr1pte = 0;
3914         vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
3915         cpup->cpu_caddr1 = 0;
3916 }
3917 
3918 /*
3919  * Function for flushing D-cache when performing module relocations
3920  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
3921  */
3922 void
3923 dcache_flushall()
3924 {}
3925 
3926 size_t
3927 exec_get_spslew(void)
3928 {
3929         return (0);
3930 }
3931 
3932 /*
3933  * Allocate a memory page.  The argument 'seed' can be any pseudo-random
3934  * number to vary where the pages come from.  This is quite a hacked up
3935  * method -- it works for now, but really needs to be fixed up a bit.
3936  *
3937  * We currently use page_create_va() on the kvp with fake offsets,
3938  * segments and virt address.  This is pretty bogus, but was copied from the
3939  * old hat_i86.c code.  A better approach would be to specify either mnode
3940  * random or mnode local and takes a page from whatever color has the MOST
3941  * available - this would have a minimal impact on page coloring.
3942  */
3943 page_t *
3944 page_get_physical(uintptr_t seed)
3945 {
3946         page_t *pp;
3947         u_offset_t offset;
3948         static struct seg tmpseg;
3949         static uintptr_t ctr = 0;
3950 
3951         /*
3952          * This code is gross, we really need a simpler page allocator.
3953          *
3954          * We need to assign an offset for the page to call page_create_va()
3955          * To avoid conflicts with other pages, we get creative with the offset.
3956          * For 32 bits, we need an offset > 4Gig
3957          * For 64 bits, need an offset somewhere in the VA hole.
3958          */
3959         offset = seed;
3960         if (offset > kernelbase)
3961                 offset -= kernelbase;
3962         offset <<= MMU_PAGESHIFT;
3963 #if defined(__amd64)
3964         offset += mmu.hole_start;       /* something in VA hole */
3965 #else
3966         offset += 1ULL << 40;     /* something > 4 Gig */
3967 #endif
3968 
3969         if (page_resv(1, KM_NOSLEEP) == 0)
3970                 return (NULL);
3971 
3972 #ifdef  DEBUG
3973         pp = page_exists(&kvp, offset);
3974         if (pp != NULL)
3975                 panic("page already exists %p", (void *)pp);
3976 #endif
3977 
3978         pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL,
3979             &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE));       /* changing VA usage */
3980         if (pp != NULL) {
3981                 page_io_unlock(pp);
3982                 page_downgrade(pp);
3983         }
3984         return (pp);
3985 }