1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #include <sys/types.h>
  26 #include <sys/t_lock.h>
  27 #include <sys/param.h>
  28 #include <sys/sysmacros.h>
  29 #include <sys/tuneable.h>
  30 #include <sys/systm.h>
  31 #include <sys/vm.h>
  32 #include <sys/kmem.h>
  33 #include <sys/vmem.h>
  34 #include <sys/mman.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/debug.h>
  37 #include <sys/dumphdr.h>
  38 #include <sys/bootconf.h>
  39 #include <sys/lgrp.h>
  40 #include <vm/seg_kmem.h>
  41 #include <vm/hat.h>
  42 #include <vm/page.h>
  43 #include <vm/vm_dep.h>
  44 #include <vm/faultcode.h>
  45 #include <sys/promif.h>
  46 #include <vm/seg_kp.h>
  47 #include <sys/bitmap.h>
  48 #include <sys/mem_cage.h>
  49 
  50 #ifdef __sparc
  51 #include <sys/ivintr.h>
  52 #include <sys/panic.h>
  53 #endif
  54 
  55 /*
  56  * seg_kmem is the primary kernel memory segment driver.  It
  57  * maps the kernel heap [kernelheap, ekernelheap), module text,
  58  * and all memory which was allocated before the VM was initialized
  59  * into kas.
  60  *
  61  * Pages which belong to seg_kmem are hashed into &kvp vnode at
  62  * an offset equal to (u_offset_t)virt_addr, and have p_lckcnt >= 1.
  63  * They must never be paged out since segkmem_fault() is a no-op to
  64  * prevent recursive faults.
  65  *
  66  * Currently, seg_kmem pages are sharelocked (p_sharelock == 1) on
  67  * __x86 and are unlocked (p_sharelock == 0) on __sparc.  Once __x86
  68  * supports relocation the #ifdef kludges can be removed.
  69  *
  70  * seg_kmem pages may be subject to relocation by page_relocate(),
  71  * provided that the HAT supports it; if this is so, segkmem_reloc
  72  * will be set to a nonzero value. All boot time allocated memory as
  73  * well as static memory is considered off limits to relocation.
  74  * Pages are "relocatable" if p_state does not have P_NORELOC set, so
  75  * we request P_NORELOC pages for memory that isn't safe to relocate.
  76  *
  77  * The kernel heap is logically divided up into four pieces:
  78  *
  79  *   heap32_arena is for allocations that require 32-bit absolute
  80  *   virtual addresses (e.g. code that uses 32-bit pointers/offsets).
  81  *
  82  *   heap_core is for allocations that require 2GB *relative*
  83  *   offsets; in other words all memory from heap_core is within
  84  *   2GB of all other memory from the same arena. This is a requirement
  85  *   of the addressing modes of some processors in supervisor code.
  86  *
  87  *   heap_arena is the general heap arena.
  88  *
  89  *   static_arena is the static memory arena.  Allocations from it
  90  *   are not subject to relocation so it is safe to use the memory
  91  *   physical address as well as the virtual address (e.g. the VA to
  92  *   PA translations are static).  Caches may import from static_arena;
  93  *   all other static memory allocations should use static_alloc_arena.
  94  *
  95  * On some platforms which have limited virtual address space, seg_kmem
  96  * may share [kernelheap, ekernelheap) with seg_kp; if this is so,
  97  * segkp_bitmap is non-NULL, and each bit represents a page of virtual
  98  * address space which is actually seg_kp mapped.
  99  */
 100 
 101 extern ulong_t *segkp_bitmap;   /* Is set if segkp is from the kernel heap */
 102 
 103 char *kernelheap;               /* start of primary kernel heap */
 104 char *ekernelheap;              /* end of primary kernel heap */
 105 struct seg kvseg;               /* primary kernel heap segment */
 106 struct seg kvseg_core;          /* "core" kernel heap segment */
 107 struct seg kzioseg;             /* Segment for zio mappings */
 108 vmem_t *heap_arena;             /* primary kernel heap arena */
 109 vmem_t *heap_core_arena;        /* core kernel heap arena */
 110 char *heap_core_base;           /* start of core kernel heap arena */
 111 char *heap_lp_base;             /* start of kernel large page heap arena */
 112 char *heap_lp_end;              /* end of kernel large page heap arena */
 113 vmem_t *hat_memload_arena;      /* HAT translation data */
 114 struct seg kvseg32;             /* 32-bit kernel heap segment */
 115 vmem_t *heap32_arena;           /* 32-bit kernel heap arena */
 116 vmem_t *heaptext_arena;         /* heaptext arena */
 117 struct as kas;                  /* kernel address space */
 118 int segkmem_reloc;              /* enable/disable relocatable segkmem pages */
 119 vmem_t *static_arena;           /* arena for caches to import static memory */
 120 vmem_t *static_alloc_arena;     /* arena for allocating static memory */
 121 vmem_t *zio_arena = NULL;       /* arena for allocating zio memory */
 122 vmem_t *zio_alloc_arena = NULL; /* arena for allocating zio memory */
 123 
 124 /*
 125  * seg_kmem driver can map part of the kernel heap with large pages.
 126  * Currently this functionality is implemented for sparc platforms only.
 127  *
 128  * The large page size "segkmem_lpsize" for kernel heap is selected in the
 129  * platform specific code. It can also be modified via /etc/system file.
 130  * Setting segkmem_lpsize to PAGESIZE in /etc/system disables usage of large
 131  * pages for kernel heap. "segkmem_lpshift" is adjusted appropriately to
 132  * match segkmem_lpsize.
 133  *
 134  * At boot time we carve from kernel heap arena a range of virtual addresses
 135  * that will be used for large page mappings. This range [heap_lp_base,
 136  * heap_lp_end) is set up as a separate vmem arena - "heap_lp_arena". We also
 137  * create "kmem_lp_arena" that caches memory already backed up by large
 138  * pages. kmem_lp_arena imports virtual segments from heap_lp_arena.
 139  */
 140 
 141 size_t  segkmem_lpsize;
 142 static  uint_t  segkmem_lpshift = PAGESHIFT;
 143 int     segkmem_lpszc = 0;
 144 
 145 size_t  segkmem_kmemlp_quantum = 0x400000;      /* 4MB */
 146 size_t  segkmem_heaplp_quantum;
 147 vmem_t *heap_lp_arena;
 148 static  vmem_t *kmem_lp_arena;
 149 static  vmem_t *segkmem_ppa_arena;
 150 static  segkmem_lpcb_t segkmem_lpcb;
 151 
 152 /*
 153  * We use "segkmem_kmemlp_max" to limit the total amount of physical memory
 154  * consumed by the large page heap. By default this parameter is set to 1/8 of
 155  * physmem but can be adjusted through /etc/system either directly or
 156  * indirectly by setting "segkmem_kmemlp_pcnt" to the percent of physmem
 157  * we allow for large page heap.
 158  */
 159 size_t  segkmem_kmemlp_max;
 160 static  uint_t  segkmem_kmemlp_pcnt;
 161 
 162 /*
 163  * Getting large pages for kernel heap could be problematic due to
 164  * physical memory fragmentation. That's why we allow to preallocate
 165  * "segkmem_kmemlp_min" bytes at boot time.
 166  */
 167 static  size_t  segkmem_kmemlp_min;
 168 
 169 /*
 170  * Throttling is used to avoid expensive tries to allocate large pages
 171  * for kernel heap when a lot of succesive attempts to do so fail.
 172  */
 173 static  ulong_t segkmem_lpthrottle_max = 0x400000;
 174 static  ulong_t segkmem_lpthrottle_start = 0x40;
 175 static  ulong_t segkmem_use_lpthrottle = 1;
 176 
 177 /*
 178  * Freed pages accumulate on a garbage list until segkmem is ready,
 179  * at which point we call segkmem_gc() to free it all.
 180  */
 181 typedef struct segkmem_gc_list {
 182         struct segkmem_gc_list  *gc_next;
 183         vmem_t                  *gc_arena;
 184         size_t                  gc_size;
 185 } segkmem_gc_list_t;
 186 
 187 static segkmem_gc_list_t *segkmem_gc_list;
 188 
 189 /*
 190  * Allocations from the hat_memload arena add VM_MEMLOAD to their
 191  * vmflags so that segkmem_xalloc() can inform the hat layer that it needs
 192  * to take steps to prevent infinite recursion.  HAT allocations also
 193  * must be non-relocatable to prevent recursive page faults.
 194  */
 195 static void *
 196 hat_memload_alloc(vmem_t *vmp, size_t size, int flags)
 197 {
 198         flags |= (VM_MEMLOAD | VM_NORELOC);
 199         return (segkmem_alloc(vmp, size, flags));
 200 }
 201 
 202 /*
 203  * Allocations from static_arena arena (or any other arena that uses
 204  * segkmem_alloc_permanent()) require non-relocatable (permanently
 205  * wired) memory pages, since these pages are referenced by physical
 206  * as well as virtual address.
 207  */
 208 void *
 209 segkmem_alloc_permanent(vmem_t *vmp, size_t size, int flags)
 210 {
 211         return (segkmem_alloc(vmp, size, flags | VM_NORELOC));
 212 }
 213 
 214 /*
 215  * Initialize kernel heap boundaries.
 216  */
 217 void
 218 kernelheap_init(
 219         void *heap_start,
 220         void *heap_end,
 221         char *first_avail,
 222         void *core_start,
 223         void *core_end)
 224 {
 225         uintptr_t textbase;
 226         size_t core_size;
 227         size_t heap_size;
 228         vmem_t *heaptext_parent;
 229         size_t  heap_lp_size = 0;
 230 #ifdef __sparc
 231         size_t kmem64_sz = kmem64_aligned_end - kmem64_base;
 232 #endif  /* __sparc */
 233 
 234         kernelheap = heap_start;
 235         ekernelheap = heap_end;
 236 
 237 #ifdef __sparc
 238         heap_lp_size = (((uintptr_t)heap_end - (uintptr_t)heap_start) / 4);
 239         /*
 240          * Bias heap_lp start address by kmem64_sz to reduce collisions
 241          * in 4M kernel TSB between kmem64 area and heap_lp
 242          */
 243         kmem64_sz = P2ROUNDUP(kmem64_sz, MMU_PAGESIZE256M);
 244         if (kmem64_sz <= heap_lp_size / 2)
 245                 heap_lp_size -= kmem64_sz;
 246         heap_lp_base = ekernelheap - heap_lp_size;
 247         heap_lp_end = heap_lp_base + heap_lp_size;
 248 #endif  /* __sparc */
 249 
 250         /*
 251          * If this platform has a 'core' heap area, then the space for
 252          * overflow module text should be carved out of the end of that
 253          * heap.  Otherwise, it gets carved out of the general purpose
 254          * heap.
 255          */
 256         core_size = (uintptr_t)core_end - (uintptr_t)core_start;
 257         if (core_size > 0) {
 258                 ASSERT(core_size >= HEAPTEXT_SIZE);
 259                 textbase = (uintptr_t)core_end - HEAPTEXT_SIZE;
 260                 core_size -= HEAPTEXT_SIZE;
 261         }
 262 #ifndef __sparc
 263         else {
 264                 ekernelheap -= HEAPTEXT_SIZE;
 265                 textbase = (uintptr_t)ekernelheap;
 266         }
 267 #endif
 268 
 269         heap_size = (uintptr_t)ekernelheap - (uintptr_t)kernelheap;
 270         heap_arena = vmem_init("heap", kernelheap, heap_size, PAGESIZE,
 271             segkmem_alloc, segkmem_free);
 272 
 273         if (core_size > 0) {
 274                 heap_core_arena = vmem_create("heap_core", core_start,
 275                     core_size, PAGESIZE, NULL, NULL, NULL, 0, VM_SLEEP);
 276                 heap_core_base = core_start;
 277         } else {
 278                 heap_core_arena = heap_arena;
 279                 heap_core_base = kernelheap;
 280         }
 281 
 282         /*
 283          * reserve space for the large page heap. If large pages for kernel
 284          * heap is enabled large page heap arean will be created later in the
 285          * boot sequence in segkmem_heap_lp_init(). Otherwise the allocated
 286          * range will be returned back to the heap_arena.
 287          */
 288         if (heap_lp_size) {
 289                 (void) vmem_xalloc(heap_arena, heap_lp_size, PAGESIZE, 0, 0,
 290                     heap_lp_base, heap_lp_end,
 291                     VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
 292         }
 293 
 294         /*
 295          * Remove the already-spoken-for memory range [kernelheap, first_avail).
 296          */
 297         (void) vmem_xalloc(heap_arena, first_avail - kernelheap, PAGESIZE,
 298             0, 0, kernelheap, first_avail, VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
 299 
 300 #ifdef __sparc
 301         heap32_arena = vmem_create("heap32", (void *)SYSBASE32,
 302             SYSLIMIT32 - SYSBASE32 - HEAPTEXT_SIZE, PAGESIZE, NULL,
 303             NULL, NULL, 0, VM_SLEEP);
 304         /*
 305          * Prom claims the physical and virtual resources used by panicbuf
 306          * and inter_vec_table. So reserve space for panicbuf, intr_vec_table,
 307          * reserved interrupt vector data structures from 32-bit heap.
 308          */
 309         (void) vmem_xalloc(heap32_arena, PANICBUFSIZE, PAGESIZE, 0, 0,
 310             panicbuf, panicbuf + PANICBUFSIZE,
 311             VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
 312 
 313         (void) vmem_xalloc(heap32_arena, IVSIZE, PAGESIZE, 0, 0,
 314             intr_vec_table, (caddr_t)intr_vec_table + IVSIZE,
 315             VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
 316 
 317         textbase = SYSLIMIT32 - HEAPTEXT_SIZE;
 318         heaptext_parent = NULL;
 319 #else   /* __sparc */
 320         heap32_arena = heap_core_arena;
 321         heaptext_parent = heap_core_arena;
 322 #endif  /* __sparc */
 323 
 324         heaptext_arena = vmem_create("heaptext", (void *)textbase,
 325             HEAPTEXT_SIZE, PAGESIZE, NULL, NULL, heaptext_parent, 0, VM_SLEEP);
 326 
 327         /*
 328          * Create a set of arenas for memory with static translations
 329          * (e.g. VA -> PA translations cannot change).  Since using
 330          * kernel pages by physical address implies it isn't safe to
 331          * walk across page boundaries, the static_arena quantum must
 332          * be PAGESIZE.  Any kmem caches that require static memory
 333          * should source from static_arena, while direct allocations
 334          * should only use static_alloc_arena.
 335          */
 336         static_arena = vmem_create("static", NULL, 0, PAGESIZE,
 337             segkmem_alloc_permanent, segkmem_free, heap_arena, 0, VM_SLEEP);
 338         static_alloc_arena = vmem_create("static_alloc", NULL, 0,
 339             sizeof (uint64_t), vmem_alloc, vmem_free, static_arena,
 340             0, VM_SLEEP);
 341 
 342         /*
 343          * Create an arena for translation data (ptes, hmes, or hblks).
 344          * We need an arena for this because hat_memload() is essential
 345          * to vmem_populate() (see comments in common/os/vmem.c).
 346          *
 347          * Note: any kmem cache that allocates from hat_memload_arena
 348          * must be created as a KMC_NOHASH cache (i.e. no external slab
 349          * and bufctl structures to allocate) so that slab creation doesn't
 350          * require anything more than a single vmem_alloc().
 351          */
 352         hat_memload_arena = vmem_create("hat_memload", NULL, 0, PAGESIZE,
 353             hat_memload_alloc, segkmem_free, heap_arena, 0,
 354             VM_SLEEP | VMC_POPULATOR | VMC_DUMPSAFE);
 355 }
 356 
 357 void
 358 boot_mapin(caddr_t addr, size_t size)
 359 {
 360         caddr_t  eaddr;
 361         page_t  *pp;
 362         pfn_t    pfnum;
 363 
 364         if (page_resv(btop(size), KM_NOSLEEP) == 0)
 365                 panic("boot_mapin: page_resv failed");
 366 
 367         for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
 368                 pfnum = va_to_pfn(addr);
 369                 if (pfnum == PFN_INVALID)
 370                         continue;
 371                 if ((pp = page_numtopp_nolock(pfnum)) == NULL)
 372                         panic("boot_mapin(): No pp for pfnum = %lx", pfnum);
 373 
 374                 /*
 375                  * must break up any large pages that may have constituent
 376                  * pages being utilized for BOP_ALLOC()'s before calling
 377                  * page_numtopp().The locking code (ie. page_reclaim())
 378                  * can't handle them
 379                  */
 380                 if (pp->p_szc != 0)
 381                         page_boot_demote(pp);
 382 
 383                 pp = page_numtopp(pfnum, SE_EXCL);
 384                 if (pp == NULL || PP_ISFREE(pp))
 385                         panic("boot_alloc: pp is NULL or free");
 386 
 387                 /*
 388                  * If the cage is on but doesn't yet contain this page,
 389                  * mark it as non-relocatable.
 390                  */
 391                 if (kcage_on && !PP_ISNORELOC(pp)) {
 392                         PP_SETNORELOC(pp);
 393                         PLCNT_XFER_NORELOC(pp);
 394                 }
 395 
 396                 (void) page_hashin(pp, &kvp, (u_offset_t)(uintptr_t)addr, NULL);
 397                 pp->p_lckcnt = 1;
 398 #if defined(__x86)
 399                 page_downgrade(pp);
 400 #else
 401                 page_unlock(pp);
 402 #endif
 403         }
 404 }
 405 
 406 /*
 407  * Get pages from boot and hash them into the kernel's vp.
 408  * Used after page structs have been allocated, but before segkmem is ready.
 409  */
 410 void *
 411 boot_alloc(void *inaddr, size_t size, uint_t align)
 412 {
 413         caddr_t addr = inaddr;
 414 
 415         if (bootops == NULL)
 416                 prom_panic("boot_alloc: attempt to allocate memory after "
 417                     "BOP_GONE");
 418 
 419         size = ptob(btopr(size));
 420 #ifdef __sparc
 421         if (bop_alloc_chunk(addr, size, align) != (caddr_t)addr)
 422                 panic("boot_alloc: bop_alloc_chunk failed");
 423 #else
 424         if (BOP_ALLOC(bootops, addr, size, align) != addr)
 425                 panic("boot_alloc: BOP_ALLOC failed");
 426 #endif
 427         boot_mapin((caddr_t)addr, size);
 428         return (addr);
 429 }
 430 
 431 static void
 432 segkmem_badop()
 433 {
 434         panic("segkmem_badop");
 435 }
 436 
 437 #define SEGKMEM_BADOP(t)        (t(*)())segkmem_badop
 438 
 439 /*ARGSUSED*/
 440 static faultcode_t
 441 segkmem_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t size,
 442         enum fault_type type, enum seg_rw rw)
 443 {
 444         pgcnt_t npages;
 445         spgcnt_t pg;
 446         page_t *pp;
 447         struct vnode *vp = seg->s_data;
 448 
 449         ASSERT(RW_READ_HELD(&seg->s_as->a_lock));
 450 
 451         if (seg->s_as != &kas || size > seg->s_size ||
 452             addr < seg->s_base || addr + size > seg->s_base + seg->s_size)
 453                 panic("segkmem_fault: bad args");
 454 
 455         /*
 456          * If it is one of segkp pages, call segkp_fault.
 457          */
 458         if (segkp_bitmap && seg == &kvseg &&
 459             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 460                 return (SEGOP_FAULT(hat, segkp, addr, size, type, rw));
 461 
 462         if (rw != S_READ && rw != S_WRITE && rw != S_OTHER)
 463                 return (FC_NOSUPPORT);
 464 
 465         npages = btopr(size);
 466 
 467         switch (type) {
 468         case F_SOFTLOCK:        /* lock down already-loaded translations */
 469                 for (pg = 0; pg < npages; pg++) {
 470                         pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr,
 471                             SE_SHARED);
 472                         if (pp == NULL) {
 473                                 /*
 474                                  * Hmm, no page. Does a kernel mapping
 475                                  * exist for it?
 476                                  */
 477                                 if (!hat_probe(kas.a_hat, addr)) {
 478                                         addr -= PAGESIZE;
 479                                         while (--pg >= 0) {
 480                                                 pp = page_find(vp, (u_offset_t)
 481                                                     (uintptr_t)addr);
 482                                                 if (pp)
 483                                                         page_unlock(pp);
 484                                                 addr -= PAGESIZE;
 485                                         }
 486                                         return (FC_NOMAP);
 487                                 }
 488                         }
 489                         addr += PAGESIZE;
 490                 }
 491                 if (rw == S_OTHER)
 492                         hat_reserve(seg->s_as, addr, size);
 493                 return (0);
 494         case F_SOFTUNLOCK:
 495                 while (npages--) {
 496                         pp = page_find(vp, (u_offset_t)(uintptr_t)addr);
 497                         if (pp)
 498                                 page_unlock(pp);
 499                         addr += PAGESIZE;
 500                 }
 501                 return (0);
 502         default:
 503                 return (FC_NOSUPPORT);
 504         }
 505         /*NOTREACHED*/
 506 }
 507 
 508 static int
 509 segkmem_setprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
 510 {
 511         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 512 
 513         if (seg->s_as != &kas || size > seg->s_size ||
 514             addr < seg->s_base || addr + size > seg->s_base + seg->s_size)
 515                 panic("segkmem_setprot: bad args");
 516 
 517         /*
 518          * If it is one of segkp pages, call segkp.
 519          */
 520         if (segkp_bitmap && seg == &kvseg &&
 521             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 522                 return (SEGOP_SETPROT(segkp, addr, size, prot));
 523 
 524         if (prot == 0)
 525                 hat_unload(kas.a_hat, addr, size, HAT_UNLOAD);
 526         else
 527                 hat_chgprot(kas.a_hat, addr, size, prot);
 528         return (0);
 529 }
 530 
 531 /*
 532  * This is a dummy segkmem function overloaded to call segkp
 533  * when segkp is under the heap.
 534  */
 535 /* ARGSUSED */
 536 static int
 537 segkmem_checkprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
 538 {
 539         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 540 
 541         if (seg->s_as != &kas)
 542                 segkmem_badop();
 543 
 544         /*
 545          * If it is one of segkp pages, call into segkp.
 546          */
 547         if (segkp_bitmap && seg == &kvseg &&
 548             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 549                 return (SEGOP_CHECKPROT(segkp, addr, size, prot));
 550 
 551         segkmem_badop();
 552         return (0);
 553 }
 554 
 555 /*
 556  * This is a dummy segkmem function overloaded to call segkp
 557  * when segkp is under the heap.
 558  */
 559 /* ARGSUSED */
 560 static int
 561 segkmem_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 562 {
 563         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 564 
 565         if (seg->s_as != &kas)
 566                 segkmem_badop();
 567 
 568         /*
 569          * If it is one of segkp pages, call into segkp.
 570          */
 571         if (segkp_bitmap && seg == &kvseg &&
 572             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 573                 return (SEGOP_KLUSTER(segkp, addr, delta));
 574 
 575         segkmem_badop();
 576         return (0);
 577 }
 578 
 579 static void
 580 segkmem_xdump_range(void *arg, void *start, size_t size)
 581 {
 582         struct as *as = arg;
 583         caddr_t addr = start;
 584         caddr_t addr_end = addr + size;
 585 
 586         while (addr < addr_end) {
 587                 pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
 588                 if (pfn != PFN_INVALID && pfn <= physmax && pf_is_memory(pfn))
 589                         dump_addpage(as, addr, pfn);
 590                 addr += PAGESIZE;
 591                 dump_timeleft = dump_timeout;
 592         }
 593 }
 594 
 595 static void
 596 segkmem_dump_range(void *arg, void *start, size_t size)
 597 {
 598         caddr_t addr = start;
 599         caddr_t addr_end = addr + size;
 600 
 601         /*
 602          * If we are about to start dumping the range of addresses we
 603          * carved out of the kernel heap for the large page heap walk
 604          * heap_lp_arena to find what segments are actually populated
 605          */
 606         if (SEGKMEM_USE_LARGEPAGES &&
 607             addr == heap_lp_base && addr_end == heap_lp_end &&
 608             vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
 609                 vmem_walk(heap_lp_arena, VMEM_ALLOC | VMEM_REENTRANT,
 610                     segkmem_xdump_range, arg);
 611         } else {
 612                 segkmem_xdump_range(arg, start, size);
 613         }
 614 }
 615 
 616 static void
 617 segkmem_dump(struct seg *seg)
 618 {
 619         /*
 620          * The kernel's heap_arena (represented by kvseg) is a very large
 621          * VA space, most of which is typically unused.  To speed up dumping
 622          * we use vmem_walk() to quickly find the pieces of heap_arena that
 623          * are actually in use.  We do the same for heap32_arena and
 624          * heap_core.
 625          *
 626          * We specify VMEM_REENTRANT to vmem_walk() because dump_addpage()
 627          * may ultimately need to allocate memory.  Reentrant walks are
 628          * necessarily imperfect snapshots.  The kernel heap continues
 629          * to change during a live crash dump, for example.  For a normal
 630          * crash dump, however, we know that there won't be any other threads
 631          * messing with the heap.  Therefore, at worst, we may fail to dump
 632          * the pages that get allocated by the act of dumping; but we will
 633          * always dump every page that was allocated when the walk began.
 634          *
 635          * The other segkmem segments are dense (fully populated), so there's
 636          * no need to use this technique when dumping them.
 637          *
 638          * Note: when adding special dump handling for any new sparsely-
 639          * populated segments, be sure to add similar handling to the ::kgrep
 640          * code in mdb.
 641          */
 642         if (seg == &kvseg) {
 643                 vmem_walk(heap_arena, VMEM_ALLOC | VMEM_REENTRANT,
 644                     segkmem_dump_range, seg->s_as);
 645 #ifndef __sparc
 646                 vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
 647                     segkmem_dump_range, seg->s_as);
 648 #endif
 649         } else if (seg == &kvseg_core) {
 650                 vmem_walk(heap_core_arena, VMEM_ALLOC | VMEM_REENTRANT,
 651                     segkmem_dump_range, seg->s_as);
 652         } else if (seg == &kvseg32) {
 653                 vmem_walk(heap32_arena, VMEM_ALLOC | VMEM_REENTRANT,
 654                     segkmem_dump_range, seg->s_as);
 655                 vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
 656                     segkmem_dump_range, seg->s_as);
 657         } else if (seg == &kzioseg) {
 658                 /*
 659                  * We don't want to dump pages attached to kzioseg since they
 660                  * contain file data from ZFS.  If this page's segment is
 661                  * kzioseg return instead of writing it to the dump device.
 662                  */
 663                 return;
 664         } else {
 665                 segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size);
 666         }
 667 }
 668 
 669 /*
 670  * lock/unlock kmem pages over a given range [addr, addr+len).
 671  * Returns a shadow list of pages in ppp. If there are holes
 672  * in the range (e.g. some of the kernel mappings do not have
 673  * underlying page_ts) returns ENOTSUP so that as_pagelock()
 674  * will handle the range via as_fault(F_SOFTLOCK).
 675  */
 676 /*ARGSUSED*/
 677 static int
 678 segkmem_pagelock(struct seg *seg, caddr_t addr, size_t len,
 679         page_t ***ppp, enum lock_type type, enum seg_rw rw)
 680 {
 681         page_t **pplist, *pp;
 682         pgcnt_t npages;
 683         spgcnt_t pg;
 684         size_t nb;
 685         struct vnode *vp = seg->s_data;
 686 
 687         ASSERT(ppp != NULL);
 688 
 689         /*
 690          * If it is one of segkp pages, call into segkp.
 691          */
 692         if (segkp_bitmap && seg == &kvseg &&
 693             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 694                 return (SEGOP_PAGELOCK(segkp, addr, len, ppp, type, rw));
 695 
 696         npages = btopr(len);
 697         nb = sizeof (page_t *) * npages;
 698 
 699         if (type == L_PAGEUNLOCK) {
 700                 pplist = *ppp;
 701                 ASSERT(pplist != NULL);
 702 
 703                 for (pg = 0; pg < npages; pg++) {
 704                         pp = pplist[pg];
 705                         page_unlock(pp);
 706                 }
 707                 kmem_free(pplist, nb);
 708                 return (0);
 709         }
 710 
 711         ASSERT(type == L_PAGELOCK);
 712 
 713         pplist = kmem_alloc(nb, KM_NOSLEEP);
 714         if (pplist == NULL) {
 715                 *ppp = NULL;
 716                 return (ENOTSUP);       /* take the slow path */
 717         }
 718 
 719         for (pg = 0; pg < npages; pg++) {
 720                 pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_SHARED);
 721                 if (pp == NULL) {
 722                         while (--pg >= 0)
 723                                 page_unlock(pplist[pg]);
 724                         kmem_free(pplist, nb);
 725                         *ppp = NULL;
 726                         return (ENOTSUP);
 727                 }
 728                 pplist[pg] = pp;
 729                 addr += PAGESIZE;
 730         }
 731 
 732         *ppp = pplist;
 733         return (0);
 734 }
 735 
 736 /*
 737  * This is a dummy segkmem function overloaded to call segkp
 738  * when segkp is under the heap.
 739  */
 740 /* ARGSUSED */
 741 static int
 742 segkmem_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
 743 {
 744         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 745 
 746         if (seg->s_as != &kas)
 747                 segkmem_badop();
 748 
 749         /*
 750          * If it is one of segkp pages, call into segkp.
 751          */
 752         if (segkp_bitmap && seg == &kvseg &&
 753             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 754                 return (SEGOP_GETMEMID(segkp, addr, memidp));
 755 
 756         segkmem_badop();
 757         return (0);
 758 }
 759 
 760 /*ARGSUSED*/
 761 static lgrp_mem_policy_info_t *
 762 segkmem_getpolicy(struct seg *seg, caddr_t addr)
 763 {
 764         return (NULL);
 765 }
 766 
 767 /*ARGSUSED*/
 768 static int
 769 segkmem_capable(struct seg *seg, segcapability_t capability)
 770 {
 771         if (capability == S_CAPABILITY_NOMINFLT)
 772                 return (1);
 773         return (0);
 774 }
 775 
 776 static struct seg_ops segkmem_ops = {
 777         SEGKMEM_BADOP(int),             /* dup */
 778         SEGKMEM_BADOP(int),             /* unmap */
 779         SEGKMEM_BADOP(void),            /* free */
 780         segkmem_fault,
 781         SEGKMEM_BADOP(faultcode_t),     /* faulta */
 782         segkmem_setprot,
 783         segkmem_checkprot,
 784         segkmem_kluster,
 785         SEGKMEM_BADOP(size_t),          /* swapout */
 786         SEGKMEM_BADOP(int),             /* sync */
 787         SEGKMEM_BADOP(size_t),          /* incore */
 788         SEGKMEM_BADOP(int),             /* lockop */
 789         SEGKMEM_BADOP(int),             /* getprot */
 790         SEGKMEM_BADOP(u_offset_t),      /* getoffset */
 791         SEGKMEM_BADOP(int),             /* gettype */
 792         SEGKMEM_BADOP(int),             /* getvp */
 793         SEGKMEM_BADOP(int),             /* advise */
 794         segkmem_dump,
 795         segkmem_pagelock,
 796         SEGKMEM_BADOP(int),             /* setpgsz */
 797         segkmem_getmemid,
 798         segkmem_getpolicy,              /* getpolicy */
 799         segkmem_capable,                /* capable */
 800 };
 801 
 802 int
 803 segkmem_zio_create(struct seg *seg)
 804 {
 805         ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
 806         seg->s_ops = &segkmem_ops;
 807         seg->s_data = &zvp;
 808         kas.a_size += seg->s_size;
 809         return (0);
 810 }
 811 
 812 int
 813 segkmem_create(struct seg *seg)
 814 {
 815         ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
 816         seg->s_ops = &segkmem_ops;
 817         seg->s_data = &kvp;
 818         kas.a_size += seg->s_size;
 819         return (0);
 820 }
 821 
 822 /*ARGSUSED*/
 823 page_t *
 824 segkmem_page_create(void *addr, size_t size, int vmflag, void *arg)
 825 {
 826         struct seg kseg;
 827         int pgflags;
 828         struct vnode *vp = arg;
 829 
 830         if (vp == NULL)
 831                 vp = &kvp;
 832 
 833         kseg.s_as = &kas;
 834         pgflags = PG_EXCL;
 835 
 836         if (segkmem_reloc == 0 || (vmflag & VM_NORELOC))
 837                 pgflags |= PG_NORELOC;
 838         if ((vmflag & VM_NOSLEEP) == 0)
 839                 pgflags |= PG_WAIT;
 840         if (vmflag & VM_PANIC)
 841                 pgflags |= PG_PANIC;
 842         if (vmflag & VM_PUSHPAGE)
 843                 pgflags |= PG_PUSHPAGE;
 844         if (vmflag & VM_NORMALPRI) {
 845                 ASSERT(vmflag & VM_NOSLEEP);
 846                 pgflags |= PG_NORMALPRI;
 847         }
 848 
 849         return (page_create_va(vp, (u_offset_t)(uintptr_t)addr, size,
 850             pgflags, &kseg, addr));
 851 }
 852 
 853 /*
 854  * Allocate pages to back the virtual address range [addr, addr + size).
 855  * If addr is NULL, allocate the virtual address space as well.
 856  */
 857 void *
 858 segkmem_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr,
 859         page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg)
 860 {
 861         page_t *ppl;
 862         caddr_t addr = inaddr;
 863         pgcnt_t npages = btopr(size);
 864         int allocflag;
 865 
 866         if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL)
 867                 return (NULL);
 868 
 869         ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
 870 
 871         if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
 872                 if (inaddr == NULL)
 873                         vmem_free(vmp, addr, size);
 874                 return (NULL);
 875         }
 876 
 877         ppl = page_create_func(addr, size, vmflag, pcarg);
 878         if (ppl == NULL) {
 879                 if (inaddr == NULL)
 880                         vmem_free(vmp, addr, size);
 881                 page_unresv(npages);
 882                 return (NULL);
 883         }
 884 
 885         /*
 886          * Under certain conditions, we need to let the HAT layer know
 887          * that it cannot safely allocate memory.  Allocations from
 888          * the hat_memload vmem arena always need this, to prevent
 889          * infinite recursion.
 890          *
 891          * In addition, the x86 hat cannot safely do memory
 892          * allocations while in vmem_populate(), because there
 893          * is no simple bound on its usage.
 894          */
 895         if (vmflag & VM_MEMLOAD)
 896                 allocflag = HAT_NO_KALLOC;
 897 #if defined(__x86)
 898         else if (vmem_is_populator())
 899                 allocflag = HAT_NO_KALLOC;
 900 #endif
 901         else
 902                 allocflag = 0;
 903 
 904         while (ppl != NULL) {
 905                 page_t *pp = ppl;
 906                 page_sub(&ppl, pp);
 907                 ASSERT(page_iolock_assert(pp));
 908                 ASSERT(PAGE_EXCL(pp));
 909                 page_io_unlock(pp);
 910                 hat_memload(kas.a_hat, (caddr_t)(uintptr_t)pp->p_offset, pp,
 911                     (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr,
 912                     HAT_LOAD_LOCK | allocflag);
 913                 pp->p_lckcnt = 1;
 914 #if defined(__x86)
 915                 page_downgrade(pp);
 916 #else
 917                 if (vmflag & SEGKMEM_SHARELOCKED)
 918                         page_downgrade(pp);
 919                 else
 920                         page_unlock(pp);
 921 #endif
 922         }
 923 
 924         return (addr);
 925 }
 926 
 927 static void *
 928 segkmem_alloc_vn(vmem_t *vmp, size_t size, int vmflag, struct vnode *vp)
 929 {
 930         void *addr;
 931         segkmem_gc_list_t *gcp, **prev_gcpp;
 932 
 933         ASSERT(vp != NULL);
 934 
 935         if (kvseg.s_base == NULL) {
 936 #ifndef __sparc
 937                 if (bootops->bsys_alloc == NULL)
 938                         halt("Memory allocation between bop_alloc() and "
 939                             "kmem_alloc().\n");
 940 #endif
 941 
 942                 /*
 943                  * There's not a lot of memory to go around during boot,
 944                  * so recycle it if we can.
 945                  */
 946                 for (prev_gcpp = &segkmem_gc_list; (gcp = *prev_gcpp) != NULL;
 947                     prev_gcpp = &gcp->gc_next) {
 948                         if (gcp->gc_arena == vmp && gcp->gc_size == size) {
 949                                 *prev_gcpp = gcp->gc_next;
 950                                 return (gcp);
 951                         }
 952                 }
 953 
 954                 addr = vmem_alloc(vmp, size, vmflag | VM_PANIC);
 955                 if (boot_alloc(addr, size, BO_NO_ALIGN) != addr)
 956                         panic("segkmem_alloc: boot_alloc failed");
 957                 return (addr);
 958         }
 959         return (segkmem_xalloc(vmp, NULL, size, vmflag, 0,
 960             segkmem_page_create, vp));
 961 }
 962 
 963 void *
 964 segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
 965 {
 966         return (segkmem_alloc_vn(vmp, size, vmflag, &kvp));
 967 }
 968 
 969 void *
 970 segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag)
 971 {
 972         return (segkmem_alloc_vn(vmp, size, vmflag, &zvp));
 973 }
 974 
 975 /*
 976  * Any changes to this routine must also be carried over to
 977  * devmap_free_pages() in the seg_dev driver. This is because
 978  * we currently don't have a special kernel segment for non-paged
 979  * kernel memory that is exported by drivers to user space.
 980  */
 981 static void
 982 segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
 983     void (*func)(page_t *))
 984 {
 985         page_t *pp;
 986         caddr_t addr = inaddr;
 987         caddr_t eaddr;
 988         pgcnt_t npages = btopr(size);
 989 
 990         ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
 991         ASSERT(vp != NULL);
 992 
 993         if (kvseg.s_base == NULL) {
 994                 segkmem_gc_list_t *gc = inaddr;
 995                 gc->gc_arena = vmp;
 996                 gc->gc_size = size;
 997                 gc->gc_next = segkmem_gc_list;
 998                 segkmem_gc_list = gc;
 999                 return;
1000         }
1001 
1002         hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1003 
1004         for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
1005 #if defined(__x86)
1006                 pp = page_find(vp, (u_offset_t)(uintptr_t)addr);
1007                 if (pp == NULL)
1008                         panic("segkmem_free: page not found");
1009                 if (!page_tryupgrade(pp)) {
1010                         /*
1011                          * Some other thread has a sharelock. Wait for
1012                          * it to drop the lock so we can free this page.
1013                          */
1014                         page_unlock(pp);
1015                         pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr,
1016                             SE_EXCL);
1017                 }
1018 #else
1019                 pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
1020 #endif
1021                 if (pp == NULL)
1022                         panic("segkmem_free: page not found");
1023                 /* Clear p_lckcnt so page_destroy() doesn't update availrmem */
1024                 pp->p_lckcnt = 0;
1025                 if (func)
1026                         func(pp);
1027                 else
1028                         page_destroy(pp, 0);
1029         }
1030         if (func == NULL)
1031                 page_unresv(npages);
1032 
1033         if (vmp != NULL)
1034                 vmem_free(vmp, inaddr, size);
1035 
1036 }
1037 
1038 void
1039 segkmem_xfree(vmem_t *vmp, void *inaddr, size_t size, void (*func)(page_t *))
1040 {
1041         segkmem_free_vn(vmp, inaddr, size, &kvp, func);
1042 }
1043 
1044 void
1045 segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
1046 {
1047         segkmem_free_vn(vmp, inaddr, size, &kvp, NULL);
1048 }
1049 
1050 void
1051 segkmem_zio_free(vmem_t *vmp, void *inaddr, size_t size)
1052 {
1053         segkmem_free_vn(vmp, inaddr, size, &zvp, NULL);
1054 }
1055 
1056 void
1057 segkmem_gc(void)
1058 {
1059         ASSERT(kvseg.s_base != NULL);
1060         while (segkmem_gc_list != NULL) {
1061                 segkmem_gc_list_t *gc = segkmem_gc_list;
1062                 segkmem_gc_list = gc->gc_next;
1063                 segkmem_free(gc->gc_arena, gc, gc->gc_size);
1064         }
1065 }
1066 
1067 /*
1068  * Legacy entry points from here to end of file.
1069  */
1070 void
1071 segkmem_mapin(struct seg *seg, void *addr, size_t size, uint_t vprot,
1072     pfn_t pfn, uint_t flags)
1073 {
1074         hat_unload(seg->s_as->a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1075         hat_devload(seg->s_as->a_hat, addr, size, pfn, vprot,
1076             flags | HAT_LOAD_LOCK);
1077 }
1078 
1079 void
1080 segkmem_mapout(struct seg *seg, void *addr, size_t size)
1081 {
1082         hat_unload(seg->s_as->a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1083 }
1084 
1085 void *
1086 kmem_getpages(pgcnt_t npages, int kmflag)
1087 {
1088         return (kmem_alloc(ptob(npages), kmflag));
1089 }
1090 
1091 void
1092 kmem_freepages(void *addr, pgcnt_t npages)
1093 {
1094         kmem_free(addr, ptob(npages));
1095 }
1096 
1097 /*
1098  * segkmem_page_create_large() allocates a large page to be used for the kmem
1099  * caches. If kpr is enabled we ask for a relocatable page unless requested
1100  * otherwise. If kpr is disabled we have to ask for a non-reloc page
1101  */
1102 static page_t *
1103 segkmem_page_create_large(void *addr, size_t size, int vmflag, void *arg)
1104 {
1105         int pgflags;
1106 
1107         pgflags = PG_EXCL;
1108 
1109         if (segkmem_reloc == 0 || (vmflag & VM_NORELOC))
1110                 pgflags |= PG_NORELOC;
1111         if (!(vmflag & VM_NOSLEEP))
1112                 pgflags |= PG_WAIT;
1113         if (vmflag & VM_PUSHPAGE)
1114                 pgflags |= PG_PUSHPAGE;
1115         if (vmflag & VM_NORMALPRI)
1116                 pgflags |= PG_NORMALPRI;
1117 
1118         return (page_create_va_large(&kvp, (u_offset_t)(uintptr_t)addr, size,
1119             pgflags, &kvseg, addr, arg));
1120 }
1121 
1122 /*
1123  * Allocate a large page to back the virtual address range
1124  * [addr, addr + size).  If addr is NULL, allocate the virtual address
1125  * space as well.
1126  */
1127 static void *
1128 segkmem_xalloc_lp(vmem_t *vmp, void *inaddr, size_t size, int vmflag,
1129     uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *),
1130     void *pcarg)
1131 {
1132         caddr_t addr = inaddr, pa;
1133         size_t  lpsize = segkmem_lpsize;
1134         pgcnt_t npages = btopr(size);
1135         pgcnt_t nbpages = btop(lpsize);
1136         pgcnt_t nlpages = size >> segkmem_lpshift;
1137         size_t  ppasize = nbpages * sizeof (page_t *);
1138         page_t *pp, *rootpp, **ppa, *pplist = NULL;
1139         int i;
1140 
1141         vmflag |= VM_NOSLEEP;
1142 
1143         if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
1144                 return (NULL);
1145         }
1146 
1147         /*
1148          * allocate an array we need for hat_memload_array.
1149          * we use a separate arena to avoid recursion.
1150          * we will not need this array when hat_memload_array learns pp++
1151          */
1152         if ((ppa = vmem_alloc(segkmem_ppa_arena, ppasize, vmflag)) == NULL) {
1153                 goto fail_array_alloc;
1154         }
1155 
1156         if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL)
1157                 goto fail_vmem_alloc;
1158 
1159         ASSERT(((uintptr_t)addr & (lpsize - 1)) == 0);
1160 
1161         /* create all the pages */
1162         for (pa = addr, i = 0; i < nlpages; i++, pa += lpsize) {
1163                 if ((pp = page_create_func(pa, lpsize, vmflag, pcarg)) == NULL)
1164                         goto fail_page_create;
1165                 page_list_concat(&pplist, &pp);
1166         }
1167 
1168         /* at this point we have all the resource to complete the request */
1169         while ((rootpp = pplist) != NULL) {
1170                 for (i = 0; i < nbpages; i++) {
1171                         ASSERT(pplist != NULL);
1172                         pp = pplist;
1173                         page_sub(&pplist, pp);
1174                         ASSERT(page_iolock_assert(pp));
1175                         page_io_unlock(pp);
1176                         ppa[i] = pp;
1177                 }
1178                 /*
1179                  * Load the locked entry. It's OK to preload the entry into the
1180                  * TSB since we now support large mappings in the kernel TSB.
1181                  */
1182                 hat_memload_array(kas.a_hat,
1183                     (caddr_t)(uintptr_t)rootpp->p_offset, lpsize,
1184                     ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr,
1185                     HAT_LOAD_LOCK);
1186 
1187                 for (--i; i >= 0; --i) {
1188                         ppa[i]->p_lckcnt = 1;
1189                         page_unlock(ppa[i]);
1190                 }
1191         }
1192 
1193         vmem_free(segkmem_ppa_arena, ppa, ppasize);
1194         return (addr);
1195 
1196 fail_page_create:
1197         while ((rootpp = pplist) != NULL) {
1198                 for (i = 0, pp = pplist; i < nbpages; i++, pp = pplist) {
1199                         ASSERT(pp != NULL);
1200                         page_sub(&pplist, pp);
1201                         ASSERT(page_iolock_assert(pp));
1202                         page_io_unlock(pp);
1203                 }
1204                 page_destroy_pages(rootpp);
1205         }
1206 
1207         if (inaddr == NULL)
1208                 vmem_free(vmp, addr, size);
1209 
1210 fail_vmem_alloc:
1211         vmem_free(segkmem_ppa_arena, ppa, ppasize);
1212 
1213 fail_array_alloc:
1214         page_unresv(npages);
1215 
1216         return (NULL);
1217 }
1218 
1219 static void
1220 segkmem_free_one_lp(caddr_t addr, size_t size)
1221 {
1222         page_t          *pp, *rootpp = NULL;
1223         pgcnt_t         pgs_left = btopr(size);
1224 
1225         ASSERT(size == segkmem_lpsize);
1226 
1227         hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1228 
1229         for (; pgs_left > 0; addr += PAGESIZE, pgs_left--) {
1230                 pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
1231                 if (pp == NULL)
1232                         panic("segkmem_free_one_lp: page not found");
1233                 ASSERT(PAGE_EXCL(pp));
1234                 pp->p_lckcnt = 0;
1235                 if (rootpp == NULL)
1236                         rootpp = pp;
1237         }
1238         ASSERT(rootpp != NULL);
1239         page_destroy_pages(rootpp);
1240 
1241         /* page_unresv() is done by the caller */
1242 }
1243 
1244 /*
1245  * This function is called to import new spans into the vmem arenas like
1246  * kmem_default_arena and kmem_oversize_arena. It first tries to import
1247  * spans from large page arena - kmem_lp_arena. In order to do this it might
1248  * have to "upgrade the requested size" to kmem_lp_arena quantum. If
1249  * it was not able to satisfy the upgraded request it then calls regular
1250  * segkmem_alloc() that satisfies the request by importing from "*vmp" arena
1251  */
1252 /*ARGSUSED*/
1253 void *
1254 segkmem_alloc_lp(vmem_t *vmp, size_t *sizep, size_t align, int vmflag)
1255 {
1256         size_t size;
1257         kthread_t *t = curthread;
1258         segkmem_lpcb_t *lpcb = &segkmem_lpcb;
1259 
1260         ASSERT(sizep != NULL);
1261 
1262         size = *sizep;
1263 
1264         if (lpcb->lp_uselp && !(t->t_flag & T_PANIC) &&
1265             !(vmflag & SEGKMEM_SHARELOCKED)) {
1266 
1267                 size_t kmemlp_qnt = segkmem_kmemlp_quantum;
1268                 size_t asize = P2ROUNDUP(size, kmemlp_qnt);
1269                 void  *addr = NULL;
1270                 ulong_t *lpthrtp = &lpcb->lp_throttle;
1271                 ulong_t lpthrt = *lpthrtp;
1272                 int     dowakeup = 0;
1273                 int     doalloc = 1;
1274 
1275                 ASSERT(kmem_lp_arena != NULL);
1276                 ASSERT(asize >= size);
1277 
1278                 if (lpthrt != 0) {
1279                         /* try to update the throttle value */
1280                         lpthrt = atomic_inc_ulong_nv(lpthrtp);
1281                         if (lpthrt >= segkmem_lpthrottle_max) {
1282                                 lpthrt = atomic_cas_ulong(lpthrtp, lpthrt,
1283                                     segkmem_lpthrottle_max / 4);
1284                         }
1285 
1286                         /*
1287                          * when we get above throttle start do an exponential
1288                          * backoff at trying large pages and reaping
1289                          */
1290                         if (lpthrt > segkmem_lpthrottle_start &&
1291                             !ISP2(lpthrt)) {
1292                                 lpcb->allocs_throttled++;
1293                                 lpthrt--;
1294                                 if (ISP2(lpthrt))
1295                                         kmem_reap();
1296                                 return (segkmem_alloc(vmp, size, vmflag));
1297                         }
1298                 }
1299 
1300                 if (!(vmflag & VM_NOSLEEP) &&
1301                     segkmem_heaplp_quantum >= (8 * kmemlp_qnt) &&
1302                     vmem_size(kmem_lp_arena, VMEM_FREE) <= kmemlp_qnt &&
1303                     asize < (segkmem_heaplp_quantum - kmemlp_qnt)) {
1304 
1305                         /*
1306                          * we are low on free memory in kmem_lp_arena
1307                          * we let only one guy to allocate heap_lp
1308                          * quantum size chunk that everybody is going to
1309                          * share
1310                          */
1311                         mutex_enter(&lpcb->lp_lock);
1312 
1313                         if (lpcb->lp_wait) {
1314 
1315                                 /* we are not the first one - wait */
1316                                 cv_wait(&lpcb->lp_cv, &lpcb->lp_lock);
1317                                 if (vmem_size(kmem_lp_arena, VMEM_FREE) <
1318                                     kmemlp_qnt)  {
1319                                         doalloc = 0;
1320                                 }
1321                         } else if (vmem_size(kmem_lp_arena, VMEM_FREE) <=
1322                             kmemlp_qnt) {
1323 
1324                                 /*
1325                                  * we are the first one, make sure we import
1326                                  * a large page
1327                                  */
1328                                 if (asize == kmemlp_qnt)
1329                                         asize += kmemlp_qnt;
1330                                 dowakeup = 1;
1331                                 lpcb->lp_wait = 1;
1332                         }
1333 
1334                         mutex_exit(&lpcb->lp_lock);
1335                 }
1336 
1337                 /*
1338                  * VM_ABORT flag prevents sleeps in vmem_xalloc when
1339                  * large pages are not available. In that case this allocation
1340                  * attempt will fail and we will retry allocation with small
1341                  * pages. We also do not want to panic if this allocation fails
1342                  * because we are going to retry.
1343                  */
1344                 if (doalloc) {
1345                         addr = vmem_alloc(kmem_lp_arena, asize,
1346                             (vmflag | VM_ABORT) & ~VM_PANIC);
1347 
1348                         if (dowakeup) {
1349                                 mutex_enter(&lpcb->lp_lock);
1350                                 ASSERT(lpcb->lp_wait != 0);
1351                                 lpcb->lp_wait = 0;
1352                                 cv_broadcast(&lpcb->lp_cv);
1353                                 mutex_exit(&lpcb->lp_lock);
1354                         }
1355                 }
1356 
1357                 if (addr != NULL) {
1358                         *sizep = asize;
1359                         *lpthrtp = 0;
1360                         return (addr);
1361                 }
1362 
1363                 if (vmflag & VM_NOSLEEP)
1364                         lpcb->nosleep_allocs_failed++;
1365                 else
1366                         lpcb->sleep_allocs_failed++;
1367                 lpcb->alloc_bytes_failed += size;
1368 
1369                 /* if large page throttling is not started yet do it */
1370                 if (segkmem_use_lpthrottle && lpthrt == 0) {
1371                         lpthrt = atomic_cas_ulong(lpthrtp, lpthrt, 1);
1372                 }
1373         }
1374         return (segkmem_alloc(vmp, size, vmflag));
1375 }
1376 
1377 void
1378 segkmem_free_lp(vmem_t *vmp, void *inaddr, size_t size)
1379 {
1380         if (kmem_lp_arena == NULL || !IS_KMEM_VA_LARGEPAGE((caddr_t)inaddr)) {
1381                 segkmem_free(vmp, inaddr, size);
1382         } else {
1383                 vmem_free(kmem_lp_arena, inaddr, size);
1384         }
1385 }
1386 
1387 /*
1388  * segkmem_alloc_lpi() imports virtual memory from large page heap arena
1389  * into kmem_lp arena. In the process it maps the imported segment with
1390  * large pages
1391  */
1392 static void *
1393 segkmem_alloc_lpi(vmem_t *vmp, size_t size, int vmflag)
1394 {
1395         segkmem_lpcb_t *lpcb = &segkmem_lpcb;
1396         void  *addr;
1397 
1398         ASSERT(size != 0);
1399         ASSERT(vmp == heap_lp_arena);
1400 
1401         /* do not allow large page heap grow beyound limits */
1402         if (vmem_size(vmp, VMEM_ALLOC) >= segkmem_kmemlp_max) {
1403                 lpcb->allocs_limited++;
1404                 return (NULL);
1405         }
1406 
1407         addr = segkmem_xalloc_lp(vmp, NULL, size, vmflag, 0,
1408             segkmem_page_create_large, NULL);
1409         return (addr);
1410 }
1411 
1412 /*
1413  * segkmem_free_lpi() returns virtual memory back into large page heap arena
1414  * from kmem_lp arena. Beore doing this it unmaps the segment and frees
1415  * large pages used to map it.
1416  */
1417 static void
1418 segkmem_free_lpi(vmem_t *vmp, void *inaddr, size_t size)
1419 {
1420         pgcnt_t         nlpages = size >> segkmem_lpshift;
1421         size_t          lpsize = segkmem_lpsize;
1422         caddr_t         addr = inaddr;
1423         pgcnt_t         npages = btopr(size);
1424         int             i;
1425 
1426         ASSERT(vmp == heap_lp_arena);
1427         ASSERT(IS_KMEM_VA_LARGEPAGE(addr));
1428         ASSERT(((uintptr_t)inaddr & (lpsize - 1)) == 0);
1429 
1430         for (i = 0; i < nlpages; i++) {
1431                 segkmem_free_one_lp(addr, lpsize);
1432                 addr += lpsize;
1433         }
1434 
1435         page_unresv(npages);
1436 
1437         vmem_free(vmp, inaddr, size);
1438 }
1439 
1440 /*
1441  * This function is called at system boot time by kmem_init right after
1442  * /etc/system file has been read. It checks based on hardware configuration
1443  * and /etc/system settings if system is going to use large pages. The
1444  * initialiazation necessary to actually start using large pages
1445  * happens later in the process after segkmem_heap_lp_init() is called.
1446  */
1447 int
1448 segkmem_lpsetup()
1449 {
1450         int use_large_pages = 0;
1451 
1452 #ifdef __sparc
1453 
1454         size_t memtotal = physmem * PAGESIZE;
1455 
1456         if (heap_lp_base == NULL) {
1457                 segkmem_lpsize = PAGESIZE;
1458                 return (0);
1459         }
1460 
1461         /* get a platform dependent value of large page size for kernel heap */
1462         segkmem_lpsize = get_segkmem_lpsize(segkmem_lpsize);
1463 
1464         if (segkmem_lpsize <= PAGESIZE) {
1465                 /*
1466                  * put virtual space reserved for the large page kernel
1467                  * back to the regular heap
1468                  */
1469                 vmem_xfree(heap_arena, heap_lp_base,
1470                     heap_lp_end - heap_lp_base);
1471                 heap_lp_base = NULL;
1472                 heap_lp_end = NULL;
1473                 segkmem_lpsize = PAGESIZE;
1474                 return (0);
1475         }
1476 
1477         /* set heap_lp quantum if necessary */
1478         if (segkmem_heaplp_quantum == 0 || !ISP2(segkmem_heaplp_quantum) ||
1479             P2PHASE(segkmem_heaplp_quantum, segkmem_lpsize)) {
1480                 segkmem_heaplp_quantum = segkmem_lpsize;
1481         }
1482 
1483         /* set kmem_lp quantum if necessary */
1484         if (segkmem_kmemlp_quantum == 0 || !ISP2(segkmem_kmemlp_quantum) ||
1485             segkmem_kmemlp_quantum > segkmem_heaplp_quantum) {
1486                 segkmem_kmemlp_quantum = segkmem_heaplp_quantum;
1487         }
1488 
1489         /* set total amount of memory allowed for large page kernel heap */
1490         if (segkmem_kmemlp_max == 0) {
1491                 if (segkmem_kmemlp_pcnt == 0 || segkmem_kmemlp_pcnt > 100)
1492                         segkmem_kmemlp_pcnt = 12;
1493                 segkmem_kmemlp_max = (memtotal * segkmem_kmemlp_pcnt) / 100;
1494         }
1495         segkmem_kmemlp_max = P2ROUNDUP(segkmem_kmemlp_max,
1496             segkmem_heaplp_quantum);
1497 
1498         /* fix lp kmem preallocation request if necesssary */
1499         if (segkmem_kmemlp_min) {
1500                 segkmem_kmemlp_min = P2ROUNDUP(segkmem_kmemlp_min,
1501                     segkmem_heaplp_quantum);
1502                 if (segkmem_kmemlp_min > segkmem_kmemlp_max)
1503                         segkmem_kmemlp_min = segkmem_kmemlp_max;
1504         }
1505 
1506         use_large_pages = 1;
1507         segkmem_lpszc = page_szc(segkmem_lpsize);
1508         segkmem_lpshift = page_get_shift(segkmem_lpszc);
1509 
1510 #endif
1511         return (use_large_pages);
1512 }
1513 
1514 void
1515 segkmem_zio_init(void *zio_mem_base, size_t zio_mem_size)
1516 {
1517         ASSERT(zio_mem_base != NULL);
1518         ASSERT(zio_mem_size != 0);
1519 
1520         /*
1521          * To reduce VA space fragmentation, we set up quantum caches for the
1522          * smaller sizes;  we chose 32k because that translates to 128k VA
1523          * slabs, which matches nicely with the common 128k zio_data bufs.
1524          */
1525         zio_arena = vmem_create("zfs_file_data", zio_mem_base, zio_mem_size,
1526             PAGESIZE, NULL, NULL, NULL, 32 * 1024, VM_SLEEP);
1527 
1528         zio_alloc_arena = vmem_create("zfs_file_data_buf", NULL, 0, PAGESIZE,
1529             segkmem_zio_alloc, segkmem_zio_free, zio_arena, 0, VM_SLEEP);
1530 
1531         ASSERT(zio_arena != NULL);
1532         ASSERT(zio_alloc_arena != NULL);
1533 }
1534 
1535 #ifdef __sparc
1536 
1537 
1538 static void *
1539 segkmem_alloc_ppa(vmem_t *vmp, size_t size, int vmflag)
1540 {
1541         size_t ppaquantum = btopr(segkmem_lpsize) * sizeof (page_t *);
1542         void   *addr;
1543 
1544         if (ppaquantum <= PAGESIZE)
1545                 return (segkmem_alloc(vmp, size, vmflag));
1546 
1547         ASSERT((size & (ppaquantum - 1)) == 0);
1548 
1549         addr = vmem_xalloc(vmp, size, ppaquantum, 0, 0, NULL, NULL, vmflag);
1550         if (addr != NULL && segkmem_xalloc(vmp, addr, size, vmflag, 0,
1551             segkmem_page_create, NULL) == NULL) {
1552                 vmem_xfree(vmp, addr, size);
1553                 addr = NULL;
1554         }
1555 
1556         return (addr);
1557 }
1558 
1559 static void
1560 segkmem_free_ppa(vmem_t *vmp, void *addr, size_t size)
1561 {
1562         size_t ppaquantum = btopr(segkmem_lpsize) * sizeof (page_t *);
1563 
1564         ASSERT(addr != NULL);
1565 
1566         if (ppaquantum <= PAGESIZE) {
1567                 segkmem_free(vmp, addr, size);
1568         } else {
1569                 segkmem_free(NULL, addr, size);
1570                 vmem_xfree(vmp, addr, size);
1571         }
1572 }
1573 
1574 void
1575 segkmem_heap_lp_init()
1576 {
1577         segkmem_lpcb_t *lpcb = &segkmem_lpcb;
1578         size_t heap_lp_size = heap_lp_end - heap_lp_base;
1579         size_t lpsize = segkmem_lpsize;
1580         size_t ppaquantum;
1581         void   *addr;
1582 
1583         if (segkmem_lpsize <= PAGESIZE) {
1584                 ASSERT(heap_lp_base == NULL);
1585                 ASSERT(heap_lp_end == NULL);
1586                 return;
1587         }
1588 
1589         ASSERT(segkmem_heaplp_quantum >= lpsize);
1590         ASSERT((segkmem_heaplp_quantum & (lpsize - 1)) == 0);
1591         ASSERT(lpcb->lp_uselp == 0);
1592         ASSERT(heap_lp_base != NULL);
1593         ASSERT(heap_lp_end != NULL);
1594         ASSERT(heap_lp_base < heap_lp_end);
1595         ASSERT(heap_lp_arena == NULL);
1596         ASSERT(((uintptr_t)heap_lp_base & (lpsize - 1)) == 0);
1597         ASSERT(((uintptr_t)heap_lp_end & (lpsize - 1)) == 0);
1598 
1599         /* create large page heap arena */
1600         heap_lp_arena = vmem_create("heap_lp", heap_lp_base, heap_lp_size,
1601             segkmem_heaplp_quantum, NULL, NULL, NULL, 0, VM_SLEEP);
1602 
1603         ASSERT(heap_lp_arena != NULL);
1604 
1605         /* This arena caches memory already mapped by large pages */
1606         kmem_lp_arena = vmem_create("kmem_lp", NULL, 0, segkmem_kmemlp_quantum,
1607             segkmem_alloc_lpi, segkmem_free_lpi, heap_lp_arena, 0, VM_SLEEP);
1608 
1609         ASSERT(kmem_lp_arena != NULL);
1610 
1611         mutex_init(&lpcb->lp_lock, NULL, MUTEX_DEFAULT, NULL);
1612         cv_init(&lpcb->lp_cv, NULL, CV_DEFAULT, NULL);
1613 
1614         /*
1615          * this arena is used for the array of page_t pointers necessary
1616          * to call hat_mem_load_array
1617          */
1618         ppaquantum = btopr(lpsize) * sizeof (page_t *);
1619         segkmem_ppa_arena = vmem_create("segkmem_ppa", NULL, 0, ppaquantum,
1620             segkmem_alloc_ppa, segkmem_free_ppa, heap_arena, ppaquantum,
1621             VM_SLEEP);
1622 
1623         ASSERT(segkmem_ppa_arena != NULL);
1624 
1625         /* prealloacate some memory for the lp kernel heap */
1626         if (segkmem_kmemlp_min) {
1627 
1628                 ASSERT(P2PHASE(segkmem_kmemlp_min,
1629                     segkmem_heaplp_quantum) == 0);
1630 
1631                 if ((addr = segkmem_alloc_lpi(heap_lp_arena,
1632                     segkmem_kmemlp_min, VM_SLEEP)) != NULL) {
1633 
1634                         addr = vmem_add(kmem_lp_arena, addr,
1635                             segkmem_kmemlp_min, VM_SLEEP);
1636                         ASSERT(addr != NULL);
1637                 }
1638         }
1639 
1640         lpcb->lp_uselp = 1;
1641 }
1642 
1643 #endif