1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* 25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 26 */ 27 28 /* 29 * VM - Hardware Address Translation management for Spitfire MMU. 30 * 31 * This file implements the machine specific hardware translation 32 * needed by the VM system. The machine independent interface is 33 * described in <vm/hat.h> while the machine dependent interface 34 * and data structures are described in <vm/hat_sfmmu.h>. 35 * 36 * The hat layer manages the address translation hardware as a cache 37 * driven by calls from the higher levels in the VM system. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/kstat.h> 42 #include <vm/hat.h> 43 #include <vm/hat_sfmmu.h> 44 #include <vm/page.h> 45 #include <sys/pte.h> 46 #include <sys/systm.h> 47 #include <sys/mman.h> 48 #include <sys/sysmacros.h> 49 #include <sys/machparam.h> 50 #include <sys/vtrace.h> 51 #include <sys/kmem.h> 52 #include <sys/mmu.h> 53 #include <sys/cmn_err.h> 54 #include <sys/cpu.h> 55 #include <sys/cpuvar.h> 56 #include <sys/debug.h> 57 #include <sys/lgrp.h> 58 #include <sys/archsystm.h> 59 #include <sys/machsystm.h> 60 #include <sys/vmsystm.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kp.h> 64 #include <vm/seg_kmem.h> 65 #include <vm/seg_kpm.h> 66 #include <vm/rm.h> 67 #include <sys/t_lock.h> 68 #include <sys/obpdefs.h> 69 #include <sys/vm_machparam.h> 70 #include <sys/var.h> 71 #include <sys/trap.h> 72 #include <sys/machtrap.h> 73 #include <sys/scb.h> 74 #include <sys/bitmap.h> 75 #include <sys/machlock.h> 76 #include <sys/membar.h> 77 #include <sys/atomic.h> 78 #include <sys/cpu_module.h> 79 #include <sys/prom_debug.h> 80 #include <sys/ksynch.h> 81 #include <sys/mem_config.h> 82 #include <sys/mem_cage.h> 83 #include <vm/vm_dep.h> 84 #include <sys/fpu/fpusystm.h> 85 #include <vm/mach_kpm.h> 86 #include <sys/callb.h> 87 88 #ifdef DEBUG 89 #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \ 90 if (SFMMU_IS_SHMERID_VALID(rid)) { \ 91 caddr_t _eaddr = (saddr) + (len); \ 92 sf_srd_t *_srdp; \ 93 sf_region_t *_rgnp; \ 94 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \ 95 ASSERT(SF_RGNMAP_TEST(hat->sfmmu_hmeregion_map, rid)); \ 96 ASSERT((hat) != ksfmmup); \ 97 _srdp = (hat)->sfmmu_srdp; \ 98 ASSERT(_srdp != NULL); \ 99 ASSERT(_srdp->srd_refcnt != 0); \ 100 _rgnp = _srdp->srd_hmergnp[(rid)]; \ 101 ASSERT(_rgnp != NULL && _rgnp->rgn_id == rid); \ 102 ASSERT(_rgnp->rgn_refcnt != 0); \ 103 ASSERT(!(_rgnp->rgn_flags & SFMMU_REGION_FREE)); \ 104 ASSERT((_rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == \ 105 SFMMU_REGION_HME); \ 106 ASSERT((saddr) >= _rgnp->rgn_saddr); \ 107 ASSERT((saddr) < _rgnp->rgn_saddr + _rgnp->rgn_size); \ 108 ASSERT(_eaddr > _rgnp->rgn_saddr); \ 109 ASSERT(_eaddr <= _rgnp->rgn_saddr + _rgnp->rgn_size); \ 110 } 111 112 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) \ 113 { \ 114 caddr_t _hsva; \ 115 caddr_t _heva; \ 116 caddr_t _rsva; \ 117 caddr_t _reva; \ 118 int _ttesz = get_hblk_ttesz(hmeblkp); \ 119 int _flagtte; \ 120 ASSERT((srdp)->srd_refcnt != 0); \ 121 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \ 122 ASSERT((rgnp)->rgn_id == rid); \ 123 ASSERT(!((rgnp)->rgn_flags & SFMMU_REGION_FREE)); \ 124 ASSERT(((rgnp)->rgn_flags & SFMMU_REGION_TYPE_MASK) == \ 125 SFMMU_REGION_HME); \ 126 ASSERT(_ttesz <= (rgnp)->rgn_pgszc); \ 127 _hsva = (caddr_t)get_hblk_base(hmeblkp); \ 128 _heva = get_hblk_endaddr(hmeblkp); \ 129 _rsva = (caddr_t)P2ALIGN( \ 130 (uintptr_t)(rgnp)->rgn_saddr, HBLK_MIN_BYTES); \ 131 _reva = (caddr_t)P2ROUNDUP( \ 132 (uintptr_t)((rgnp)->rgn_saddr + (rgnp)->rgn_size), \ 133 HBLK_MIN_BYTES); \ 134 ASSERT(_hsva >= _rsva); \ 135 ASSERT(_hsva < _reva); \ 136 ASSERT(_heva > _rsva); \ 137 ASSERT(_heva <= _reva); \ 138 _flagtte = (_ttesz < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : \ 139 _ttesz; \ 140 ASSERT(rgnp->rgn_hmeflags & (0x1 << _flagtte)); \ 141 } 142 143 #else /* DEBUG */ 144 #define SFMMU_VALIDATE_HMERID(hat, rid, addr, len) 145 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) 146 #endif /* DEBUG */ 147 148 #if defined(SF_ERRATA_57) 149 extern caddr_t errata57_limit; 150 #endif 151 152 #define HME8BLK_SZ_RND ((roundup(HME8BLK_SZ, sizeof (int64_t))) / \ 153 (sizeof (int64_t))) 154 #define HBLK_RESERVE ((struct hme_blk *)hblk_reserve) 155 156 #define HBLK_RESERVE_CNT 128 157 #define HBLK_RESERVE_MIN 20 158 159 static struct hme_blk *freehblkp; 160 static kmutex_t freehblkp_lock; 161 static int freehblkcnt; 162 163 static int64_t hblk_reserve[HME8BLK_SZ_RND]; 164 static kmutex_t hblk_reserve_lock; 165 static kthread_t *hblk_reserve_thread; 166 167 static nucleus_hblk8_info_t nucleus_hblk8; 168 static nucleus_hblk1_info_t nucleus_hblk1; 169 170 /* 171 * Data to manage per-cpu hmeblk pending queues, hmeblks are queued here 172 * after the initial phase of removing an hmeblk from the hash chain, see 173 * the detailed comment in sfmmu_hblk_hash_rm() for further details. 174 */ 175 static cpu_hme_pend_t *cpu_hme_pend; 176 static uint_t cpu_hme_pend_thresh; 177 /* 178 * SFMMU specific hat functions 179 */ 180 void hat_pagecachectl(struct page *, int); 181 182 /* flags for hat_pagecachectl */ 183 #define HAT_CACHE 0x1 184 #define HAT_UNCACHE 0x2 185 #define HAT_TMPNC 0x4 186 187 /* 188 * Flag to allow the creation of non-cacheable translations 189 * to system memory. It is off by default. At the moment this 190 * flag is used by the ecache error injector. The error injector 191 * will turn it on when creating such a translation then shut it 192 * off when it's finished. 193 */ 194 195 int sfmmu_allow_nc_trans = 0; 196 197 /* 198 * Flag to disable large page support. 199 * value of 1 => disable all large pages. 200 * bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively. 201 * 202 * For example, use the value 0x4 to disable 512K pages. 203 * 204 */ 205 #define LARGE_PAGES_OFF 0x1 206 207 /* 208 * The disable_large_pages and disable_ism_large_pages variables control 209 * hat_memload_array and the page sizes to be used by ISM and the kernel. 210 * 211 * The disable_auto_data_large_pages and disable_auto_text_large_pages variables 212 * are only used to control which OOB pages to use at upper VM segment creation 213 * time, and are set in hat_init_pagesizes and used in the map_pgsz* routines. 214 * Their values may come from platform or CPU specific code to disable page 215 * sizes that should not be used. 216 * 217 * WARNING: 512K pages are currently not supported for ISM/DISM. 218 */ 219 uint_t disable_large_pages = 0; 220 uint_t disable_ism_large_pages = (1 << TTE512K); 221 uint_t disable_auto_data_large_pages = 0; 222 uint_t disable_auto_text_large_pages = 0; 223 224 /* 225 * Private sfmmu data structures for hat management 226 */ 227 static struct kmem_cache *sfmmuid_cache; 228 static struct kmem_cache *mmuctxdom_cache; 229 230 /* 231 * Private sfmmu data structures for tsb management 232 */ 233 static struct kmem_cache *sfmmu_tsbinfo_cache; 234 static struct kmem_cache *sfmmu_tsb8k_cache; 235 static struct kmem_cache *sfmmu_tsb_cache[NLGRPS_MAX]; 236 static vmem_t *kmem_bigtsb_arena; 237 static vmem_t *kmem_tsb_arena; 238 239 /* 240 * sfmmu static variables for hmeblk resource management. 241 */ 242 static vmem_t *hat_memload1_arena; /* HAT translation arena for sfmmu1_cache */ 243 static struct kmem_cache *sfmmu8_cache; 244 static struct kmem_cache *sfmmu1_cache; 245 static struct kmem_cache *pa_hment_cache; 246 247 static kmutex_t ism_mlist_lock; /* mutex for ism mapping list */ 248 /* 249 * private data for ism 250 */ 251 static struct kmem_cache *ism_blk_cache; 252 static struct kmem_cache *ism_ment_cache; 253 #define ISMID_STARTADDR NULL 254 255 /* 256 * Region management data structures and function declarations. 257 */ 258 259 static void sfmmu_leave_srd(sfmmu_t *); 260 static int sfmmu_srdcache_constructor(void *, void *, int); 261 static void sfmmu_srdcache_destructor(void *, void *); 262 static int sfmmu_rgncache_constructor(void *, void *, int); 263 static void sfmmu_rgncache_destructor(void *, void *); 264 static int sfrgnmap_isnull(sf_region_map_t *); 265 static int sfhmergnmap_isnull(sf_hmeregion_map_t *); 266 static int sfmmu_scdcache_constructor(void *, void *, int); 267 static void sfmmu_scdcache_destructor(void *, void *); 268 static void sfmmu_rgn_cb_noop(caddr_t, caddr_t, caddr_t, 269 size_t, void *, u_offset_t); 270 271 static uint_t srd_hashmask = SFMMU_MAX_SRD_BUCKETS - 1; 272 static sf_srd_bucket_t *srd_buckets; 273 static struct kmem_cache *srd_cache; 274 static uint_t srd_rgn_hashmask = SFMMU_MAX_REGION_BUCKETS - 1; 275 static struct kmem_cache *region_cache; 276 static struct kmem_cache *scd_cache; 277 278 #ifdef sun4v 279 int use_bigtsb_arena = 1; 280 #else 281 int use_bigtsb_arena = 0; 282 #endif 283 284 /* External /etc/system tunable, for turning on&off the shctx support */ 285 int disable_shctx = 0; 286 /* Internal variable, set by MD if the HW supports shctx feature */ 287 int shctx_on = 0; 288 289 #ifdef DEBUG 290 static void check_scd_sfmmu_list(sfmmu_t **, sfmmu_t *, int); 291 #endif 292 static void sfmmu_to_scd_list(sfmmu_t **, sfmmu_t *); 293 static void sfmmu_from_scd_list(sfmmu_t **, sfmmu_t *); 294 295 static sf_scd_t *sfmmu_alloc_scd(sf_srd_t *, sf_region_map_t *); 296 static void sfmmu_find_scd(sfmmu_t *); 297 static void sfmmu_join_scd(sf_scd_t *, sfmmu_t *); 298 static void sfmmu_finish_join_scd(sfmmu_t *); 299 static void sfmmu_leave_scd(sfmmu_t *, uchar_t); 300 static void sfmmu_destroy_scd(sf_srd_t *, sf_scd_t *, sf_region_map_t *); 301 static int sfmmu_alloc_scd_tsbs(sf_srd_t *, sf_scd_t *); 302 static void sfmmu_free_scd_tsbs(sfmmu_t *); 303 static void sfmmu_tsb_inv_ctx(sfmmu_t *); 304 static int find_ism_rid(sfmmu_t *, sfmmu_t *, caddr_t, uint_t *); 305 static void sfmmu_ism_hatflags(sfmmu_t *, int); 306 static int sfmmu_srd_lock_held(sf_srd_t *); 307 static void sfmmu_remove_scd(sf_scd_t **, sf_scd_t *); 308 static void sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *); 309 static void sfmmu_link_scd_to_regions(sf_srd_t *, sf_scd_t *); 310 static void sfmmu_unlink_scd_from_regions(sf_srd_t *, sf_scd_t *); 311 static void sfmmu_link_to_hmeregion(sfmmu_t *, sf_region_t *); 312 static void sfmmu_unlink_from_hmeregion(sfmmu_t *, sf_region_t *); 313 314 /* 315 * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists, 316 * HAT flags, synchronizing TLB/TSB coherency, and context management. 317 * The lock is hashed on the sfmmup since the case where we need to lock 318 * all processes is rare but does occur (e.g. we need to unload a shared 319 * mapping from all processes using the mapping). We have a lot of buckets, 320 * and each slab of sfmmu_t's can use about a quarter of them, giving us 321 * a fairly good distribution without wasting too much space and overhead 322 * when we have to grab them all. 323 */ 324 #define SFMMU_NUM_LOCK 128 /* must be power of two */ 325 hatlock_t hat_lock[SFMMU_NUM_LOCK]; 326 327 /* 328 * Hash algorithm optimized for a small number of slabs. 329 * 7 is (highbit((sizeof sfmmu_t)) - 1) 330 * This hash algorithm is based upon the knowledge that sfmmu_t's come from a 331 * kmem_cache, and thus they will be sequential within that cache. In 332 * addition, each new slab will have a different "color" up to cache_maxcolor 333 * which will skew the hashing for each successive slab which is allocated. 334 * If the size of sfmmu_t changed to a larger size, this algorithm may need 335 * to be revisited. 336 */ 337 #define TSB_HASH_SHIFT_BITS (7) 338 #define PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS) 339 340 #ifdef DEBUG 341 int tsb_hash_debug = 0; 342 #define TSB_HASH(sfmmup) \ 343 (tsb_hash_debug ? &hat_lock[0] : \ 344 &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]) 345 #else /* DEBUG */ 346 #define TSB_HASH(sfmmup) &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)] 347 #endif /* DEBUG */ 348 349 350 /* sfmmu_replace_tsb() return codes. */ 351 typedef enum tsb_replace_rc { 352 TSB_SUCCESS, 353 TSB_ALLOCFAIL, 354 TSB_LOSTRACE, 355 TSB_ALREADY_SWAPPED, 356 TSB_CANTGROW 357 } tsb_replace_rc_t; 358 359 /* 360 * Flags for TSB allocation routines. 361 */ 362 #define TSB_ALLOC 0x01 363 #define TSB_FORCEALLOC 0x02 364 #define TSB_GROW 0x04 365 #define TSB_SHRINK 0x08 366 #define TSB_SWAPIN 0x10 367 368 /* 369 * Support for HAT callbacks. 370 */ 371 #define SFMMU_MAX_RELOC_CALLBACKS 10 372 int sfmmu_max_cb_id = SFMMU_MAX_RELOC_CALLBACKS; 373 static id_t sfmmu_cb_nextid = 0; 374 static id_t sfmmu_tsb_cb_id; 375 struct sfmmu_callback *sfmmu_cb_table; 376 377 kmutex_t kpr_mutex; 378 kmutex_t kpr_suspendlock; 379 kthread_t *kreloc_thread; 380 381 /* 382 * Enable VA->PA translation sanity checking on DEBUG kernels. 383 * Disabled by default. This is incompatible with some 384 * drivers (error injector, RSM) so if it breaks you get 385 * to keep both pieces. 386 */ 387 int hat_check_vtop = 0; 388 389 /* 390 * Private sfmmu routines (prototypes) 391 */ 392 static struct hme_blk *sfmmu_shadow_hcreate(sfmmu_t *, caddr_t, int, uint_t); 393 static struct hme_blk *sfmmu_hblk_alloc(sfmmu_t *, caddr_t, 394 struct hmehash_bucket *, uint_t, hmeblk_tag, uint_t, 395 uint_t); 396 static caddr_t sfmmu_hblk_unload(struct hat *, struct hme_blk *, caddr_t, 397 caddr_t, demap_range_t *, uint_t); 398 static caddr_t sfmmu_hblk_sync(struct hat *, struct hme_blk *, caddr_t, 399 caddr_t, int); 400 static void sfmmu_hblk_free(struct hme_blk **); 401 static void sfmmu_hblks_list_purge(struct hme_blk **, int); 402 static uint_t sfmmu_get_free_hblk(struct hme_blk **, uint_t); 403 static uint_t sfmmu_put_free_hblk(struct hme_blk *, uint_t); 404 static struct hme_blk *sfmmu_hblk_steal(int); 405 static int sfmmu_steal_this_hblk(struct hmehash_bucket *, 406 struct hme_blk *, uint64_t, struct hme_blk *); 407 static caddr_t sfmmu_hblk_unlock(struct hme_blk *, caddr_t, caddr_t); 408 409 static void hat_do_memload_array(struct hat *, caddr_t, size_t, 410 struct page **, uint_t, uint_t, uint_t); 411 static void hat_do_memload(struct hat *, caddr_t, struct page *, 412 uint_t, uint_t, uint_t); 413 static void sfmmu_memload_batchsmall(struct hat *, caddr_t, page_t **, 414 uint_t, uint_t, pgcnt_t, uint_t); 415 void sfmmu_tteload(struct hat *, tte_t *, caddr_t, page_t *, 416 uint_t); 417 static int sfmmu_tteload_array(sfmmu_t *, tte_t *, caddr_t, page_t **, 418 uint_t, uint_t); 419 static struct hmehash_bucket *sfmmu_tteload_acquire_hashbucket(sfmmu_t *, 420 caddr_t, int, uint_t); 421 static struct hme_blk *sfmmu_tteload_find_hmeblk(sfmmu_t *, 422 struct hmehash_bucket *, caddr_t, uint_t, uint_t, 423 uint_t); 424 static int sfmmu_tteload_addentry(sfmmu_t *, struct hme_blk *, tte_t *, 425 caddr_t, page_t **, uint_t, uint_t); 426 static void sfmmu_tteload_release_hashbucket(struct hmehash_bucket *); 427 428 static int sfmmu_pagearray_setup(caddr_t, page_t **, tte_t *, int); 429 static pfn_t sfmmu_uvatopfn(caddr_t, sfmmu_t *, tte_t *); 430 void sfmmu_memtte(tte_t *, pfn_t, uint_t, int); 431 #ifdef VAC 432 static void sfmmu_vac_conflict(struct hat *, caddr_t, page_t *); 433 static int sfmmu_vacconflict_array(caddr_t, page_t *, int *); 434 int tst_tnc(page_t *pp, pgcnt_t); 435 void conv_tnc(page_t *pp, int); 436 #endif 437 438 static void sfmmu_get_ctx(sfmmu_t *); 439 static void sfmmu_free_sfmmu(sfmmu_t *); 440 441 static void sfmmu_ttesync(struct hat *, caddr_t, tte_t *, page_t *); 442 static void sfmmu_chgattr(struct hat *, caddr_t, size_t, uint_t, int); 443 444 cpuset_t sfmmu_pageunload(page_t *, struct sf_hment *, int); 445 static void hat_pagereload(struct page *, struct page *); 446 static cpuset_t sfmmu_pagesync(page_t *, struct sf_hment *, uint_t); 447 #ifdef VAC 448 void sfmmu_page_cache_array(page_t *, int, int, pgcnt_t); 449 static void sfmmu_page_cache(page_t *, int, int, int); 450 #endif 451 452 cpuset_t sfmmu_rgntlb_demap(caddr_t, sf_region_t *, 453 struct hme_blk *, int); 454 static void sfmmu_tlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 455 pfn_t, int, int, int, int); 456 static void sfmmu_ismtlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 457 pfn_t, int); 458 static void sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int); 459 static void sfmmu_tlb_range_demap(demap_range_t *); 460 static void sfmmu_invalidate_ctx(sfmmu_t *); 461 static void sfmmu_sync_mmustate(sfmmu_t *); 462 463 static void sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t); 464 static int sfmmu_tsbinfo_alloc(struct tsb_info **, int, int, uint_t, 465 sfmmu_t *); 466 static void sfmmu_tsb_free(struct tsb_info *); 467 static void sfmmu_tsbinfo_free(struct tsb_info *); 468 static int sfmmu_init_tsbinfo(struct tsb_info *, int, int, uint_t, 469 sfmmu_t *); 470 static void sfmmu_tsb_chk_reloc(sfmmu_t *, hatlock_t *); 471 static void sfmmu_tsb_swapin(sfmmu_t *, hatlock_t *); 472 static int sfmmu_select_tsb_szc(pgcnt_t); 473 static void sfmmu_mod_tsb(sfmmu_t *, caddr_t, tte_t *, int); 474 #define sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \ 475 sfmmu_mod_tsb(sfmmup, vaddr, tte, szc) 476 #define sfmmu_unload_tsb(sfmmup, vaddr, szc) \ 477 sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc) 478 static void sfmmu_copy_tsb(struct tsb_info *, struct tsb_info *); 479 static tsb_replace_rc_t sfmmu_replace_tsb(sfmmu_t *, struct tsb_info *, uint_t, 480 hatlock_t *, uint_t); 481 static void sfmmu_size_tsb(sfmmu_t *, int, uint64_t, uint64_t, int); 482 483 #ifdef VAC 484 void sfmmu_cache_flush(pfn_t, int); 485 void sfmmu_cache_flushcolor(int, pfn_t); 486 #endif 487 static caddr_t sfmmu_hblk_chgattr(sfmmu_t *, struct hme_blk *, caddr_t, 488 caddr_t, demap_range_t *, uint_t, int); 489 490 static uint64_t sfmmu_vtop_attr(uint_t, int mode, tte_t *); 491 static uint_t sfmmu_ptov_attr(tte_t *); 492 static caddr_t sfmmu_hblk_chgprot(sfmmu_t *, struct hme_blk *, caddr_t, 493 caddr_t, demap_range_t *, uint_t); 494 static uint_t sfmmu_vtop_prot(uint_t, uint_t *); 495 static int sfmmu_idcache_constructor(void *, void *, int); 496 static void sfmmu_idcache_destructor(void *, void *); 497 static int sfmmu_hblkcache_constructor(void *, void *, int); 498 static void sfmmu_hblkcache_destructor(void *, void *); 499 static void sfmmu_hblkcache_reclaim(void *); 500 static void sfmmu_shadow_hcleanup(sfmmu_t *, struct hme_blk *, 501 struct hmehash_bucket *); 502 static void sfmmu_hblk_hash_rm(struct hmehash_bucket *, struct hme_blk *, 503 struct hme_blk *, struct hme_blk **, int); 504 static void sfmmu_hblk_hash_add(struct hmehash_bucket *, struct hme_blk *, 505 uint64_t); 506 static struct hme_blk *sfmmu_check_pending_hblks(int); 507 static void sfmmu_free_hblks(sfmmu_t *, caddr_t, caddr_t, int); 508 static void sfmmu_cleanup_rhblk(sf_srd_t *, caddr_t, uint_t, int); 509 static void sfmmu_unload_hmeregion_va(sf_srd_t *, uint_t, caddr_t, caddr_t, 510 int, caddr_t *); 511 static void sfmmu_unload_hmeregion(sf_srd_t *, sf_region_t *); 512 513 static void sfmmu_rm_large_mappings(page_t *, int); 514 515 static void hat_lock_init(void); 516 static void hat_kstat_init(void); 517 static int sfmmu_kstat_percpu_update(kstat_t *ksp, int rw); 518 static void sfmmu_set_scd_rttecnt(sf_srd_t *, sf_scd_t *); 519 static int sfmmu_is_rgnva(sf_srd_t *, caddr_t, ulong_t, ulong_t); 520 static void sfmmu_check_page_sizes(sfmmu_t *, int); 521 int fnd_mapping_sz(page_t *); 522 static void iment_add(struct ism_ment *, struct hat *); 523 static void iment_sub(struct ism_ment *, struct hat *); 524 static pgcnt_t ism_tsb_entries(sfmmu_t *, int szc); 525 extern void sfmmu_setup_tsbinfo(sfmmu_t *); 526 extern void sfmmu_clear_utsbinfo(void); 527 528 static void sfmmu_ctx_wrap_around(mmu_ctx_t *, boolean_t); 529 530 extern int vpm_enable; 531 532 /* kpm globals */ 533 #ifdef DEBUG 534 /* 535 * Enable trap level tsbmiss handling 536 */ 537 int kpm_tsbmtl = 1; 538 539 /* 540 * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the 541 * required TLB shootdowns in this case, so handle w/ care. Off by default. 542 */ 543 int kpm_tlb_flush; 544 #endif /* DEBUG */ 545 546 static void *sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *, size_t, int); 547 548 #ifdef DEBUG 549 static void sfmmu_check_hblk_flist(); 550 #endif 551 552 /* 553 * Semi-private sfmmu data structures. Some of them are initialize in 554 * startup or in hat_init. Some of them are private but accessed by 555 * assembly code or mach_sfmmu.c 556 */ 557 struct hmehash_bucket *uhme_hash; /* user hmeblk hash table */ 558 struct hmehash_bucket *khme_hash; /* kernel hmeblk hash table */ 559 uint64_t uhme_hash_pa; /* PA of uhme_hash */ 560 uint64_t khme_hash_pa; /* PA of khme_hash */ 561 int uhmehash_num; /* # of buckets in user hash table */ 562 int khmehash_num; /* # of buckets in kernel hash table */ 563 564 uint_t max_mmu_ctxdoms = 0; /* max context domains in the system */ 565 mmu_ctx_t **mmu_ctxs_tbl; /* global array of context domains */ 566 uint64_t mmu_saved_gnum = 0; /* to init incoming MMUs' gnums */ 567 568 #define DEFAULT_NUM_CTXS_PER_MMU 8192 569 static uint_t nctxs = DEFAULT_NUM_CTXS_PER_MMU; 570 571 int cache; /* describes system cache */ 572 573 caddr_t ktsb_base; /* kernel 8k-indexed tsb base address */ 574 uint64_t ktsb_pbase; /* kernel 8k-indexed tsb phys address */ 575 int ktsb_szcode; /* kernel 8k-indexed tsb size code */ 576 int ktsb_sz; /* kernel 8k-indexed tsb size */ 577 578 caddr_t ktsb4m_base; /* kernel 4m-indexed tsb base address */ 579 uint64_t ktsb4m_pbase; /* kernel 4m-indexed tsb phys address */ 580 int ktsb4m_szcode; /* kernel 4m-indexed tsb size code */ 581 int ktsb4m_sz; /* kernel 4m-indexed tsb size */ 582 583 uint64_t kpm_tsbbase; /* kernel seg_kpm 4M TSB base address */ 584 int kpm_tsbsz; /* kernel seg_kpm 4M TSB size code */ 585 uint64_t kpmsm_tsbbase; /* kernel seg_kpm 8K TSB base address */ 586 int kpmsm_tsbsz; /* kernel seg_kpm 8K TSB size code */ 587 588 #ifndef sun4v 589 int utsb_dtlb_ttenum = -1; /* index in TLB for utsb locked TTE */ 590 int utsb4m_dtlb_ttenum = -1; /* index in TLB for 4M TSB TTE */ 591 int dtlb_resv_ttenum; /* index in TLB of first reserved TTE */ 592 caddr_t utsb_vabase; /* reserved kernel virtual memory */ 593 caddr_t utsb4m_vabase; /* for trap handler TSB accesses */ 594 #endif /* sun4v */ 595 uint64_t tsb_alloc_bytes = 0; /* bytes allocated to TSBs */ 596 vmem_t *kmem_tsb_default_arena[NLGRPS_MAX]; /* For dynamic TSBs */ 597 vmem_t *kmem_bigtsb_default_arena[NLGRPS_MAX]; /* dynamic 256M TSBs */ 598 599 /* 600 * Size to use for TSB slabs. Future platforms that support page sizes 601 * larger than 4M may wish to change these values, and provide their own 602 * assembly macros for building and decoding the TSB base register contents. 603 * Note disable_large_pages will override the value set here. 604 */ 605 static uint_t tsb_slab_ttesz = TTE4M; 606 size_t tsb_slab_size = MMU_PAGESIZE4M; 607 uint_t tsb_slab_shift = MMU_PAGESHIFT4M; 608 /* PFN mask for TTE */ 609 size_t tsb_slab_mask = MMU_PAGEOFFSET4M >> MMU_PAGESHIFT; 610 611 /* 612 * Size to use for TSB slabs. These are used only when 256M tsb arenas 613 * exist. 614 */ 615 static uint_t bigtsb_slab_ttesz = TTE256M; 616 static size_t bigtsb_slab_size = MMU_PAGESIZE256M; 617 static uint_t bigtsb_slab_shift = MMU_PAGESHIFT256M; 618 /* 256M page alignment for 8K pfn */ 619 static size_t bigtsb_slab_mask = MMU_PAGEOFFSET256M >> MMU_PAGESHIFT; 620 621 /* largest TSB size to grow to, will be smaller on smaller memory systems */ 622 static int tsb_max_growsize = 0; 623 624 /* 625 * Tunable parameters dealing with TSB policies. 626 */ 627 628 /* 629 * This undocumented tunable forces all 8K TSBs to be allocated from 630 * the kernel heap rather than from the kmem_tsb_default_arena arenas. 631 */ 632 #ifdef DEBUG 633 int tsb_forceheap = 0; 634 #endif /* DEBUG */ 635 636 /* 637 * Decide whether to use per-lgroup arenas, or one global set of 638 * TSB arenas. The default is not to break up per-lgroup, since 639 * most platforms don't recognize any tangible benefit from it. 640 */ 641 int tsb_lgrp_affinity = 0; 642 643 /* 644 * Used for growing the TSB based on the process RSS. 645 * tsb_rss_factor is based on the smallest TSB, and is 646 * shifted by the TSB size to determine if we need to grow. 647 * The default will grow the TSB if the number of TTEs for 648 * this page size exceeds 75% of the number of TSB entries, 649 * which should _almost_ eliminate all conflict misses 650 * (at the expense of using up lots and lots of memory). 651 */ 652 #define TSB_RSS_FACTOR (TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75) 653 #define SFMMU_RSS_TSBSIZE(tsbszc) (tsb_rss_factor << tsbszc) 654 #define SELECT_TSB_SIZECODE(pgcnt) ( \ 655 (enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \ 656 default_tsb_size) 657 #define TSB_OK_SHRINK() \ 658 (tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree) 659 #define TSB_OK_GROW() \ 660 (tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree) 661 662 int enable_tsb_rss_sizing = 1; 663 int tsb_rss_factor = (int)TSB_RSS_FACTOR; 664 665 /* which TSB size code to use for new address spaces or if rss sizing off */ 666 int default_tsb_size = TSB_8K_SZCODE; 667 668 static uint64_t tsb_alloc_hiwater; /* limit TSB reserved memory */ 669 uint64_t tsb_alloc_hiwater_factor; /* tsb_alloc_hiwater = physmem / this */ 670 #define TSB_ALLOC_HIWATER_FACTOR_DEFAULT 32 671 672 #ifdef DEBUG 673 static int tsb_random_size = 0; /* set to 1 to test random tsb sizes on alloc */ 674 static int tsb_grow_stress = 0; /* if set to 1, keep replacing TSB w/ random */ 675 static int tsb_alloc_mtbf = 0; /* fail allocation every n attempts */ 676 static int tsb_alloc_fail_mtbf = 0; 677 static int tsb_alloc_count = 0; 678 #endif /* DEBUG */ 679 680 /* if set to 1, will remap valid TTEs when growing TSB. */ 681 int tsb_remap_ttes = 1; 682 683 /* 684 * If we have more than this many mappings, allocate a second TSB. 685 * This default is chosen because the I/D fully associative TLBs are 686 * assumed to have at least 8 available entries. Platforms with a 687 * larger fully-associative TLB could probably override the default. 688 */ 689 690 #ifdef sun4v 691 int tsb_sectsb_threshold = 0; 692 #else 693 int tsb_sectsb_threshold = 8; 694 #endif 695 696 /* 697 * kstat data 698 */ 699 struct sfmmu_global_stat sfmmu_global_stat; 700 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat; 701 702 /* 703 * Global data 704 */ 705 sfmmu_t *ksfmmup; /* kernel's hat id */ 706 707 #ifdef DEBUG 708 static void chk_tte(tte_t *, tte_t *, tte_t *, struct hme_blk *); 709 #endif 710 711 /* sfmmu locking operations */ 712 static kmutex_t *sfmmu_mlspl_enter(struct page *, int); 713 static int sfmmu_mlspl_held(struct page *, int); 714 715 kmutex_t *sfmmu_page_enter(page_t *); 716 void sfmmu_page_exit(kmutex_t *); 717 int sfmmu_page_spl_held(struct page *); 718 719 /* sfmmu internal locking operations - accessed directly */ 720 static void sfmmu_mlist_reloc_enter(page_t *, page_t *, 721 kmutex_t **, kmutex_t **); 722 static void sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *); 723 static hatlock_t * 724 sfmmu_hat_enter(sfmmu_t *); 725 static hatlock_t * 726 sfmmu_hat_tryenter(sfmmu_t *); 727 static void sfmmu_hat_exit(hatlock_t *); 728 static void sfmmu_hat_lock_all(void); 729 static void sfmmu_hat_unlock_all(void); 730 static void sfmmu_ismhat_enter(sfmmu_t *, int); 731 static void sfmmu_ismhat_exit(sfmmu_t *, int); 732 733 kpm_hlk_t *kpmp_table; 734 uint_t kpmp_table_sz; /* must be a power of 2 */ 735 uchar_t kpmp_shift; 736 737 kpm_shlk_t *kpmp_stable; 738 uint_t kpmp_stable_sz; /* must be a power of 2 */ 739 740 /* 741 * SPL_TABLE_SIZE is 2 * NCPU, but no smaller than 128. 742 * SPL_SHIFT is log2(SPL_TABLE_SIZE). 743 */ 744 #if ((2*NCPU_P2) > 128) 745 #define SPL_SHIFT ((unsigned)(NCPU_LOG2 + 1)) 746 #else 747 #define SPL_SHIFT 7U 748 #endif 749 #define SPL_TABLE_SIZE (1U << SPL_SHIFT) 750 #define SPL_MASK (SPL_TABLE_SIZE - 1) 751 752 /* 753 * We shift by PP_SHIFT to take care of the low-order 0 bits of a page_t 754 * and by multiples of SPL_SHIFT to get as many varied bits as we can. 755 */ 756 #define SPL_INDEX(pp) \ 757 ((((uintptr_t)(pp) >> PP_SHIFT) ^ \ 758 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT)) ^ \ 759 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 2)) ^ \ 760 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 3))) & \ 761 SPL_MASK) 762 763 #define SPL_HASH(pp) \ 764 (&sfmmu_page_lock[SPL_INDEX(pp)].pad_mutex) 765 766 static pad_mutex_t sfmmu_page_lock[SPL_TABLE_SIZE]; 767 768 /* Array of mutexes protecting a page's mapping list and p_nrm field. */ 769 770 #define MML_TABLE_SIZE SPL_TABLE_SIZE 771 #define MLIST_HASH(pp) (&mml_table[SPL_INDEX(pp)].pad_mutex) 772 773 static pad_mutex_t mml_table[MML_TABLE_SIZE]; 774 775 /* 776 * hat_unload_callback() will group together callbacks in order 777 * to avoid xt_sync() calls. This is the maximum size of the group. 778 */ 779 #define MAX_CB_ADDR 32 780 781 tte_t hw_tte; 782 static ulong_t sfmmu_dmr_maxbit = DMR_MAXBIT; 783 784 static char *mmu_ctx_kstat_names[] = { 785 "mmu_ctx_tsb_exceptions", 786 "mmu_ctx_tsb_raise_exception", 787 "mmu_ctx_wrap_around", 788 }; 789 790 /* 791 * Wrapper for vmem_xalloc since vmem_create only allows limited 792 * parameters for vm_source_alloc functions. This function allows us 793 * to specify alignment consistent with the size of the object being 794 * allocated. 795 */ 796 static void * 797 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *vmp, size_t size, int vmflag) 798 { 799 return (vmem_xalloc(vmp, size, size, 0, 0, NULL, NULL, vmflag)); 800 } 801 802 /* Common code for setting tsb_alloc_hiwater. */ 803 #define SFMMU_SET_TSB_ALLOC_HIWATER(pages) tsb_alloc_hiwater = \ 804 ptob(pages) / tsb_alloc_hiwater_factor 805 806 /* 807 * Set tsb_max_growsize to allow at most all of physical memory to be mapped by 808 * a single TSB. physmem is the number of physical pages so we need physmem 8K 809 * TTEs to represent all those physical pages. We round this up by using 810 * 1<<highbit(). To figure out which size code to use, remember that the size 811 * code is just an amount to shift the smallest TSB size to get the size of 812 * this TSB. So we subtract that size, TSB_START_SIZE, from highbit() (or 813 * highbit() - 1) to get the size code for the smallest TSB that can represent 814 * all of physical memory, while erring on the side of too much. 815 * 816 * Restrict tsb_max_growsize to make sure that: 817 * 1) TSBs can't grow larger than the TSB slab size 818 * 2) TSBs can't grow larger than UTSB_MAX_SZCODE. 819 */ 820 #define SFMMU_SET_TSB_MAX_GROWSIZE(pages) { \ 821 int _i, _szc, _slabszc, _tsbszc; \ 822 \ 823 _i = highbit(pages); \ 824 if ((1 << (_i - 1)) == (pages)) \ 825 _i--; /* 2^n case, round down */ \ 826 _szc = _i - TSB_START_SIZE; \ 827 _slabszc = bigtsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT); \ 828 _tsbszc = MIN(_szc, _slabszc); \ 829 tsb_max_growsize = MIN(_tsbszc, UTSB_MAX_SZCODE); \ 830 } 831 832 /* 833 * Given a pointer to an sfmmu and a TTE size code, return a pointer to the 834 * tsb_info which handles that TTE size. 835 */ 836 #define SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) { \ 837 (tsbinfop) = (sfmmup)->sfmmu_tsb; \ 838 ASSERT(((tsbinfop)->tsb_flags & TSB_SHAREDCTX) || \ 839 sfmmu_hat_lock_held(sfmmup)); \ 840 if ((tte_szc) >= TTE4M) { \ 841 ASSERT((tsbinfop) != NULL); \ 842 (tsbinfop) = (tsbinfop)->tsb_next; \ 843 } \ 844 } 845 846 /* 847 * Macro to use to unload entries from the TSB. 848 * It has knowledge of which page sizes get replicated in the TSB 849 * and will call the appropriate unload routine for the appropriate size. 850 */ 851 #define SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, ismhat) \ 852 { \ 853 int ttesz = get_hblk_ttesz(hmeblkp); \ 854 if (ttesz == TTE8K || ttesz == TTE4M) { \ 855 sfmmu_unload_tsb(sfmmup, addr, ttesz); \ 856 } else { \ 857 caddr_t sva = ismhat ? addr : \ 858 (caddr_t)get_hblk_base(hmeblkp); \ 859 caddr_t eva = sva + get_hblk_span(hmeblkp); \ 860 ASSERT(addr >= sva && addr < eva); \ 861 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); \ 862 } \ 863 } 864 865 866 /* Update tsb_alloc_hiwater after memory is configured. */ 867 /*ARGSUSED*/ 868 static void 869 sfmmu_update_post_add(void *arg, pgcnt_t delta_pages) 870 { 871 /* Assumes physmem has already been updated. */ 872 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 873 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 874 } 875 876 /* 877 * Update tsb_alloc_hiwater before memory is deleted. We'll do nothing here 878 * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is 879 * deleted. 880 */ 881 /*ARGSUSED*/ 882 static int 883 sfmmu_update_pre_del(void *arg, pgcnt_t delta_pages) 884 { 885 return (0); 886 } 887 888 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */ 889 /*ARGSUSED*/ 890 static void 891 sfmmu_update_post_del(void *arg, pgcnt_t delta_pages, int cancelled) 892 { 893 /* 894 * Whether the delete was cancelled or not, just go ahead and update 895 * tsb_alloc_hiwater and tsb_max_growsize. 896 */ 897 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 898 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 899 } 900 901 static kphysm_setup_vector_t sfmmu_update_vec = { 902 KPHYSM_SETUP_VECTOR_VERSION, /* version */ 903 sfmmu_update_post_add, /* post_add */ 904 sfmmu_update_pre_del, /* pre_del */ 905 sfmmu_update_post_del /* post_del */ 906 }; 907 908 909 /* 910 * HME_BLK HASH PRIMITIVES 911 */ 912 913 /* 914 * Enter a hme on the mapping list for page pp. 915 * When large pages are more prevalent in the system we might want to 916 * keep the mapping list in ascending order by the hment size. For now, 917 * small pages are more frequent, so don't slow it down. 918 */ 919 #define HME_ADD(hme, pp) \ 920 { \ 921 ASSERT(sfmmu_mlist_held(pp)); \ 922 \ 923 hme->hme_prev = NULL; \ 924 hme->hme_next = pp->p_mapping; \ 925 hme->hme_page = pp; \ 926 if (pp->p_mapping) { \ 927 ((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\ 928 ASSERT(pp->p_share > 0); \ 929 } else { \ 930 /* EMPTY */ \ 931 ASSERT(pp->p_share == 0); \ 932 } \ 933 pp->p_mapping = hme; \ 934 pp->p_share++; \ 935 } 936 937 /* 938 * Enter a hme on the mapping list for page pp. 939 * If we are unmapping a large translation, we need to make sure that the 940 * change is reflect in the corresponding bit of the p_index field. 941 */ 942 #define HME_SUB(hme, pp) \ 943 { \ 944 ASSERT(sfmmu_mlist_held(pp)); \ 945 ASSERT(hme->hme_page == pp || IS_PAHME(hme)); \ 946 \ 947 if (pp->p_mapping == NULL) { \ 948 panic("hme_remove - no mappings"); \ 949 } \ 950 \ 951 membar_stst(); /* ensure previous stores finish */ \ 952 \ 953 ASSERT(pp->p_share > 0); \ 954 pp->p_share--; \ 955 \ 956 if (hme->hme_prev) { \ 957 ASSERT(pp->p_mapping != hme); \ 958 ASSERT(hme->hme_prev->hme_page == pp || \ 959 IS_PAHME(hme->hme_prev)); \ 960 hme->hme_prev->hme_next = hme->hme_next; \ 961 } else { \ 962 ASSERT(pp->p_mapping == hme); \ 963 pp->p_mapping = hme->hme_next; \ 964 ASSERT((pp->p_mapping == NULL) ? \ 965 (pp->p_share == 0) : 1); \ 966 } \ 967 \ 968 if (hme->hme_next) { \ 969 ASSERT(hme->hme_next->hme_page == pp || \ 970 IS_PAHME(hme->hme_next)); \ 971 hme->hme_next->hme_prev = hme->hme_prev; \ 972 } \ 973 \ 974 /* zero out the entry */ \ 975 hme->hme_next = NULL; \ 976 hme->hme_prev = NULL; \ 977 hme->hme_page = NULL; \ 978 \ 979 if (hme_size(hme) > TTE8K) { \ 980 /* remove mappings for remainder of large pg */ \ 981 sfmmu_rm_large_mappings(pp, hme_size(hme)); \ 982 } \ 983 } 984 985 /* 986 * This function returns the hment given the hme_blk and a vaddr. 987 * It assumes addr has already been checked to belong to hme_blk's 988 * range. 989 */ 990 #define HBLKTOHME(hment, hmeblkp, addr) \ 991 { \ 992 int index; \ 993 HBLKTOHME_IDX(hment, hmeblkp, addr, index) \ 994 } 995 996 /* 997 * Version of HBLKTOHME that also returns the index in hmeblkp 998 * of the hment. 999 */ 1000 #define HBLKTOHME_IDX(hment, hmeblkp, addr, idx) \ 1001 { \ 1002 ASSERT(in_hblk_range((hmeblkp), (addr))); \ 1003 \ 1004 if (get_hblk_ttesz(hmeblkp) == TTE8K) { \ 1005 idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \ 1006 } else \ 1007 idx = 0; \ 1008 \ 1009 (hment) = &(hmeblkp)->hblk_hme[idx]; \ 1010 } 1011 1012 /* 1013 * Disable any page sizes not supported by the CPU 1014 */ 1015 void 1016 hat_init_pagesizes() 1017 { 1018 int i; 1019 1020 mmu_exported_page_sizes = 0; 1021 for (i = TTE8K; i < max_mmu_page_sizes; i++) { 1022 1023 szc_2_userszc[i] = (uint_t)-1; 1024 userszc_2_szc[i] = (uint_t)-1; 1025 1026 if ((mmu_exported_pagesize_mask & (1 << i)) == 0) { 1027 disable_large_pages |= (1 << i); 1028 } else { 1029 szc_2_userszc[i] = mmu_exported_page_sizes; 1030 userszc_2_szc[mmu_exported_page_sizes] = i; 1031 mmu_exported_page_sizes++; 1032 } 1033 } 1034 1035 disable_ism_large_pages |= disable_large_pages; 1036 disable_auto_data_large_pages = disable_large_pages; 1037 disable_auto_text_large_pages = disable_large_pages; 1038 1039 /* 1040 * Initialize mmu-specific large page sizes. 1041 */ 1042 if (&mmu_large_pages_disabled) { 1043 disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD); 1044 disable_ism_large_pages |= 1045 mmu_large_pages_disabled(HAT_LOAD_SHARE); 1046 disable_auto_data_large_pages |= 1047 mmu_large_pages_disabled(HAT_AUTO_DATA); 1048 disable_auto_text_large_pages |= 1049 mmu_large_pages_disabled(HAT_AUTO_TEXT); 1050 } 1051 } 1052 1053 /* 1054 * Initialize the hardware address translation structures. 1055 */ 1056 void 1057 hat_init(void) 1058 { 1059 int i; 1060 uint_t sz; 1061 size_t size; 1062 1063 hat_lock_init(); 1064 hat_kstat_init(); 1065 1066 /* 1067 * Hardware-only bits in a TTE 1068 */ 1069 MAKE_TTE_MASK(&hw_tte); 1070 1071 hat_init_pagesizes(); 1072 1073 /* Initialize the hash locks */ 1074 for (i = 0; i < khmehash_num; i++) { 1075 mutex_init(&khme_hash[i].hmehash_mutex, NULL, 1076 MUTEX_DEFAULT, NULL); 1077 khme_hash[i].hmeh_nextpa = HMEBLK_ENDPA; 1078 } 1079 for (i = 0; i < uhmehash_num; i++) { 1080 mutex_init(&uhme_hash[i].hmehash_mutex, NULL, 1081 MUTEX_DEFAULT, NULL); 1082 uhme_hash[i].hmeh_nextpa = HMEBLK_ENDPA; 1083 } 1084 khmehash_num--; /* make sure counter starts from 0 */ 1085 uhmehash_num--; /* make sure counter starts from 0 */ 1086 1087 /* 1088 * Allocate context domain structures. 1089 * 1090 * A platform may choose to modify max_mmu_ctxdoms in 1091 * set_platform_defaults(). If a platform does not define 1092 * a set_platform_defaults() or does not choose to modify 1093 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU. 1094 * 1095 * For all platforms that have CPUs sharing MMUs, this 1096 * value must be defined. 1097 */ 1098 if (max_mmu_ctxdoms == 0) 1099 max_mmu_ctxdoms = max_ncpus; 1100 1101 size = max_mmu_ctxdoms * sizeof (mmu_ctx_t *); 1102 mmu_ctxs_tbl = kmem_zalloc(size, KM_SLEEP); 1103 1104 /* mmu_ctx_t is 64 bytes aligned */ 1105 mmuctxdom_cache = kmem_cache_create("mmuctxdom_cache", 1106 sizeof (mmu_ctx_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 1107 /* 1108 * MMU context domain initialization for the Boot CPU. 1109 * This needs the context domains array allocated above. 1110 */ 1111 mutex_enter(&cpu_lock); 1112 sfmmu_cpu_init(CPU); 1113 mutex_exit(&cpu_lock); 1114 1115 /* 1116 * Intialize ism mapping list lock. 1117 */ 1118 1119 mutex_init(&ism_mlist_lock, NULL, MUTEX_DEFAULT, NULL); 1120 1121 /* 1122 * Each sfmmu structure carries an array of MMU context info 1123 * structures, one per context domain. The size of this array depends 1124 * on the maximum number of context domains. So, the size of the 1125 * sfmmu structure varies per platform. 1126 * 1127 * sfmmu is allocated from static arena, because trap 1128 * handler at TL > 0 is not allowed to touch kernel relocatable 1129 * memory. sfmmu's alignment is changed to 64 bytes from 1130 * default 8 bytes, as the lower 6 bits will be used to pass 1131 * pgcnt to vtag_flush_pgcnt_tl1. 1132 */ 1133 size = sizeof (sfmmu_t) + sizeof (sfmmu_ctx_t) * (max_mmu_ctxdoms - 1); 1134 1135 sfmmuid_cache = kmem_cache_create("sfmmuid_cache", size, 1136 64, sfmmu_idcache_constructor, sfmmu_idcache_destructor, 1137 NULL, NULL, static_arena, 0); 1138 1139 sfmmu_tsbinfo_cache = kmem_cache_create("sfmmu_tsbinfo_cache", 1140 sizeof (struct tsb_info), 0, NULL, NULL, NULL, NULL, NULL, 0); 1141 1142 /* 1143 * Since we only use the tsb8k cache to "borrow" pages for TSBs 1144 * from the heap when low on memory or when TSB_FORCEALLOC is 1145 * specified, don't use magazines to cache them--we want to return 1146 * them to the system as quickly as possible. 1147 */ 1148 sfmmu_tsb8k_cache = kmem_cache_create("sfmmu_tsb8k_cache", 1149 MMU_PAGESIZE, MMU_PAGESIZE, NULL, NULL, NULL, NULL, 1150 static_arena, KMC_NOMAGAZINE); 1151 1152 /* 1153 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical 1154 * memory, which corresponds to the old static reserve for TSBs. 1155 * tsb_alloc_hiwater_factor defaults to 32. This caps the amount of 1156 * memory we'll allocate for TSB slabs; beyond this point TSB 1157 * allocations will be taken from the kernel heap (via 1158 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem 1159 * consumer. 1160 */ 1161 if (tsb_alloc_hiwater_factor == 0) { 1162 tsb_alloc_hiwater_factor = TSB_ALLOC_HIWATER_FACTOR_DEFAULT; 1163 } 1164 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 1165 1166 for (sz = tsb_slab_ttesz; sz > 0; sz--) { 1167 if (!(disable_large_pages & (1 << sz))) 1168 break; 1169 } 1170 1171 if (sz < tsb_slab_ttesz) { 1172 tsb_slab_ttesz = sz; 1173 tsb_slab_shift = MMU_PAGESHIFT + (sz << 1) + sz; 1174 tsb_slab_size = 1 << tsb_slab_shift; 1175 tsb_slab_mask = (1 << (tsb_slab_shift - MMU_PAGESHIFT)) - 1; 1176 use_bigtsb_arena = 0; 1177 } else if (use_bigtsb_arena && 1178 (disable_large_pages & (1 << bigtsb_slab_ttesz))) { 1179 use_bigtsb_arena = 0; 1180 } 1181 1182 if (!use_bigtsb_arena) { 1183 bigtsb_slab_shift = tsb_slab_shift; 1184 } 1185 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 1186 1187 /* 1188 * On smaller memory systems, allocate TSB memory in smaller chunks 1189 * than the default 4M slab size. We also honor disable_large_pages 1190 * here. 1191 * 1192 * The trap handlers need to be patched with the final slab shift, 1193 * since they need to be able to construct the TSB pointer at runtime. 1194 */ 1195 if ((tsb_max_growsize <= TSB_512K_SZCODE) && 1196 !(disable_large_pages & (1 << TTE512K))) { 1197 tsb_slab_ttesz = TTE512K; 1198 tsb_slab_shift = MMU_PAGESHIFT512K; 1199 tsb_slab_size = MMU_PAGESIZE512K; 1200 tsb_slab_mask = MMU_PAGEOFFSET512K >> MMU_PAGESHIFT; 1201 use_bigtsb_arena = 0; 1202 } 1203 1204 if (!use_bigtsb_arena) { 1205 bigtsb_slab_ttesz = tsb_slab_ttesz; 1206 bigtsb_slab_shift = tsb_slab_shift; 1207 bigtsb_slab_size = tsb_slab_size; 1208 bigtsb_slab_mask = tsb_slab_mask; 1209 } 1210 1211 1212 /* 1213 * Set up memory callback to update tsb_alloc_hiwater and 1214 * tsb_max_growsize. 1215 */ 1216 i = kphysm_setup_func_register(&sfmmu_update_vec, (void *) 0); 1217 ASSERT(i == 0); 1218 1219 /* 1220 * kmem_tsb_arena is the source from which large TSB slabs are 1221 * drawn. The quantum of this arena corresponds to the largest 1222 * TSB size we can dynamically allocate for user processes. 1223 * Currently it must also be a supported page size since we 1224 * use exactly one translation entry to map each slab page. 1225 * 1226 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from 1227 * which most TSBs are allocated. Since most TSB allocations are 1228 * typically 8K we have a kmem cache we stack on top of each 1229 * kmem_tsb_default_arena to speed up those allocations. 1230 * 1231 * Note the two-level scheme of arenas is required only 1232 * because vmem_create doesn't allow us to specify alignment 1233 * requirements. If this ever changes the code could be 1234 * simplified to use only one level of arenas. 1235 * 1236 * If 256M page support exists on sun4v, 256MB kmem_bigtsb_arena 1237 * will be provided in addition to the 4M kmem_tsb_arena. 1238 */ 1239 if (use_bigtsb_arena) { 1240 kmem_bigtsb_arena = vmem_create("kmem_bigtsb", NULL, 0, 1241 bigtsb_slab_size, sfmmu_vmem_xalloc_aligned_wrapper, 1242 vmem_xfree, heap_arena, 0, VM_SLEEP); 1243 } 1244 1245 kmem_tsb_arena = vmem_create("kmem_tsb", NULL, 0, tsb_slab_size, 1246 sfmmu_vmem_xalloc_aligned_wrapper, 1247 vmem_xfree, heap_arena, 0, VM_SLEEP); 1248 1249 if (tsb_lgrp_affinity) { 1250 char s[50]; 1251 for (i = 0; i < NLGRPS_MAX; i++) { 1252 if (use_bigtsb_arena) { 1253 (void) sprintf(s, "kmem_bigtsb_lgrp%d", i); 1254 kmem_bigtsb_default_arena[i] = vmem_create(s, 1255 NULL, 0, 2 * tsb_slab_size, 1256 sfmmu_tsb_segkmem_alloc, 1257 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 1258 0, VM_SLEEP | VM_BESTFIT); 1259 } 1260 1261 (void) sprintf(s, "kmem_tsb_lgrp%d", i); 1262 kmem_tsb_default_arena[i] = vmem_create(s, 1263 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc, 1264 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0, 1265 VM_SLEEP | VM_BESTFIT); 1266 1267 (void) sprintf(s, "sfmmu_tsb_lgrp%d_cache", i); 1268 sfmmu_tsb_cache[i] = kmem_cache_create(s, 1269 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL, 1270 kmem_tsb_default_arena[i], 0); 1271 } 1272 } else { 1273 if (use_bigtsb_arena) { 1274 kmem_bigtsb_default_arena[0] = 1275 vmem_create("kmem_bigtsb_default", NULL, 0, 1276 2 * tsb_slab_size, sfmmu_tsb_segkmem_alloc, 1277 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 0, 1278 VM_SLEEP | VM_BESTFIT); 1279 } 1280 1281 kmem_tsb_default_arena[0] = vmem_create("kmem_tsb_default", 1282 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc, 1283 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0, 1284 VM_SLEEP | VM_BESTFIT); 1285 sfmmu_tsb_cache[0] = kmem_cache_create("sfmmu_tsb_cache", 1286 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL, 1287 kmem_tsb_default_arena[0], 0); 1288 } 1289 1290 sfmmu8_cache = kmem_cache_create("sfmmu8_cache", HME8BLK_SZ, 1291 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1292 sfmmu_hblkcache_destructor, 1293 sfmmu_hblkcache_reclaim, (void *)HME8BLK_SZ, 1294 hat_memload_arena, KMC_NOHASH); 1295 1296 hat_memload1_arena = vmem_create("hat_memload1", NULL, 0, PAGESIZE, 1297 segkmem_alloc_permanent, segkmem_free, heap_arena, 0, 1298 VMC_DUMPSAFE | VM_SLEEP); 1299 1300 sfmmu1_cache = kmem_cache_create("sfmmu1_cache", HME1BLK_SZ, 1301 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1302 sfmmu_hblkcache_destructor, 1303 NULL, (void *)HME1BLK_SZ, 1304 hat_memload1_arena, KMC_NOHASH); 1305 1306 pa_hment_cache = kmem_cache_create("pa_hment_cache", PAHME_SZ, 1307 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 1308 1309 ism_blk_cache = kmem_cache_create("ism_blk_cache", 1310 sizeof (ism_blk_t), ecache_alignsize, NULL, NULL, 1311 NULL, NULL, static_arena, KMC_NOHASH); 1312 1313 ism_ment_cache = kmem_cache_create("ism_ment_cache", 1314 sizeof (ism_ment_t), 0, NULL, NULL, 1315 NULL, NULL, NULL, 0); 1316 1317 /* 1318 * We grab the first hat for the kernel, 1319 */ 1320 AS_LOCK_ENTER(&kas, &kas.a_lock, RW_WRITER); 1321 kas.a_hat = hat_alloc(&kas); 1322 AS_LOCK_EXIT(&kas, &kas.a_lock); 1323 1324 /* 1325 * Initialize hblk_reserve. 1326 */ 1327 ((struct hme_blk *)hblk_reserve)->hblk_nextpa = 1328 va_to_pa((caddr_t)hblk_reserve); 1329 1330 #ifndef UTSB_PHYS 1331 /* 1332 * Reserve some kernel virtual address space for the locked TTEs 1333 * that allow us to probe the TSB from TL>0. 1334 */ 1335 utsb_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1336 0, 0, NULL, NULL, VM_SLEEP); 1337 utsb4m_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1338 0, 0, NULL, NULL, VM_SLEEP); 1339 #endif 1340 1341 #ifdef VAC 1342 /* 1343 * The big page VAC handling code assumes VAC 1344 * will not be bigger than the smallest big 1345 * page- which is 64K. 1346 */ 1347 if (TTEPAGES(TTE64K) < CACHE_NUM_COLOR) { 1348 cmn_err(CE_PANIC, "VAC too big!"); 1349 } 1350 #endif 1351 1352 uhme_hash_pa = va_to_pa(uhme_hash); 1353 khme_hash_pa = va_to_pa(khme_hash); 1354 1355 /* 1356 * Initialize relocation locks. kpr_suspendlock is held 1357 * at PIL_MAX to prevent interrupts from pinning the holder 1358 * of a suspended TTE which may access it leading to a 1359 * deadlock condition. 1360 */ 1361 mutex_init(&kpr_mutex, NULL, MUTEX_DEFAULT, NULL); 1362 mutex_init(&kpr_suspendlock, NULL, MUTEX_SPIN, (void *)PIL_MAX); 1363 1364 /* 1365 * If Shared context support is disabled via /etc/system 1366 * set shctx_on to 0 here if it was set to 1 earlier in boot 1367 * sequence by cpu module initialization code. 1368 */ 1369 if (shctx_on && disable_shctx) { 1370 shctx_on = 0; 1371 } 1372 1373 if (shctx_on) { 1374 srd_buckets = kmem_zalloc(SFMMU_MAX_SRD_BUCKETS * 1375 sizeof (srd_buckets[0]), KM_SLEEP); 1376 for (i = 0; i < SFMMU_MAX_SRD_BUCKETS; i++) { 1377 mutex_init(&srd_buckets[i].srdb_lock, NULL, 1378 MUTEX_DEFAULT, NULL); 1379 } 1380 1381 srd_cache = kmem_cache_create("srd_cache", sizeof (sf_srd_t), 1382 0, sfmmu_srdcache_constructor, sfmmu_srdcache_destructor, 1383 NULL, NULL, NULL, 0); 1384 region_cache = kmem_cache_create("region_cache", 1385 sizeof (sf_region_t), 0, sfmmu_rgncache_constructor, 1386 sfmmu_rgncache_destructor, NULL, NULL, NULL, 0); 1387 scd_cache = kmem_cache_create("scd_cache", sizeof (sf_scd_t), 1388 0, sfmmu_scdcache_constructor, sfmmu_scdcache_destructor, 1389 NULL, NULL, NULL, 0); 1390 } 1391 1392 /* 1393 * Pre-allocate hrm_hashtab before enabling the collection of 1394 * refmod statistics. Allocating on the fly would mean us 1395 * running the risk of suffering recursive mutex enters or 1396 * deadlocks. 1397 */ 1398 hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *), 1399 KM_SLEEP); 1400 1401 /* Allocate per-cpu pending freelist of hmeblks */ 1402 cpu_hme_pend = kmem_zalloc((NCPU * sizeof (cpu_hme_pend_t)) + 64, 1403 KM_SLEEP); 1404 cpu_hme_pend = (cpu_hme_pend_t *)P2ROUNDUP( 1405 (uintptr_t)cpu_hme_pend, 64); 1406 1407 for (i = 0; i < NCPU; i++) { 1408 mutex_init(&cpu_hme_pend[i].chp_mutex, NULL, MUTEX_DEFAULT, 1409 NULL); 1410 } 1411 1412 if (cpu_hme_pend_thresh == 0) { 1413 cpu_hme_pend_thresh = CPU_HME_PEND_THRESH; 1414 } 1415 } 1416 1417 /* 1418 * Initialize locking for the hat layer, called early during boot. 1419 */ 1420 static void 1421 hat_lock_init() 1422 { 1423 int i; 1424 1425 /* 1426 * initialize the array of mutexes protecting a page's mapping 1427 * list and p_nrm field. 1428 */ 1429 for (i = 0; i < MML_TABLE_SIZE; i++) 1430 mutex_init(&mml_table[i].pad_mutex, NULL, MUTEX_DEFAULT, NULL); 1431 1432 if (kpm_enable) { 1433 for (i = 0; i < kpmp_table_sz; i++) { 1434 mutex_init(&kpmp_table[i].khl_mutex, NULL, 1435 MUTEX_DEFAULT, NULL); 1436 } 1437 } 1438 1439 /* 1440 * Initialize array of mutex locks that protects sfmmu fields and 1441 * TSB lists. 1442 */ 1443 for (i = 0; i < SFMMU_NUM_LOCK; i++) 1444 mutex_init(HATLOCK_MUTEXP(&hat_lock[i]), NULL, MUTEX_DEFAULT, 1445 NULL); 1446 } 1447 1448 #define SFMMU_KERNEL_MAXVA \ 1449 (kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT)) 1450 1451 /* 1452 * Allocate a hat structure. 1453 * Called when an address space first uses a hat. 1454 */ 1455 struct hat * 1456 hat_alloc(struct as *as) 1457 { 1458 sfmmu_t *sfmmup; 1459 int i; 1460 uint64_t cnum; 1461 extern uint_t get_color_start(struct as *); 1462 1463 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1464 sfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP); 1465 sfmmup->sfmmu_as = as; 1466 sfmmup->sfmmu_flags = 0; 1467 sfmmup->sfmmu_tteflags = 0; 1468 sfmmup->sfmmu_rtteflags = 0; 1469 LOCK_INIT_CLEAR(&sfmmup->sfmmu_ctx_lock); 1470 1471 if (as == &kas) { 1472 ksfmmup = sfmmup; 1473 sfmmup->sfmmu_cext = 0; 1474 cnum = KCONTEXT; 1475 1476 sfmmup->sfmmu_clrstart = 0; 1477 sfmmup->sfmmu_tsb = NULL; 1478 /* 1479 * hat_kern_setup() will call sfmmu_init_ktsbinfo() 1480 * to setup tsb_info for ksfmmup. 1481 */ 1482 } else { 1483 1484 /* 1485 * Just set to invalid ctx. When it faults, it will 1486 * get a valid ctx. This would avoid the situation 1487 * where we get a ctx, but it gets stolen and then 1488 * we fault when we try to run and so have to get 1489 * another ctx. 1490 */ 1491 sfmmup->sfmmu_cext = 0; 1492 cnum = INVALID_CONTEXT; 1493 1494 /* initialize original physical page coloring bin */ 1495 sfmmup->sfmmu_clrstart = get_color_start(as); 1496 #ifdef DEBUG 1497 if (tsb_random_size) { 1498 uint32_t randval = (uint32_t)gettick() >> 4; 1499 int size = randval % (tsb_max_growsize + 1); 1500 1501 /* chose a random tsb size for stress testing */ 1502 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, size, 1503 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1504 } else 1505 #endif /* DEBUG */ 1506 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, 1507 default_tsb_size, 1508 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1509 sfmmup->sfmmu_flags = HAT_SWAPPED | HAT_ALLCTX_INVALID; 1510 ASSERT(sfmmup->sfmmu_tsb != NULL); 1511 } 1512 1513 ASSERT(max_mmu_ctxdoms > 0); 1514 for (i = 0; i < max_mmu_ctxdoms; i++) { 1515 sfmmup->sfmmu_ctxs[i].cnum = cnum; 1516 sfmmup->sfmmu_ctxs[i].gnum = 0; 1517 } 1518 1519 for (i = 0; i < max_mmu_page_sizes; i++) { 1520 sfmmup->sfmmu_ttecnt[i] = 0; 1521 sfmmup->sfmmu_scdrttecnt[i] = 0; 1522 sfmmup->sfmmu_ismttecnt[i] = 0; 1523 sfmmup->sfmmu_scdismttecnt[i] = 0; 1524 sfmmup->sfmmu_pgsz[i] = TTE8K; 1525 } 1526 sfmmup->sfmmu_tsb0_4minflcnt = 0; 1527 sfmmup->sfmmu_iblk = NULL; 1528 sfmmup->sfmmu_ismhat = 0; 1529 sfmmup->sfmmu_scdhat = 0; 1530 sfmmup->sfmmu_ismblkpa = (uint64_t)-1; 1531 if (sfmmup == ksfmmup) { 1532 CPUSET_ALL(sfmmup->sfmmu_cpusran); 1533 } else { 1534 CPUSET_ZERO(sfmmup->sfmmu_cpusran); 1535 } 1536 sfmmup->sfmmu_free = 0; 1537 sfmmup->sfmmu_rmstat = 0; 1538 sfmmup->sfmmu_clrbin = sfmmup->sfmmu_clrstart; 1539 cv_init(&sfmmup->sfmmu_tsb_cv, NULL, CV_DEFAULT, NULL); 1540 sfmmup->sfmmu_srdp = NULL; 1541 SF_RGNMAP_ZERO(sfmmup->sfmmu_region_map); 1542 bzero(sfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE); 1543 sfmmup->sfmmu_scdp = NULL; 1544 sfmmup->sfmmu_scd_link.next = NULL; 1545 sfmmup->sfmmu_scd_link.prev = NULL; 1546 return (sfmmup); 1547 } 1548 1549 /* 1550 * Create per-MMU context domain kstats for a given MMU ctx. 1551 */ 1552 static void 1553 sfmmu_mmu_kstat_create(mmu_ctx_t *mmu_ctxp) 1554 { 1555 mmu_ctx_stat_t stat; 1556 kstat_t *mmu_kstat; 1557 1558 ASSERT(MUTEX_HELD(&cpu_lock)); 1559 ASSERT(mmu_ctxp->mmu_kstat == NULL); 1560 1561 mmu_kstat = kstat_create("unix", mmu_ctxp->mmu_idx, "mmu_ctx", 1562 "hat", KSTAT_TYPE_NAMED, MMU_CTX_NUM_STATS, KSTAT_FLAG_VIRTUAL); 1563 1564 if (mmu_kstat == NULL) { 1565 cmn_err(CE_WARN, "kstat_create for MMU %d failed", 1566 mmu_ctxp->mmu_idx); 1567 } else { 1568 mmu_kstat->ks_data = mmu_ctxp->mmu_kstat_data; 1569 for (stat = 0; stat < MMU_CTX_NUM_STATS; stat++) 1570 kstat_named_init(&mmu_ctxp->mmu_kstat_data[stat], 1571 mmu_ctx_kstat_names[stat], KSTAT_DATA_INT64); 1572 mmu_ctxp->mmu_kstat = mmu_kstat; 1573 kstat_install(mmu_kstat); 1574 } 1575 } 1576 1577 /* 1578 * plat_cpuid_to_mmu_ctx_info() is a platform interface that returns MMU 1579 * context domain information for a given CPU. If a platform does not 1580 * specify that interface, then the function below is used instead to return 1581 * default information. The defaults are as follows: 1582 * 1583 * - The number of MMU context IDs supported on any CPU in the 1584 * system is 8K. 1585 * - There is one MMU context domain per CPU. 1586 */ 1587 /*ARGSUSED*/ 1588 static void 1589 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *infop) 1590 { 1591 infop->mmu_nctxs = nctxs; 1592 infop->mmu_idx = cpu[cpuid]->cpu_seqid; 1593 } 1594 1595 /* 1596 * Called during CPU initialization to set the MMU context-related information 1597 * for a CPU. 1598 * 1599 * cpu_lock serializes accesses to mmu_ctxs and mmu_saved_gnum. 1600 */ 1601 void 1602 sfmmu_cpu_init(cpu_t *cp) 1603 { 1604 mmu_ctx_info_t info; 1605 mmu_ctx_t *mmu_ctxp; 1606 1607 ASSERT(MUTEX_HELD(&cpu_lock)); 1608 1609 if (&plat_cpuid_to_mmu_ctx_info == NULL) 1610 sfmmu_cpuid_to_mmu_ctx_info(cp->cpu_id, &info); 1611 else 1612 plat_cpuid_to_mmu_ctx_info(cp->cpu_id, &info); 1613 1614 ASSERT(info.mmu_idx < max_mmu_ctxdoms); 1615 1616 if ((mmu_ctxp = mmu_ctxs_tbl[info.mmu_idx]) == NULL) { 1617 /* Each mmu_ctx is cacheline aligned. */ 1618 mmu_ctxp = kmem_cache_alloc(mmuctxdom_cache, KM_SLEEP); 1619 bzero(mmu_ctxp, sizeof (mmu_ctx_t)); 1620 1621 mutex_init(&mmu_ctxp->mmu_lock, NULL, MUTEX_SPIN, 1622 (void *)ipltospl(DISP_LEVEL)); 1623 mmu_ctxp->mmu_idx = info.mmu_idx; 1624 mmu_ctxp->mmu_nctxs = info.mmu_nctxs; 1625 /* 1626 * Globally for lifetime of a system, 1627 * gnum must always increase. 1628 * mmu_saved_gnum is protected by the cpu_lock. 1629 */ 1630 mmu_ctxp->mmu_gnum = mmu_saved_gnum + 1; 1631 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS; 1632 1633 sfmmu_mmu_kstat_create(mmu_ctxp); 1634 1635 mmu_ctxs_tbl[info.mmu_idx] = mmu_ctxp; 1636 } else { 1637 ASSERT(mmu_ctxp->mmu_idx == info.mmu_idx); 1638 ASSERT(mmu_ctxp->mmu_nctxs <= info.mmu_nctxs); 1639 } 1640 1641 /* 1642 * The mmu_lock is acquired here to prevent races with 1643 * the wrap-around code. 1644 */ 1645 mutex_enter(&mmu_ctxp->mmu_lock); 1646 1647 1648 mmu_ctxp->mmu_ncpus++; 1649 CPUSET_ADD(mmu_ctxp->mmu_cpuset, cp->cpu_id); 1650 CPU_MMU_IDX(cp) = info.mmu_idx; 1651 CPU_MMU_CTXP(cp) = mmu_ctxp; 1652 1653 mutex_exit(&mmu_ctxp->mmu_lock); 1654 } 1655 1656 static void 1657 sfmmu_ctxdom_free(mmu_ctx_t *mmu_ctxp) 1658 { 1659 ASSERT(MUTEX_HELD(&cpu_lock)); 1660 ASSERT(!MUTEX_HELD(&mmu_ctxp->mmu_lock)); 1661 1662 mutex_destroy(&mmu_ctxp->mmu_lock); 1663 1664 if (mmu_ctxp->mmu_kstat) 1665 kstat_delete(mmu_ctxp->mmu_kstat); 1666 1667 /* mmu_saved_gnum is protected by the cpu_lock. */ 1668 if (mmu_saved_gnum < mmu_ctxp->mmu_gnum) 1669 mmu_saved_gnum = mmu_ctxp->mmu_gnum; 1670 1671 kmem_cache_free(mmuctxdom_cache, mmu_ctxp); 1672 } 1673 1674 /* 1675 * Called to perform MMU context-related cleanup for a CPU. 1676 */ 1677 void 1678 sfmmu_cpu_cleanup(cpu_t *cp) 1679 { 1680 mmu_ctx_t *mmu_ctxp; 1681 1682 ASSERT(MUTEX_HELD(&cpu_lock)); 1683 1684 mmu_ctxp = CPU_MMU_CTXP(cp); 1685 ASSERT(mmu_ctxp != NULL); 1686 1687 /* 1688 * The mmu_lock is acquired here to prevent races with 1689 * the wrap-around code. 1690 */ 1691 mutex_enter(&mmu_ctxp->mmu_lock); 1692 1693 CPU_MMU_CTXP(cp) = NULL; 1694 1695 CPUSET_DEL(mmu_ctxp->mmu_cpuset, cp->cpu_id); 1696 if (--mmu_ctxp->mmu_ncpus == 0) { 1697 mmu_ctxs_tbl[mmu_ctxp->mmu_idx] = NULL; 1698 mutex_exit(&mmu_ctxp->mmu_lock); 1699 sfmmu_ctxdom_free(mmu_ctxp); 1700 return; 1701 } 1702 1703 mutex_exit(&mmu_ctxp->mmu_lock); 1704 } 1705 1706 uint_t 1707 sfmmu_ctxdom_nctxs(int idx) 1708 { 1709 return (mmu_ctxs_tbl[idx]->mmu_nctxs); 1710 } 1711 1712 #ifdef sun4v 1713 /* 1714 * sfmmu_ctxdoms_* is an interface provided to help keep context domains 1715 * consistant after suspend/resume on system that can resume on a different 1716 * hardware than it was suspended. 1717 * 1718 * sfmmu_ctxdom_lock(void) locks all context domains and prevents new contexts 1719 * from being allocated. It acquires all hat_locks, which blocks most access to 1720 * context data, except for a few cases that are handled separately or are 1721 * harmless. It wraps each domain to increment gnum and invalidate on-CPU 1722 * contexts, and forces cnum to its max. As a result of this call all user 1723 * threads that are running on CPUs trap and try to perform wrap around but 1724 * can't because hat_locks are taken. Threads that were not on CPUs but started 1725 * by scheduler go to sfmmu_alloc_ctx() to aquire context without checking 1726 * hat_lock, but fail, because cnum == nctxs, and therefore also trap and block 1727 * on hat_lock trying to wrap. sfmmu_ctxdom_lock() must be called before CPUs 1728 * are paused, else it could deadlock acquiring locks held by paused CPUs. 1729 * 1730 * sfmmu_ctxdoms_remove() removes context domains from every CPUs and records 1731 * the CPUs that had them. It must be called after CPUs have been paused. This 1732 * ensures that no threads are in sfmmu_alloc_ctx() accessing domain data, 1733 * because pause_cpus sends a mondo interrupt to every CPU, and sfmmu_alloc_ctx 1734 * runs with interrupts disabled. When CPUs are later resumed, they may enter 1735 * sfmmu_alloc_ctx, but it will check for CPU_MMU_CTXP = NULL and immediately 1736 * return failure. Or, they will be blocked trying to acquire hat_lock. Thus 1737 * after sfmmu_ctxdoms_remove returns, we are guaranteed that no one is 1738 * accessing the old context domains. 1739 * 1740 * sfmmu_ctxdoms_update(void) frees space used by old context domains and 1741 * allocates new context domains based on hardware layout. It initializes 1742 * every CPU that had context domain before migration to have one again. 1743 * sfmmu_ctxdoms_update must be called after CPUs are resumed, else it 1744 * could deadlock acquiring locks held by paused CPUs. 1745 * 1746 * sfmmu_ctxdoms_unlock(void) releases all hat_locks after which user threads 1747 * acquire new context ids and continue execution. 1748 * 1749 * Therefore functions should be called in the following order: 1750 * suspend_routine() 1751 * sfmmu_ctxdom_lock() 1752 * pause_cpus() 1753 * suspend() 1754 * if (suspend failed) 1755 * sfmmu_ctxdom_unlock() 1756 * ... 1757 * sfmmu_ctxdom_remove() 1758 * resume_cpus() 1759 * sfmmu_ctxdom_update() 1760 * sfmmu_ctxdom_unlock() 1761 */ 1762 static cpuset_t sfmmu_ctxdoms_pset; 1763 1764 void 1765 sfmmu_ctxdoms_remove() 1766 { 1767 processorid_t id; 1768 cpu_t *cp; 1769 1770 /* 1771 * Record the CPUs that have domains in sfmmu_ctxdoms_pset, so they can 1772 * be restored post-migration. A CPU may be powered off and not have a 1773 * domain, for example. 1774 */ 1775 CPUSET_ZERO(sfmmu_ctxdoms_pset); 1776 1777 for (id = 0; id < NCPU; id++) { 1778 if ((cp = cpu[id]) != NULL && CPU_MMU_CTXP(cp) != NULL) { 1779 CPUSET_ADD(sfmmu_ctxdoms_pset, id); 1780 CPU_MMU_CTXP(cp) = NULL; 1781 } 1782 } 1783 } 1784 1785 void 1786 sfmmu_ctxdoms_lock(void) 1787 { 1788 int idx; 1789 mmu_ctx_t *mmu_ctxp; 1790 1791 sfmmu_hat_lock_all(); 1792 1793 /* 1794 * At this point, no thread can be in sfmmu_ctx_wrap_around, because 1795 * hat_lock is always taken before calling it. 1796 * 1797 * For each domain, set mmu_cnum to max so no more contexts can be 1798 * allocated, and wrap to flush on-CPU contexts and force threads to 1799 * acquire a new context when we later drop hat_lock after migration. 1800 * Setting mmu_cnum may race with sfmmu_alloc_ctx which also sets cnum, 1801 * but the latter uses CAS and will miscompare and not overwrite it. 1802 */ 1803 kpreempt_disable(); /* required by sfmmu_ctx_wrap_around */ 1804 for (idx = 0; idx < max_mmu_ctxdoms; idx++) { 1805 if ((mmu_ctxp = mmu_ctxs_tbl[idx]) != NULL) { 1806 mutex_enter(&mmu_ctxp->mmu_lock); 1807 mmu_ctxp->mmu_cnum = mmu_ctxp->mmu_nctxs; 1808 /* make sure updated cnum visible */ 1809 membar_enter(); 1810 mutex_exit(&mmu_ctxp->mmu_lock); 1811 sfmmu_ctx_wrap_around(mmu_ctxp, B_FALSE); 1812 } 1813 } 1814 kpreempt_enable(); 1815 } 1816 1817 void 1818 sfmmu_ctxdoms_unlock(void) 1819 { 1820 sfmmu_hat_unlock_all(); 1821 } 1822 1823 void 1824 sfmmu_ctxdoms_update(void) 1825 { 1826 processorid_t id; 1827 cpu_t *cp; 1828 uint_t idx; 1829 mmu_ctx_t *mmu_ctxp; 1830 1831 /* 1832 * Free all context domains. As side effect, this increases 1833 * mmu_saved_gnum to the maximum gnum over all domains, which is used to 1834 * init gnum in the new domains, which therefore will be larger than the 1835 * sfmmu gnum for any process, guaranteeing that every process will see 1836 * a new generation and allocate a new context regardless of what new 1837 * domain it runs in. 1838 */ 1839 mutex_enter(&cpu_lock); 1840 1841 for (idx = 0; idx < max_mmu_ctxdoms; idx++) { 1842 if (mmu_ctxs_tbl[idx] != NULL) { 1843 mmu_ctxp = mmu_ctxs_tbl[idx]; 1844 mmu_ctxs_tbl[idx] = NULL; 1845 sfmmu_ctxdom_free(mmu_ctxp); 1846 } 1847 } 1848 1849 for (id = 0; id < NCPU; id++) { 1850 if (CPU_IN_SET(sfmmu_ctxdoms_pset, id) && 1851 (cp = cpu[id]) != NULL) 1852 sfmmu_cpu_init(cp); 1853 } 1854 mutex_exit(&cpu_lock); 1855 } 1856 #endif 1857 1858 /* 1859 * Hat_setup, makes an address space context the current active one. 1860 * In sfmmu this translates to setting the secondary context with the 1861 * corresponding context. 1862 */ 1863 void 1864 hat_setup(struct hat *sfmmup, int allocflag) 1865 { 1866 hatlock_t *hatlockp; 1867 1868 /* Init needs some special treatment. */ 1869 if (allocflag == HAT_INIT) { 1870 /* 1871 * Make sure that we have 1872 * 1. a TSB 1873 * 2. a valid ctx that doesn't get stolen after this point. 1874 */ 1875 hatlockp = sfmmu_hat_enter(sfmmup); 1876 1877 /* 1878 * Swap in the TSB. hat_init() allocates tsbinfos without 1879 * TSBs, but we need one for init, since the kernel does some 1880 * special things to set up its stack and needs the TSB to 1881 * resolve page faults. 1882 */ 1883 sfmmu_tsb_swapin(sfmmup, hatlockp); 1884 1885 sfmmu_get_ctx(sfmmup); 1886 1887 sfmmu_hat_exit(hatlockp); 1888 } else { 1889 ASSERT(allocflag == HAT_ALLOC); 1890 1891 hatlockp = sfmmu_hat_enter(sfmmup); 1892 kpreempt_disable(); 1893 1894 CPUSET_ADD(sfmmup->sfmmu_cpusran, CPU->cpu_id); 1895 /* 1896 * sfmmu_setctx_sec takes <pgsz|cnum> as a parameter, 1897 * pagesize bits don't matter in this case since we are passing 1898 * INVALID_CONTEXT to it. 1899 * Compatibility Note: hw takes care of MMU_SCONTEXT1 1900 */ 1901 sfmmu_setctx_sec(INVALID_CONTEXT); 1902 sfmmu_clear_utsbinfo(); 1903 1904 kpreempt_enable(); 1905 sfmmu_hat_exit(hatlockp); 1906 } 1907 } 1908 1909 /* 1910 * Free all the translation resources for the specified address space. 1911 * Called from as_free when an address space is being destroyed. 1912 */ 1913 void 1914 hat_free_start(struct hat *sfmmup) 1915 { 1916 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 1917 ASSERT(sfmmup != ksfmmup); 1918 1919 sfmmup->sfmmu_free = 1; 1920 if (sfmmup->sfmmu_scdp != NULL) { 1921 sfmmu_leave_scd(sfmmup, 0); 1922 } 1923 1924 ASSERT(sfmmup->sfmmu_scdp == NULL); 1925 } 1926 1927 void 1928 hat_free_end(struct hat *sfmmup) 1929 { 1930 int i; 1931 1932 ASSERT(sfmmup->sfmmu_free == 1); 1933 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 1934 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 1935 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 1936 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 1937 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 1938 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 1939 1940 if (sfmmup->sfmmu_rmstat) { 1941 hat_freestat(sfmmup->sfmmu_as, NULL); 1942 } 1943 1944 while (sfmmup->sfmmu_tsb != NULL) { 1945 struct tsb_info *next = sfmmup->sfmmu_tsb->tsb_next; 1946 sfmmu_tsbinfo_free(sfmmup->sfmmu_tsb); 1947 sfmmup->sfmmu_tsb = next; 1948 } 1949 1950 if (sfmmup->sfmmu_srdp != NULL) { 1951 sfmmu_leave_srd(sfmmup); 1952 ASSERT(sfmmup->sfmmu_srdp == NULL); 1953 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 1954 if (sfmmup->sfmmu_hmeregion_links[i] != NULL) { 1955 kmem_free(sfmmup->sfmmu_hmeregion_links[i], 1956 SFMMU_L2_HMERLINKS_SIZE); 1957 sfmmup->sfmmu_hmeregion_links[i] = NULL; 1958 } 1959 } 1960 } 1961 sfmmu_free_sfmmu(sfmmup); 1962 1963 #ifdef DEBUG 1964 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 1965 ASSERT(sfmmup->sfmmu_hmeregion_links[i] == NULL); 1966 } 1967 #endif 1968 1969 kmem_cache_free(sfmmuid_cache, sfmmup); 1970 } 1971 1972 /* 1973 * Duplicate the translations of an as into another newas 1974 */ 1975 /* ARGSUSED */ 1976 int 1977 hat_dup(struct hat *hat, struct hat *newhat, caddr_t addr, size_t len, 1978 uint_t flag) 1979 { 1980 sf_srd_t *srdp; 1981 sf_scd_t *scdp; 1982 int i; 1983 extern uint_t get_color_start(struct as *); 1984 1985 ASSERT((flag == 0) || (flag == HAT_DUP_ALL) || (flag == HAT_DUP_COW) || 1986 (flag == HAT_DUP_SRD)); 1987 ASSERT(hat != ksfmmup); 1988 ASSERT(newhat != ksfmmup); 1989 ASSERT(flag != HAT_DUP_ALL || hat->sfmmu_srdp == newhat->sfmmu_srdp); 1990 1991 if (flag == HAT_DUP_COW) { 1992 panic("hat_dup: HAT_DUP_COW not supported"); 1993 } 1994 1995 if (flag == HAT_DUP_SRD && ((srdp = hat->sfmmu_srdp) != NULL)) { 1996 ASSERT(srdp->srd_evp != NULL); 1997 VN_HOLD(srdp->srd_evp); 1998 ASSERT(srdp->srd_refcnt > 0); 1999 newhat->sfmmu_srdp = srdp; 2000 atomic_inc_32((volatile uint_t *)&srdp->srd_refcnt); 2001 } 2002 2003 /* 2004 * HAT_DUP_ALL flag is used after as duplication is done. 2005 */ 2006 if (flag == HAT_DUP_ALL && ((srdp = newhat->sfmmu_srdp) != NULL)) { 2007 ASSERT(newhat->sfmmu_srdp->srd_refcnt >= 2); 2008 newhat->sfmmu_rtteflags = hat->sfmmu_rtteflags; 2009 if (hat->sfmmu_flags & HAT_4MTEXT_FLAG) { 2010 newhat->sfmmu_flags |= HAT_4MTEXT_FLAG; 2011 } 2012 2013 /* check if need to join scd */ 2014 if ((scdp = hat->sfmmu_scdp) != NULL && 2015 newhat->sfmmu_scdp != scdp) { 2016 int ret; 2017 SF_RGNMAP_IS_SUBSET(&newhat->sfmmu_region_map, 2018 &scdp->scd_region_map, ret); 2019 ASSERT(ret); 2020 sfmmu_join_scd(scdp, newhat); 2021 ASSERT(newhat->sfmmu_scdp == scdp && 2022 scdp->scd_refcnt >= 2); 2023 for (i = 0; i < max_mmu_page_sizes; i++) { 2024 newhat->sfmmu_ismttecnt[i] = 2025 hat->sfmmu_ismttecnt[i]; 2026 newhat->sfmmu_scdismttecnt[i] = 2027 hat->sfmmu_scdismttecnt[i]; 2028 } 2029 } 2030 2031 sfmmu_check_page_sizes(newhat, 1); 2032 } 2033 2034 if (flag == HAT_DUP_ALL && consistent_coloring == 0 && 2035 update_proc_pgcolorbase_after_fork != 0) { 2036 hat->sfmmu_clrbin = get_color_start(hat->sfmmu_as); 2037 } 2038 return (0); 2039 } 2040 2041 void 2042 hat_memload(struct hat *hat, caddr_t addr, struct page *pp, 2043 uint_t attr, uint_t flags) 2044 { 2045 hat_do_memload(hat, addr, pp, attr, flags, 2046 SFMMU_INVALID_SHMERID); 2047 } 2048 2049 void 2050 hat_memload_region(struct hat *hat, caddr_t addr, struct page *pp, 2051 uint_t attr, uint_t flags, hat_region_cookie_t rcookie) 2052 { 2053 uint_t rid; 2054 if (rcookie == HAT_INVALID_REGION_COOKIE) { 2055 hat_do_memload(hat, addr, pp, attr, flags, 2056 SFMMU_INVALID_SHMERID); 2057 return; 2058 } 2059 rid = (uint_t)((uint64_t)rcookie); 2060 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 2061 hat_do_memload(hat, addr, pp, attr, flags, rid); 2062 } 2063 2064 /* 2065 * Set up addr to map to page pp with protection prot. 2066 * As an optimization we also load the TSB with the 2067 * corresponding tte but it is no big deal if the tte gets kicked out. 2068 */ 2069 static void 2070 hat_do_memload(struct hat *hat, caddr_t addr, struct page *pp, 2071 uint_t attr, uint_t flags, uint_t rid) 2072 { 2073 tte_t tte; 2074 2075 2076 ASSERT(hat != NULL); 2077 ASSERT(PAGE_LOCKED(pp)); 2078 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 2079 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 2080 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2081 SFMMU_VALIDATE_HMERID(hat, rid, addr, MMU_PAGESIZE); 2082 2083 if (PP_ISFREE(pp)) { 2084 panic("hat_memload: loading a mapping to free page %p", 2085 (void *)pp); 2086 } 2087 2088 ASSERT((hat == ksfmmup) || 2089 AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock)); 2090 2091 if (flags & ~SFMMU_LOAD_ALLFLAG) 2092 cmn_err(CE_NOTE, "hat_memload: unsupported flags %d", 2093 flags & ~SFMMU_LOAD_ALLFLAG); 2094 2095 if (hat->sfmmu_rmstat) 2096 hat_resvstat(MMU_PAGESIZE, hat->sfmmu_as, addr); 2097 2098 #if defined(SF_ERRATA_57) 2099 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2100 (addr < errata57_limit) && (attr & PROT_EXEC) && 2101 !(flags & HAT_LOAD_SHARE)) { 2102 cmn_err(CE_WARN, "hat_memload: illegal attempt to make user " 2103 " page executable"); 2104 attr &= ~PROT_EXEC; 2105 } 2106 #endif 2107 2108 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 2109 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, flags, rid); 2110 2111 /* 2112 * Check TSB and TLB page sizes. 2113 */ 2114 if ((flags & HAT_LOAD_SHARE) == 0) { 2115 sfmmu_check_page_sizes(hat, 1); 2116 } 2117 } 2118 2119 /* 2120 * hat_devload can be called to map real memory (e.g. 2121 * /dev/kmem) and even though hat_devload will determine pf is 2122 * for memory, it will be unable to get a shared lock on the 2123 * page (because someone else has it exclusively) and will 2124 * pass dp = NULL. If tteload doesn't get a non-NULL 2125 * page pointer it can't cache memory. 2126 */ 2127 void 2128 hat_devload(struct hat *hat, caddr_t addr, size_t len, pfn_t pfn, 2129 uint_t attr, int flags) 2130 { 2131 tte_t tte; 2132 struct page *pp = NULL; 2133 int use_lgpg = 0; 2134 2135 ASSERT(hat != NULL); 2136 2137 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 2138 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2139 ASSERT((hat == ksfmmup) || 2140 AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock)); 2141 if (len == 0) 2142 panic("hat_devload: zero len"); 2143 if (flags & ~SFMMU_LOAD_ALLFLAG) 2144 cmn_err(CE_NOTE, "hat_devload: unsupported flags %d", 2145 flags & ~SFMMU_LOAD_ALLFLAG); 2146 2147 #if defined(SF_ERRATA_57) 2148 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2149 (addr < errata57_limit) && (attr & PROT_EXEC) && 2150 !(flags & HAT_LOAD_SHARE)) { 2151 cmn_err(CE_WARN, "hat_devload: illegal attempt to make user " 2152 " page executable"); 2153 attr &= ~PROT_EXEC; 2154 } 2155 #endif 2156 2157 /* 2158 * If it's a memory page find its pp 2159 */ 2160 if (!(flags & HAT_LOAD_NOCONSIST) && pf_is_memory(pfn)) { 2161 pp = page_numtopp_nolock(pfn); 2162 if (pp == NULL) { 2163 flags |= HAT_LOAD_NOCONSIST; 2164 } else { 2165 if (PP_ISFREE(pp)) { 2166 panic("hat_memload: loading " 2167 "a mapping to free page %p", 2168 (void *)pp); 2169 } 2170 if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) { 2171 panic("hat_memload: loading a mapping " 2172 "to unlocked relocatable page %p", 2173 (void *)pp); 2174 } 2175 ASSERT(len == MMU_PAGESIZE); 2176 } 2177 } 2178 2179 if (hat->sfmmu_rmstat) 2180 hat_resvstat(len, hat->sfmmu_as, addr); 2181 2182 if (flags & HAT_LOAD_NOCONSIST) { 2183 attr |= SFMMU_UNCACHEVTTE; 2184 use_lgpg = 1; 2185 } 2186 if (!pf_is_memory(pfn)) { 2187 attr |= SFMMU_UNCACHEPTTE | HAT_NOSYNC; 2188 use_lgpg = 1; 2189 switch (attr & HAT_ORDER_MASK) { 2190 case HAT_STRICTORDER: 2191 case HAT_UNORDERED_OK: 2192 /* 2193 * we set the side effect bit for all non 2194 * memory mappings unless merging is ok 2195 */ 2196 attr |= SFMMU_SIDEFFECT; 2197 break; 2198 case HAT_MERGING_OK: 2199 case HAT_LOADCACHING_OK: 2200 case HAT_STORECACHING_OK: 2201 break; 2202 default: 2203 panic("hat_devload: bad attr"); 2204 break; 2205 } 2206 } 2207 while (len) { 2208 if (!use_lgpg) { 2209 sfmmu_memtte(&tte, pfn, attr, TTE8K); 2210 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2211 flags, SFMMU_INVALID_SHMERID); 2212 len -= MMU_PAGESIZE; 2213 addr += MMU_PAGESIZE; 2214 pfn++; 2215 continue; 2216 } 2217 /* 2218 * try to use large pages, check va/pa alignments 2219 * Note that 32M/256M page sizes are not (yet) supported. 2220 */ 2221 if ((len >= MMU_PAGESIZE4M) && 2222 !((uintptr_t)addr & MMU_PAGEOFFSET4M) && 2223 !(disable_large_pages & (1 << TTE4M)) && 2224 !(mmu_ptob(pfn) & MMU_PAGEOFFSET4M)) { 2225 sfmmu_memtte(&tte, pfn, attr, TTE4M); 2226 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2227 flags, SFMMU_INVALID_SHMERID); 2228 len -= MMU_PAGESIZE4M; 2229 addr += MMU_PAGESIZE4M; 2230 pfn += MMU_PAGESIZE4M / MMU_PAGESIZE; 2231 } else if ((len >= MMU_PAGESIZE512K) && 2232 !((uintptr_t)addr & MMU_PAGEOFFSET512K) && 2233 !(disable_large_pages & (1 << TTE512K)) && 2234 !(mmu_ptob(pfn) & MMU_PAGEOFFSET512K)) { 2235 sfmmu_memtte(&tte, pfn, attr, TTE512K); 2236 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2237 flags, SFMMU_INVALID_SHMERID); 2238 len -= MMU_PAGESIZE512K; 2239 addr += MMU_PAGESIZE512K; 2240 pfn += MMU_PAGESIZE512K / MMU_PAGESIZE; 2241 } else if ((len >= MMU_PAGESIZE64K) && 2242 !((uintptr_t)addr & MMU_PAGEOFFSET64K) && 2243 !(disable_large_pages & (1 << TTE64K)) && 2244 !(mmu_ptob(pfn) & MMU_PAGEOFFSET64K)) { 2245 sfmmu_memtte(&tte, pfn, attr, TTE64K); 2246 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2247 flags, SFMMU_INVALID_SHMERID); 2248 len -= MMU_PAGESIZE64K; 2249 addr += MMU_PAGESIZE64K; 2250 pfn += MMU_PAGESIZE64K / MMU_PAGESIZE; 2251 } else { 2252 sfmmu_memtte(&tte, pfn, attr, TTE8K); 2253 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2254 flags, SFMMU_INVALID_SHMERID); 2255 len -= MMU_PAGESIZE; 2256 addr += MMU_PAGESIZE; 2257 pfn++; 2258 } 2259 } 2260 2261 /* 2262 * Check TSB and TLB page sizes. 2263 */ 2264 if ((flags & HAT_LOAD_SHARE) == 0) { 2265 sfmmu_check_page_sizes(hat, 1); 2266 } 2267 } 2268 2269 void 2270 hat_memload_array(struct hat *hat, caddr_t addr, size_t len, 2271 struct page **pps, uint_t attr, uint_t flags) 2272 { 2273 hat_do_memload_array(hat, addr, len, pps, attr, flags, 2274 SFMMU_INVALID_SHMERID); 2275 } 2276 2277 void 2278 hat_memload_array_region(struct hat *hat, caddr_t addr, size_t len, 2279 struct page **pps, uint_t attr, uint_t flags, 2280 hat_region_cookie_t rcookie) 2281 { 2282 uint_t rid; 2283 if (rcookie == HAT_INVALID_REGION_COOKIE) { 2284 hat_do_memload_array(hat, addr, len, pps, attr, flags, 2285 SFMMU_INVALID_SHMERID); 2286 return; 2287 } 2288 rid = (uint_t)((uint64_t)rcookie); 2289 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 2290 hat_do_memload_array(hat, addr, len, pps, attr, flags, rid); 2291 } 2292 2293 /* 2294 * Map the largest extend possible out of the page array. The array may NOT 2295 * be in order. The largest possible mapping a page can have 2296 * is specified in the p_szc field. The p_szc field 2297 * cannot change as long as there any mappings (large or small) 2298 * to any of the pages that make up the large page. (ie. any 2299 * promotion/demotion of page size is not up to the hat but up to 2300 * the page free list manager). The array 2301 * should consist of properly aligned contigous pages that are 2302 * part of a big page for a large mapping to be created. 2303 */ 2304 static void 2305 hat_do_memload_array(struct hat *hat, caddr_t addr, size_t len, 2306 struct page **pps, uint_t attr, uint_t flags, uint_t rid) 2307 { 2308 int ttesz; 2309 size_t mapsz; 2310 pgcnt_t numpg, npgs; 2311 tte_t tte; 2312 page_t *pp; 2313 uint_t large_pages_disable; 2314 2315 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 2316 SFMMU_VALIDATE_HMERID(hat, rid, addr, len); 2317 2318 if (hat->sfmmu_rmstat) 2319 hat_resvstat(len, hat->sfmmu_as, addr); 2320 2321 #if defined(SF_ERRATA_57) 2322 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2323 (addr < errata57_limit) && (attr & PROT_EXEC) && 2324 !(flags & HAT_LOAD_SHARE)) { 2325 cmn_err(CE_WARN, "hat_memload_array: illegal attempt to make " 2326 "user page executable"); 2327 attr &= ~PROT_EXEC; 2328 } 2329 #endif 2330 2331 /* Get number of pages */ 2332 npgs = len >> MMU_PAGESHIFT; 2333 2334 if (flags & HAT_LOAD_SHARE) { 2335 large_pages_disable = disable_ism_large_pages; 2336 } else { 2337 large_pages_disable = disable_large_pages; 2338 } 2339 2340 if (npgs < NHMENTS || large_pages_disable == LARGE_PAGES_OFF) { 2341 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs, 2342 rid); 2343 return; 2344 } 2345 2346 while (npgs >= NHMENTS) { 2347 pp = *pps; 2348 for (ttesz = pp->p_szc; ttesz != TTE8K; ttesz--) { 2349 /* 2350 * Check if this page size is disabled. 2351 */ 2352 if (large_pages_disable & (1 << ttesz)) 2353 continue; 2354 2355 numpg = TTEPAGES(ttesz); 2356 mapsz = numpg << MMU_PAGESHIFT; 2357 if ((npgs >= numpg) && 2358 IS_P2ALIGNED(addr, mapsz) && 2359 IS_P2ALIGNED(pp->p_pagenum, numpg)) { 2360 /* 2361 * At this point we have enough pages and 2362 * we know the virtual address and the pfn 2363 * are properly aligned. We still need 2364 * to check for physical contiguity but since 2365 * it is very likely that this is the case 2366 * we will assume they are so and undo 2367 * the request if necessary. It would 2368 * be great if we could get a hint flag 2369 * like HAT_CONTIG which would tell us 2370 * the pages are contigous for sure. 2371 */ 2372 sfmmu_memtte(&tte, (*pps)->p_pagenum, 2373 attr, ttesz); 2374 if (!sfmmu_tteload_array(hat, &tte, addr, 2375 pps, flags, rid)) { 2376 break; 2377 } 2378 } 2379 } 2380 if (ttesz == TTE8K) { 2381 /* 2382 * We were not able to map array using a large page 2383 * batch a hmeblk or fraction at a time. 2384 */ 2385 numpg = ((uintptr_t)addr >> MMU_PAGESHIFT) 2386 & (NHMENTS-1); 2387 numpg = NHMENTS - numpg; 2388 ASSERT(numpg <= npgs); 2389 mapsz = numpg * MMU_PAGESIZE; 2390 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, 2391 numpg, rid); 2392 } 2393 addr += mapsz; 2394 npgs -= numpg; 2395 pps += numpg; 2396 } 2397 2398 if (npgs) { 2399 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs, 2400 rid); 2401 } 2402 2403 /* 2404 * Check TSB and TLB page sizes. 2405 */ 2406 if ((flags & HAT_LOAD_SHARE) == 0) { 2407 sfmmu_check_page_sizes(hat, 1); 2408 } 2409 } 2410 2411 /* 2412 * Function tries to batch 8K pages into the same hme blk. 2413 */ 2414 static void 2415 sfmmu_memload_batchsmall(struct hat *hat, caddr_t vaddr, page_t **pps, 2416 uint_t attr, uint_t flags, pgcnt_t npgs, uint_t rid) 2417 { 2418 tte_t tte; 2419 page_t *pp; 2420 struct hmehash_bucket *hmebp; 2421 struct hme_blk *hmeblkp; 2422 int index; 2423 2424 while (npgs) { 2425 /* 2426 * Acquire the hash bucket. 2427 */ 2428 hmebp = sfmmu_tteload_acquire_hashbucket(hat, vaddr, TTE8K, 2429 rid); 2430 ASSERT(hmebp); 2431 2432 /* 2433 * Find the hment block. 2434 */ 2435 hmeblkp = sfmmu_tteload_find_hmeblk(hat, hmebp, vaddr, 2436 TTE8K, flags, rid); 2437 ASSERT(hmeblkp); 2438 2439 do { 2440 /* 2441 * Make the tte. 2442 */ 2443 pp = *pps; 2444 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 2445 2446 /* 2447 * Add the translation. 2448 */ 2449 (void) sfmmu_tteload_addentry(hat, hmeblkp, &tte, 2450 vaddr, pps, flags, rid); 2451 2452 /* 2453 * Goto next page. 2454 */ 2455 pps++; 2456 npgs--; 2457 2458 /* 2459 * Goto next address. 2460 */ 2461 vaddr += MMU_PAGESIZE; 2462 2463 /* 2464 * Don't crossover into a different hmentblk. 2465 */ 2466 index = (int)(((uintptr_t)vaddr >> MMU_PAGESHIFT) & 2467 (NHMENTS-1)); 2468 2469 } while (index != 0 && npgs != 0); 2470 2471 /* 2472 * Release the hash bucket. 2473 */ 2474 2475 sfmmu_tteload_release_hashbucket(hmebp); 2476 } 2477 } 2478 2479 /* 2480 * Construct a tte for a page: 2481 * 2482 * tte_valid = 1 2483 * tte_size2 = size & TTE_SZ2_BITS (Panther and Olympus-C only) 2484 * tte_size = size 2485 * tte_nfo = attr & HAT_NOFAULT 2486 * tte_ie = attr & HAT_STRUCTURE_LE 2487 * tte_hmenum = hmenum 2488 * tte_pahi = pp->p_pagenum >> TTE_PASHIFT; 2489 * tte_palo = pp->p_pagenum & TTE_PALOMASK; 2490 * tte_ref = 1 (optimization) 2491 * tte_wr_perm = attr & PROT_WRITE; 2492 * tte_no_sync = attr & HAT_NOSYNC 2493 * tte_lock = attr & SFMMU_LOCKTTE 2494 * tte_cp = !(attr & SFMMU_UNCACHEPTTE) 2495 * tte_cv = !(attr & SFMMU_UNCACHEVTTE) 2496 * tte_e = attr & SFMMU_SIDEFFECT 2497 * tte_priv = !(attr & PROT_USER) 2498 * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt) 2499 * tte_glb = 0 2500 */ 2501 void 2502 sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz) 2503 { 2504 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2505 2506 ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */); 2507 ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */); 2508 2509 if (TTE_IS_NOSYNC(ttep)) { 2510 TTE_SET_REF(ttep); 2511 if (TTE_IS_WRITABLE(ttep)) { 2512 TTE_SET_MOD(ttep); 2513 } 2514 } 2515 if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) { 2516 panic("sfmmu_memtte: can't set both NFO and EXEC bits"); 2517 } 2518 } 2519 2520 /* 2521 * This function will add a translation to the hme_blk and allocate the 2522 * hme_blk if one does not exist. 2523 * If a page structure is specified then it will add the 2524 * corresponding hment to the mapping list. 2525 * It will also update the hmenum field for the tte. 2526 * 2527 * Currently this function is only used for kernel mappings. 2528 * So pass invalid region to sfmmu_tteload_array(). 2529 */ 2530 void 2531 sfmmu_tteload(struct hat *sfmmup, tte_t *ttep, caddr_t vaddr, page_t *pp, 2532 uint_t flags) 2533 { 2534 ASSERT(sfmmup == ksfmmup); 2535 (void) sfmmu_tteload_array(sfmmup, ttep, vaddr, &pp, flags, 2536 SFMMU_INVALID_SHMERID); 2537 } 2538 2539 /* 2540 * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB. 2541 * Assumes that a particular page size may only be resident in one TSB. 2542 */ 2543 static void 2544 sfmmu_mod_tsb(sfmmu_t *sfmmup, caddr_t vaddr, tte_t *ttep, int ttesz) 2545 { 2546 struct tsb_info *tsbinfop = NULL; 2547 uint64_t tag; 2548 struct tsbe *tsbe_addr; 2549 uint64_t tsb_base; 2550 uint_t tsb_size; 2551 int vpshift = MMU_PAGESHIFT; 2552 int phys = 0; 2553 2554 if (sfmmup == ksfmmup) { /* No support for 32/256M ksfmmu pages */ 2555 phys = ktsb_phys; 2556 if (ttesz >= TTE4M) { 2557 #ifndef sun4v 2558 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2559 #endif 2560 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2561 tsb_size = ktsb4m_szcode; 2562 } else { 2563 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2564 tsb_size = ktsb_szcode; 2565 } 2566 } else { 2567 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2568 2569 /* 2570 * If there isn't a TSB for this page size, or the TSB is 2571 * swapped out, there is nothing to do. Note that the latter 2572 * case seems impossible but can occur if hat_pageunload() 2573 * is called on an ISM mapping while the process is swapped 2574 * out. 2575 */ 2576 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2577 return; 2578 2579 /* 2580 * If another thread is in the middle of relocating a TSB 2581 * we can't unload the entry so set a flag so that the 2582 * TSB will be flushed before it can be accessed by the 2583 * process. 2584 */ 2585 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2586 if (ttep == NULL) 2587 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2588 return; 2589 } 2590 #if defined(UTSB_PHYS) 2591 phys = 1; 2592 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2593 #else 2594 tsb_base = (uint64_t)tsbinfop->tsb_va; 2595 #endif 2596 tsb_size = tsbinfop->tsb_szc; 2597 } 2598 if (ttesz >= TTE4M) 2599 vpshift = MMU_PAGESHIFT4M; 2600 2601 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2602 tag = sfmmu_make_tsbtag(vaddr); 2603 2604 if (ttep == NULL) { 2605 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2606 } else { 2607 if (ttesz >= TTE4M) { 2608 SFMMU_STAT(sf_tsb_load4m); 2609 } else { 2610 SFMMU_STAT(sf_tsb_load8k); 2611 } 2612 2613 sfmmu_load_tsbe(tsbe_addr, tag, ttep, phys); 2614 } 2615 } 2616 2617 /* 2618 * Unmap all entries from [start, end) matching the given page size. 2619 * 2620 * This function is used primarily to unmap replicated 64K or 512K entries 2621 * from the TSB that are inserted using the base page size TSB pointer, but 2622 * it may also be called to unmap a range of addresses from the TSB. 2623 */ 2624 void 2625 sfmmu_unload_tsb_range(sfmmu_t *sfmmup, caddr_t start, caddr_t end, int ttesz) 2626 { 2627 struct tsb_info *tsbinfop; 2628 uint64_t tag; 2629 struct tsbe *tsbe_addr; 2630 caddr_t vaddr; 2631 uint64_t tsb_base; 2632 int vpshift, vpgsz; 2633 uint_t tsb_size; 2634 int phys = 0; 2635 2636 /* 2637 * Assumptions: 2638 * If ttesz == 8K, 64K or 512K, we walk through the range 8K 2639 * at a time shooting down any valid entries we encounter. 2640 * 2641 * If ttesz >= 4M we walk the range 4M at a time shooting 2642 * down any valid mappings we find. 2643 */ 2644 if (sfmmup == ksfmmup) { 2645 phys = ktsb_phys; 2646 if (ttesz >= TTE4M) { 2647 #ifndef sun4v 2648 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2649 #endif 2650 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2651 tsb_size = ktsb4m_szcode; 2652 } else { 2653 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2654 tsb_size = ktsb_szcode; 2655 } 2656 } else { 2657 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2658 2659 /* 2660 * If there isn't a TSB for this page size, or the TSB is 2661 * swapped out, there is nothing to do. Note that the latter 2662 * case seems impossible but can occur if hat_pageunload() 2663 * is called on an ISM mapping while the process is swapped 2664 * out. 2665 */ 2666 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2667 return; 2668 2669 /* 2670 * If another thread is in the middle of relocating a TSB 2671 * we can't unload the entry so set a flag so that the 2672 * TSB will be flushed before it can be accessed by the 2673 * process. 2674 */ 2675 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2676 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2677 return; 2678 } 2679 #if defined(UTSB_PHYS) 2680 phys = 1; 2681 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2682 #else 2683 tsb_base = (uint64_t)tsbinfop->tsb_va; 2684 #endif 2685 tsb_size = tsbinfop->tsb_szc; 2686 } 2687 if (ttesz >= TTE4M) { 2688 vpshift = MMU_PAGESHIFT4M; 2689 vpgsz = MMU_PAGESIZE4M; 2690 } else { 2691 vpshift = MMU_PAGESHIFT; 2692 vpgsz = MMU_PAGESIZE; 2693 } 2694 2695 for (vaddr = start; vaddr < end; vaddr += vpgsz) { 2696 tag = sfmmu_make_tsbtag(vaddr); 2697 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2698 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2699 } 2700 } 2701 2702 /* 2703 * Select the optimum TSB size given the number of mappings 2704 * that need to be cached. 2705 */ 2706 static int 2707 sfmmu_select_tsb_szc(pgcnt_t pgcnt) 2708 { 2709 int szc = 0; 2710 2711 #ifdef DEBUG 2712 if (tsb_grow_stress) { 2713 uint32_t randval = (uint32_t)gettick() >> 4; 2714 return (randval % (tsb_max_growsize + 1)); 2715 } 2716 #endif /* DEBUG */ 2717 2718 while ((szc < tsb_max_growsize) && (pgcnt > SFMMU_RSS_TSBSIZE(szc))) 2719 szc++; 2720 return (szc); 2721 } 2722 2723 /* 2724 * This function will add a translation to the hme_blk and allocate the 2725 * hme_blk if one does not exist. 2726 * If a page structure is specified then it will add the 2727 * corresponding hment to the mapping list. 2728 * It will also update the hmenum field for the tte. 2729 * Furthermore, it attempts to create a large page translation 2730 * for <addr,hat> at page array pps. It assumes addr and first 2731 * pp is correctly aligned. It returns 0 if successful and 1 otherwise. 2732 */ 2733 static int 2734 sfmmu_tteload_array(sfmmu_t *sfmmup, tte_t *ttep, caddr_t vaddr, 2735 page_t **pps, uint_t flags, uint_t rid) 2736 { 2737 struct hmehash_bucket *hmebp; 2738 struct hme_blk *hmeblkp; 2739 int ret; 2740 uint_t size; 2741 2742 /* 2743 * Get mapping size. 2744 */ 2745 size = TTE_CSZ(ttep); 2746 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 2747 2748 /* 2749 * Acquire the hash bucket. 2750 */ 2751 hmebp = sfmmu_tteload_acquire_hashbucket(sfmmup, vaddr, size, rid); 2752 ASSERT(hmebp); 2753 2754 /* 2755 * Find the hment block. 2756 */ 2757 hmeblkp = sfmmu_tteload_find_hmeblk(sfmmup, hmebp, vaddr, size, flags, 2758 rid); 2759 ASSERT(hmeblkp); 2760 2761 /* 2762 * Add the translation. 2763 */ 2764 ret = sfmmu_tteload_addentry(sfmmup, hmeblkp, ttep, vaddr, pps, flags, 2765 rid); 2766 2767 /* 2768 * Release the hash bucket. 2769 */ 2770 sfmmu_tteload_release_hashbucket(hmebp); 2771 2772 return (ret); 2773 } 2774 2775 /* 2776 * Function locks and returns a pointer to the hash bucket for vaddr and size. 2777 */ 2778 static struct hmehash_bucket * 2779 sfmmu_tteload_acquire_hashbucket(sfmmu_t *sfmmup, caddr_t vaddr, int size, 2780 uint_t rid) 2781 { 2782 struct hmehash_bucket *hmebp; 2783 int hmeshift; 2784 void *htagid = sfmmutohtagid(sfmmup, rid); 2785 2786 ASSERT(htagid != NULL); 2787 2788 hmeshift = HME_HASH_SHIFT(size); 2789 2790 hmebp = HME_HASH_FUNCTION(htagid, vaddr, hmeshift); 2791 2792 SFMMU_HASH_LOCK(hmebp); 2793 2794 return (hmebp); 2795 } 2796 2797 /* 2798 * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the 2799 * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is 2800 * allocated. 2801 */ 2802 static struct hme_blk * 2803 sfmmu_tteload_find_hmeblk(sfmmu_t *sfmmup, struct hmehash_bucket *hmebp, 2804 caddr_t vaddr, uint_t size, uint_t flags, uint_t rid) 2805 { 2806 hmeblk_tag hblktag; 2807 int hmeshift; 2808 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 2809 2810 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 2811 2812 hblktag.htag_id = sfmmutohtagid(sfmmup, rid); 2813 ASSERT(hblktag.htag_id != NULL); 2814 hmeshift = HME_HASH_SHIFT(size); 2815 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 2816 hblktag.htag_rehash = HME_HASH_REHASH(size); 2817 hblktag.htag_rid = rid; 2818 2819 ttearray_realloc: 2820 2821 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 2822 2823 /* 2824 * We block until hblk_reserve_lock is released; it's held by 2825 * the thread, temporarily using hblk_reserve, until hblk_reserve is 2826 * replaced by a hblk from sfmmu8_cache. 2827 */ 2828 if (hmeblkp == (struct hme_blk *)hblk_reserve && 2829 hblk_reserve_thread != curthread) { 2830 SFMMU_HASH_UNLOCK(hmebp); 2831 mutex_enter(&hblk_reserve_lock); 2832 mutex_exit(&hblk_reserve_lock); 2833 SFMMU_STAT(sf_hblk_reserve_hit); 2834 SFMMU_HASH_LOCK(hmebp); 2835 goto ttearray_realloc; 2836 } 2837 2838 if (hmeblkp == NULL) { 2839 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 2840 hblktag, flags, rid); 2841 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 2842 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 2843 } else { 2844 /* 2845 * It is possible for 8k and 64k hblks to collide since they 2846 * have the same rehash value. This is because we 2847 * lazily free hblks and 8K/64K blks could be lingering. 2848 * If we find size mismatch we free the block and & try again. 2849 */ 2850 if (get_hblk_ttesz(hmeblkp) != size) { 2851 ASSERT(!hmeblkp->hblk_vcnt); 2852 ASSERT(!hmeblkp->hblk_hmecnt); 2853 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 2854 &list, 0); 2855 goto ttearray_realloc; 2856 } 2857 if (hmeblkp->hblk_shw_bit) { 2858 /* 2859 * if the hblk was previously used as a shadow hblk then 2860 * we will change it to a normal hblk 2861 */ 2862 ASSERT(!hmeblkp->hblk_shared); 2863 if (hmeblkp->hblk_shw_mask) { 2864 sfmmu_shadow_hcleanup(sfmmup, hmeblkp, hmebp); 2865 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 2866 goto ttearray_realloc; 2867 } else { 2868 hmeblkp->hblk_shw_bit = 0; 2869 } 2870 } 2871 SFMMU_STAT(sf_hblk_hit); 2872 } 2873 2874 /* 2875 * hat_memload() should never call kmem_cache_free() for kernel hmeblks; 2876 * see block comment showing the stacktrace in sfmmu_hblk_alloc(); 2877 * set the flag parameter to 1 so that sfmmu_hblks_list_purge() will 2878 * just add these hmeblks to the per-cpu pending queue. 2879 */ 2880 sfmmu_hblks_list_purge(&list, 1); 2881 2882 ASSERT(get_hblk_ttesz(hmeblkp) == size); 2883 ASSERT(!hmeblkp->hblk_shw_bit); 2884 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 2885 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 2886 ASSERT(hmeblkp->hblk_tag.htag_rid == rid); 2887 2888 return (hmeblkp); 2889 } 2890 2891 /* 2892 * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1 2893 * otherwise. 2894 */ 2895 static int 2896 sfmmu_tteload_addentry(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, tte_t *ttep, 2897 caddr_t vaddr, page_t **pps, uint_t flags, uint_t rid) 2898 { 2899 page_t *pp = *pps; 2900 int hmenum, size, remap; 2901 tte_t tteold, flush_tte; 2902 #ifdef DEBUG 2903 tte_t orig_old; 2904 #endif /* DEBUG */ 2905 struct sf_hment *sfhme; 2906 kmutex_t *pml, *pmtx; 2907 hatlock_t *hatlockp; 2908 int myflt; 2909 2910 /* 2911 * remove this panic when we decide to let user virtual address 2912 * space be >= USERLIMIT. 2913 */ 2914 if (!TTE_IS_PRIVILEGED(ttep) && vaddr >= (caddr_t)USERLIMIT) 2915 panic("user addr %p in kernel space", (void *)vaddr); 2916 #if defined(TTE_IS_GLOBAL) 2917 if (TTE_IS_GLOBAL(ttep)) 2918 panic("sfmmu_tteload: creating global tte"); 2919 #endif 2920 2921 #ifdef DEBUG 2922 if (pf_is_memory(sfmmu_ttetopfn(ttep, vaddr)) && 2923 !TTE_IS_PCACHEABLE(ttep) && !sfmmu_allow_nc_trans) 2924 panic("sfmmu_tteload: non cacheable memory tte"); 2925 #endif /* DEBUG */ 2926 2927 /* don't simulate dirty bit for writeable ISM/DISM mappings */ 2928 if ((flags & HAT_LOAD_SHARE) && TTE_IS_WRITABLE(ttep)) { 2929 TTE_SET_REF(ttep); 2930 TTE_SET_MOD(ttep); 2931 } 2932 2933 if ((flags & HAT_LOAD_SHARE) || !TTE_IS_REF(ttep) || 2934 !TTE_IS_MOD(ttep)) { 2935 /* 2936 * Don't load TSB for dummy as in ISM. Also don't preload 2937 * the TSB if the TTE isn't writable since we're likely to 2938 * fault on it again -- preloading can be fairly expensive. 2939 */ 2940 flags |= SFMMU_NO_TSBLOAD; 2941 } 2942 2943 size = TTE_CSZ(ttep); 2944 switch (size) { 2945 case TTE8K: 2946 SFMMU_STAT(sf_tteload8k); 2947 break; 2948 case TTE64K: 2949 SFMMU_STAT(sf_tteload64k); 2950 break; 2951 case TTE512K: 2952 SFMMU_STAT(sf_tteload512k); 2953 break; 2954 case TTE4M: 2955 SFMMU_STAT(sf_tteload4m); 2956 break; 2957 case (TTE32M): 2958 SFMMU_STAT(sf_tteload32m); 2959 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 2960 break; 2961 case (TTE256M): 2962 SFMMU_STAT(sf_tteload256m); 2963 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 2964 break; 2965 } 2966 2967 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 2968 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 2969 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 2970 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 2971 2972 HBLKTOHME_IDX(sfhme, hmeblkp, vaddr, hmenum); 2973 2974 /* 2975 * Need to grab mlist lock here so that pageunload 2976 * will not change tte behind us. 2977 */ 2978 if (pp) { 2979 pml = sfmmu_mlist_enter(pp); 2980 } 2981 2982 sfmmu_copytte(&sfhme->hme_tte, &tteold); 2983 /* 2984 * Look for corresponding hment and if valid verify 2985 * pfns are equal. 2986 */ 2987 remap = TTE_IS_VALID(&tteold); 2988 if (remap) { 2989 pfn_t new_pfn, old_pfn; 2990 2991 old_pfn = TTE_TO_PFN(vaddr, &tteold); 2992 new_pfn = TTE_TO_PFN(vaddr, ttep); 2993 2994 if (flags & HAT_LOAD_REMAP) { 2995 /* make sure we are remapping same type of pages */ 2996 if (pf_is_memory(old_pfn) != pf_is_memory(new_pfn)) { 2997 panic("sfmmu_tteload - tte remap io<->memory"); 2998 } 2999 if (old_pfn != new_pfn && 3000 (pp != NULL || sfhme->hme_page != NULL)) { 3001 panic("sfmmu_tteload - tte remap pp != NULL"); 3002 } 3003 } else if (old_pfn != new_pfn) { 3004 panic("sfmmu_tteload - tte remap, hmeblkp 0x%p", 3005 (void *)hmeblkp); 3006 } 3007 ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep)); 3008 } 3009 3010 if (pp) { 3011 if (size == TTE8K) { 3012 #ifdef VAC 3013 /* 3014 * Handle VAC consistency 3015 */ 3016 if (!remap && (cache & CACHE_VAC) && !PP_ISNC(pp)) { 3017 sfmmu_vac_conflict(sfmmup, vaddr, pp); 3018 } 3019 #endif 3020 3021 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 3022 pmtx = sfmmu_page_enter(pp); 3023 PP_CLRRO(pp); 3024 sfmmu_page_exit(pmtx); 3025 } else if (!PP_ISMAPPED(pp) && 3026 (!TTE_IS_WRITABLE(ttep)) && !(PP_ISMOD(pp))) { 3027 pmtx = sfmmu_page_enter(pp); 3028 if (!(PP_ISMOD(pp))) { 3029 PP_SETRO(pp); 3030 } 3031 sfmmu_page_exit(pmtx); 3032 } 3033 3034 } else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) { 3035 /* 3036 * sfmmu_pagearray_setup failed so return 3037 */ 3038 sfmmu_mlist_exit(pml); 3039 return (1); 3040 } 3041 } 3042 3043 /* 3044 * Make sure hment is not on a mapping list. 3045 */ 3046 ASSERT(remap || (sfhme->hme_page == NULL)); 3047 3048 /* if it is not a remap then hme->next better be NULL */ 3049 ASSERT((!remap) ? sfhme->hme_next == NULL : 1); 3050 3051 if (flags & HAT_LOAD_LOCK) { 3052 if ((hmeblkp->hblk_lckcnt + 1) >= MAX_HBLK_LCKCNT) { 3053 panic("too high lckcnt-hmeblk %p", 3054 (void *)hmeblkp); 3055 } 3056 atomic_inc_32(&hmeblkp->hblk_lckcnt); 3057 3058 HBLK_STACK_TRACE(hmeblkp, HBLK_LOCK); 3059 } 3060 3061 #ifdef VAC 3062 if (pp && PP_ISNC(pp)) { 3063 /* 3064 * If the physical page is marked to be uncacheable, like 3065 * by a vac conflict, make sure the new mapping is also 3066 * uncacheable. 3067 */ 3068 TTE_CLR_VCACHEABLE(ttep); 3069 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 3070 } 3071 #endif 3072 ttep->tte_hmenum = hmenum; 3073 3074 #ifdef DEBUG 3075 orig_old = tteold; 3076 #endif /* DEBUG */ 3077 3078 while (sfmmu_modifytte_try(&tteold, ttep, &sfhme->hme_tte) < 0) { 3079 if ((sfmmup == KHATID) && 3080 (flags & (HAT_LOAD_LOCK | HAT_LOAD_REMAP))) { 3081 sfmmu_copytte(&sfhme->hme_tte, &tteold); 3082 } 3083 #ifdef DEBUG 3084 chk_tte(&orig_old, &tteold, ttep, hmeblkp); 3085 #endif /* DEBUG */ 3086 } 3087 ASSERT(TTE_IS_VALID(&sfhme->hme_tte)); 3088 3089 if (!TTE_IS_VALID(&tteold)) { 3090 3091 atomic_inc_16(&hmeblkp->hblk_vcnt); 3092 if (rid == SFMMU_INVALID_SHMERID) { 3093 atomic_inc_ulong(&sfmmup->sfmmu_ttecnt[size]); 3094 } else { 3095 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 3096 sf_region_t *rgnp = srdp->srd_hmergnp[rid]; 3097 /* 3098 * We already accounted for region ttecnt's in sfmmu 3099 * during hat_join_region() processing. Here we 3100 * only update ttecnt's in region struture. 3101 */ 3102 atomic_inc_ulong(&rgnp->rgn_ttecnt[size]); 3103 } 3104 } 3105 3106 myflt = (astosfmmu(curthread->t_procp->p_as) == sfmmup); 3107 if (size > TTE8K && (flags & HAT_LOAD_SHARE) == 0 && 3108 sfmmup != ksfmmup) { 3109 uchar_t tteflag = 1 << size; 3110 if (rid == SFMMU_INVALID_SHMERID) { 3111 if (!(sfmmup->sfmmu_tteflags & tteflag)) { 3112 hatlockp = sfmmu_hat_enter(sfmmup); 3113 sfmmup->sfmmu_tteflags |= tteflag; 3114 sfmmu_hat_exit(hatlockp); 3115 } 3116 } else if (!(sfmmup->sfmmu_rtteflags & tteflag)) { 3117 hatlockp = sfmmu_hat_enter(sfmmup); 3118 sfmmup->sfmmu_rtteflags |= tteflag; 3119 sfmmu_hat_exit(hatlockp); 3120 } 3121 /* 3122 * Update the current CPU tsbmiss area, so the current thread 3123 * won't need to take the tsbmiss for the new pagesize. 3124 * The other threads in the process will update their tsb 3125 * miss area lazily in sfmmu_tsbmiss_exception() when they 3126 * fail to find the translation for a newly added pagesize. 3127 */ 3128 if (size > TTE64K && myflt) { 3129 struct tsbmiss *tsbmp; 3130 kpreempt_disable(); 3131 tsbmp = &tsbmiss_area[CPU->cpu_id]; 3132 if (rid == SFMMU_INVALID_SHMERID) { 3133 if (!(tsbmp->uhat_tteflags & tteflag)) { 3134 tsbmp->uhat_tteflags |= tteflag; 3135 } 3136 } else { 3137 if (!(tsbmp->uhat_rtteflags & tteflag)) { 3138 tsbmp->uhat_rtteflags |= tteflag; 3139 } 3140 } 3141 kpreempt_enable(); 3142 } 3143 } 3144 3145 if (size >= TTE4M && (flags & HAT_LOAD_TEXT) && 3146 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) { 3147 hatlockp = sfmmu_hat_enter(sfmmup); 3148 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG); 3149 sfmmu_hat_exit(hatlockp); 3150 } 3151 3152 flush_tte.tte_intlo = (tteold.tte_intlo ^ ttep->tte_intlo) & 3153 hw_tte.tte_intlo; 3154 flush_tte.tte_inthi = (tteold.tte_inthi ^ ttep->tte_inthi) & 3155 hw_tte.tte_inthi; 3156 3157 if (remap && (flush_tte.tte_inthi || flush_tte.tte_intlo)) { 3158 /* 3159 * If remap and new tte differs from old tte we need 3160 * to sync the mod bit and flush TLB/TSB. We don't 3161 * need to sync ref bit because we currently always set 3162 * ref bit in tteload. 3163 */ 3164 ASSERT(TTE_IS_REF(ttep)); 3165 if (TTE_IS_MOD(&tteold)) { 3166 sfmmu_ttesync(sfmmup, vaddr, &tteold, pp); 3167 } 3168 /* 3169 * hwtte bits shouldn't change for SRD hmeblks as long as SRD 3170 * hmes are only used for read only text. Adding this code for 3171 * completeness and future use of shared hmeblks with writable 3172 * mappings of VMODSORT vnodes. 3173 */ 3174 if (hmeblkp->hblk_shared) { 3175 cpuset_t cpuset = sfmmu_rgntlb_demap(vaddr, 3176 sfmmup->sfmmu_srdp->srd_hmergnp[rid], hmeblkp, 1); 3177 xt_sync(cpuset); 3178 SFMMU_STAT_ADD(sf_region_remap_demap, 1); 3179 } else { 3180 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 0); 3181 xt_sync(sfmmup->sfmmu_cpusran); 3182 } 3183 } 3184 3185 if ((flags & SFMMU_NO_TSBLOAD) == 0) { 3186 /* 3187 * We only preload 8K and 4M mappings into the TSB, since 3188 * 64K and 512K mappings are replicated and hence don't 3189 * have a single, unique TSB entry. Ditto for 32M/256M. 3190 */ 3191 if (size == TTE8K || size == TTE4M) { 3192 sf_scd_t *scdp; 3193 hatlockp = sfmmu_hat_enter(sfmmup); 3194 /* 3195 * Don't preload private TSB if the mapping is used 3196 * by the shctx in the SCD. 3197 */ 3198 scdp = sfmmup->sfmmu_scdp; 3199 if (rid == SFMMU_INVALID_SHMERID || scdp == NULL || 3200 !SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 3201 sfmmu_load_tsb(sfmmup, vaddr, &sfhme->hme_tte, 3202 size); 3203 } 3204 sfmmu_hat_exit(hatlockp); 3205 } 3206 } 3207 if (pp) { 3208 if (!remap) { 3209 HME_ADD(sfhme, pp); 3210 atomic_inc_16(&hmeblkp->hblk_hmecnt); 3211 ASSERT(hmeblkp->hblk_hmecnt > 0); 3212 3213 /* 3214 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 3215 * see pageunload() for comment. 3216 */ 3217 } 3218 sfmmu_mlist_exit(pml); 3219 } 3220 3221 return (0); 3222 } 3223 /* 3224 * Function unlocks hash bucket. 3225 */ 3226 static void 3227 sfmmu_tteload_release_hashbucket(struct hmehash_bucket *hmebp) 3228 { 3229 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3230 SFMMU_HASH_UNLOCK(hmebp); 3231 } 3232 3233 /* 3234 * function which checks and sets up page array for a large 3235 * translation. Will set p_vcolor, p_index, p_ro fields. 3236 * Assumes addr and pfnum of first page are properly aligned. 3237 * Will check for physical contiguity. If check fails it return 3238 * non null. 3239 */ 3240 static int 3241 sfmmu_pagearray_setup(caddr_t addr, page_t **pps, tte_t *ttep, int remap) 3242 { 3243 int i, index, ttesz; 3244 pfn_t pfnum; 3245 pgcnt_t npgs; 3246 page_t *pp, *pp1; 3247 kmutex_t *pmtx; 3248 #ifdef VAC 3249 int osz; 3250 int cflags = 0; 3251 int vac_err = 0; 3252 #endif 3253 int newidx = 0; 3254 3255 ttesz = TTE_CSZ(ttep); 3256 3257 ASSERT(ttesz > TTE8K); 3258 3259 npgs = TTEPAGES(ttesz); 3260 index = PAGESZ_TO_INDEX(ttesz); 3261 3262 pfnum = (*pps)->p_pagenum; 3263 ASSERT(IS_P2ALIGNED(pfnum, npgs)); 3264 3265 /* 3266 * Save the first pp so we can do HAT_TMPNC at the end. 3267 */ 3268 pp1 = *pps; 3269 #ifdef VAC 3270 osz = fnd_mapping_sz(pp1); 3271 #endif 3272 3273 for (i = 0; i < npgs; i++, pps++) { 3274 pp = *pps; 3275 ASSERT(PAGE_LOCKED(pp)); 3276 ASSERT(pp->p_szc >= ttesz); 3277 ASSERT(pp->p_szc == pp1->p_szc); 3278 ASSERT(sfmmu_mlist_held(pp)); 3279 3280 /* 3281 * XXX is it possible to maintain P_RO on the root only? 3282 */ 3283 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 3284 pmtx = sfmmu_page_enter(pp); 3285 PP_CLRRO(pp); 3286 sfmmu_page_exit(pmtx); 3287 } else if (!PP_ISMAPPED(pp) && !TTE_IS_WRITABLE(ttep) && 3288 !PP_ISMOD(pp)) { 3289 pmtx = sfmmu_page_enter(pp); 3290 if (!(PP_ISMOD(pp))) { 3291 PP_SETRO(pp); 3292 } 3293 sfmmu_page_exit(pmtx); 3294 } 3295 3296 /* 3297 * If this is a remap we skip vac & contiguity checks. 3298 */ 3299 if (remap) 3300 continue; 3301 3302 /* 3303 * set p_vcolor and detect any vac conflicts. 3304 */ 3305 #ifdef VAC 3306 if (vac_err == 0) { 3307 vac_err = sfmmu_vacconflict_array(addr, pp, &cflags); 3308 3309 } 3310 #endif 3311 3312 /* 3313 * Save current index in case we need to undo it. 3314 * Note: "PAGESZ_TO_INDEX(sz) (1 << (sz))" 3315 * "SFMMU_INDEX_SHIFT 6" 3316 * "SFMMU_INDEX_MASK ((1 << SFMMU_INDEX_SHIFT) - 1)" 3317 * "PP_MAPINDEX(p_index) (p_index & SFMMU_INDEX_MASK)" 3318 * 3319 * So: index = PAGESZ_TO_INDEX(ttesz); 3320 * if ttesz == 1 then index = 0x2 3321 * 2 then index = 0x4 3322 * 3 then index = 0x8 3323 * 4 then index = 0x10 3324 * 5 then index = 0x20 3325 * The code below checks if it's a new pagesize (ie, newidx) 3326 * in case we need to take it back out of p_index, 3327 * and then or's the new index into the existing index. 3328 */ 3329 if ((PP_MAPINDEX(pp) & index) == 0) 3330 newidx = 1; 3331 pp->p_index = (PP_MAPINDEX(pp) | index); 3332 3333 /* 3334 * contiguity check 3335 */ 3336 if (pp->p_pagenum != pfnum) { 3337 /* 3338 * If we fail the contiguity test then 3339 * the only thing we need to fix is the p_index field. 3340 * We might get a few extra flushes but since this 3341 * path is rare that is ok. The p_ro field will 3342 * get automatically fixed on the next tteload to 3343 * the page. NO TNC bit is set yet. 3344 */ 3345 while (i >= 0) { 3346 pp = *pps; 3347 if (newidx) 3348 pp->p_index = (PP_MAPINDEX(pp) & 3349 ~index); 3350 pps--; 3351 i--; 3352 } 3353 return (1); 3354 } 3355 pfnum++; 3356 addr += MMU_PAGESIZE; 3357 } 3358 3359 #ifdef VAC 3360 if (vac_err) { 3361 if (ttesz > osz) { 3362 /* 3363 * There are some smaller mappings that causes vac 3364 * conflicts. Convert all existing small mappings to 3365 * TNC. 3366 */ 3367 SFMMU_STAT_ADD(sf_uncache_conflict, npgs); 3368 sfmmu_page_cache_array(pp1, HAT_TMPNC, CACHE_FLUSH, 3369 npgs); 3370 } else { 3371 /* EMPTY */ 3372 /* 3373 * If there exists an big page mapping, 3374 * that means the whole existing big page 3375 * has TNC setting already. No need to covert to 3376 * TNC again. 3377 */ 3378 ASSERT(PP_ISTNC(pp1)); 3379 } 3380 } 3381 #endif /* VAC */ 3382 3383 return (0); 3384 } 3385 3386 #ifdef VAC 3387 /* 3388 * Routine that detects vac consistency for a large page. It also 3389 * sets virtual color for all pp's for this big mapping. 3390 */ 3391 static int 3392 sfmmu_vacconflict_array(caddr_t addr, page_t *pp, int *cflags) 3393 { 3394 int vcolor, ocolor; 3395 3396 ASSERT(sfmmu_mlist_held(pp)); 3397 3398 if (PP_ISNC(pp)) { 3399 return (HAT_TMPNC); 3400 } 3401 3402 vcolor = addr_to_vcolor(addr); 3403 if (PP_NEWPAGE(pp)) { 3404 PP_SET_VCOLOR(pp, vcolor); 3405 return (0); 3406 } 3407 3408 ocolor = PP_GET_VCOLOR(pp); 3409 if (ocolor == vcolor) { 3410 return (0); 3411 } 3412 3413 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) { 3414 /* 3415 * Previous user of page had a differnet color 3416 * but since there are no current users 3417 * we just flush the cache and change the color. 3418 * As an optimization for large pages we flush the 3419 * entire cache of that color and set a flag. 3420 */ 3421 SFMMU_STAT(sf_pgcolor_conflict); 3422 if (!CacheColor_IsFlushed(*cflags, ocolor)) { 3423 CacheColor_SetFlushed(*cflags, ocolor); 3424 sfmmu_cache_flushcolor(ocolor, pp->p_pagenum); 3425 } 3426 PP_SET_VCOLOR(pp, vcolor); 3427 return (0); 3428 } 3429 3430 /* 3431 * We got a real conflict with a current mapping. 3432 * set flags to start unencaching all mappings 3433 * and return failure so we restart looping 3434 * the pp array from the beginning. 3435 */ 3436 return (HAT_TMPNC); 3437 } 3438 #endif /* VAC */ 3439 3440 /* 3441 * creates a large page shadow hmeblk for a tte. 3442 * The purpose of this routine is to allow us to do quick unloads because 3443 * the vm layer can easily pass a very large but sparsely populated range. 3444 */ 3445 static struct hme_blk * 3446 sfmmu_shadow_hcreate(sfmmu_t *sfmmup, caddr_t vaddr, int ttesz, uint_t flags) 3447 { 3448 struct hmehash_bucket *hmebp; 3449 hmeblk_tag hblktag; 3450 int hmeshift, size, vshift; 3451 uint_t shw_mask, newshw_mask; 3452 struct hme_blk *hmeblkp; 3453 3454 ASSERT(sfmmup != KHATID); 3455 if (mmu_page_sizes == max_mmu_page_sizes) { 3456 ASSERT(ttesz < TTE256M); 3457 } else { 3458 ASSERT(ttesz < TTE4M); 3459 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 3460 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 3461 } 3462 3463 if (ttesz == TTE8K) { 3464 size = TTE512K; 3465 } else { 3466 size = ++ttesz; 3467 } 3468 3469 hblktag.htag_id = sfmmup; 3470 hmeshift = HME_HASH_SHIFT(size); 3471 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 3472 hblktag.htag_rehash = HME_HASH_REHASH(size); 3473 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3474 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 3475 3476 SFMMU_HASH_LOCK(hmebp); 3477 3478 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 3479 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 3480 if (hmeblkp == NULL) { 3481 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 3482 hblktag, flags, SFMMU_INVALID_SHMERID); 3483 } 3484 ASSERT(hmeblkp); 3485 if (!hmeblkp->hblk_shw_mask) { 3486 /* 3487 * if this is a unused hblk it was just allocated or could 3488 * potentially be a previous large page hblk so we need to 3489 * set the shadow bit. 3490 */ 3491 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt); 3492 hmeblkp->hblk_shw_bit = 1; 3493 } else if (hmeblkp->hblk_shw_bit == 0) { 3494 panic("sfmmu_shadow_hcreate: shw bit not set in hmeblkp 0x%p", 3495 (void *)hmeblkp); 3496 } 3497 ASSERT(hmeblkp->hblk_shw_bit == 1); 3498 ASSERT(!hmeblkp->hblk_shared); 3499 vshift = vaddr_to_vshift(hblktag, vaddr, size); 3500 ASSERT(vshift < 8); 3501 /* 3502 * Atomically set shw mask bit 3503 */ 3504 do { 3505 shw_mask = hmeblkp->hblk_shw_mask; 3506 newshw_mask = shw_mask | (1 << vshift); 3507 newshw_mask = atomic_cas_32(&hmeblkp->hblk_shw_mask, shw_mask, 3508 newshw_mask); 3509 } while (newshw_mask != shw_mask); 3510 3511 SFMMU_HASH_UNLOCK(hmebp); 3512 3513 return (hmeblkp); 3514 } 3515 3516 /* 3517 * This routine cleanup a previous shadow hmeblk and changes it to 3518 * a regular hblk. This happens rarely but it is possible 3519 * when a process wants to use large pages and there are hblks still 3520 * lying around from the previous as that used these hmeblks. 3521 * The alternative was to cleanup the shadow hblks at unload time 3522 * but since so few user processes actually use large pages, it is 3523 * better to be lazy and cleanup at this time. 3524 */ 3525 static void 3526 sfmmu_shadow_hcleanup(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 3527 struct hmehash_bucket *hmebp) 3528 { 3529 caddr_t addr, endaddr; 3530 int hashno, size; 3531 3532 ASSERT(hmeblkp->hblk_shw_bit); 3533 ASSERT(!hmeblkp->hblk_shared); 3534 3535 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3536 3537 if (!hmeblkp->hblk_shw_mask) { 3538 hmeblkp->hblk_shw_bit = 0; 3539 return; 3540 } 3541 addr = (caddr_t)get_hblk_base(hmeblkp); 3542 endaddr = get_hblk_endaddr(hmeblkp); 3543 size = get_hblk_ttesz(hmeblkp); 3544 hashno = size - 1; 3545 ASSERT(hashno > 0); 3546 SFMMU_HASH_UNLOCK(hmebp); 3547 3548 sfmmu_free_hblks(sfmmup, addr, endaddr, hashno); 3549 3550 SFMMU_HASH_LOCK(hmebp); 3551 } 3552 3553 static void 3554 sfmmu_free_hblks(sfmmu_t *sfmmup, caddr_t addr, caddr_t endaddr, 3555 int hashno) 3556 { 3557 int hmeshift, shadow = 0; 3558 hmeblk_tag hblktag; 3559 struct hmehash_bucket *hmebp; 3560 struct hme_blk *hmeblkp; 3561 struct hme_blk *nx_hblk, *pr_hblk, *list = NULL; 3562 3563 ASSERT(hashno > 0); 3564 hblktag.htag_id = sfmmup; 3565 hblktag.htag_rehash = hashno; 3566 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3567 3568 hmeshift = HME_HASH_SHIFT(hashno); 3569 3570 while (addr < endaddr) { 3571 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3572 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3573 SFMMU_HASH_LOCK(hmebp); 3574 /* inline HME_HASH_SEARCH */ 3575 hmeblkp = hmebp->hmeblkp; 3576 pr_hblk = NULL; 3577 while (hmeblkp) { 3578 if (HTAGS_EQ(hmeblkp->hblk_tag, hblktag)) { 3579 /* found hme_blk */ 3580 ASSERT(!hmeblkp->hblk_shared); 3581 if (hmeblkp->hblk_shw_bit) { 3582 if (hmeblkp->hblk_shw_mask) { 3583 shadow = 1; 3584 sfmmu_shadow_hcleanup(sfmmup, 3585 hmeblkp, hmebp); 3586 break; 3587 } else { 3588 hmeblkp->hblk_shw_bit = 0; 3589 } 3590 } 3591 3592 /* 3593 * Hblk_hmecnt and hblk_vcnt could be non zero 3594 * since hblk_unload() does not gurantee that. 3595 * 3596 * XXX - this could cause tteload() to spin 3597 * where sfmmu_shadow_hcleanup() is called. 3598 */ 3599 } 3600 3601 nx_hblk = hmeblkp->hblk_next; 3602 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 3603 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3604 &list, 0); 3605 } else { 3606 pr_hblk = hmeblkp; 3607 } 3608 hmeblkp = nx_hblk; 3609 } 3610 3611 SFMMU_HASH_UNLOCK(hmebp); 3612 3613 if (shadow) { 3614 /* 3615 * We found another shadow hblk so cleaned its 3616 * children. We need to go back and cleanup 3617 * the original hblk so we don't change the 3618 * addr. 3619 */ 3620 shadow = 0; 3621 } else { 3622 addr = (caddr_t)roundup((uintptr_t)addr + 1, 3623 (1 << hmeshift)); 3624 } 3625 } 3626 sfmmu_hblks_list_purge(&list, 0); 3627 } 3628 3629 /* 3630 * This routine's job is to delete stale invalid shared hmeregions hmeblks that 3631 * may still linger on after pageunload. 3632 */ 3633 static void 3634 sfmmu_cleanup_rhblk(sf_srd_t *srdp, caddr_t addr, uint_t rid, int ttesz) 3635 { 3636 int hmeshift; 3637 hmeblk_tag hblktag; 3638 struct hmehash_bucket *hmebp; 3639 struct hme_blk *hmeblkp; 3640 struct hme_blk *pr_hblk; 3641 struct hme_blk *list = NULL; 3642 3643 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3644 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3645 3646 hmeshift = HME_HASH_SHIFT(ttesz); 3647 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3648 hblktag.htag_rehash = ttesz; 3649 hblktag.htag_rid = rid; 3650 hblktag.htag_id = srdp; 3651 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift); 3652 3653 SFMMU_HASH_LOCK(hmebp); 3654 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 3655 if (hmeblkp != NULL) { 3656 ASSERT(hmeblkp->hblk_shared); 3657 ASSERT(!hmeblkp->hblk_shw_bit); 3658 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 3659 panic("sfmmu_cleanup_rhblk: valid hmeblk"); 3660 } 3661 ASSERT(!hmeblkp->hblk_lckcnt); 3662 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3663 &list, 0); 3664 } 3665 SFMMU_HASH_UNLOCK(hmebp); 3666 sfmmu_hblks_list_purge(&list, 0); 3667 } 3668 3669 /* ARGSUSED */ 3670 static void 3671 sfmmu_rgn_cb_noop(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr, 3672 size_t r_size, void *r_obj, u_offset_t r_objoff) 3673 { 3674 } 3675 3676 /* 3677 * Searches for an hmeblk which maps addr, then unloads this mapping 3678 * and updates *eaddrp, if the hmeblk is found. 3679 */ 3680 static void 3681 sfmmu_unload_hmeregion_va(sf_srd_t *srdp, uint_t rid, caddr_t addr, 3682 caddr_t eaddr, int ttesz, caddr_t *eaddrp) 3683 { 3684 int hmeshift; 3685 hmeblk_tag hblktag; 3686 struct hmehash_bucket *hmebp; 3687 struct hme_blk *hmeblkp; 3688 struct hme_blk *pr_hblk; 3689 struct hme_blk *list = NULL; 3690 3691 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3692 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3693 ASSERT(ttesz >= HBLK_MIN_TTESZ); 3694 3695 hmeshift = HME_HASH_SHIFT(ttesz); 3696 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3697 hblktag.htag_rehash = ttesz; 3698 hblktag.htag_rid = rid; 3699 hblktag.htag_id = srdp; 3700 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift); 3701 3702 SFMMU_HASH_LOCK(hmebp); 3703 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 3704 if (hmeblkp != NULL) { 3705 ASSERT(hmeblkp->hblk_shared); 3706 ASSERT(!hmeblkp->hblk_lckcnt); 3707 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 3708 *eaddrp = sfmmu_hblk_unload(NULL, hmeblkp, addr, 3709 eaddr, NULL, HAT_UNLOAD); 3710 ASSERT(*eaddrp > addr); 3711 } 3712 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt); 3713 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3714 &list, 0); 3715 } 3716 SFMMU_HASH_UNLOCK(hmebp); 3717 sfmmu_hblks_list_purge(&list, 0); 3718 } 3719 3720 static void 3721 sfmmu_unload_hmeregion(sf_srd_t *srdp, sf_region_t *rgnp) 3722 { 3723 int ttesz = rgnp->rgn_pgszc; 3724 size_t rsz = rgnp->rgn_size; 3725 caddr_t rsaddr = rgnp->rgn_saddr; 3726 caddr_t readdr = rsaddr + rsz; 3727 caddr_t rhsaddr; 3728 caddr_t va; 3729 uint_t rid = rgnp->rgn_id; 3730 caddr_t cbsaddr; 3731 caddr_t cbeaddr; 3732 hat_rgn_cb_func_t rcbfunc; 3733 ulong_t cnt; 3734 3735 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3736 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3737 3738 ASSERT(IS_P2ALIGNED(rsaddr, TTEBYTES(ttesz))); 3739 ASSERT(IS_P2ALIGNED(rsz, TTEBYTES(ttesz))); 3740 if (ttesz < HBLK_MIN_TTESZ) { 3741 ttesz = HBLK_MIN_TTESZ; 3742 rhsaddr = (caddr_t)P2ALIGN((uintptr_t)rsaddr, HBLK_MIN_BYTES); 3743 } else { 3744 rhsaddr = rsaddr; 3745 } 3746 3747 if ((rcbfunc = rgnp->rgn_cb_function) == NULL) { 3748 rcbfunc = sfmmu_rgn_cb_noop; 3749 } 3750 3751 while (ttesz >= HBLK_MIN_TTESZ) { 3752 cbsaddr = rsaddr; 3753 cbeaddr = rsaddr; 3754 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) { 3755 ttesz--; 3756 continue; 3757 } 3758 cnt = 0; 3759 va = rsaddr; 3760 while (va < readdr) { 3761 ASSERT(va >= rhsaddr); 3762 if (va != cbeaddr) { 3763 if (cbeaddr != cbsaddr) { 3764 ASSERT(cbeaddr > cbsaddr); 3765 (*rcbfunc)(cbsaddr, cbeaddr, 3766 rsaddr, rsz, rgnp->rgn_obj, 3767 rgnp->rgn_objoff); 3768 } 3769 cbsaddr = va; 3770 cbeaddr = va; 3771 } 3772 sfmmu_unload_hmeregion_va(srdp, rid, va, readdr, 3773 ttesz, &cbeaddr); 3774 cnt++; 3775 va = rhsaddr + (cnt << TTE_PAGE_SHIFT(ttesz)); 3776 } 3777 if (cbeaddr != cbsaddr) { 3778 ASSERT(cbeaddr > cbsaddr); 3779 (*rcbfunc)(cbsaddr, cbeaddr, rsaddr, 3780 rsz, rgnp->rgn_obj, 3781 rgnp->rgn_objoff); 3782 } 3783 ttesz--; 3784 } 3785 } 3786 3787 /* 3788 * Release one hardware address translation lock on the given address range. 3789 */ 3790 void 3791 hat_unlock(struct hat *sfmmup, caddr_t addr, size_t len) 3792 { 3793 struct hmehash_bucket *hmebp; 3794 hmeblk_tag hblktag; 3795 int hmeshift, hashno = 1; 3796 struct hme_blk *hmeblkp, *list = NULL; 3797 caddr_t endaddr; 3798 3799 ASSERT(sfmmup != NULL); 3800 3801 ASSERT((sfmmup == ksfmmup) || 3802 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 3803 ASSERT((len & MMU_PAGEOFFSET) == 0); 3804 endaddr = addr + len; 3805 hblktag.htag_id = sfmmup; 3806 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3807 3808 /* 3809 * Spitfire supports 4 page sizes. 3810 * Most pages are expected to be of the smallest page size (8K) and 3811 * these will not need to be rehashed. 64K pages also don't need to be 3812 * rehashed because an hmeblk spans 64K of address space. 512K pages 3813 * might need 1 rehash and and 4M pages might need 2 rehashes. 3814 */ 3815 while (addr < endaddr) { 3816 hmeshift = HME_HASH_SHIFT(hashno); 3817 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3818 hblktag.htag_rehash = hashno; 3819 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3820 3821 SFMMU_HASH_LOCK(hmebp); 3822 3823 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 3824 if (hmeblkp != NULL) { 3825 ASSERT(!hmeblkp->hblk_shared); 3826 /* 3827 * If we encounter a shadow hmeblk then 3828 * we know there are no valid hmeblks mapping 3829 * this address at this size or larger. 3830 * Just increment address by the smallest 3831 * page size. 3832 */ 3833 if (hmeblkp->hblk_shw_bit) { 3834 addr += MMU_PAGESIZE; 3835 } else { 3836 addr = sfmmu_hblk_unlock(hmeblkp, addr, 3837 endaddr); 3838 } 3839 SFMMU_HASH_UNLOCK(hmebp); 3840 hashno = 1; 3841 continue; 3842 } 3843 SFMMU_HASH_UNLOCK(hmebp); 3844 3845 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 3846 /* 3847 * We have traversed the whole list and rehashed 3848 * if necessary without finding the address to unlock 3849 * which should never happen. 3850 */ 3851 panic("sfmmu_unlock: addr not found. " 3852 "addr %p hat %p", (void *)addr, (void *)sfmmup); 3853 } else { 3854 hashno++; 3855 } 3856 } 3857 3858 sfmmu_hblks_list_purge(&list, 0); 3859 } 3860 3861 void 3862 hat_unlock_region(struct hat *sfmmup, caddr_t addr, size_t len, 3863 hat_region_cookie_t rcookie) 3864 { 3865 sf_srd_t *srdp; 3866 sf_region_t *rgnp; 3867 int ttesz; 3868 uint_t rid; 3869 caddr_t eaddr; 3870 caddr_t va; 3871 int hmeshift; 3872 hmeblk_tag hblktag; 3873 struct hmehash_bucket *hmebp; 3874 struct hme_blk *hmeblkp; 3875 struct hme_blk *pr_hblk; 3876 struct hme_blk *list; 3877 3878 if (rcookie == HAT_INVALID_REGION_COOKIE) { 3879 hat_unlock(sfmmup, addr, len); 3880 return; 3881 } 3882 3883 ASSERT(sfmmup != NULL); 3884 ASSERT(sfmmup != ksfmmup); 3885 3886 srdp = sfmmup->sfmmu_srdp; 3887 rid = (uint_t)((uint64_t)rcookie); 3888 VERIFY3U(rid, <, SFMMU_MAX_HME_REGIONS); 3889 eaddr = addr + len; 3890 va = addr; 3891 list = NULL; 3892 rgnp = srdp->srd_hmergnp[rid]; 3893 SFMMU_VALIDATE_HMERID(sfmmup, rid, addr, len); 3894 3895 ASSERT(IS_P2ALIGNED(addr, TTEBYTES(rgnp->rgn_pgszc))); 3896 ASSERT(IS_P2ALIGNED(len, TTEBYTES(rgnp->rgn_pgszc))); 3897 if (rgnp->rgn_pgszc < HBLK_MIN_TTESZ) { 3898 ttesz = HBLK_MIN_TTESZ; 3899 } else { 3900 ttesz = rgnp->rgn_pgszc; 3901 } 3902 while (va < eaddr) { 3903 while (ttesz < rgnp->rgn_pgszc && 3904 IS_P2ALIGNED(va, TTEBYTES(ttesz + 1))) { 3905 ttesz++; 3906 } 3907 while (ttesz >= HBLK_MIN_TTESZ) { 3908 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) { 3909 ttesz--; 3910 continue; 3911 } 3912 hmeshift = HME_HASH_SHIFT(ttesz); 3913 hblktag.htag_bspage = HME_HASH_BSPAGE(va, hmeshift); 3914 hblktag.htag_rehash = ttesz; 3915 hblktag.htag_rid = rid; 3916 hblktag.htag_id = srdp; 3917 hmebp = HME_HASH_FUNCTION(srdp, va, hmeshift); 3918 SFMMU_HASH_LOCK(hmebp); 3919 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, 3920 &list); 3921 if (hmeblkp == NULL) { 3922 SFMMU_HASH_UNLOCK(hmebp); 3923 ttesz--; 3924 continue; 3925 } 3926 ASSERT(hmeblkp->hblk_shared); 3927 va = sfmmu_hblk_unlock(hmeblkp, va, eaddr); 3928 ASSERT(va >= eaddr || 3929 IS_P2ALIGNED((uintptr_t)va, TTEBYTES(ttesz))); 3930 SFMMU_HASH_UNLOCK(hmebp); 3931 break; 3932 } 3933 if (ttesz < HBLK_MIN_TTESZ) { 3934 panic("hat_unlock_region: addr not found " 3935 "addr %p hat %p", (void *)va, (void *)sfmmup); 3936 } 3937 } 3938 sfmmu_hblks_list_purge(&list, 0); 3939 } 3940 3941 /* 3942 * Function to unlock a range of addresses in an hmeblk. It returns the 3943 * next address that needs to be unlocked. 3944 * Should be called with the hash lock held. 3945 */ 3946 static caddr_t 3947 sfmmu_hblk_unlock(struct hme_blk *hmeblkp, caddr_t addr, caddr_t endaddr) 3948 { 3949 struct sf_hment *sfhme; 3950 tte_t tteold, ttemod; 3951 int ttesz, ret; 3952 3953 ASSERT(in_hblk_range(hmeblkp, addr)); 3954 ASSERT(hmeblkp->hblk_shw_bit == 0); 3955 3956 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 3957 ttesz = get_hblk_ttesz(hmeblkp); 3958 3959 HBLKTOHME(sfhme, hmeblkp, addr); 3960 while (addr < endaddr) { 3961 readtte: 3962 sfmmu_copytte(&sfhme->hme_tte, &tteold); 3963 if (TTE_IS_VALID(&tteold)) { 3964 3965 ttemod = tteold; 3966 3967 ret = sfmmu_modifytte_try(&tteold, &ttemod, 3968 &sfhme->hme_tte); 3969 3970 if (ret < 0) 3971 goto readtte; 3972 3973 if (hmeblkp->hblk_lckcnt == 0) 3974 panic("zero hblk lckcnt"); 3975 3976 if (((uintptr_t)addr + TTEBYTES(ttesz)) > 3977 (uintptr_t)endaddr) 3978 panic("can't unlock large tte"); 3979 3980 ASSERT(hmeblkp->hblk_lckcnt > 0); 3981 atomic_dec_32(&hmeblkp->hblk_lckcnt); 3982 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 3983 } else { 3984 panic("sfmmu_hblk_unlock: invalid tte"); 3985 } 3986 addr += TTEBYTES(ttesz); 3987 sfhme++; 3988 } 3989 return (addr); 3990 } 3991 3992 /* 3993 * Physical Address Mapping Framework 3994 * 3995 * General rules: 3996 * 3997 * (1) Applies only to seg_kmem memory pages. To make things easier, 3998 * seg_kpm addresses are also accepted by the routines, but nothing 3999 * is done with them since by definition their PA mappings are static. 4000 * (2) hat_add_callback() may only be called while holding the page lock 4001 * SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()), 4002 * or passing HAC_PAGELOCK flag. 4003 * (3) prehandler() and posthandler() may not call hat_add_callback() or 4004 * hat_delete_callback(), nor should they allocate memory. Post quiesce 4005 * callbacks may not sleep or acquire adaptive mutex locks. 4006 * (4) Either prehandler() or posthandler() (but not both) may be specified 4007 * as being NULL. Specifying an errhandler() is optional. 4008 * 4009 * Details of using the framework: 4010 * 4011 * registering a callback (hat_register_callback()) 4012 * 4013 * Pass prehandler, posthandler, errhandler addresses 4014 * as described below. If capture_cpus argument is nonzero, 4015 * suspend callback to the prehandler will occur with CPUs 4016 * captured and executing xc_loop() and CPUs will remain 4017 * captured until after the posthandler suspend callback 4018 * occurs. 4019 * 4020 * adding a callback (hat_add_callback()) 4021 * 4022 * as_pagelock(); 4023 * hat_add_callback(); 4024 * save returned pfn in private data structures or program registers; 4025 * as_pageunlock(); 4026 * 4027 * prehandler() 4028 * 4029 * Stop all accesses by physical address to this memory page. 4030 * Called twice: the first, PRESUSPEND, is a context safe to acquire 4031 * adaptive locks. The second, SUSPEND, is called at high PIL with 4032 * CPUs captured so adaptive locks may NOT be acquired (and all spin 4033 * locks must be XCALL_PIL or higher locks). 4034 * 4035 * May return the following errors: 4036 * EIO: A fatal error has occurred. This will result in panic. 4037 * EAGAIN: The page cannot be suspended. This will fail the 4038 * relocation. 4039 * 0: Success. 4040 * 4041 * posthandler() 4042 * 4043 * Save new pfn in private data structures or program registers; 4044 * not allowed to fail (non-zero return values will result in panic). 4045 * 4046 * errhandler() 4047 * 4048 * called when an error occurs related to the callback. Currently 4049 * the only such error is HAT_CB_ERR_LEAKED which indicates that 4050 * a page is being freed, but there are still outstanding callback(s) 4051 * registered on the page. 4052 * 4053 * removing a callback (hat_delete_callback(); e.g., prior to freeing memory) 4054 * 4055 * stop using physical address 4056 * hat_delete_callback(); 4057 * 4058 */ 4059 4060 /* 4061 * Register a callback class. Each subsystem should do this once and 4062 * cache the id_t returned for use in setting up and tearing down callbacks. 4063 * 4064 * There is no facility for removing callback IDs once they are created; 4065 * the "key" should be unique for each module, so in case a module is unloaded 4066 * and subsequently re-loaded, we can recycle the module's previous entry. 4067 */ 4068 id_t 4069 hat_register_callback(int key, 4070 int (*prehandler)(caddr_t, uint_t, uint_t, void *), 4071 int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t), 4072 int (*errhandler)(caddr_t, uint_t, uint_t, void *), 4073 int capture_cpus) 4074 { 4075 id_t id; 4076 4077 /* 4078 * Search the table for a pre-existing callback associated with 4079 * the identifier "key". If one exists, we re-use that entry in 4080 * the table for this instance, otherwise we assign the next 4081 * available table slot. 4082 */ 4083 for (id = 0; id < sfmmu_max_cb_id; id++) { 4084 if (sfmmu_cb_table[id].key == key) 4085 break; 4086 } 4087 4088 if (id == sfmmu_max_cb_id) { 4089 id = sfmmu_cb_nextid++; 4090 if (id >= sfmmu_max_cb_id) 4091 panic("hat_register_callback: out of callback IDs"); 4092 } 4093 4094 ASSERT(prehandler != NULL || posthandler != NULL); 4095 4096 sfmmu_cb_table[id].key = key; 4097 sfmmu_cb_table[id].prehandler = prehandler; 4098 sfmmu_cb_table[id].posthandler = posthandler; 4099 sfmmu_cb_table[id].errhandler = errhandler; 4100 sfmmu_cb_table[id].capture_cpus = capture_cpus; 4101 4102 return (id); 4103 } 4104 4105 #define HAC_COOKIE_NONE (void *)-1 4106 4107 /* 4108 * Add relocation callbacks to the specified addr/len which will be called 4109 * when relocating the associated page. See the description of pre and 4110 * posthandler above for more details. 4111 * 4112 * If HAC_PAGELOCK is included in flags, the underlying memory page is 4113 * locked internally so the caller must be able to deal with the callback 4114 * running even before this function has returned. If HAC_PAGELOCK is not 4115 * set, it is assumed that the underlying memory pages are locked. 4116 * 4117 * Since the caller must track the individual page boundaries anyway, 4118 * we only allow a callback to be added to a single page (large 4119 * or small). Thus [addr, addr + len) MUST be contained within a single 4120 * page. 4121 * 4122 * Registering multiple callbacks on the same [addr, addr+len) is supported, 4123 * _provided_that_ a unique parameter is specified for each callback. 4124 * If multiple callbacks are registered on the same range the callback will 4125 * be invoked with each unique parameter. Registering the same callback with 4126 * the same argument more than once will result in corrupted kernel state. 4127 * 4128 * Returns the pfn of the underlying kernel page in *rpfn 4129 * on success, or PFN_INVALID on failure. 4130 * 4131 * cookiep (if passed) provides storage space for an opaque cookie 4132 * to return later to hat_delete_callback(). This cookie makes the callback 4133 * deletion significantly quicker by avoiding a potentially lengthy hash 4134 * search. 4135 * 4136 * Returns values: 4137 * 0: success 4138 * ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP) 4139 * EINVAL: callback ID is not valid 4140 * ENXIO: ["vaddr", "vaddr" + len) is not mapped in the kernel's address 4141 * space 4142 * ERANGE: ["vaddr", "vaddr" + len) crosses a page boundary 4143 */ 4144 int 4145 hat_add_callback(id_t callback_id, caddr_t vaddr, uint_t len, uint_t flags, 4146 void *pvt, pfn_t *rpfn, void **cookiep) 4147 { 4148 struct hmehash_bucket *hmebp; 4149 hmeblk_tag hblktag; 4150 struct hme_blk *hmeblkp; 4151 int hmeshift, hashno; 4152 caddr_t saddr, eaddr, baseaddr; 4153 struct pa_hment *pahmep; 4154 struct sf_hment *sfhmep, *osfhmep; 4155 kmutex_t *pml; 4156 tte_t tte; 4157 page_t *pp; 4158 vnode_t *vp; 4159 u_offset_t off; 4160 pfn_t pfn; 4161 int kmflags = (flags & HAC_SLEEP)? KM_SLEEP : KM_NOSLEEP; 4162 int locked = 0; 4163 4164 /* 4165 * For KPM mappings, just return the physical address since we 4166 * don't need to register any callbacks. 4167 */ 4168 if (IS_KPM_ADDR(vaddr)) { 4169 uint64_t paddr; 4170 SFMMU_KPM_VTOP(vaddr, paddr); 4171 *rpfn = btop(paddr); 4172 if (cookiep != NULL) 4173 *cookiep = HAC_COOKIE_NONE; 4174 return (0); 4175 } 4176 4177 if (callback_id < (id_t)0 || callback_id >= sfmmu_cb_nextid) { 4178 *rpfn = PFN_INVALID; 4179 return (EINVAL); 4180 } 4181 4182 if ((pahmep = kmem_cache_alloc(pa_hment_cache, kmflags)) == NULL) { 4183 *rpfn = PFN_INVALID; 4184 return (ENOMEM); 4185 } 4186 4187 sfhmep = &pahmep->sfment; 4188 4189 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 4190 eaddr = saddr + len; 4191 4192 rehash: 4193 /* Find the mapping(s) for this page */ 4194 for (hashno = TTE64K, hmeblkp = NULL; 4195 hmeblkp == NULL && hashno <= mmu_hashcnt; 4196 hashno++) { 4197 hmeshift = HME_HASH_SHIFT(hashno); 4198 hblktag.htag_id = ksfmmup; 4199 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4200 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 4201 hblktag.htag_rehash = hashno; 4202 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 4203 4204 SFMMU_HASH_LOCK(hmebp); 4205 4206 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 4207 4208 if (hmeblkp == NULL) 4209 SFMMU_HASH_UNLOCK(hmebp); 4210 } 4211 4212 if (hmeblkp == NULL) { 4213 kmem_cache_free(pa_hment_cache, pahmep); 4214 *rpfn = PFN_INVALID; 4215 return (ENXIO); 4216 } 4217 4218 ASSERT(!hmeblkp->hblk_shared); 4219 4220 HBLKTOHME(osfhmep, hmeblkp, saddr); 4221 sfmmu_copytte(&osfhmep->hme_tte, &tte); 4222 4223 if (!TTE_IS_VALID(&tte)) { 4224 SFMMU_HASH_UNLOCK(hmebp); 4225 kmem_cache_free(pa_hment_cache, pahmep); 4226 *rpfn = PFN_INVALID; 4227 return (ENXIO); 4228 } 4229 4230 /* 4231 * Make sure the boundaries for the callback fall within this 4232 * single mapping. 4233 */ 4234 baseaddr = (caddr_t)get_hblk_base(hmeblkp); 4235 ASSERT(saddr >= baseaddr); 4236 if (eaddr > saddr + TTEBYTES(TTE_CSZ(&tte))) { 4237 SFMMU_HASH_UNLOCK(hmebp); 4238 kmem_cache_free(pa_hment_cache, pahmep); 4239 *rpfn = PFN_INVALID; 4240 return (ERANGE); 4241 } 4242 4243 pfn = sfmmu_ttetopfn(&tte, vaddr); 4244 4245 /* 4246 * The pfn may not have a page_t underneath in which case we 4247 * just return it. This can happen if we are doing I/O to a 4248 * static portion of the kernel's address space, for instance. 4249 */ 4250 pp = osfhmep->hme_page; 4251 if (pp == NULL) { 4252 SFMMU_HASH_UNLOCK(hmebp); 4253 kmem_cache_free(pa_hment_cache, pahmep); 4254 *rpfn = pfn; 4255 if (cookiep) 4256 *cookiep = HAC_COOKIE_NONE; 4257 return (0); 4258 } 4259 ASSERT(pp == PP_PAGEROOT(pp)); 4260 4261 vp = pp->p_vnode; 4262 off = pp->p_offset; 4263 4264 pml = sfmmu_mlist_enter(pp); 4265 4266 if (flags & HAC_PAGELOCK) { 4267 if (!page_trylock(pp, SE_SHARED)) { 4268 /* 4269 * Somebody is holding SE_EXCL lock. Might 4270 * even be hat_page_relocate(). Drop all 4271 * our locks, lookup the page in &kvp, and 4272 * retry. If it doesn't exist in &kvp and &zvp, 4273 * then we must be dealing with a kernel mapped 4274 * page which doesn't actually belong to 4275 * segkmem so we punt. 4276 */ 4277 sfmmu_mlist_exit(pml); 4278 SFMMU_HASH_UNLOCK(hmebp); 4279 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 4280 4281 /* check zvp before giving up */ 4282 if (pp == NULL) 4283 pp = page_lookup(&zvp, (u_offset_t)saddr, 4284 SE_SHARED); 4285 4286 /* Okay, we didn't find it, give up */ 4287 if (pp == NULL) { 4288 kmem_cache_free(pa_hment_cache, pahmep); 4289 *rpfn = pfn; 4290 if (cookiep) 4291 *cookiep = HAC_COOKIE_NONE; 4292 return (0); 4293 } 4294 page_unlock(pp); 4295 goto rehash; 4296 } 4297 locked = 1; 4298 } 4299 4300 if (!PAGE_LOCKED(pp) && !panicstr) 4301 panic("hat_add_callback: page 0x%p not locked", (void *)pp); 4302 4303 if (osfhmep->hme_page != pp || pp->p_vnode != vp || 4304 pp->p_offset != off) { 4305 /* 4306 * The page moved before we got our hands on it. Drop 4307 * all the locks and try again. 4308 */ 4309 ASSERT((flags & HAC_PAGELOCK) != 0); 4310 sfmmu_mlist_exit(pml); 4311 SFMMU_HASH_UNLOCK(hmebp); 4312 page_unlock(pp); 4313 locked = 0; 4314 goto rehash; 4315 } 4316 4317 if (!VN_ISKAS(vp)) { 4318 /* 4319 * This is not a segkmem page but another page which 4320 * has been kernel mapped. It had better have at least 4321 * a share lock on it. Return the pfn. 4322 */ 4323 sfmmu_mlist_exit(pml); 4324 SFMMU_HASH_UNLOCK(hmebp); 4325 if (locked) 4326 page_unlock(pp); 4327 kmem_cache_free(pa_hment_cache, pahmep); 4328 ASSERT(PAGE_LOCKED(pp)); 4329 *rpfn = pfn; 4330 if (cookiep) 4331 *cookiep = HAC_COOKIE_NONE; 4332 return (0); 4333 } 4334 4335 /* 4336 * Setup this pa_hment and link its embedded dummy sf_hment into 4337 * the mapping list. 4338 */ 4339 pp->p_share++; 4340 pahmep->cb_id = callback_id; 4341 pahmep->addr = vaddr; 4342 pahmep->len = len; 4343 pahmep->refcnt = 1; 4344 pahmep->flags = 0; 4345 pahmep->pvt = pvt; 4346 4347 sfhmep->hme_tte.ll = 0; 4348 sfhmep->hme_data = pahmep; 4349 sfhmep->hme_prev = osfhmep; 4350 sfhmep->hme_next = osfhmep->hme_next; 4351 4352 if (osfhmep->hme_next) 4353 osfhmep->hme_next->hme_prev = sfhmep; 4354 4355 osfhmep->hme_next = sfhmep; 4356 4357 sfmmu_mlist_exit(pml); 4358 SFMMU_HASH_UNLOCK(hmebp); 4359 4360 if (locked) 4361 page_unlock(pp); 4362 4363 *rpfn = pfn; 4364 if (cookiep) 4365 *cookiep = (void *)pahmep; 4366 4367 return (0); 4368 } 4369 4370 /* 4371 * Remove the relocation callbacks from the specified addr/len. 4372 */ 4373 void 4374 hat_delete_callback(caddr_t vaddr, uint_t len, void *pvt, uint_t flags, 4375 void *cookie) 4376 { 4377 struct hmehash_bucket *hmebp; 4378 hmeblk_tag hblktag; 4379 struct hme_blk *hmeblkp; 4380 int hmeshift, hashno; 4381 caddr_t saddr; 4382 struct pa_hment *pahmep; 4383 struct sf_hment *sfhmep, *osfhmep; 4384 kmutex_t *pml; 4385 tte_t tte; 4386 page_t *pp; 4387 vnode_t *vp; 4388 u_offset_t off; 4389 int locked = 0; 4390 4391 /* 4392 * If the cookie is HAC_COOKIE_NONE then there is no pa_hment to 4393 * remove so just return. 4394 */ 4395 if (cookie == HAC_COOKIE_NONE || IS_KPM_ADDR(vaddr)) 4396 return; 4397 4398 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 4399 4400 rehash: 4401 /* Find the mapping(s) for this page */ 4402 for (hashno = TTE64K, hmeblkp = NULL; 4403 hmeblkp == NULL && hashno <= mmu_hashcnt; 4404 hashno++) { 4405 hmeshift = HME_HASH_SHIFT(hashno); 4406 hblktag.htag_id = ksfmmup; 4407 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4408 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 4409 hblktag.htag_rehash = hashno; 4410 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 4411 4412 SFMMU_HASH_LOCK(hmebp); 4413 4414 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 4415 4416 if (hmeblkp == NULL) 4417 SFMMU_HASH_UNLOCK(hmebp); 4418 } 4419 4420 if (hmeblkp == NULL) 4421 return; 4422 4423 ASSERT(!hmeblkp->hblk_shared); 4424 4425 HBLKTOHME(osfhmep, hmeblkp, saddr); 4426 4427 sfmmu_copytte(&osfhmep->hme_tte, &tte); 4428 if (!TTE_IS_VALID(&tte)) { 4429 SFMMU_HASH_UNLOCK(hmebp); 4430 return; 4431 } 4432 4433 pp = osfhmep->hme_page; 4434 if (pp == NULL) { 4435 SFMMU_HASH_UNLOCK(hmebp); 4436 ASSERT(cookie == NULL); 4437 return; 4438 } 4439 4440 vp = pp->p_vnode; 4441 off = pp->p_offset; 4442 4443 pml = sfmmu_mlist_enter(pp); 4444 4445 if (flags & HAC_PAGELOCK) { 4446 if (!page_trylock(pp, SE_SHARED)) { 4447 /* 4448 * Somebody is holding SE_EXCL lock. Might 4449 * even be hat_page_relocate(). Drop all 4450 * our locks, lookup the page in &kvp, and 4451 * retry. If it doesn't exist in &kvp and &zvp, 4452 * then we must be dealing with a kernel mapped 4453 * page which doesn't actually belong to 4454 * segkmem so we punt. 4455 */ 4456 sfmmu_mlist_exit(pml); 4457 SFMMU_HASH_UNLOCK(hmebp); 4458 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 4459 /* check zvp before giving up */ 4460 if (pp == NULL) 4461 pp = page_lookup(&zvp, (u_offset_t)saddr, 4462 SE_SHARED); 4463 4464 if (pp == NULL) { 4465 ASSERT(cookie == NULL); 4466 return; 4467 } 4468 page_unlock(pp); 4469 goto rehash; 4470 } 4471 locked = 1; 4472 } 4473 4474 ASSERT(PAGE_LOCKED(pp)); 4475 4476 if (osfhmep->hme_page != pp || pp->p_vnode != vp || 4477 pp->p_offset != off) { 4478 /* 4479 * The page moved before we got our hands on it. Drop 4480 * all the locks and try again. 4481 */ 4482 ASSERT((flags & HAC_PAGELOCK) != 0); 4483 sfmmu_mlist_exit(pml); 4484 SFMMU_HASH_UNLOCK(hmebp); 4485 page_unlock(pp); 4486 locked = 0; 4487 goto rehash; 4488 } 4489 4490 if (!VN_ISKAS(vp)) { 4491 /* 4492 * This is not a segkmem page but another page which 4493 * has been kernel mapped. 4494 */ 4495 sfmmu_mlist_exit(pml); 4496 SFMMU_HASH_UNLOCK(hmebp); 4497 if (locked) 4498 page_unlock(pp); 4499 ASSERT(cookie == NULL); 4500 return; 4501 } 4502 4503 if (cookie != NULL) { 4504 pahmep = (struct pa_hment *)cookie; 4505 sfhmep = &pahmep->sfment; 4506 } else { 4507 for (sfhmep = pp->p_mapping; sfhmep != NULL; 4508 sfhmep = sfhmep->hme_next) { 4509 4510 /* 4511 * skip va<->pa mappings 4512 */ 4513 if (!IS_PAHME(sfhmep)) 4514 continue; 4515 4516 pahmep = sfhmep->hme_data; 4517 ASSERT(pahmep != NULL); 4518 4519 /* 4520 * if pa_hment matches, remove it 4521 */ 4522 if ((pahmep->pvt == pvt) && 4523 (pahmep->addr == vaddr) && 4524 (pahmep->len == len)) { 4525 break; 4526 } 4527 } 4528 } 4529 4530 if (sfhmep == NULL) { 4531 if (!panicstr) { 4532 panic("hat_delete_callback: pa_hment not found, pp %p", 4533 (void *)pp); 4534 } 4535 return; 4536 } 4537 4538 /* 4539 * Note: at this point a valid kernel mapping must still be 4540 * present on this page. 4541 */ 4542 pp->p_share--; 4543 if (pp->p_share <= 0) 4544 panic("hat_delete_callback: zero p_share"); 4545 4546 if (--pahmep->refcnt == 0) { 4547 if (pahmep->flags != 0) 4548 panic("hat_delete_callback: pa_hment is busy"); 4549 4550 /* 4551 * Remove sfhmep from the mapping list for the page. 4552 */ 4553 if (sfhmep->hme_prev) { 4554 sfhmep->hme_prev->hme_next = sfhmep->hme_next; 4555 } else { 4556 pp->p_mapping = sfhmep->hme_next; 4557 } 4558 4559 if (sfhmep->hme_next) 4560 sfhmep->hme_next->hme_prev = sfhmep->hme_prev; 4561 4562 sfmmu_mlist_exit(pml); 4563 SFMMU_HASH_UNLOCK(hmebp); 4564 4565 if (locked) 4566 page_unlock(pp); 4567 4568 kmem_cache_free(pa_hment_cache, pahmep); 4569 return; 4570 } 4571 4572 sfmmu_mlist_exit(pml); 4573 SFMMU_HASH_UNLOCK(hmebp); 4574 if (locked) 4575 page_unlock(pp); 4576 } 4577 4578 /* 4579 * hat_probe returns 1 if the translation for the address 'addr' is 4580 * loaded, zero otherwise. 4581 * 4582 * hat_probe should be used only for advisorary purposes because it may 4583 * occasionally return the wrong value. The implementation must guarantee that 4584 * returning the wrong value is a very rare event. hat_probe is used 4585 * to implement optimizations in the segment drivers. 4586 * 4587 */ 4588 int 4589 hat_probe(struct hat *sfmmup, caddr_t addr) 4590 { 4591 pfn_t pfn; 4592 tte_t tte; 4593 4594 ASSERT(sfmmup != NULL); 4595 4596 ASSERT((sfmmup == ksfmmup) || 4597 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 4598 4599 if (sfmmup == ksfmmup) { 4600 while ((pfn = sfmmu_vatopfn(addr, sfmmup, &tte)) 4601 == PFN_SUSPENDED) { 4602 sfmmu_vatopfn_suspended(addr, sfmmup, &tte); 4603 } 4604 } else { 4605 pfn = sfmmu_uvatopfn(addr, sfmmup, NULL); 4606 } 4607 4608 if (pfn != PFN_INVALID) 4609 return (1); 4610 else 4611 return (0); 4612 } 4613 4614 ssize_t 4615 hat_getpagesize(struct hat *sfmmup, caddr_t addr) 4616 { 4617 tte_t tte; 4618 4619 if (sfmmup == ksfmmup) { 4620 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4621 return (-1); 4622 } 4623 } else { 4624 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4625 return (-1); 4626 } 4627 } 4628 4629 ASSERT(TTE_IS_VALID(&tte)); 4630 return (TTEBYTES(TTE_CSZ(&tte))); 4631 } 4632 4633 uint_t 4634 hat_getattr(struct hat *sfmmup, caddr_t addr, uint_t *attr) 4635 { 4636 tte_t tte; 4637 4638 if (sfmmup == ksfmmup) { 4639 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4640 tte.ll = 0; 4641 } 4642 } else { 4643 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4644 tte.ll = 0; 4645 } 4646 } 4647 if (TTE_IS_VALID(&tte)) { 4648 *attr = sfmmu_ptov_attr(&tte); 4649 return (0); 4650 } 4651 *attr = 0; 4652 return ((uint_t)0xffffffff); 4653 } 4654 4655 /* 4656 * Enables more attributes on specified address range (ie. logical OR) 4657 */ 4658 void 4659 hat_setattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4660 { 4661 ASSERT(hat->sfmmu_as != NULL); 4662 4663 sfmmu_chgattr(hat, addr, len, attr, SFMMU_SETATTR); 4664 } 4665 4666 /* 4667 * Assigns attributes to the specified address range. All the attributes 4668 * are specified. 4669 */ 4670 void 4671 hat_chgattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4672 { 4673 ASSERT(hat->sfmmu_as != NULL); 4674 4675 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CHGATTR); 4676 } 4677 4678 /* 4679 * Remove attributes on the specified address range (ie. loginal NAND) 4680 */ 4681 void 4682 hat_clrattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4683 { 4684 ASSERT(hat->sfmmu_as != NULL); 4685 4686 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CLRATTR); 4687 } 4688 4689 /* 4690 * Change attributes on an address range to that specified by attr and mode. 4691 */ 4692 static void 4693 sfmmu_chgattr(struct hat *sfmmup, caddr_t addr, size_t len, uint_t attr, 4694 int mode) 4695 { 4696 struct hmehash_bucket *hmebp; 4697 hmeblk_tag hblktag; 4698 int hmeshift, hashno = 1; 4699 struct hme_blk *hmeblkp, *list = NULL; 4700 caddr_t endaddr; 4701 cpuset_t cpuset; 4702 demap_range_t dmr; 4703 4704 CPUSET_ZERO(cpuset); 4705 4706 ASSERT((sfmmup == ksfmmup) || 4707 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 4708 ASSERT((len & MMU_PAGEOFFSET) == 0); 4709 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 4710 4711 if ((attr & PROT_USER) && (mode != SFMMU_CLRATTR) && 4712 ((addr + len) > (caddr_t)USERLIMIT)) { 4713 panic("user addr %p in kernel space", 4714 (void *)addr); 4715 } 4716 4717 endaddr = addr + len; 4718 hblktag.htag_id = sfmmup; 4719 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4720 DEMAP_RANGE_INIT(sfmmup, &dmr); 4721 4722 while (addr < endaddr) { 4723 hmeshift = HME_HASH_SHIFT(hashno); 4724 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 4725 hblktag.htag_rehash = hashno; 4726 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 4727 4728 SFMMU_HASH_LOCK(hmebp); 4729 4730 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 4731 if (hmeblkp != NULL) { 4732 ASSERT(!hmeblkp->hblk_shared); 4733 /* 4734 * We've encountered a shadow hmeblk so skip the range 4735 * of the next smaller mapping size. 4736 */ 4737 if (hmeblkp->hblk_shw_bit) { 4738 ASSERT(sfmmup != ksfmmup); 4739 ASSERT(hashno > 1); 4740 addr = (caddr_t)P2END((uintptr_t)addr, 4741 TTEBYTES(hashno - 1)); 4742 } else { 4743 addr = sfmmu_hblk_chgattr(sfmmup, 4744 hmeblkp, addr, endaddr, &dmr, attr, mode); 4745 } 4746 SFMMU_HASH_UNLOCK(hmebp); 4747 hashno = 1; 4748 continue; 4749 } 4750 SFMMU_HASH_UNLOCK(hmebp); 4751 4752 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 4753 /* 4754 * We have traversed the whole list and rehashed 4755 * if necessary without finding the address to chgattr. 4756 * This is ok, so we increment the address by the 4757 * smallest hmeblk range for kernel mappings or for 4758 * user mappings with no large pages, and the largest 4759 * hmeblk range, to account for shadow hmeblks, for 4760 * user mappings with large pages and continue. 4761 */ 4762 if (sfmmup == ksfmmup) 4763 addr = (caddr_t)P2END((uintptr_t)addr, 4764 TTEBYTES(1)); 4765 else 4766 addr = (caddr_t)P2END((uintptr_t)addr, 4767 TTEBYTES(hashno)); 4768 hashno = 1; 4769 } else { 4770 hashno++; 4771 } 4772 } 4773 4774 sfmmu_hblks_list_purge(&list, 0); 4775 DEMAP_RANGE_FLUSH(&dmr); 4776 cpuset = sfmmup->sfmmu_cpusran; 4777 xt_sync(cpuset); 4778 } 4779 4780 /* 4781 * This function chgattr on a range of addresses in an hmeblk. It returns the 4782 * next addres that needs to be chgattr. 4783 * It should be called with the hash lock held. 4784 * XXX It should be possible to optimize chgattr by not flushing every time but 4785 * on the other hand: 4786 * 1. do one flush crosscall. 4787 * 2. only flush if we are increasing permissions (make sure this will work) 4788 */ 4789 static caddr_t 4790 sfmmu_hblk_chgattr(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 4791 caddr_t endaddr, demap_range_t *dmrp, uint_t attr, int mode) 4792 { 4793 tte_t tte, tteattr, tteflags, ttemod; 4794 struct sf_hment *sfhmep; 4795 int ttesz; 4796 struct page *pp = NULL; 4797 kmutex_t *pml, *pmtx; 4798 int ret; 4799 int use_demap_range; 4800 #if defined(SF_ERRATA_57) 4801 int check_exec; 4802 #endif 4803 4804 ASSERT(in_hblk_range(hmeblkp, addr)); 4805 ASSERT(hmeblkp->hblk_shw_bit == 0); 4806 ASSERT(!hmeblkp->hblk_shared); 4807 4808 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 4809 ttesz = get_hblk_ttesz(hmeblkp); 4810 4811 /* 4812 * Flush the current demap region if addresses have been 4813 * skipped or the page size doesn't match. 4814 */ 4815 use_demap_range = (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp)); 4816 if (use_demap_range) { 4817 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 4818 } else if (dmrp != NULL) { 4819 DEMAP_RANGE_FLUSH(dmrp); 4820 } 4821 4822 tteattr.ll = sfmmu_vtop_attr(attr, mode, &tteflags); 4823 #if defined(SF_ERRATA_57) 4824 check_exec = (sfmmup != ksfmmup) && 4825 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 4826 TTE_IS_EXECUTABLE(&tteattr); 4827 #endif 4828 HBLKTOHME(sfhmep, hmeblkp, addr); 4829 while (addr < endaddr) { 4830 sfmmu_copytte(&sfhmep->hme_tte, &tte); 4831 if (TTE_IS_VALID(&tte)) { 4832 if ((tte.ll & tteflags.ll) == tteattr.ll) { 4833 /* 4834 * if the new attr is the same as old 4835 * continue 4836 */ 4837 goto next_addr; 4838 } 4839 if (!TTE_IS_WRITABLE(&tteattr)) { 4840 /* 4841 * make sure we clear hw modify bit if we 4842 * removing write protections 4843 */ 4844 tteflags.tte_intlo |= TTE_HWWR_INT; 4845 } 4846 4847 pml = NULL; 4848 pp = sfhmep->hme_page; 4849 if (pp) { 4850 pml = sfmmu_mlist_enter(pp); 4851 } 4852 4853 if (pp != sfhmep->hme_page) { 4854 /* 4855 * tte must have been unloaded. 4856 */ 4857 ASSERT(pml); 4858 sfmmu_mlist_exit(pml); 4859 continue; 4860 } 4861 4862 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 4863 4864 ttemod = tte; 4865 ttemod.ll = (ttemod.ll & ~tteflags.ll) | tteattr.ll; 4866 ASSERT(TTE_TO_TTEPFN(&ttemod) == TTE_TO_TTEPFN(&tte)); 4867 4868 #if defined(SF_ERRATA_57) 4869 if (check_exec && addr < errata57_limit) 4870 ttemod.tte_exec_perm = 0; 4871 #endif 4872 ret = sfmmu_modifytte_try(&tte, &ttemod, 4873 &sfhmep->hme_tte); 4874 4875 if (ret < 0) { 4876 /* tte changed underneath us */ 4877 if (pml) { 4878 sfmmu_mlist_exit(pml); 4879 } 4880 continue; 4881 } 4882 4883 if (tteflags.tte_intlo & TTE_HWWR_INT) { 4884 /* 4885 * need to sync if we are clearing modify bit. 4886 */ 4887 sfmmu_ttesync(sfmmup, addr, &tte, pp); 4888 } 4889 4890 if (pp && PP_ISRO(pp)) { 4891 if (tteattr.tte_intlo & TTE_WRPRM_INT) { 4892 pmtx = sfmmu_page_enter(pp); 4893 PP_CLRRO(pp); 4894 sfmmu_page_exit(pmtx); 4895 } 4896 } 4897 4898 if (ret > 0 && use_demap_range) { 4899 DEMAP_RANGE_MARKPG(dmrp, addr); 4900 } else if (ret > 0) { 4901 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 4902 } 4903 4904 if (pml) { 4905 sfmmu_mlist_exit(pml); 4906 } 4907 } 4908 next_addr: 4909 addr += TTEBYTES(ttesz); 4910 sfhmep++; 4911 DEMAP_RANGE_NEXTPG(dmrp); 4912 } 4913 return (addr); 4914 } 4915 4916 /* 4917 * This routine converts virtual attributes to physical ones. It will 4918 * update the tteflags field with the tte mask corresponding to the attributes 4919 * affected and it returns the new attributes. It will also clear the modify 4920 * bit if we are taking away write permission. This is necessary since the 4921 * modify bit is the hardware permission bit and we need to clear it in order 4922 * to detect write faults. 4923 */ 4924 static uint64_t 4925 sfmmu_vtop_attr(uint_t attr, int mode, tte_t *ttemaskp) 4926 { 4927 tte_t ttevalue; 4928 4929 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 4930 4931 switch (mode) { 4932 case SFMMU_CHGATTR: 4933 /* all attributes specified */ 4934 ttevalue.tte_inthi = MAKE_TTEATTR_INTHI(attr); 4935 ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr); 4936 ttemaskp->tte_inthi = TTEINTHI_ATTR; 4937 ttemaskp->tte_intlo = TTEINTLO_ATTR; 4938 break; 4939 case SFMMU_SETATTR: 4940 ASSERT(!(attr & ~HAT_PROT_MASK)); 4941 ttemaskp->ll = 0; 4942 ttevalue.ll = 0; 4943 /* 4944 * a valid tte implies exec and read for sfmmu 4945 * so no need to do anything about them. 4946 * since priviledged access implies user access 4947 * PROT_USER doesn't make sense either. 4948 */ 4949 if (attr & PROT_WRITE) { 4950 ttemaskp->tte_intlo |= TTE_WRPRM_INT; 4951 ttevalue.tte_intlo |= TTE_WRPRM_INT; 4952 } 4953 break; 4954 case SFMMU_CLRATTR: 4955 /* attributes will be nand with current ones */ 4956 if (attr & ~(PROT_WRITE | PROT_USER)) { 4957 panic("sfmmu: attr %x not supported", attr); 4958 } 4959 ttemaskp->ll = 0; 4960 ttevalue.ll = 0; 4961 if (attr & PROT_WRITE) { 4962 /* clear both writable and modify bit */ 4963 ttemaskp->tte_intlo |= TTE_WRPRM_INT | TTE_HWWR_INT; 4964 } 4965 if (attr & PROT_USER) { 4966 ttemaskp->tte_intlo |= TTE_PRIV_INT; 4967 ttevalue.tte_intlo |= TTE_PRIV_INT; 4968 } 4969 break; 4970 default: 4971 panic("sfmmu_vtop_attr: bad mode %x", mode); 4972 } 4973 ASSERT(TTE_TO_TTEPFN(&ttevalue) == 0); 4974 return (ttevalue.ll); 4975 } 4976 4977 static uint_t 4978 sfmmu_ptov_attr(tte_t *ttep) 4979 { 4980 uint_t attr; 4981 4982 ASSERT(TTE_IS_VALID(ttep)); 4983 4984 attr = PROT_READ; 4985 4986 if (TTE_IS_WRITABLE(ttep)) { 4987 attr |= PROT_WRITE; 4988 } 4989 if (TTE_IS_EXECUTABLE(ttep)) { 4990 attr |= PROT_EXEC; 4991 } 4992 if (!TTE_IS_PRIVILEGED(ttep)) { 4993 attr |= PROT_USER; 4994 } 4995 if (TTE_IS_NFO(ttep)) { 4996 attr |= HAT_NOFAULT; 4997 } 4998 if (TTE_IS_NOSYNC(ttep)) { 4999 attr |= HAT_NOSYNC; 5000 } 5001 if (TTE_IS_SIDEFFECT(ttep)) { 5002 attr |= SFMMU_SIDEFFECT; 5003 } 5004 if (!TTE_IS_VCACHEABLE(ttep)) { 5005 attr |= SFMMU_UNCACHEVTTE; 5006 } 5007 if (!TTE_IS_PCACHEABLE(ttep)) { 5008 attr |= SFMMU_UNCACHEPTTE; 5009 } 5010 return (attr); 5011 } 5012 5013 /* 5014 * hat_chgprot is a deprecated hat call. New segment drivers 5015 * should store all attributes and use hat_*attr calls. 5016 * 5017 * Change the protections in the virtual address range 5018 * given to the specified virtual protection. If vprot is ~PROT_WRITE, 5019 * then remove write permission, leaving the other 5020 * permissions unchanged. If vprot is ~PROT_USER, remove user permissions. 5021 * 5022 */ 5023 void 5024 hat_chgprot(struct hat *sfmmup, caddr_t addr, size_t len, uint_t vprot) 5025 { 5026 struct hmehash_bucket *hmebp; 5027 hmeblk_tag hblktag; 5028 int hmeshift, hashno = 1; 5029 struct hme_blk *hmeblkp, *list = NULL; 5030 caddr_t endaddr; 5031 cpuset_t cpuset; 5032 demap_range_t dmr; 5033 5034 ASSERT((len & MMU_PAGEOFFSET) == 0); 5035 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 5036 5037 ASSERT(sfmmup->sfmmu_as != NULL); 5038 5039 CPUSET_ZERO(cpuset); 5040 5041 if ((vprot != (uint_t)~PROT_WRITE) && (vprot & PROT_USER) && 5042 ((addr + len) > (caddr_t)USERLIMIT)) { 5043 panic("user addr %p vprot %x in kernel space", 5044 (void *)addr, vprot); 5045 } 5046 endaddr = addr + len; 5047 hblktag.htag_id = sfmmup; 5048 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 5049 DEMAP_RANGE_INIT(sfmmup, &dmr); 5050 5051 while (addr < endaddr) { 5052 hmeshift = HME_HASH_SHIFT(hashno); 5053 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5054 hblktag.htag_rehash = hashno; 5055 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5056 5057 SFMMU_HASH_LOCK(hmebp); 5058 5059 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 5060 if (hmeblkp != NULL) { 5061 ASSERT(!hmeblkp->hblk_shared); 5062 /* 5063 * We've encountered a shadow hmeblk so skip the range 5064 * of the next smaller mapping size. 5065 */ 5066 if (hmeblkp->hblk_shw_bit) { 5067 ASSERT(sfmmup != ksfmmup); 5068 ASSERT(hashno > 1); 5069 addr = (caddr_t)P2END((uintptr_t)addr, 5070 TTEBYTES(hashno - 1)); 5071 } else { 5072 addr = sfmmu_hblk_chgprot(sfmmup, hmeblkp, 5073 addr, endaddr, &dmr, vprot); 5074 } 5075 SFMMU_HASH_UNLOCK(hmebp); 5076 hashno = 1; 5077 continue; 5078 } 5079 SFMMU_HASH_UNLOCK(hmebp); 5080 5081 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 5082 /* 5083 * We have traversed the whole list and rehashed 5084 * if necessary without finding the address to chgprot. 5085 * This is ok so we increment the address by the 5086 * smallest hmeblk range for kernel mappings and the 5087 * largest hmeblk range, to account for shadow hmeblks, 5088 * for user mappings and continue. 5089 */ 5090 if (sfmmup == ksfmmup) 5091 addr = (caddr_t)P2END((uintptr_t)addr, 5092 TTEBYTES(1)); 5093 else 5094 addr = (caddr_t)P2END((uintptr_t)addr, 5095 TTEBYTES(hashno)); 5096 hashno = 1; 5097 } else { 5098 hashno++; 5099 } 5100 } 5101 5102 sfmmu_hblks_list_purge(&list, 0); 5103 DEMAP_RANGE_FLUSH(&dmr); 5104 cpuset = sfmmup->sfmmu_cpusran; 5105 xt_sync(cpuset); 5106 } 5107 5108 /* 5109 * This function chgprots a range of addresses in an hmeblk. It returns the 5110 * next addres that needs to be chgprot. 5111 * It should be called with the hash lock held. 5112 * XXX It shold be possible to optimize chgprot by not flushing every time but 5113 * on the other hand: 5114 * 1. do one flush crosscall. 5115 * 2. only flush if we are increasing permissions (make sure this will work) 5116 */ 5117 static caddr_t 5118 sfmmu_hblk_chgprot(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5119 caddr_t endaddr, demap_range_t *dmrp, uint_t vprot) 5120 { 5121 uint_t pprot; 5122 tte_t tte, ttemod; 5123 struct sf_hment *sfhmep; 5124 uint_t tteflags; 5125 int ttesz; 5126 struct page *pp = NULL; 5127 kmutex_t *pml, *pmtx; 5128 int ret; 5129 int use_demap_range; 5130 #if defined(SF_ERRATA_57) 5131 int check_exec; 5132 #endif 5133 5134 ASSERT(in_hblk_range(hmeblkp, addr)); 5135 ASSERT(hmeblkp->hblk_shw_bit == 0); 5136 ASSERT(!hmeblkp->hblk_shared); 5137 5138 #ifdef DEBUG 5139 if (get_hblk_ttesz(hmeblkp) != TTE8K && 5140 (endaddr < get_hblk_endaddr(hmeblkp))) { 5141 panic("sfmmu_hblk_chgprot: partial chgprot of large page"); 5142 } 5143 #endif /* DEBUG */ 5144 5145 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 5146 ttesz = get_hblk_ttesz(hmeblkp); 5147 5148 pprot = sfmmu_vtop_prot(vprot, &tteflags); 5149 #if defined(SF_ERRATA_57) 5150 check_exec = (sfmmup != ksfmmup) && 5151 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 5152 ((vprot & PROT_EXEC) == PROT_EXEC); 5153 #endif 5154 HBLKTOHME(sfhmep, hmeblkp, addr); 5155 5156 /* 5157 * Flush the current demap region if addresses have been 5158 * skipped or the page size doesn't match. 5159 */ 5160 use_demap_range = (TTEBYTES(ttesz) == MMU_PAGESIZE); 5161 if (use_demap_range) { 5162 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 5163 } else if (dmrp != NULL) { 5164 DEMAP_RANGE_FLUSH(dmrp); 5165 } 5166 5167 while (addr < endaddr) { 5168 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5169 if (TTE_IS_VALID(&tte)) { 5170 if (TTE_GET_LOFLAGS(&tte, tteflags) == pprot) { 5171 /* 5172 * if the new protection is the same as old 5173 * continue 5174 */ 5175 goto next_addr; 5176 } 5177 pml = NULL; 5178 pp = sfhmep->hme_page; 5179 if (pp) { 5180 pml = sfmmu_mlist_enter(pp); 5181 } 5182 if (pp != sfhmep->hme_page) { 5183 /* 5184 * tte most have been unloaded 5185 * underneath us. Recheck 5186 */ 5187 ASSERT(pml); 5188 sfmmu_mlist_exit(pml); 5189 continue; 5190 } 5191 5192 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5193 5194 ttemod = tte; 5195 TTE_SET_LOFLAGS(&ttemod, tteflags, pprot); 5196 #if defined(SF_ERRATA_57) 5197 if (check_exec && addr < errata57_limit) 5198 ttemod.tte_exec_perm = 0; 5199 #endif 5200 ret = sfmmu_modifytte_try(&tte, &ttemod, 5201 &sfhmep->hme_tte); 5202 5203 if (ret < 0) { 5204 /* tte changed underneath us */ 5205 if (pml) { 5206 sfmmu_mlist_exit(pml); 5207 } 5208 continue; 5209 } 5210 5211 if (tteflags & TTE_HWWR_INT) { 5212 /* 5213 * need to sync if we are clearing modify bit. 5214 */ 5215 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5216 } 5217 5218 if (pp && PP_ISRO(pp)) { 5219 if (pprot & TTE_WRPRM_INT) { 5220 pmtx = sfmmu_page_enter(pp); 5221 PP_CLRRO(pp); 5222 sfmmu_page_exit(pmtx); 5223 } 5224 } 5225 5226 if (ret > 0 && use_demap_range) { 5227 DEMAP_RANGE_MARKPG(dmrp, addr); 5228 } else if (ret > 0) { 5229 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 5230 } 5231 5232 if (pml) { 5233 sfmmu_mlist_exit(pml); 5234 } 5235 } 5236 next_addr: 5237 addr += TTEBYTES(ttesz); 5238 sfhmep++; 5239 DEMAP_RANGE_NEXTPG(dmrp); 5240 } 5241 return (addr); 5242 } 5243 5244 /* 5245 * This routine is deprecated and should only be used by hat_chgprot. 5246 * The correct routine is sfmmu_vtop_attr. 5247 * This routine converts virtual page protections to physical ones. It will 5248 * update the tteflags field with the tte mask corresponding to the protections 5249 * affected and it returns the new protections. It will also clear the modify 5250 * bit if we are taking away write permission. This is necessary since the 5251 * modify bit is the hardware permission bit and we need to clear it in order 5252 * to detect write faults. 5253 * It accepts the following special protections: 5254 * ~PROT_WRITE = remove write permissions. 5255 * ~PROT_USER = remove user permissions. 5256 */ 5257 static uint_t 5258 sfmmu_vtop_prot(uint_t vprot, uint_t *tteflagsp) 5259 { 5260 if (vprot == (uint_t)~PROT_WRITE) { 5261 *tteflagsp = TTE_WRPRM_INT | TTE_HWWR_INT; 5262 return (0); /* will cause wrprm to be cleared */ 5263 } 5264 if (vprot == (uint_t)~PROT_USER) { 5265 *tteflagsp = TTE_PRIV_INT; 5266 return (0); /* will cause privprm to be cleared */ 5267 } 5268 if ((vprot == 0) || (vprot == PROT_USER) || 5269 ((vprot & PROT_ALL) != vprot)) { 5270 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 5271 } 5272 5273 switch (vprot) { 5274 case (PROT_READ): 5275 case (PROT_EXEC): 5276 case (PROT_EXEC | PROT_READ): 5277 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 5278 return (TTE_PRIV_INT); /* set prv and clr wrt */ 5279 case (PROT_WRITE): 5280 case (PROT_WRITE | PROT_READ): 5281 case (PROT_EXEC | PROT_WRITE): 5282 case (PROT_EXEC | PROT_WRITE | PROT_READ): 5283 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 5284 return (TTE_PRIV_INT | TTE_WRPRM_INT); /* set prv and wrt */ 5285 case (PROT_USER | PROT_READ): 5286 case (PROT_USER | PROT_EXEC): 5287 case (PROT_USER | PROT_EXEC | PROT_READ): 5288 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 5289 return (0); /* clr prv and wrt */ 5290 case (PROT_USER | PROT_WRITE): 5291 case (PROT_USER | PROT_WRITE | PROT_READ): 5292 case (PROT_USER | PROT_EXEC | PROT_WRITE): 5293 case (PROT_USER | PROT_EXEC | PROT_WRITE | PROT_READ): 5294 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 5295 return (TTE_WRPRM_INT); /* clr prv and set wrt */ 5296 default: 5297 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 5298 } 5299 return (0); 5300 } 5301 5302 /* 5303 * Alternate unload for very large virtual ranges. With a true 64 bit VA, 5304 * the normal algorithm would take too long for a very large VA range with 5305 * few real mappings. This routine just walks thru all HMEs in the global 5306 * hash table to find and remove mappings. 5307 */ 5308 static void 5309 hat_unload_large_virtual( 5310 struct hat *sfmmup, 5311 caddr_t startaddr, 5312 size_t len, 5313 uint_t flags, 5314 hat_callback_t *callback) 5315 { 5316 struct hmehash_bucket *hmebp; 5317 struct hme_blk *hmeblkp; 5318 struct hme_blk *pr_hblk = NULL; 5319 struct hme_blk *nx_hblk; 5320 struct hme_blk *list = NULL; 5321 int i; 5322 demap_range_t dmr, *dmrp; 5323 cpuset_t cpuset; 5324 caddr_t endaddr = startaddr + len; 5325 caddr_t sa; 5326 caddr_t ea; 5327 caddr_t cb_sa[MAX_CB_ADDR]; 5328 caddr_t cb_ea[MAX_CB_ADDR]; 5329 int addr_cnt = 0; 5330 int a = 0; 5331 5332 if (sfmmup->sfmmu_free) { 5333 dmrp = NULL; 5334 } else { 5335 dmrp = &dmr; 5336 DEMAP_RANGE_INIT(sfmmup, dmrp); 5337 } 5338 5339 /* 5340 * Loop through all the hash buckets of HME blocks looking for matches. 5341 */ 5342 for (i = 0; i <= UHMEHASH_SZ; i++) { 5343 hmebp = &uhme_hash[i]; 5344 SFMMU_HASH_LOCK(hmebp); 5345 hmeblkp = hmebp->hmeblkp; 5346 pr_hblk = NULL; 5347 while (hmeblkp) { 5348 nx_hblk = hmeblkp->hblk_next; 5349 5350 /* 5351 * skip if not this context, if a shadow block or 5352 * if the mapping is not in the requested range 5353 */ 5354 if (hmeblkp->hblk_tag.htag_id != sfmmup || 5355 hmeblkp->hblk_shw_bit || 5356 (sa = (caddr_t)get_hblk_base(hmeblkp)) >= endaddr || 5357 (ea = get_hblk_endaddr(hmeblkp)) <= startaddr) { 5358 pr_hblk = hmeblkp; 5359 goto next_block; 5360 } 5361 5362 ASSERT(!hmeblkp->hblk_shared); 5363 /* 5364 * unload if there are any current valid mappings 5365 */ 5366 if (hmeblkp->hblk_vcnt != 0 || 5367 hmeblkp->hblk_hmecnt != 0) 5368 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 5369 sa, ea, dmrp, flags); 5370 5371 /* 5372 * on unmap we also release the HME block itself, once 5373 * all mappings are gone. 5374 */ 5375 if ((flags & HAT_UNLOAD_UNMAP) != 0 && 5376 !hmeblkp->hblk_vcnt && 5377 !hmeblkp->hblk_hmecnt) { 5378 ASSERT(!hmeblkp->hblk_lckcnt); 5379 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 5380 &list, 0); 5381 } else { 5382 pr_hblk = hmeblkp; 5383 } 5384 5385 if (callback == NULL) 5386 goto next_block; 5387 5388 /* 5389 * HME blocks may span more than one page, but we may be 5390 * unmapping only one page, so check for a smaller range 5391 * for the callback 5392 */ 5393 if (sa < startaddr) 5394 sa = startaddr; 5395 if (--ea > endaddr) 5396 ea = endaddr - 1; 5397 5398 cb_sa[addr_cnt] = sa; 5399 cb_ea[addr_cnt] = ea; 5400 if (++addr_cnt == MAX_CB_ADDR) { 5401 if (dmrp != NULL) { 5402 DEMAP_RANGE_FLUSH(dmrp); 5403 cpuset = sfmmup->sfmmu_cpusran; 5404 xt_sync(cpuset); 5405 } 5406 5407 for (a = 0; a < MAX_CB_ADDR; ++a) { 5408 callback->hcb_start_addr = cb_sa[a]; 5409 callback->hcb_end_addr = cb_ea[a]; 5410 callback->hcb_function(callback); 5411 } 5412 addr_cnt = 0; 5413 } 5414 5415 next_block: 5416 hmeblkp = nx_hblk; 5417 } 5418 SFMMU_HASH_UNLOCK(hmebp); 5419 } 5420 5421 sfmmu_hblks_list_purge(&list, 0); 5422 if (dmrp != NULL) { 5423 DEMAP_RANGE_FLUSH(dmrp); 5424 cpuset = sfmmup->sfmmu_cpusran; 5425 xt_sync(cpuset); 5426 } 5427 5428 for (a = 0; a < addr_cnt; ++a) { 5429 callback->hcb_start_addr = cb_sa[a]; 5430 callback->hcb_end_addr = cb_ea[a]; 5431 callback->hcb_function(callback); 5432 } 5433 5434 /* 5435 * Check TSB and TLB page sizes if the process isn't exiting. 5436 */ 5437 if (!sfmmup->sfmmu_free) 5438 sfmmu_check_page_sizes(sfmmup, 0); 5439 } 5440 5441 /* 5442 * Unload all the mappings in the range [addr..addr+len). addr and len must 5443 * be MMU_PAGESIZE aligned. 5444 */ 5445 5446 extern struct seg *segkmap; 5447 #define ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \ 5448 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size)) 5449 5450 5451 void 5452 hat_unload_callback( 5453 struct hat *sfmmup, 5454 caddr_t addr, 5455 size_t len, 5456 uint_t flags, 5457 hat_callback_t *callback) 5458 { 5459 struct hmehash_bucket *hmebp; 5460 hmeblk_tag hblktag; 5461 int hmeshift, hashno, iskernel; 5462 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 5463 caddr_t endaddr; 5464 cpuset_t cpuset; 5465 int addr_count = 0; 5466 int a; 5467 caddr_t cb_start_addr[MAX_CB_ADDR]; 5468 caddr_t cb_end_addr[MAX_CB_ADDR]; 5469 int issegkmap = ISSEGKMAP(sfmmup, addr); 5470 demap_range_t dmr, *dmrp; 5471 5472 ASSERT(sfmmup->sfmmu_as != NULL); 5473 5474 ASSERT((sfmmup == ksfmmup) || (flags & HAT_UNLOAD_OTHER) || \ 5475 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 5476 5477 ASSERT(sfmmup != NULL); 5478 ASSERT((len & MMU_PAGEOFFSET) == 0); 5479 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 5480 5481 /* 5482 * Probing through a large VA range (say 63 bits) will be slow, even 5483 * at 4 Meg steps between the probes. So, when the virtual address range 5484 * is very large, search the HME entries for what to unload. 5485 * 5486 * len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need 5487 * 5488 * UHMEHASH_SZ is number of hash buckets to examine 5489 * 5490 */ 5491 if (sfmmup != KHATID && (len >> TTE_PAGE_SHIFT(TTE4M)) > UHMEHASH_SZ) { 5492 hat_unload_large_virtual(sfmmup, addr, len, flags, callback); 5493 return; 5494 } 5495 5496 CPUSET_ZERO(cpuset); 5497 5498 /* 5499 * If the process is exiting, we can save a lot of fuss since 5500 * we'll flush the TLB when we free the ctx anyway. 5501 */ 5502 if (sfmmup->sfmmu_free) { 5503 dmrp = NULL; 5504 } else { 5505 dmrp = &dmr; 5506 DEMAP_RANGE_INIT(sfmmup, dmrp); 5507 } 5508 5509 endaddr = addr + len; 5510 hblktag.htag_id = sfmmup; 5511 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 5512 5513 /* 5514 * It is likely for the vm to call unload over a wide range of 5515 * addresses that are actually very sparsely populated by 5516 * translations. In order to speed this up the sfmmu hat supports 5517 * the concept of shadow hmeblks. Dummy large page hmeblks that 5518 * correspond to actual small translations are allocated at tteload 5519 * time and are referred to as shadow hmeblks. Now, during unload 5520 * time, we first check if we have a shadow hmeblk for that 5521 * translation. The absence of one means the corresponding address 5522 * range is empty and can be skipped. 5523 * 5524 * The kernel is an exception to above statement and that is why 5525 * we don't use shadow hmeblks and hash starting from the smallest 5526 * page size. 5527 */ 5528 if (sfmmup == KHATID) { 5529 iskernel = 1; 5530 hashno = TTE64K; 5531 } else { 5532 iskernel = 0; 5533 if (mmu_page_sizes == max_mmu_page_sizes) { 5534 hashno = TTE256M; 5535 } else { 5536 hashno = TTE4M; 5537 } 5538 } 5539 while (addr < endaddr) { 5540 hmeshift = HME_HASH_SHIFT(hashno); 5541 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5542 hblktag.htag_rehash = hashno; 5543 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5544 5545 SFMMU_HASH_LOCK(hmebp); 5546 5547 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 5548 if (hmeblkp == NULL) { 5549 /* 5550 * didn't find an hmeblk. skip the appropiate 5551 * address range. 5552 */ 5553 SFMMU_HASH_UNLOCK(hmebp); 5554 if (iskernel) { 5555 if (hashno < mmu_hashcnt) { 5556 hashno++; 5557 continue; 5558 } else { 5559 hashno = TTE64K; 5560 addr = (caddr_t)roundup((uintptr_t)addr 5561 + 1, MMU_PAGESIZE64K); 5562 continue; 5563 } 5564 } 5565 addr = (caddr_t)roundup((uintptr_t)addr + 1, 5566 (1 << hmeshift)); 5567 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5568 ASSERT(hashno == TTE64K); 5569 continue; 5570 } 5571 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5572 hashno = TTE512K; 5573 continue; 5574 } 5575 if (mmu_page_sizes == max_mmu_page_sizes) { 5576 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5577 hashno = TTE4M; 5578 continue; 5579 } 5580 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5581 hashno = TTE32M; 5582 continue; 5583 } 5584 hashno = TTE256M; 5585 continue; 5586 } else { 5587 hashno = TTE4M; 5588 continue; 5589 } 5590 } 5591 ASSERT(hmeblkp); 5592 ASSERT(!hmeblkp->hblk_shared); 5593 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5594 /* 5595 * If the valid count is zero we can skip the range 5596 * mapped by this hmeblk. 5597 * We free hblks in the case of HAT_UNMAP. HAT_UNMAP 5598 * is used by segment drivers as a hint 5599 * that the mapping resource won't be used any longer. 5600 * The best example of this is during exit(). 5601 */ 5602 addr = (caddr_t)roundup((uintptr_t)addr + 1, 5603 get_hblk_span(hmeblkp)); 5604 if ((flags & HAT_UNLOAD_UNMAP) || 5605 (iskernel && !issegkmap)) { 5606 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 5607 &list, 0); 5608 } 5609 SFMMU_HASH_UNLOCK(hmebp); 5610 5611 if (iskernel) { 5612 hashno = TTE64K; 5613 continue; 5614 } 5615 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5616 ASSERT(hashno == TTE64K); 5617 continue; 5618 } 5619 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5620 hashno = TTE512K; 5621 continue; 5622 } 5623 if (mmu_page_sizes == max_mmu_page_sizes) { 5624 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5625 hashno = TTE4M; 5626 continue; 5627 } 5628 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5629 hashno = TTE32M; 5630 continue; 5631 } 5632 hashno = TTE256M; 5633 continue; 5634 } else { 5635 hashno = TTE4M; 5636 continue; 5637 } 5638 } 5639 if (hmeblkp->hblk_shw_bit) { 5640 /* 5641 * If we encounter a shadow hmeblk we know there is 5642 * smaller sized hmeblks mapping the same address space. 5643 * Decrement the hash size and rehash. 5644 */ 5645 ASSERT(sfmmup != KHATID); 5646 hashno--; 5647 SFMMU_HASH_UNLOCK(hmebp); 5648 continue; 5649 } 5650 5651 /* 5652 * track callback address ranges. 5653 * only start a new range when it's not contiguous 5654 */ 5655 if (callback != NULL) { 5656 if (addr_count > 0 && 5657 addr == cb_end_addr[addr_count - 1]) 5658 --addr_count; 5659 else 5660 cb_start_addr[addr_count] = addr; 5661 } 5662 5663 addr = sfmmu_hblk_unload(sfmmup, hmeblkp, addr, endaddr, 5664 dmrp, flags); 5665 5666 if (callback != NULL) 5667 cb_end_addr[addr_count++] = addr; 5668 5669 if (((flags & HAT_UNLOAD_UNMAP) || (iskernel && !issegkmap)) && 5670 !hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5671 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 0); 5672 } 5673 SFMMU_HASH_UNLOCK(hmebp); 5674 5675 /* 5676 * Notify our caller as to exactly which pages 5677 * have been unloaded. We do these in clumps, 5678 * to minimize the number of xt_sync()s that need to occur. 5679 */ 5680 if (callback != NULL && addr_count == MAX_CB_ADDR) { 5681 if (dmrp != NULL) { 5682 DEMAP_RANGE_FLUSH(dmrp); 5683 cpuset = sfmmup->sfmmu_cpusran; 5684 xt_sync(cpuset); 5685 } 5686 5687 for (a = 0; a < MAX_CB_ADDR; ++a) { 5688 callback->hcb_start_addr = cb_start_addr[a]; 5689 callback->hcb_end_addr = cb_end_addr[a]; 5690 callback->hcb_function(callback); 5691 } 5692 addr_count = 0; 5693 } 5694 if (iskernel) { 5695 hashno = TTE64K; 5696 continue; 5697 } 5698 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5699 ASSERT(hashno == TTE64K); 5700 continue; 5701 } 5702 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5703 hashno = TTE512K; 5704 continue; 5705 } 5706 if (mmu_page_sizes == max_mmu_page_sizes) { 5707 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5708 hashno = TTE4M; 5709 continue; 5710 } 5711 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5712 hashno = TTE32M; 5713 continue; 5714 } 5715 hashno = TTE256M; 5716 } else { 5717 hashno = TTE4M; 5718 } 5719 } 5720 5721 sfmmu_hblks_list_purge(&list, 0); 5722 if (dmrp != NULL) { 5723 DEMAP_RANGE_FLUSH(dmrp); 5724 cpuset = sfmmup->sfmmu_cpusran; 5725 xt_sync(cpuset); 5726 } 5727 if (callback && addr_count != 0) { 5728 for (a = 0; a < addr_count; ++a) { 5729 callback->hcb_start_addr = cb_start_addr[a]; 5730 callback->hcb_end_addr = cb_end_addr[a]; 5731 callback->hcb_function(callback); 5732 } 5733 } 5734 5735 /* 5736 * Check TSB and TLB page sizes if the process isn't exiting. 5737 */ 5738 if (!sfmmup->sfmmu_free) 5739 sfmmu_check_page_sizes(sfmmup, 0); 5740 } 5741 5742 /* 5743 * Unload all the mappings in the range [addr..addr+len). addr and len must 5744 * be MMU_PAGESIZE aligned. 5745 */ 5746 void 5747 hat_unload(struct hat *sfmmup, caddr_t addr, size_t len, uint_t flags) 5748 { 5749 hat_unload_callback(sfmmup, addr, len, flags, NULL); 5750 } 5751 5752 5753 /* 5754 * Find the largest mapping size for this page. 5755 */ 5756 int 5757 fnd_mapping_sz(page_t *pp) 5758 { 5759 int sz; 5760 int p_index; 5761 5762 p_index = PP_MAPINDEX(pp); 5763 5764 sz = 0; 5765 p_index >>= 1; /* don't care about 8K bit */ 5766 for (; p_index; p_index >>= 1) { 5767 sz++; 5768 } 5769 5770 return (sz); 5771 } 5772 5773 /* 5774 * This function unloads a range of addresses for an hmeblk. 5775 * It returns the next address to be unloaded. 5776 * It should be called with the hash lock held. 5777 */ 5778 static caddr_t 5779 sfmmu_hblk_unload(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5780 caddr_t endaddr, demap_range_t *dmrp, uint_t flags) 5781 { 5782 tte_t tte, ttemod; 5783 struct sf_hment *sfhmep; 5784 int ttesz; 5785 long ttecnt; 5786 page_t *pp; 5787 kmutex_t *pml; 5788 int ret; 5789 int use_demap_range; 5790 5791 ASSERT(in_hblk_range(hmeblkp, addr)); 5792 ASSERT(!hmeblkp->hblk_shw_bit); 5793 ASSERT(sfmmup != NULL || hmeblkp->hblk_shared); 5794 ASSERT(sfmmup == NULL || !hmeblkp->hblk_shared); 5795 ASSERT(dmrp == NULL || !hmeblkp->hblk_shared); 5796 5797 #ifdef DEBUG 5798 if (get_hblk_ttesz(hmeblkp) != TTE8K && 5799 (endaddr < get_hblk_endaddr(hmeblkp))) { 5800 panic("sfmmu_hblk_unload: partial unload of large page"); 5801 } 5802 #endif /* DEBUG */ 5803 5804 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 5805 ttesz = get_hblk_ttesz(hmeblkp); 5806 5807 use_demap_range = ((dmrp == NULL) || 5808 (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp))); 5809 5810 if (use_demap_range) { 5811 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 5812 } else if (dmrp != NULL) { 5813 DEMAP_RANGE_FLUSH(dmrp); 5814 } 5815 ttecnt = 0; 5816 HBLKTOHME(sfhmep, hmeblkp, addr); 5817 5818 while (addr < endaddr) { 5819 pml = NULL; 5820 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5821 if (TTE_IS_VALID(&tte)) { 5822 pp = sfhmep->hme_page; 5823 if (pp != NULL) { 5824 pml = sfmmu_mlist_enter(pp); 5825 } 5826 5827 /* 5828 * Verify if hme still points to 'pp' now that 5829 * we have p_mapping lock. 5830 */ 5831 if (sfhmep->hme_page != pp) { 5832 if (pp != NULL && sfhmep->hme_page != NULL) { 5833 ASSERT(pml != NULL); 5834 sfmmu_mlist_exit(pml); 5835 /* Re-start this iteration. */ 5836 continue; 5837 } 5838 ASSERT((pp != NULL) && 5839 (sfhmep->hme_page == NULL)); 5840 goto tte_unloaded; 5841 } 5842 5843 /* 5844 * This point on we have both HASH and p_mapping 5845 * lock. 5846 */ 5847 ASSERT(pp == sfhmep->hme_page); 5848 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5849 5850 /* 5851 * We need to loop on modify tte because it is 5852 * possible for pagesync to come along and 5853 * change the software bits beneath us. 5854 * 5855 * Page_unload can also invalidate the tte after 5856 * we read tte outside of p_mapping lock. 5857 */ 5858 again: 5859 ttemod = tte; 5860 5861 TTE_SET_INVALID(&ttemod); 5862 ret = sfmmu_modifytte_try(&tte, &ttemod, 5863 &sfhmep->hme_tte); 5864 5865 if (ret <= 0) { 5866 if (TTE_IS_VALID(&tte)) { 5867 ASSERT(ret < 0); 5868 goto again; 5869 } 5870 if (pp != NULL) { 5871 panic("sfmmu_hblk_unload: pp = 0x%p " 5872 "tte became invalid under mlist" 5873 " lock = 0x%p", (void *)pp, 5874 (void *)pml); 5875 } 5876 continue; 5877 } 5878 5879 if (!(flags & HAT_UNLOAD_NOSYNC)) { 5880 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5881 } 5882 5883 /* 5884 * Ok- we invalidated the tte. Do the rest of the job. 5885 */ 5886 ttecnt++; 5887 5888 if (flags & HAT_UNLOAD_UNLOCK) { 5889 ASSERT(hmeblkp->hblk_lckcnt > 0); 5890 atomic_dec_32(&hmeblkp->hblk_lckcnt); 5891 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 5892 } 5893 5894 /* 5895 * Normally we would need to flush the page 5896 * from the virtual cache at this point in 5897 * order to prevent a potential cache alias 5898 * inconsistency. 5899 * The particular scenario we need to worry 5900 * about is: 5901 * Given: va1 and va2 are two virtual address 5902 * that alias and map the same physical 5903 * address. 5904 * 1. mapping exists from va1 to pa and data 5905 * has been read into the cache. 5906 * 2. unload va1. 5907 * 3. load va2 and modify data using va2. 5908 * 4 unload va2. 5909 * 5. load va1 and reference data. Unless we 5910 * flush the data cache when we unload we will 5911 * get stale data. 5912 * Fortunately, page coloring eliminates the 5913 * above scenario by remembering the color a 5914 * physical page was last or is currently 5915 * mapped to. Now, we delay the flush until 5916 * the loading of translations. Only when the 5917 * new translation is of a different color 5918 * are we forced to flush. 5919 */ 5920 if (use_demap_range) { 5921 /* 5922 * Mark this page as needing a demap. 5923 */ 5924 DEMAP_RANGE_MARKPG(dmrp, addr); 5925 } else { 5926 ASSERT(sfmmup != NULL); 5927 ASSERT(!hmeblkp->hblk_shared); 5928 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 5929 sfmmup->sfmmu_free, 0); 5930 } 5931 5932 if (pp) { 5933 /* 5934 * Remove the hment from the mapping list 5935 */ 5936 ASSERT(hmeblkp->hblk_hmecnt > 0); 5937 5938 /* 5939 * Again, we cannot 5940 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS); 5941 */ 5942 HME_SUB(sfhmep, pp); 5943 membar_stst(); 5944 atomic_dec_16(&hmeblkp->hblk_hmecnt); 5945 } 5946 5947 ASSERT(hmeblkp->hblk_vcnt > 0); 5948 atomic_dec_16(&hmeblkp->hblk_vcnt); 5949 5950 ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 5951 !hmeblkp->hblk_lckcnt); 5952 5953 #ifdef VAC 5954 if (pp && (pp->p_nrm & (P_KPMC | P_KPMS | P_TNC))) { 5955 if (PP_ISTNC(pp)) { 5956 /* 5957 * If page was temporary 5958 * uncached, try to recache 5959 * it. Note that HME_SUB() was 5960 * called above so p_index and 5961 * mlist had been updated. 5962 */ 5963 conv_tnc(pp, ttesz); 5964 } else if (pp->p_mapping == NULL) { 5965 ASSERT(kpm_enable); 5966 /* 5967 * Page is marked to be in VAC conflict 5968 * to an existing kpm mapping and/or is 5969 * kpm mapped using only the regular 5970 * pagesize. 5971 */ 5972 sfmmu_kpm_hme_unload(pp); 5973 } 5974 } 5975 #endif /* VAC */ 5976 } else if ((pp = sfhmep->hme_page) != NULL) { 5977 /* 5978 * TTE is invalid but the hme 5979 * still exists. let pageunload 5980 * complete its job. 5981 */ 5982 ASSERT(pml == NULL); 5983 pml = sfmmu_mlist_enter(pp); 5984 if (sfhmep->hme_page != NULL) { 5985 sfmmu_mlist_exit(pml); 5986 continue; 5987 } 5988 ASSERT(sfhmep->hme_page == NULL); 5989 } else if (hmeblkp->hblk_hmecnt != 0) { 5990 /* 5991 * pageunload may have not finished decrementing 5992 * hblk_vcnt and hblk_hmecnt. Find page_t if any and 5993 * wait for pageunload to finish. Rely on pageunload 5994 * to decrement hblk_hmecnt after hblk_vcnt. 5995 */ 5996 pfn_t pfn = TTE_TO_TTEPFN(&tte); 5997 ASSERT(pml == NULL); 5998 if (pf_is_memory(pfn)) { 5999 pp = page_numtopp_nolock(pfn); 6000 if (pp != NULL) { 6001 pml = sfmmu_mlist_enter(pp); 6002 sfmmu_mlist_exit(pml); 6003 pml = NULL; 6004 } 6005 } 6006 } 6007 6008 tte_unloaded: 6009 /* 6010 * At this point, the tte we are looking at 6011 * should be unloaded, and hme has been unlinked 6012 * from page too. This is important because in 6013 * pageunload, it does ttesync() then HME_SUB. 6014 * We need to make sure HME_SUB has been completed 6015 * so we know ttesync() has been completed. Otherwise, 6016 * at exit time, after return from hat layer, VM will 6017 * release as structure which hat_setstat() (called 6018 * by ttesync()) needs. 6019 */ 6020 #ifdef DEBUG 6021 { 6022 tte_t dtte; 6023 6024 ASSERT(sfhmep->hme_page == NULL); 6025 6026 sfmmu_copytte(&sfhmep->hme_tte, &dtte); 6027 ASSERT(!TTE_IS_VALID(&dtte)); 6028 } 6029 #endif 6030 6031 if (pml) { 6032 sfmmu_mlist_exit(pml); 6033 } 6034 6035 addr += TTEBYTES(ttesz); 6036 sfhmep++; 6037 DEMAP_RANGE_NEXTPG(dmrp); 6038 } 6039 /* 6040 * For shared hmeblks this routine is only called when region is freed 6041 * and no longer referenced. So no need to decrement ttecnt 6042 * in the region structure here. 6043 */ 6044 if (ttecnt > 0 && sfmmup != NULL) { 6045 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -ttecnt); 6046 } 6047 return (addr); 6048 } 6049 6050 /* 6051 * Invalidate a virtual address range for the local CPU. 6052 * For best performance ensure that the va range is completely 6053 * mapped, otherwise the entire TLB will be flushed. 6054 */ 6055 void 6056 hat_flush_range(struct hat *sfmmup, caddr_t va, size_t size) 6057 { 6058 ssize_t sz; 6059 caddr_t endva = va + size; 6060 6061 while (va < endva) { 6062 sz = hat_getpagesize(sfmmup, va); 6063 if (sz < 0) { 6064 vtag_flushall(); 6065 break; 6066 } 6067 vtag_flushpage(va, (uint64_t)sfmmup); 6068 va += sz; 6069 } 6070 } 6071 6072 /* 6073 * Synchronize all the mappings in the range [addr..addr+len). 6074 * Can be called with clearflag having two states: 6075 * HAT_SYNC_DONTZERO means just return the rm stats 6076 * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats 6077 */ 6078 void 6079 hat_sync(struct hat *sfmmup, caddr_t addr, size_t len, uint_t clearflag) 6080 { 6081 struct hmehash_bucket *hmebp; 6082 hmeblk_tag hblktag; 6083 int hmeshift, hashno = 1; 6084 struct hme_blk *hmeblkp, *list = NULL; 6085 caddr_t endaddr; 6086 cpuset_t cpuset; 6087 6088 ASSERT((sfmmup == ksfmmup) || 6089 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 6090 ASSERT((len & MMU_PAGEOFFSET) == 0); 6091 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 6092 (clearflag == HAT_SYNC_ZERORM)); 6093 6094 CPUSET_ZERO(cpuset); 6095 6096 endaddr = addr + len; 6097 hblktag.htag_id = sfmmup; 6098 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 6099 6100 /* 6101 * Spitfire supports 4 page sizes. 6102 * Most pages are expected to be of the smallest page 6103 * size (8K) and these will not need to be rehashed. 64K 6104 * pages also don't need to be rehashed because the an hmeblk 6105 * spans 64K of address space. 512K pages might need 1 rehash and 6106 * and 4M pages 2 rehashes. 6107 */ 6108 while (addr < endaddr) { 6109 hmeshift = HME_HASH_SHIFT(hashno); 6110 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 6111 hblktag.htag_rehash = hashno; 6112 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 6113 6114 SFMMU_HASH_LOCK(hmebp); 6115 6116 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 6117 if (hmeblkp != NULL) { 6118 ASSERT(!hmeblkp->hblk_shared); 6119 /* 6120 * We've encountered a shadow hmeblk so skip the range 6121 * of the next smaller mapping size. 6122 */ 6123 if (hmeblkp->hblk_shw_bit) { 6124 ASSERT(sfmmup != ksfmmup); 6125 ASSERT(hashno > 1); 6126 addr = (caddr_t)P2END((uintptr_t)addr, 6127 TTEBYTES(hashno - 1)); 6128 } else { 6129 addr = sfmmu_hblk_sync(sfmmup, hmeblkp, 6130 addr, endaddr, clearflag); 6131 } 6132 SFMMU_HASH_UNLOCK(hmebp); 6133 hashno = 1; 6134 continue; 6135 } 6136 SFMMU_HASH_UNLOCK(hmebp); 6137 6138 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 6139 /* 6140 * We have traversed the whole list and rehashed 6141 * if necessary without finding the address to sync. 6142 * This is ok so we increment the address by the 6143 * smallest hmeblk range for kernel mappings and the 6144 * largest hmeblk range, to account for shadow hmeblks, 6145 * for user mappings and continue. 6146 */ 6147 if (sfmmup == ksfmmup) 6148 addr = (caddr_t)P2END((uintptr_t)addr, 6149 TTEBYTES(1)); 6150 else 6151 addr = (caddr_t)P2END((uintptr_t)addr, 6152 TTEBYTES(hashno)); 6153 hashno = 1; 6154 } else { 6155 hashno++; 6156 } 6157 } 6158 sfmmu_hblks_list_purge(&list, 0); 6159 cpuset = sfmmup->sfmmu_cpusran; 6160 xt_sync(cpuset); 6161 } 6162 6163 static caddr_t 6164 sfmmu_hblk_sync(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 6165 caddr_t endaddr, int clearflag) 6166 { 6167 tte_t tte, ttemod; 6168 struct sf_hment *sfhmep; 6169 int ttesz; 6170 struct page *pp; 6171 kmutex_t *pml; 6172 int ret; 6173 6174 ASSERT(hmeblkp->hblk_shw_bit == 0); 6175 ASSERT(!hmeblkp->hblk_shared); 6176 6177 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 6178 6179 ttesz = get_hblk_ttesz(hmeblkp); 6180 HBLKTOHME(sfhmep, hmeblkp, addr); 6181 6182 while (addr < endaddr) { 6183 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6184 if (TTE_IS_VALID(&tte)) { 6185 pml = NULL; 6186 pp = sfhmep->hme_page; 6187 if (pp) { 6188 pml = sfmmu_mlist_enter(pp); 6189 } 6190 if (pp != sfhmep->hme_page) { 6191 /* 6192 * tte most have been unloaded 6193 * underneath us. Recheck 6194 */ 6195 ASSERT(pml); 6196 sfmmu_mlist_exit(pml); 6197 continue; 6198 } 6199 6200 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 6201 6202 if (clearflag == HAT_SYNC_ZERORM) { 6203 ttemod = tte; 6204 TTE_CLR_RM(&ttemod); 6205 ret = sfmmu_modifytte_try(&tte, &ttemod, 6206 &sfhmep->hme_tte); 6207 if (ret < 0) { 6208 if (pml) { 6209 sfmmu_mlist_exit(pml); 6210 } 6211 continue; 6212 } 6213 6214 if (ret > 0) { 6215 sfmmu_tlb_demap(addr, sfmmup, 6216 hmeblkp, 0, 0); 6217 } 6218 } 6219 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6220 if (pml) { 6221 sfmmu_mlist_exit(pml); 6222 } 6223 } 6224 addr += TTEBYTES(ttesz); 6225 sfhmep++; 6226 } 6227 return (addr); 6228 } 6229 6230 /* 6231 * This function will sync a tte to the page struct and it will 6232 * update the hat stats. Currently it allows us to pass a NULL pp 6233 * and we will simply update the stats. We may want to change this 6234 * so we only keep stats for pages backed by pp's. 6235 */ 6236 static void 6237 sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp) 6238 { 6239 uint_t rm = 0; 6240 int sz; 6241 pgcnt_t npgs; 6242 6243 ASSERT(TTE_IS_VALID(ttep)); 6244 6245 if (TTE_IS_NOSYNC(ttep)) { 6246 return; 6247 } 6248 6249 if (TTE_IS_REF(ttep)) { 6250 rm = P_REF; 6251 } 6252 if (TTE_IS_MOD(ttep)) { 6253 rm |= P_MOD; 6254 } 6255 6256 if (rm == 0) { 6257 return; 6258 } 6259 6260 sz = TTE_CSZ(ttep); 6261 if (sfmmup != NULL && sfmmup->sfmmu_rmstat) { 6262 int i; 6263 caddr_t vaddr = addr; 6264 6265 for (i = 0; i < TTEPAGES(sz); i++, vaddr += MMU_PAGESIZE) { 6266 hat_setstat(sfmmup->sfmmu_as, vaddr, MMU_PAGESIZE, rm); 6267 } 6268 6269 } 6270 6271 /* 6272 * XXX I want to use cas to update nrm bits but they 6273 * currently belong in common/vm and not in hat where 6274 * they should be. 6275 * The nrm bits are protected by the same mutex as 6276 * the one that protects the page's mapping list. 6277 */ 6278 if (!pp) 6279 return; 6280 ASSERT(sfmmu_mlist_held(pp)); 6281 /* 6282 * If the tte is for a large page, we need to sync all the 6283 * pages covered by the tte. 6284 */ 6285 if (sz != TTE8K) { 6286 ASSERT(pp->p_szc != 0); 6287 pp = PP_GROUPLEADER(pp, sz); 6288 ASSERT(sfmmu_mlist_held(pp)); 6289 } 6290 6291 /* Get number of pages from tte size. */ 6292 npgs = TTEPAGES(sz); 6293 6294 do { 6295 ASSERT(pp); 6296 ASSERT(sfmmu_mlist_held(pp)); 6297 if (((rm & P_REF) != 0 && !PP_ISREF(pp)) || 6298 ((rm & P_MOD) != 0 && !PP_ISMOD(pp))) 6299 hat_page_setattr(pp, rm); 6300 6301 /* 6302 * Are we done? If not, we must have a large mapping. 6303 * For large mappings we need to sync the rest of the pages 6304 * covered by this tte; goto the next page. 6305 */ 6306 } while (--npgs > 0 && (pp = PP_PAGENEXT(pp))); 6307 } 6308 6309 /* 6310 * Execute pre-callback handler of each pa_hment linked to pp 6311 * 6312 * Inputs: 6313 * flag: either HAT_PRESUSPEND or HAT_SUSPEND. 6314 * capture_cpus: pointer to return value (below) 6315 * 6316 * Returns: 6317 * Propagates the subsystem callback return values back to the caller; 6318 * returns 0 on success. If capture_cpus is non-NULL, the value returned 6319 * is zero if all of the pa_hments are of a type that do not require 6320 * capturing CPUs prior to suspending the mapping, else it is 1. 6321 */ 6322 static int 6323 hat_pageprocess_precallbacks(struct page *pp, uint_t flag, int *capture_cpus) 6324 { 6325 struct sf_hment *sfhmep; 6326 struct pa_hment *pahmep; 6327 int (*f)(caddr_t, uint_t, uint_t, void *); 6328 int ret; 6329 id_t id; 6330 int locked = 0; 6331 kmutex_t *pml; 6332 6333 ASSERT(PAGE_EXCL(pp)); 6334 if (!sfmmu_mlist_held(pp)) { 6335 pml = sfmmu_mlist_enter(pp); 6336 locked = 1; 6337 } 6338 6339 if (capture_cpus) 6340 *capture_cpus = 0; 6341 6342 top: 6343 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6344 /* 6345 * skip sf_hments corresponding to VA<->PA mappings; 6346 * for pa_hment's, hme_tte.ll is zero 6347 */ 6348 if (!IS_PAHME(sfhmep)) 6349 continue; 6350 6351 pahmep = sfhmep->hme_data; 6352 ASSERT(pahmep != NULL); 6353 6354 /* 6355 * skip if pre-handler has been called earlier in this loop 6356 */ 6357 if (pahmep->flags & flag) 6358 continue; 6359 6360 id = pahmep->cb_id; 6361 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 6362 if (capture_cpus && sfmmu_cb_table[id].capture_cpus != 0) 6363 *capture_cpus = 1; 6364 if ((f = sfmmu_cb_table[id].prehandler) == NULL) { 6365 pahmep->flags |= flag; 6366 continue; 6367 } 6368 6369 /* 6370 * Drop the mapping list lock to avoid locking order issues. 6371 */ 6372 if (locked) 6373 sfmmu_mlist_exit(pml); 6374 6375 ret = f(pahmep->addr, pahmep->len, flag, pahmep->pvt); 6376 if (ret != 0) 6377 return (ret); /* caller must do the cleanup */ 6378 6379 if (locked) { 6380 pml = sfmmu_mlist_enter(pp); 6381 pahmep->flags |= flag; 6382 goto top; 6383 } 6384 6385 pahmep->flags |= flag; 6386 } 6387 6388 if (locked) 6389 sfmmu_mlist_exit(pml); 6390 6391 return (0); 6392 } 6393 6394 /* 6395 * Execute post-callback handler of each pa_hment linked to pp 6396 * 6397 * Same overall assumptions and restrictions apply as for 6398 * hat_pageprocess_precallbacks(). 6399 */ 6400 static void 6401 hat_pageprocess_postcallbacks(struct page *pp, uint_t flag) 6402 { 6403 pfn_t pgpfn = pp->p_pagenum; 6404 pfn_t pgmask = btop(page_get_pagesize(pp->p_szc)) - 1; 6405 pfn_t newpfn; 6406 struct sf_hment *sfhmep; 6407 struct pa_hment *pahmep; 6408 int (*f)(caddr_t, uint_t, uint_t, void *, pfn_t); 6409 id_t id; 6410 int locked = 0; 6411 kmutex_t *pml; 6412 6413 ASSERT(PAGE_EXCL(pp)); 6414 if (!sfmmu_mlist_held(pp)) { 6415 pml = sfmmu_mlist_enter(pp); 6416 locked = 1; 6417 } 6418 6419 top: 6420 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6421 /* 6422 * skip sf_hments corresponding to VA<->PA mappings; 6423 * for pa_hment's, hme_tte.ll is zero 6424 */ 6425 if (!IS_PAHME(sfhmep)) 6426 continue; 6427 6428 pahmep = sfhmep->hme_data; 6429 ASSERT(pahmep != NULL); 6430 6431 if ((pahmep->flags & flag) == 0) 6432 continue; 6433 6434 pahmep->flags &= ~flag; 6435 6436 id = pahmep->cb_id; 6437 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 6438 if ((f = sfmmu_cb_table[id].posthandler) == NULL) 6439 continue; 6440 6441 /* 6442 * Convert the base page PFN into the constituent PFN 6443 * which is needed by the callback handler. 6444 */ 6445 newpfn = pgpfn | (btop((uintptr_t)pahmep->addr) & pgmask); 6446 6447 /* 6448 * Drop the mapping list lock to avoid locking order issues. 6449 */ 6450 if (locked) 6451 sfmmu_mlist_exit(pml); 6452 6453 if (f(pahmep->addr, pahmep->len, flag, pahmep->pvt, newpfn) 6454 != 0) 6455 panic("sfmmu: posthandler failed"); 6456 6457 if (locked) { 6458 pml = sfmmu_mlist_enter(pp); 6459 goto top; 6460 } 6461 } 6462 6463 if (locked) 6464 sfmmu_mlist_exit(pml); 6465 } 6466 6467 /* 6468 * Suspend locked kernel mapping 6469 */ 6470 void 6471 hat_pagesuspend(struct page *pp) 6472 { 6473 struct sf_hment *sfhmep; 6474 sfmmu_t *sfmmup; 6475 tte_t tte, ttemod; 6476 struct hme_blk *hmeblkp; 6477 caddr_t addr; 6478 int index, cons; 6479 cpuset_t cpuset; 6480 6481 ASSERT(PAGE_EXCL(pp)); 6482 ASSERT(sfmmu_mlist_held(pp)); 6483 6484 mutex_enter(&kpr_suspendlock); 6485 6486 /* 6487 * We're about to suspend a kernel mapping so mark this thread as 6488 * non-traceable by DTrace. This prevents us from running into issues 6489 * with probe context trying to touch a suspended page 6490 * in the relocation codepath itself. 6491 */ 6492 curthread->t_flag |= T_DONTDTRACE; 6493 6494 index = PP_MAPINDEX(pp); 6495 cons = TTE8K; 6496 6497 retry: 6498 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6499 6500 if (IS_PAHME(sfhmep)) 6501 continue; 6502 6503 if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep)) != cons) 6504 continue; 6505 6506 /* 6507 * Loop until we successfully set the suspend bit in 6508 * the TTE. 6509 */ 6510 again: 6511 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6512 ASSERT(TTE_IS_VALID(&tte)); 6513 6514 ttemod = tte; 6515 TTE_SET_SUSPEND(&ttemod); 6516 if (sfmmu_modifytte_try(&tte, &ttemod, 6517 &sfhmep->hme_tte) < 0) 6518 goto again; 6519 6520 /* 6521 * Invalidate TSB entry 6522 */ 6523 hmeblkp = sfmmu_hmetohblk(sfhmep); 6524 6525 sfmmup = hblktosfmmu(hmeblkp); 6526 ASSERT(sfmmup == ksfmmup); 6527 ASSERT(!hmeblkp->hblk_shared); 6528 6529 addr = tte_to_vaddr(hmeblkp, tte); 6530 6531 /* 6532 * No need to make sure that the TSB for this sfmmu is 6533 * not being relocated since it is ksfmmup and thus it 6534 * will never be relocated. 6535 */ 6536 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 6537 6538 /* 6539 * Update xcall stats 6540 */ 6541 cpuset = cpu_ready_set; 6542 CPUSET_DEL(cpuset, CPU->cpu_id); 6543 6544 /* LINTED: constant in conditional context */ 6545 SFMMU_XCALL_STATS(ksfmmup); 6546 6547 /* 6548 * Flush TLB entry on remote CPU's 6549 */ 6550 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 6551 (uint64_t)ksfmmup); 6552 xt_sync(cpuset); 6553 6554 /* 6555 * Flush TLB entry on local CPU 6556 */ 6557 vtag_flushpage(addr, (uint64_t)ksfmmup); 6558 } 6559 6560 while (index != 0) { 6561 index = index >> 1; 6562 if (index != 0) 6563 cons++; 6564 if (index & 0x1) { 6565 pp = PP_GROUPLEADER(pp, cons); 6566 goto retry; 6567 } 6568 } 6569 } 6570 6571 #ifdef DEBUG 6572 6573 #define N_PRLE 1024 6574 struct prle { 6575 page_t *targ; 6576 page_t *repl; 6577 int status; 6578 int pausecpus; 6579 hrtime_t whence; 6580 }; 6581 6582 static struct prle page_relocate_log[N_PRLE]; 6583 static int prl_entry; 6584 static kmutex_t prl_mutex; 6585 6586 #define PAGE_RELOCATE_LOG(t, r, s, p) \ 6587 mutex_enter(&prl_mutex); \ 6588 page_relocate_log[prl_entry].targ = *(t); \ 6589 page_relocate_log[prl_entry].repl = *(r); \ 6590 page_relocate_log[prl_entry].status = (s); \ 6591 page_relocate_log[prl_entry].pausecpus = (p); \ 6592 page_relocate_log[prl_entry].whence = gethrtime(); \ 6593 prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1; \ 6594 mutex_exit(&prl_mutex); 6595 6596 #else /* !DEBUG */ 6597 #define PAGE_RELOCATE_LOG(t, r, s, p) 6598 #endif 6599 6600 /* 6601 * Core Kernel Page Relocation Algorithm 6602 * 6603 * Input: 6604 * 6605 * target : constituent pages are SE_EXCL locked. 6606 * replacement: constituent pages are SE_EXCL locked. 6607 * 6608 * Output: 6609 * 6610 * nrelocp: number of pages relocated 6611 */ 6612 int 6613 hat_page_relocate(page_t **target, page_t **replacement, spgcnt_t *nrelocp) 6614 { 6615 page_t *targ, *repl; 6616 page_t *tpp, *rpp; 6617 kmutex_t *low, *high; 6618 spgcnt_t npages, i; 6619 page_t *pl = NULL; 6620 int old_pil; 6621 cpuset_t cpuset; 6622 int cap_cpus; 6623 int ret; 6624 #ifdef VAC 6625 int cflags = 0; 6626 #endif 6627 6628 if (!kcage_on || PP_ISNORELOC(*target)) { 6629 PAGE_RELOCATE_LOG(target, replacement, EAGAIN, -1); 6630 return (EAGAIN); 6631 } 6632 6633 mutex_enter(&kpr_mutex); 6634 kreloc_thread = curthread; 6635 6636 targ = *target; 6637 repl = *replacement; 6638 ASSERT(repl != NULL); 6639 ASSERT(targ->p_szc == repl->p_szc); 6640 6641 npages = page_get_pagecnt(targ->p_szc); 6642 6643 /* 6644 * unload VA<->PA mappings that are not locked 6645 */ 6646 tpp = targ; 6647 for (i = 0; i < npages; i++) { 6648 (void) hat_pageunload(tpp, SFMMU_KERNEL_RELOC); 6649 tpp++; 6650 } 6651 6652 /* 6653 * Do "presuspend" callbacks, in a context from which we can still 6654 * block as needed. Note that we don't hold the mapping list lock 6655 * of "targ" at this point due to potential locking order issues; 6656 * we assume that between the hat_pageunload() above and holding 6657 * the SE_EXCL lock that the mapping list *cannot* change at this 6658 * point. 6659 */ 6660 ret = hat_pageprocess_precallbacks(targ, HAT_PRESUSPEND, &cap_cpus); 6661 if (ret != 0) { 6662 /* 6663 * EIO translates to fatal error, for all others cleanup 6664 * and return EAGAIN. 6665 */ 6666 ASSERT(ret != EIO); 6667 hat_pageprocess_postcallbacks(targ, HAT_POSTUNSUSPEND); 6668 PAGE_RELOCATE_LOG(target, replacement, ret, -1); 6669 kreloc_thread = NULL; 6670 mutex_exit(&kpr_mutex); 6671 return (EAGAIN); 6672 } 6673 6674 /* 6675 * acquire p_mapping list lock for both the target and replacement 6676 * root pages. 6677 * 6678 * low and high refer to the need to grab the mlist locks in a 6679 * specific order in order to prevent race conditions. Thus the 6680 * lower lock must be grabbed before the higher lock. 6681 * 6682 * This will block hat_unload's accessing p_mapping list. Since 6683 * we have SE_EXCL lock, hat_memload and hat_pageunload will be 6684 * blocked. Thus, no one else will be accessing the p_mapping list 6685 * while we suspend and reload the locked mapping below. 6686 */ 6687 tpp = targ; 6688 rpp = repl; 6689 sfmmu_mlist_reloc_enter(tpp, rpp, &low, &high); 6690 6691 kpreempt_disable(); 6692 6693 /* 6694 * We raise our PIL to 13 so that we don't get captured by 6695 * another CPU or pinned by an interrupt thread. We can't go to 6696 * PIL 14 since the nexus driver(s) may need to interrupt at 6697 * that level in the case of IOMMU pseudo mappings. 6698 */ 6699 cpuset = cpu_ready_set; 6700 CPUSET_DEL(cpuset, CPU->cpu_id); 6701 if (!cap_cpus || CPUSET_ISNULL(cpuset)) { 6702 old_pil = splr(XCALL_PIL); 6703 } else { 6704 old_pil = -1; 6705 xc_attention(cpuset); 6706 } 6707 ASSERT(getpil() == XCALL_PIL); 6708 6709 /* 6710 * Now do suspend callbacks. In the case of an IOMMU mapping 6711 * this will suspend all DMA activity to the page while it is 6712 * being relocated. Since we are well above LOCK_LEVEL and CPUs 6713 * may be captured at this point we should have acquired any needed 6714 * locks in the presuspend callback. 6715 */ 6716 ret = hat_pageprocess_precallbacks(targ, HAT_SUSPEND, NULL); 6717 if (ret != 0) { 6718 repl = targ; 6719 goto suspend_fail; 6720 } 6721 6722 /* 6723 * Raise the PIL yet again, this time to block all high-level 6724 * interrupts on this CPU. This is necessary to prevent an 6725 * interrupt routine from pinning the thread which holds the 6726 * mapping suspended and then touching the suspended page. 6727 * 6728 * Once the page is suspended we also need to be careful to 6729 * avoid calling any functions which touch any seg_kmem memory 6730 * since that memory may be backed by the very page we are 6731 * relocating in here! 6732 */ 6733 hat_pagesuspend(targ); 6734 6735 /* 6736 * Now that we are confident everybody has stopped using this page, 6737 * copy the page contents. Note we use a physical copy to prevent 6738 * locking issues and to avoid fpRAS because we can't handle it in 6739 * this context. 6740 */ 6741 for (i = 0; i < npages; i++, tpp++, rpp++) { 6742 #ifdef VAC 6743 /* 6744 * If the replacement has a different vcolor than 6745 * the one being replacd, we need to handle VAC 6746 * consistency for it just as we were setting up 6747 * a new mapping to it. 6748 */ 6749 if ((PP_GET_VCOLOR(rpp) != NO_VCOLOR) && 6750 (tpp->p_vcolor != rpp->p_vcolor) && 6751 !CacheColor_IsFlushed(cflags, PP_GET_VCOLOR(rpp))) { 6752 CacheColor_SetFlushed(cflags, PP_GET_VCOLOR(rpp)); 6753 sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp), 6754 rpp->p_pagenum); 6755 } 6756 #endif 6757 /* 6758 * Copy the contents of the page. 6759 */ 6760 ppcopy_kernel(tpp, rpp); 6761 } 6762 6763 tpp = targ; 6764 rpp = repl; 6765 for (i = 0; i < npages; i++, tpp++, rpp++) { 6766 /* 6767 * Copy attributes. VAC consistency was handled above, 6768 * if required. 6769 */ 6770 rpp->p_nrm = tpp->p_nrm; 6771 tpp->p_nrm = 0; 6772 rpp->p_index = tpp->p_index; 6773 tpp->p_index = 0; 6774 #ifdef VAC 6775 rpp->p_vcolor = tpp->p_vcolor; 6776 #endif 6777 } 6778 6779 /* 6780 * First, unsuspend the page, if we set the suspend bit, and transfer 6781 * the mapping list from the target page to the replacement page. 6782 * Next process postcallbacks; since pa_hment's are linked only to the 6783 * p_mapping list of root page, we don't iterate over the constituent 6784 * pages. 6785 */ 6786 hat_pagereload(targ, repl); 6787 6788 suspend_fail: 6789 hat_pageprocess_postcallbacks(repl, HAT_UNSUSPEND); 6790 6791 /* 6792 * Now lower our PIL and release any captured CPUs since we 6793 * are out of the "danger zone". After this it will again be 6794 * safe to acquire adaptive mutex locks, or to drop them... 6795 */ 6796 if (old_pil != -1) { 6797 splx(old_pil); 6798 } else { 6799 xc_dismissed(cpuset); 6800 } 6801 6802 kpreempt_enable(); 6803 6804 sfmmu_mlist_reloc_exit(low, high); 6805 6806 /* 6807 * Postsuspend callbacks should drop any locks held across 6808 * the suspend callbacks. As before, we don't hold the mapping 6809 * list lock at this point.. our assumption is that the mapping 6810 * list still can't change due to our holding SE_EXCL lock and 6811 * there being no unlocked mappings left. Hence the restriction 6812 * on calling context to hat_delete_callback() 6813 */ 6814 hat_pageprocess_postcallbacks(repl, HAT_POSTUNSUSPEND); 6815 if (ret != 0) { 6816 /* 6817 * The second presuspend call failed: we got here through 6818 * the suspend_fail label above. 6819 */ 6820 ASSERT(ret != EIO); 6821 PAGE_RELOCATE_LOG(target, replacement, ret, cap_cpus); 6822 kreloc_thread = NULL; 6823 mutex_exit(&kpr_mutex); 6824 return (EAGAIN); 6825 } 6826 6827 /* 6828 * Now that we're out of the performance critical section we can 6829 * take care of updating the hash table, since we still 6830 * hold all the pages locked SE_EXCL at this point we 6831 * needn't worry about things changing out from under us. 6832 */ 6833 tpp = targ; 6834 rpp = repl; 6835 for (i = 0; i < npages; i++, tpp++, rpp++) { 6836 6837 /* 6838 * replace targ with replacement in page_hash table 6839 */ 6840 targ = tpp; 6841 page_relocate_hash(rpp, targ); 6842 6843 /* 6844 * concatenate target; caller of platform_page_relocate() 6845 * expects target to be concatenated after returning. 6846 */ 6847 ASSERT(targ->p_next == targ); 6848 ASSERT(targ->p_prev == targ); 6849 page_list_concat(&pl, &targ); 6850 } 6851 6852 ASSERT(*target == pl); 6853 *nrelocp = npages; 6854 PAGE_RELOCATE_LOG(target, replacement, 0, cap_cpus); 6855 kreloc_thread = NULL; 6856 mutex_exit(&kpr_mutex); 6857 return (0); 6858 } 6859 6860 /* 6861 * Called when stray pa_hments are found attached to a page which is 6862 * being freed. Notify the subsystem which attached the pa_hment of 6863 * the error if it registered a suitable handler, else panic. 6864 */ 6865 static void 6866 sfmmu_pahment_leaked(struct pa_hment *pahmep) 6867 { 6868 id_t cb_id = pahmep->cb_id; 6869 6870 ASSERT(cb_id >= (id_t)0 && cb_id < sfmmu_cb_nextid); 6871 if (sfmmu_cb_table[cb_id].errhandler != NULL) { 6872 if (sfmmu_cb_table[cb_id].errhandler(pahmep->addr, pahmep->len, 6873 HAT_CB_ERR_LEAKED, pahmep->pvt) == 0) 6874 return; /* non-fatal */ 6875 } 6876 panic("pa_hment leaked: 0x%p", (void *)pahmep); 6877 } 6878 6879 /* 6880 * Remove all mappings to page 'pp'. 6881 */ 6882 int 6883 hat_pageunload(struct page *pp, uint_t forceflag) 6884 { 6885 struct page *origpp = pp; 6886 struct sf_hment *sfhme, *tmphme; 6887 struct hme_blk *hmeblkp; 6888 kmutex_t *pml; 6889 #ifdef VAC 6890 kmutex_t *pmtx; 6891 #endif 6892 cpuset_t cpuset, tset; 6893 int index, cons; 6894 int pa_hments; 6895 6896 ASSERT(PAGE_EXCL(pp)); 6897 6898 tmphme = NULL; 6899 pa_hments = 0; 6900 CPUSET_ZERO(cpuset); 6901 6902 pml = sfmmu_mlist_enter(pp); 6903 6904 #ifdef VAC 6905 if (pp->p_kpmref) 6906 sfmmu_kpm_pageunload(pp); 6907 ASSERT(!PP_ISMAPPED_KPM(pp)); 6908 #endif 6909 /* 6910 * Clear vpm reference. Since the page is exclusively locked 6911 * vpm cannot be referencing it. 6912 */ 6913 if (vpm_enable) { 6914 pp->p_vpmref = 0; 6915 } 6916 6917 index = PP_MAPINDEX(pp); 6918 cons = TTE8K; 6919 retry: 6920 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 6921 tmphme = sfhme->hme_next; 6922 6923 if (IS_PAHME(sfhme)) { 6924 ASSERT(sfhme->hme_data != NULL); 6925 pa_hments++; 6926 continue; 6927 } 6928 6929 hmeblkp = sfmmu_hmetohblk(sfhme); 6930 6931 /* 6932 * If there are kernel mappings don't unload them, they will 6933 * be suspended. 6934 */ 6935 if (forceflag == SFMMU_KERNEL_RELOC && hmeblkp->hblk_lckcnt && 6936 hmeblkp->hblk_tag.htag_id == ksfmmup) 6937 continue; 6938 6939 tset = sfmmu_pageunload(pp, sfhme, cons); 6940 CPUSET_OR(cpuset, tset); 6941 } 6942 6943 while (index != 0) { 6944 index = index >> 1; 6945 if (index != 0) 6946 cons++; 6947 if (index & 0x1) { 6948 /* Go to leading page */ 6949 pp = PP_GROUPLEADER(pp, cons); 6950 ASSERT(sfmmu_mlist_held(pp)); 6951 goto retry; 6952 } 6953 } 6954 6955 /* 6956 * cpuset may be empty if the page was only mapped by segkpm, 6957 * in which case we won't actually cross-trap. 6958 */ 6959 xt_sync(cpuset); 6960 6961 /* 6962 * The page should have no mappings at this point, unless 6963 * we were called from hat_page_relocate() in which case we 6964 * leave the locked mappings which will be suspended later. 6965 */ 6966 ASSERT(!PP_ISMAPPED(origpp) || pa_hments || 6967 (forceflag == SFMMU_KERNEL_RELOC)); 6968 6969 #ifdef VAC 6970 if (PP_ISTNC(pp)) { 6971 if (cons == TTE8K) { 6972 pmtx = sfmmu_page_enter(pp); 6973 PP_CLRTNC(pp); 6974 sfmmu_page_exit(pmtx); 6975 } else { 6976 conv_tnc(pp, cons); 6977 } 6978 } 6979 #endif /* VAC */ 6980 6981 if (pa_hments && forceflag != SFMMU_KERNEL_RELOC) { 6982 /* 6983 * Unlink any pa_hments and free them, calling back 6984 * the responsible subsystem to notify it of the error. 6985 * This can occur in situations such as drivers leaking 6986 * DMA handles: naughty, but common enough that we'd like 6987 * to keep the system running rather than bringing it 6988 * down with an obscure error like "pa_hment leaked" 6989 * which doesn't aid the user in debugging their driver. 6990 */ 6991 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 6992 tmphme = sfhme->hme_next; 6993 if (IS_PAHME(sfhme)) { 6994 struct pa_hment *pahmep = sfhme->hme_data; 6995 sfmmu_pahment_leaked(pahmep); 6996 HME_SUB(sfhme, pp); 6997 kmem_cache_free(pa_hment_cache, pahmep); 6998 } 6999 } 7000 7001 ASSERT(!PP_ISMAPPED(origpp)); 7002 } 7003 7004 sfmmu_mlist_exit(pml); 7005 7006 return (0); 7007 } 7008 7009 cpuset_t 7010 sfmmu_pageunload(page_t *pp, struct sf_hment *sfhme, int cons) 7011 { 7012 struct hme_blk *hmeblkp; 7013 sfmmu_t *sfmmup; 7014 tte_t tte, ttemod; 7015 #ifdef DEBUG 7016 tte_t orig_old; 7017 #endif /* DEBUG */ 7018 caddr_t addr; 7019 int ttesz; 7020 int ret; 7021 cpuset_t cpuset; 7022 7023 ASSERT(pp != NULL); 7024 ASSERT(sfmmu_mlist_held(pp)); 7025 ASSERT(!PP_ISKAS(pp)); 7026 7027 CPUSET_ZERO(cpuset); 7028 7029 hmeblkp = sfmmu_hmetohblk(sfhme); 7030 7031 readtte: 7032 sfmmu_copytte(&sfhme->hme_tte, &tte); 7033 if (TTE_IS_VALID(&tte)) { 7034 sfmmup = hblktosfmmu(hmeblkp); 7035 ttesz = get_hblk_ttesz(hmeblkp); 7036 /* 7037 * Only unload mappings of 'cons' size. 7038 */ 7039 if (ttesz != cons) 7040 return (cpuset); 7041 7042 /* 7043 * Note that we have p_mapping lock, but no hash lock here. 7044 * hblk_unload() has to have both hash lock AND p_mapping 7045 * lock before it tries to modify tte. So, the tte could 7046 * not become invalid in the sfmmu_modifytte_try() below. 7047 */ 7048 ttemod = tte; 7049 #ifdef DEBUG 7050 orig_old = tte; 7051 #endif /* DEBUG */ 7052 7053 TTE_SET_INVALID(&ttemod); 7054 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 7055 if (ret < 0) { 7056 #ifdef DEBUG 7057 /* only R/M bits can change. */ 7058 chk_tte(&orig_old, &tte, &ttemod, hmeblkp); 7059 #endif /* DEBUG */ 7060 goto readtte; 7061 } 7062 7063 if (ret == 0) { 7064 panic("pageunload: cas failed?"); 7065 } 7066 7067 addr = tte_to_vaddr(hmeblkp, tte); 7068 7069 if (hmeblkp->hblk_shared) { 7070 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7071 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7072 sf_region_t *rgnp; 7073 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7074 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7075 ASSERT(srdp != NULL); 7076 rgnp = srdp->srd_hmergnp[rid]; 7077 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 7078 cpuset = sfmmu_rgntlb_demap(addr, rgnp, hmeblkp, 1); 7079 sfmmu_ttesync(NULL, addr, &tte, pp); 7080 ASSERT(rgnp->rgn_ttecnt[ttesz] > 0); 7081 atomic_dec_ulong(&rgnp->rgn_ttecnt[ttesz]); 7082 } else { 7083 sfmmu_ttesync(sfmmup, addr, &tte, pp); 7084 atomic_dec_ulong(&sfmmup->sfmmu_ttecnt[ttesz]); 7085 7086 /* 7087 * We need to flush the page from the virtual cache 7088 * in order to prevent a virtual cache alias 7089 * inconsistency. The particular scenario we need 7090 * to worry about is: 7091 * Given: va1 and va2 are two virtual address that 7092 * alias and will map the same physical address. 7093 * 1. mapping exists from va1 to pa and data has 7094 * been read into the cache. 7095 * 2. unload va1. 7096 * 3. load va2 and modify data using va2. 7097 * 4 unload va2. 7098 * 5. load va1 and reference data. Unless we flush 7099 * the data cache when we unload we will get 7100 * stale data. 7101 * This scenario is taken care of by using virtual 7102 * page coloring. 7103 */ 7104 if (sfmmup->sfmmu_ismhat) { 7105 /* 7106 * Flush TSBs, TLBs and caches 7107 * of every process 7108 * sharing this ism segment. 7109 */ 7110 sfmmu_hat_lock_all(); 7111 mutex_enter(&ism_mlist_lock); 7112 kpreempt_disable(); 7113 sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp, 7114 pp->p_pagenum, CACHE_NO_FLUSH); 7115 kpreempt_enable(); 7116 mutex_exit(&ism_mlist_lock); 7117 sfmmu_hat_unlock_all(); 7118 cpuset = cpu_ready_set; 7119 } else { 7120 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 7121 cpuset = sfmmup->sfmmu_cpusran; 7122 } 7123 } 7124 7125 /* 7126 * Hme_sub has to run after ttesync() and a_rss update. 7127 * See hblk_unload(). 7128 */ 7129 HME_SUB(sfhme, pp); 7130 membar_stst(); 7131 7132 /* 7133 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 7134 * since pteload may have done a HME_ADD() right after 7135 * we did the HME_SUB() above. Hmecnt is now maintained 7136 * by cas only. no lock guranteed its value. The only 7137 * gurantee we have is the hmecnt should not be less than 7138 * what it should be so the hblk will not be taken away. 7139 * It's also important that we decremented the hmecnt after 7140 * we are done with hmeblkp so that this hmeblk won't be 7141 * stolen. 7142 */ 7143 ASSERT(hmeblkp->hblk_hmecnt > 0); 7144 ASSERT(hmeblkp->hblk_vcnt > 0); 7145 atomic_dec_16(&hmeblkp->hblk_vcnt); 7146 atomic_dec_16(&hmeblkp->hblk_hmecnt); 7147 /* 7148 * This is bug 4063182. 7149 * XXX: fixme 7150 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 7151 * !hmeblkp->hblk_lckcnt); 7152 */ 7153 } else { 7154 panic("invalid tte? pp %p &tte %p", 7155 (void *)pp, (void *)&tte); 7156 } 7157 7158 return (cpuset); 7159 } 7160 7161 /* 7162 * While relocating a kernel page, this function will move the mappings 7163 * from tpp to dpp and modify any associated data with these mappings. 7164 * It also unsuspends the suspended kernel mapping. 7165 */ 7166 static void 7167 hat_pagereload(struct page *tpp, struct page *dpp) 7168 { 7169 struct sf_hment *sfhme; 7170 tte_t tte, ttemod; 7171 int index, cons; 7172 7173 ASSERT(getpil() == PIL_MAX); 7174 ASSERT(sfmmu_mlist_held(tpp)); 7175 ASSERT(sfmmu_mlist_held(dpp)); 7176 7177 index = PP_MAPINDEX(tpp); 7178 cons = TTE8K; 7179 7180 /* Update real mappings to the page */ 7181 retry: 7182 for (sfhme = tpp->p_mapping; sfhme != NULL; sfhme = sfhme->hme_next) { 7183 if (IS_PAHME(sfhme)) 7184 continue; 7185 sfmmu_copytte(&sfhme->hme_tte, &tte); 7186 ttemod = tte; 7187 7188 /* 7189 * replace old pfn with new pfn in TTE 7190 */ 7191 PFN_TO_TTE(ttemod, dpp->p_pagenum); 7192 7193 /* 7194 * clear suspend bit 7195 */ 7196 ASSERT(TTE_IS_SUSPEND(&ttemod)); 7197 TTE_CLR_SUSPEND(&ttemod); 7198 7199 if (sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte) < 0) 7200 panic("hat_pagereload(): sfmmu_modifytte_try() failed"); 7201 7202 /* 7203 * set hme_page point to new page 7204 */ 7205 sfhme->hme_page = dpp; 7206 } 7207 7208 /* 7209 * move p_mapping list from old page to new page 7210 */ 7211 dpp->p_mapping = tpp->p_mapping; 7212 tpp->p_mapping = NULL; 7213 dpp->p_share = tpp->p_share; 7214 tpp->p_share = 0; 7215 7216 while (index != 0) { 7217 index = index >> 1; 7218 if (index != 0) 7219 cons++; 7220 if (index & 0x1) { 7221 tpp = PP_GROUPLEADER(tpp, cons); 7222 dpp = PP_GROUPLEADER(dpp, cons); 7223 goto retry; 7224 } 7225 } 7226 7227 curthread->t_flag &= ~T_DONTDTRACE; 7228 mutex_exit(&kpr_suspendlock); 7229 } 7230 7231 uint_t 7232 hat_pagesync(struct page *pp, uint_t clearflag) 7233 { 7234 struct sf_hment *sfhme, *tmphme = NULL; 7235 struct hme_blk *hmeblkp; 7236 kmutex_t *pml; 7237 cpuset_t cpuset, tset; 7238 int index, cons; 7239 extern ulong_t po_share; 7240 page_t *save_pp = pp; 7241 int stop_on_sh = 0; 7242 uint_t shcnt; 7243 7244 CPUSET_ZERO(cpuset); 7245 7246 if (PP_ISRO(pp) && (clearflag & HAT_SYNC_STOPON_MOD)) { 7247 return (PP_GENERIC_ATTR(pp)); 7248 } 7249 7250 if ((clearflag & HAT_SYNC_ZERORM) == 0) { 7251 if ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(pp)) { 7252 return (PP_GENERIC_ATTR(pp)); 7253 } 7254 if ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(pp)) { 7255 return (PP_GENERIC_ATTR(pp)); 7256 } 7257 if (clearflag & HAT_SYNC_STOPON_SHARED) { 7258 if (pp->p_share > po_share) { 7259 hat_page_setattr(pp, P_REF); 7260 return (PP_GENERIC_ATTR(pp)); 7261 } 7262 stop_on_sh = 1; 7263 shcnt = 0; 7264 } 7265 } 7266 7267 clearflag &= ~HAT_SYNC_STOPON_SHARED; 7268 pml = sfmmu_mlist_enter(pp); 7269 index = PP_MAPINDEX(pp); 7270 cons = TTE8K; 7271 retry: 7272 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7273 /* 7274 * We need to save the next hment on the list since 7275 * it is possible for pagesync to remove an invalid hment 7276 * from the list. 7277 */ 7278 tmphme = sfhme->hme_next; 7279 if (IS_PAHME(sfhme)) 7280 continue; 7281 /* 7282 * If we are looking for large mappings and this hme doesn't 7283 * reach the range we are seeking, just ignore it. 7284 */ 7285 hmeblkp = sfmmu_hmetohblk(sfhme); 7286 7287 if (hme_size(sfhme) < cons) 7288 continue; 7289 7290 if (stop_on_sh) { 7291 if (hmeblkp->hblk_shared) { 7292 sf_srd_t *srdp = hblktosrd(hmeblkp); 7293 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7294 sf_region_t *rgnp; 7295 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7296 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7297 ASSERT(srdp != NULL); 7298 rgnp = srdp->srd_hmergnp[rid]; 7299 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, 7300 rgnp, rid); 7301 shcnt += rgnp->rgn_refcnt; 7302 } else { 7303 shcnt++; 7304 } 7305 if (shcnt > po_share) { 7306 /* 7307 * tell the pager to spare the page this time 7308 * around. 7309 */ 7310 hat_page_setattr(save_pp, P_REF); 7311 index = 0; 7312 break; 7313 } 7314 } 7315 tset = sfmmu_pagesync(pp, sfhme, 7316 clearflag & ~HAT_SYNC_STOPON_RM); 7317 CPUSET_OR(cpuset, tset); 7318 7319 /* 7320 * If clearflag is HAT_SYNC_DONTZERO, break out as soon 7321 * as the "ref" or "mod" is set or share cnt exceeds po_share. 7322 */ 7323 if ((clearflag & ~HAT_SYNC_STOPON_RM) == HAT_SYNC_DONTZERO && 7324 (((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp)) || 7325 ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp)))) { 7326 index = 0; 7327 break; 7328 } 7329 } 7330 7331 while (index) { 7332 index = index >> 1; 7333 cons++; 7334 if (index & 0x1) { 7335 /* Go to leading page */ 7336 pp = PP_GROUPLEADER(pp, cons); 7337 goto retry; 7338 } 7339 } 7340 7341 xt_sync(cpuset); 7342 sfmmu_mlist_exit(pml); 7343 return (PP_GENERIC_ATTR(save_pp)); 7344 } 7345 7346 /* 7347 * Get all the hardware dependent attributes for a page struct 7348 */ 7349 static cpuset_t 7350 sfmmu_pagesync(struct page *pp, struct sf_hment *sfhme, 7351 uint_t clearflag) 7352 { 7353 caddr_t addr; 7354 tte_t tte, ttemod; 7355 struct hme_blk *hmeblkp; 7356 int ret; 7357 sfmmu_t *sfmmup; 7358 cpuset_t cpuset; 7359 7360 ASSERT(pp != NULL); 7361 ASSERT(sfmmu_mlist_held(pp)); 7362 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 7363 (clearflag == HAT_SYNC_ZERORM)); 7364 7365 SFMMU_STAT(sf_pagesync); 7366 7367 CPUSET_ZERO(cpuset); 7368 7369 sfmmu_pagesync_retry: 7370 7371 sfmmu_copytte(&sfhme->hme_tte, &tte); 7372 if (TTE_IS_VALID(&tte)) { 7373 hmeblkp = sfmmu_hmetohblk(sfhme); 7374 sfmmup = hblktosfmmu(hmeblkp); 7375 addr = tte_to_vaddr(hmeblkp, tte); 7376 if (clearflag == HAT_SYNC_ZERORM) { 7377 ttemod = tte; 7378 TTE_CLR_RM(&ttemod); 7379 ret = sfmmu_modifytte_try(&tte, &ttemod, 7380 &sfhme->hme_tte); 7381 if (ret < 0) { 7382 /* 7383 * cas failed and the new value is not what 7384 * we want. 7385 */ 7386 goto sfmmu_pagesync_retry; 7387 } 7388 7389 if (ret > 0) { 7390 /* we win the cas */ 7391 if (hmeblkp->hblk_shared) { 7392 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7393 uint_t rid = 7394 hmeblkp->hblk_tag.htag_rid; 7395 sf_region_t *rgnp; 7396 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7397 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7398 ASSERT(srdp != NULL); 7399 rgnp = srdp->srd_hmergnp[rid]; 7400 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 7401 srdp, rgnp, rid); 7402 cpuset = sfmmu_rgntlb_demap(addr, 7403 rgnp, hmeblkp, 1); 7404 } else { 7405 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 7406 0, 0); 7407 cpuset = sfmmup->sfmmu_cpusran; 7408 } 7409 } 7410 } 7411 sfmmu_ttesync(hmeblkp->hblk_shared ? NULL : sfmmup, addr, 7412 &tte, pp); 7413 } 7414 return (cpuset); 7415 } 7416 7417 /* 7418 * Remove write permission from a mappings to a page, so that 7419 * we can detect the next modification of it. This requires modifying 7420 * the TTE then invalidating (demap) any TLB entry using that TTE. 7421 * This code is similar to sfmmu_pagesync(). 7422 */ 7423 static cpuset_t 7424 sfmmu_pageclrwrt(struct page *pp, struct sf_hment *sfhme) 7425 { 7426 caddr_t addr; 7427 tte_t tte; 7428 tte_t ttemod; 7429 struct hme_blk *hmeblkp; 7430 int ret; 7431 sfmmu_t *sfmmup; 7432 cpuset_t cpuset; 7433 7434 ASSERT(pp != NULL); 7435 ASSERT(sfmmu_mlist_held(pp)); 7436 7437 CPUSET_ZERO(cpuset); 7438 SFMMU_STAT(sf_clrwrt); 7439 7440 retry: 7441 7442 sfmmu_copytte(&sfhme->hme_tte, &tte); 7443 if (TTE_IS_VALID(&tte) && TTE_IS_WRITABLE(&tte)) { 7444 hmeblkp = sfmmu_hmetohblk(sfhme); 7445 sfmmup = hblktosfmmu(hmeblkp); 7446 addr = tte_to_vaddr(hmeblkp, tte); 7447 7448 ttemod = tte; 7449 TTE_CLR_WRT(&ttemod); 7450 TTE_CLR_MOD(&ttemod); 7451 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 7452 7453 /* 7454 * if cas failed and the new value is not what 7455 * we want retry 7456 */ 7457 if (ret < 0) 7458 goto retry; 7459 7460 /* we win the cas */ 7461 if (ret > 0) { 7462 if (hmeblkp->hblk_shared) { 7463 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7464 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7465 sf_region_t *rgnp; 7466 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7467 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7468 ASSERT(srdp != NULL); 7469 rgnp = srdp->srd_hmergnp[rid]; 7470 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 7471 srdp, rgnp, rid); 7472 cpuset = sfmmu_rgntlb_demap(addr, 7473 rgnp, hmeblkp, 1); 7474 } else { 7475 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 7476 cpuset = sfmmup->sfmmu_cpusran; 7477 } 7478 } 7479 } 7480 7481 return (cpuset); 7482 } 7483 7484 /* 7485 * Walk all mappings of a page, removing write permission and clearing the 7486 * ref/mod bits. This code is similar to hat_pagesync() 7487 */ 7488 static void 7489 hat_page_clrwrt(page_t *pp) 7490 { 7491 struct sf_hment *sfhme; 7492 struct sf_hment *tmphme = NULL; 7493 kmutex_t *pml; 7494 cpuset_t cpuset; 7495 cpuset_t tset; 7496 int index; 7497 int cons; 7498 7499 CPUSET_ZERO(cpuset); 7500 7501 pml = sfmmu_mlist_enter(pp); 7502 index = PP_MAPINDEX(pp); 7503 cons = TTE8K; 7504 retry: 7505 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7506 tmphme = sfhme->hme_next; 7507 7508 /* 7509 * If we are looking for large mappings and this hme doesn't 7510 * reach the range we are seeking, just ignore its. 7511 */ 7512 7513 if (hme_size(sfhme) < cons) 7514 continue; 7515 7516 tset = sfmmu_pageclrwrt(pp, sfhme); 7517 CPUSET_OR(cpuset, tset); 7518 } 7519 7520 while (index) { 7521 index = index >> 1; 7522 cons++; 7523 if (index & 0x1) { 7524 /* Go to leading page */ 7525 pp = PP_GROUPLEADER(pp, cons); 7526 goto retry; 7527 } 7528 } 7529 7530 xt_sync(cpuset); 7531 sfmmu_mlist_exit(pml); 7532 } 7533 7534 /* 7535 * Set the given REF/MOD/RO bits for the given page. 7536 * For a vnode with a sorted v_pages list, we need to change 7537 * the attributes and the v_pages list together under page_vnode_mutex. 7538 */ 7539 void 7540 hat_page_setattr(page_t *pp, uint_t flag) 7541 { 7542 vnode_t *vp = pp->p_vnode; 7543 page_t **listp; 7544 kmutex_t *pmtx; 7545 kmutex_t *vphm = NULL; 7546 int noshuffle; 7547 7548 noshuffle = flag & P_NSH; 7549 flag &= ~P_NSH; 7550 7551 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7552 7553 /* 7554 * nothing to do if attribute already set 7555 */ 7556 if ((pp->p_nrm & flag) == flag) 7557 return; 7558 7559 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp) && 7560 !noshuffle) { 7561 vphm = page_vnode_mutex(vp); 7562 mutex_enter(vphm); 7563 } 7564 7565 pmtx = sfmmu_page_enter(pp); 7566 pp->p_nrm |= flag; 7567 sfmmu_page_exit(pmtx); 7568 7569 if (vphm != NULL) { 7570 /* 7571 * Some File Systems examine v_pages for NULL w/o 7572 * grabbing the vphm mutex. Must not let it become NULL when 7573 * pp is the only page on the list. 7574 */ 7575 if (pp->p_vpnext != pp) { 7576 page_vpsub(&vp->v_pages, pp); 7577 if (vp->v_pages != NULL) 7578 listp = &vp->v_pages->p_vpprev->p_vpnext; 7579 else 7580 listp = &vp->v_pages; 7581 page_vpadd(listp, pp); 7582 } 7583 mutex_exit(vphm); 7584 } 7585 } 7586 7587 void 7588 hat_page_clrattr(page_t *pp, uint_t flag) 7589 { 7590 vnode_t *vp = pp->p_vnode; 7591 kmutex_t *pmtx; 7592 7593 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7594 7595 pmtx = sfmmu_page_enter(pp); 7596 7597 /* 7598 * Caller is expected to hold page's io lock for VMODSORT to work 7599 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod 7600 * bit is cleared. 7601 * We don't have assert to avoid tripping some existing third party 7602 * code. The dirty page is moved back to top of the v_page list 7603 * after IO is done in pvn_write_done(). 7604 */ 7605 pp->p_nrm &= ~flag; 7606 sfmmu_page_exit(pmtx); 7607 7608 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) { 7609 7610 /* 7611 * VMODSORT works by removing write permissions and getting 7612 * a fault when a page is made dirty. At this point 7613 * we need to remove write permission from all mappings 7614 * to this page. 7615 */ 7616 hat_page_clrwrt(pp); 7617 } 7618 } 7619 7620 uint_t 7621 hat_page_getattr(page_t *pp, uint_t flag) 7622 { 7623 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7624 return ((uint_t)(pp->p_nrm & flag)); 7625 } 7626 7627 /* 7628 * DEBUG kernels: verify that a kernel va<->pa translation 7629 * is safe by checking the underlying page_t is in a page 7630 * relocation-safe state. 7631 */ 7632 #ifdef DEBUG 7633 void 7634 sfmmu_check_kpfn(pfn_t pfn) 7635 { 7636 page_t *pp; 7637 int index, cons; 7638 7639 if (hat_check_vtop == 0) 7640 return; 7641 7642 if (kvseg.s_base == NULL || panicstr) 7643 return; 7644 7645 pp = page_numtopp_nolock(pfn); 7646 if (!pp) 7647 return; 7648 7649 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 7650 return; 7651 7652 /* 7653 * Handed a large kernel page, we dig up the root page since we 7654 * know the root page might have the lock also. 7655 */ 7656 if (pp->p_szc != 0) { 7657 index = PP_MAPINDEX(pp); 7658 cons = TTE8K; 7659 again: 7660 while (index != 0) { 7661 index >>= 1; 7662 if (index != 0) 7663 cons++; 7664 if (index & 0x1) { 7665 pp = PP_GROUPLEADER(pp, cons); 7666 goto again; 7667 } 7668 } 7669 } 7670 7671 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 7672 return; 7673 7674 /* 7675 * Pages need to be locked or allocated "permanent" (either from 7676 * static_arena arena or explicitly setting PG_NORELOC when calling 7677 * page_create_va()) for VA->PA translations to be valid. 7678 */ 7679 if (!PP_ISNORELOC(pp)) 7680 panic("Illegal VA->PA translation, pp 0x%p not permanent", 7681 (void *)pp); 7682 else 7683 panic("Illegal VA->PA translation, pp 0x%p not locked", 7684 (void *)pp); 7685 } 7686 #endif /* DEBUG */ 7687 7688 /* 7689 * Returns a page frame number for a given virtual address. 7690 * Returns PFN_INVALID to indicate an invalid mapping 7691 */ 7692 pfn_t 7693 hat_getpfnum(struct hat *hat, caddr_t addr) 7694 { 7695 pfn_t pfn; 7696 tte_t tte; 7697 7698 /* 7699 * We would like to 7700 * ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 7701 * but we can't because the iommu driver will call this 7702 * routine at interrupt time and it can't grab the as lock 7703 * or it will deadlock: A thread could have the as lock 7704 * and be waiting for io. The io can't complete 7705 * because the interrupt thread is blocked trying to grab 7706 * the as lock. 7707 */ 7708 7709 if (hat == ksfmmup) { 7710 if (IS_KMEM_VA_LARGEPAGE(addr)) { 7711 ASSERT(segkmem_lpszc > 0); 7712 pfn = sfmmu_kvaszc2pfn(addr, segkmem_lpszc); 7713 if (pfn != PFN_INVALID) { 7714 sfmmu_check_kpfn(pfn); 7715 return (pfn); 7716 } 7717 } else if (segkpm && IS_KPM_ADDR(addr)) { 7718 return (sfmmu_kpm_vatopfn(addr)); 7719 } 7720 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte)) 7721 == PFN_SUSPENDED) { 7722 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte); 7723 } 7724 sfmmu_check_kpfn(pfn); 7725 return (pfn); 7726 } else { 7727 return (sfmmu_uvatopfn(addr, hat, NULL)); 7728 } 7729 } 7730 7731 /* 7732 * This routine will return both pfn and tte for the vaddr. 7733 */ 7734 static pfn_t 7735 sfmmu_uvatopfn(caddr_t vaddr, struct hat *sfmmup, tte_t *ttep) 7736 { 7737 struct hmehash_bucket *hmebp; 7738 hmeblk_tag hblktag; 7739 int hmeshift, hashno = 1; 7740 struct hme_blk *hmeblkp = NULL; 7741 tte_t tte; 7742 7743 struct sf_hment *sfhmep; 7744 pfn_t pfn; 7745 7746 /* support for ISM */ 7747 ism_map_t *ism_map; 7748 ism_blk_t *ism_blkp; 7749 int i; 7750 sfmmu_t *ism_hatid = NULL; 7751 sfmmu_t *locked_hatid = NULL; 7752 sfmmu_t *sv_sfmmup = sfmmup; 7753 caddr_t sv_vaddr = vaddr; 7754 sf_srd_t *srdp; 7755 7756 if (ttep == NULL) { 7757 ttep = &tte; 7758 } else { 7759 ttep->ll = 0; 7760 } 7761 7762 ASSERT(sfmmup != ksfmmup); 7763 SFMMU_STAT(sf_user_vtop); 7764 /* 7765 * Set ism_hatid if vaddr falls in a ISM segment. 7766 */ 7767 ism_blkp = sfmmup->sfmmu_iblk; 7768 if (ism_blkp != NULL) { 7769 sfmmu_ismhat_enter(sfmmup, 0); 7770 locked_hatid = sfmmup; 7771 } 7772 while (ism_blkp != NULL && ism_hatid == NULL) { 7773 ism_map = ism_blkp->iblk_maps; 7774 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 7775 if (vaddr >= ism_start(ism_map[i]) && 7776 vaddr < ism_end(ism_map[i])) { 7777 sfmmup = ism_hatid = ism_map[i].imap_ismhat; 7778 vaddr = (caddr_t)(vaddr - 7779 ism_start(ism_map[i])); 7780 break; 7781 } 7782 } 7783 ism_blkp = ism_blkp->iblk_next; 7784 } 7785 if (locked_hatid) { 7786 sfmmu_ismhat_exit(locked_hatid, 0); 7787 } 7788 7789 hblktag.htag_id = sfmmup; 7790 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 7791 do { 7792 hmeshift = HME_HASH_SHIFT(hashno); 7793 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 7794 hblktag.htag_rehash = hashno; 7795 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 7796 7797 SFMMU_HASH_LOCK(hmebp); 7798 7799 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 7800 if (hmeblkp != NULL) { 7801 ASSERT(!hmeblkp->hblk_shared); 7802 HBLKTOHME(sfhmep, hmeblkp, vaddr); 7803 sfmmu_copytte(&sfhmep->hme_tte, ttep); 7804 SFMMU_HASH_UNLOCK(hmebp); 7805 if (TTE_IS_VALID(ttep)) { 7806 pfn = TTE_TO_PFN(vaddr, ttep); 7807 return (pfn); 7808 } 7809 break; 7810 } 7811 SFMMU_HASH_UNLOCK(hmebp); 7812 hashno++; 7813 } while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt)); 7814 7815 if (SF_HMERGNMAP_ISNULL(sv_sfmmup)) { 7816 return (PFN_INVALID); 7817 } 7818 srdp = sv_sfmmup->sfmmu_srdp; 7819 ASSERT(srdp != NULL); 7820 ASSERT(srdp->srd_refcnt != 0); 7821 hblktag.htag_id = srdp; 7822 hashno = 1; 7823 do { 7824 hmeshift = HME_HASH_SHIFT(hashno); 7825 hblktag.htag_bspage = HME_HASH_BSPAGE(sv_vaddr, hmeshift); 7826 hblktag.htag_rehash = hashno; 7827 hmebp = HME_HASH_FUNCTION(srdp, sv_vaddr, hmeshift); 7828 7829 SFMMU_HASH_LOCK(hmebp); 7830 for (hmeblkp = hmebp->hmeblkp; hmeblkp != NULL; 7831 hmeblkp = hmeblkp->hblk_next) { 7832 uint_t rid; 7833 sf_region_t *rgnp; 7834 caddr_t rsaddr; 7835 caddr_t readdr; 7836 7837 if (!HTAGS_EQ_SHME(hmeblkp->hblk_tag, hblktag, 7838 sv_sfmmup->sfmmu_hmeregion_map)) { 7839 continue; 7840 } 7841 ASSERT(hmeblkp->hblk_shared); 7842 rid = hmeblkp->hblk_tag.htag_rid; 7843 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7844 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7845 rgnp = srdp->srd_hmergnp[rid]; 7846 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 7847 HBLKTOHME(sfhmep, hmeblkp, sv_vaddr); 7848 sfmmu_copytte(&sfhmep->hme_tte, ttep); 7849 rsaddr = rgnp->rgn_saddr; 7850 readdr = rsaddr + rgnp->rgn_size; 7851 #ifdef DEBUG 7852 if (TTE_IS_VALID(ttep) || 7853 get_hblk_ttesz(hmeblkp) > TTE8K) { 7854 caddr_t eva = tte_to_evaddr(hmeblkp, ttep); 7855 ASSERT(eva > sv_vaddr); 7856 ASSERT(sv_vaddr >= rsaddr); 7857 ASSERT(sv_vaddr < readdr); 7858 ASSERT(eva <= readdr); 7859 } 7860 #endif /* DEBUG */ 7861 /* 7862 * Continue the search if we 7863 * found an invalid 8K tte outside of the area 7864 * covered by this hmeblk's region. 7865 */ 7866 if (TTE_IS_VALID(ttep)) { 7867 SFMMU_HASH_UNLOCK(hmebp); 7868 pfn = TTE_TO_PFN(sv_vaddr, ttep); 7869 return (pfn); 7870 } else if (get_hblk_ttesz(hmeblkp) > TTE8K || 7871 (sv_vaddr >= rsaddr && sv_vaddr < readdr)) { 7872 SFMMU_HASH_UNLOCK(hmebp); 7873 pfn = PFN_INVALID; 7874 return (pfn); 7875 } 7876 } 7877 SFMMU_HASH_UNLOCK(hmebp); 7878 hashno++; 7879 } while (hashno <= mmu_hashcnt); 7880 return (PFN_INVALID); 7881 } 7882 7883 7884 /* 7885 * For compatability with AT&T and later optimizations 7886 */ 7887 /* ARGSUSED */ 7888 void 7889 hat_map(struct hat *hat, caddr_t addr, size_t len, uint_t flags) 7890 { 7891 ASSERT(hat != NULL); 7892 } 7893 7894 /* 7895 * Return the number of mappings to a particular page. This number is an 7896 * approximation of the number of people sharing the page. 7897 * 7898 * shared hmeblks or ism hmeblks are counted as 1 mapping here. 7899 * hat_page_checkshare() can be used to compare threshold to share 7900 * count that reflects the number of region sharers albeit at higher cost. 7901 */ 7902 ulong_t 7903 hat_page_getshare(page_t *pp) 7904 { 7905 page_t *spp = pp; /* start page */ 7906 kmutex_t *pml; 7907 ulong_t cnt; 7908 int index, sz = TTE64K; 7909 7910 /* 7911 * We need to grab the mlist lock to make sure any outstanding 7912 * load/unloads complete. Otherwise we could return zero 7913 * even though the unload(s) hasn't finished yet. 7914 */ 7915 pml = sfmmu_mlist_enter(spp); 7916 cnt = spp->p_share; 7917 7918 #ifdef VAC 7919 if (kpm_enable) 7920 cnt += spp->p_kpmref; 7921 #endif 7922 if (vpm_enable && pp->p_vpmref) { 7923 cnt += 1; 7924 } 7925 7926 /* 7927 * If we have any large mappings, we count the number of 7928 * mappings that this large page is part of. 7929 */ 7930 index = PP_MAPINDEX(spp); 7931 index >>= 1; 7932 while (index) { 7933 pp = PP_GROUPLEADER(spp, sz); 7934 if ((index & 0x1) && pp != spp) { 7935 cnt += pp->p_share; 7936 spp = pp; 7937 } 7938 index >>= 1; 7939 sz++; 7940 } 7941 sfmmu_mlist_exit(pml); 7942 return (cnt); 7943 } 7944 7945 /* 7946 * Return 1 if the number of mappings exceeds sh_thresh. Return 0 7947 * otherwise. Count shared hmeblks by region's refcnt. 7948 */ 7949 int 7950 hat_page_checkshare(page_t *pp, ulong_t sh_thresh) 7951 { 7952 kmutex_t *pml; 7953 ulong_t cnt = 0; 7954 int index, sz = TTE8K; 7955 struct sf_hment *sfhme, *tmphme = NULL; 7956 struct hme_blk *hmeblkp; 7957 7958 pml = sfmmu_mlist_enter(pp); 7959 7960 #ifdef VAC 7961 if (kpm_enable) 7962 cnt = pp->p_kpmref; 7963 #endif 7964 7965 if (vpm_enable && pp->p_vpmref) { 7966 cnt += 1; 7967 } 7968 7969 if (pp->p_share + cnt > sh_thresh) { 7970 sfmmu_mlist_exit(pml); 7971 return (1); 7972 } 7973 7974 index = PP_MAPINDEX(pp); 7975 7976 again: 7977 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7978 tmphme = sfhme->hme_next; 7979 if (IS_PAHME(sfhme)) { 7980 continue; 7981 } 7982 7983 hmeblkp = sfmmu_hmetohblk(sfhme); 7984 if (hme_size(sfhme) != sz) { 7985 continue; 7986 } 7987 7988 if (hmeblkp->hblk_shared) { 7989 sf_srd_t *srdp = hblktosrd(hmeblkp); 7990 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7991 sf_region_t *rgnp; 7992 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7993 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7994 ASSERT(srdp != NULL); 7995 rgnp = srdp->srd_hmergnp[rid]; 7996 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, 7997 rgnp, rid); 7998 cnt += rgnp->rgn_refcnt; 7999 } else { 8000 cnt++; 8001 } 8002 if (cnt > sh_thresh) { 8003 sfmmu_mlist_exit(pml); 8004 return (1); 8005 } 8006 } 8007 8008 index >>= 1; 8009 sz++; 8010 while (index) { 8011 pp = PP_GROUPLEADER(pp, sz); 8012 ASSERT(sfmmu_mlist_held(pp)); 8013 if (index & 0x1) { 8014 goto again; 8015 } 8016 index >>= 1; 8017 sz++; 8018 } 8019 sfmmu_mlist_exit(pml); 8020 return (0); 8021 } 8022 8023 /* 8024 * Unload all large mappings to the pp and reset the p_szc field of every 8025 * constituent page according to the remaining mappings. 8026 * 8027 * pp must be locked SE_EXCL. Even though no other constituent pages are 8028 * locked it's legal to unload the large mappings to the pp because all 8029 * constituent pages of large locked mappings have to be locked SE_SHARED. 8030 * This means if we have SE_EXCL lock on one of constituent pages none of the 8031 * large mappings to pp are locked. 8032 * 8033 * Decrease p_szc field starting from the last constituent page and ending 8034 * with the root page. This method is used because other threads rely on the 8035 * root's p_szc to find the lock to syncronize on. After a root page_t's p_szc 8036 * is demoted then other threads will succeed in sfmmu_mlspl_enter(). This 8037 * ensures that p_szc changes of the constituent pages appears atomic for all 8038 * threads that use sfmmu_mlspl_enter() to examine p_szc field. 8039 * 8040 * This mechanism is only used for file system pages where it's not always 8041 * possible to get SE_EXCL locks on all constituent pages to demote the size 8042 * code (as is done for anonymous or kernel large pages). 8043 * 8044 * See more comments in front of sfmmu_mlspl_enter(). 8045 */ 8046 void 8047 hat_page_demote(page_t *pp) 8048 { 8049 int index; 8050 int sz; 8051 cpuset_t cpuset; 8052 int sync = 0; 8053 page_t *rootpp; 8054 struct sf_hment *sfhme; 8055 struct sf_hment *tmphme = NULL; 8056 struct hme_blk *hmeblkp; 8057 uint_t pszc; 8058 page_t *lastpp; 8059 cpuset_t tset; 8060 pgcnt_t npgs; 8061 kmutex_t *pml; 8062 kmutex_t *pmtx = NULL; 8063 8064 ASSERT(PAGE_EXCL(pp)); 8065 ASSERT(!PP_ISFREE(pp)); 8066 ASSERT(!PP_ISKAS(pp)); 8067 ASSERT(page_szc_lock_assert(pp)); 8068 pml = sfmmu_mlist_enter(pp); 8069 8070 pszc = pp->p_szc; 8071 if (pszc == 0) { 8072 goto out; 8073 } 8074 8075 index = PP_MAPINDEX(pp) >> 1; 8076 8077 if (index) { 8078 CPUSET_ZERO(cpuset); 8079 sz = TTE64K; 8080 sync = 1; 8081 } 8082 8083 while (index) { 8084 if (!(index & 0x1)) { 8085 index >>= 1; 8086 sz++; 8087 continue; 8088 } 8089 ASSERT(sz <= pszc); 8090 rootpp = PP_GROUPLEADER(pp, sz); 8091 for (sfhme = rootpp->p_mapping; sfhme; sfhme = tmphme) { 8092 tmphme = sfhme->hme_next; 8093 ASSERT(!IS_PAHME(sfhme)); 8094 hmeblkp = sfmmu_hmetohblk(sfhme); 8095 if (hme_size(sfhme) != sz) { 8096 continue; 8097 } 8098 tset = sfmmu_pageunload(rootpp, sfhme, sz); 8099 CPUSET_OR(cpuset, tset); 8100 } 8101 if (index >>= 1) { 8102 sz++; 8103 } 8104 } 8105 8106 ASSERT(!PP_ISMAPPED_LARGE(pp)); 8107 8108 if (sync) { 8109 xt_sync(cpuset); 8110 #ifdef VAC 8111 if (PP_ISTNC(pp)) { 8112 conv_tnc(rootpp, sz); 8113 } 8114 #endif /* VAC */ 8115 } 8116 8117 pmtx = sfmmu_page_enter(pp); 8118 8119 ASSERT(pp->p_szc == pszc); 8120 rootpp = PP_PAGEROOT(pp); 8121 ASSERT(rootpp->p_szc == pszc); 8122 lastpp = PP_PAGENEXT_N(rootpp, TTEPAGES(pszc) - 1); 8123 8124 while (lastpp != rootpp) { 8125 sz = PP_MAPINDEX(lastpp) ? fnd_mapping_sz(lastpp) : 0; 8126 ASSERT(sz < pszc); 8127 npgs = (sz == 0) ? 1 : TTEPAGES(sz); 8128 ASSERT(P2PHASE(lastpp->p_pagenum, npgs) == npgs - 1); 8129 while (--npgs > 0) { 8130 lastpp->p_szc = (uchar_t)sz; 8131 lastpp = PP_PAGEPREV(lastpp); 8132 } 8133 if (sz) { 8134 /* 8135 * make sure before current root's pszc 8136 * is updated all updates to constituent pages pszc 8137 * fields are globally visible. 8138 */ 8139 membar_producer(); 8140 } 8141 lastpp->p_szc = sz; 8142 ASSERT(IS_P2ALIGNED(lastpp->p_pagenum, TTEPAGES(sz))); 8143 if (lastpp != rootpp) { 8144 lastpp = PP_PAGEPREV(lastpp); 8145 } 8146 } 8147 if (sz == 0) { 8148 /* the loop above doesn't cover this case */ 8149 rootpp->p_szc = 0; 8150 } 8151 out: 8152 ASSERT(pp->p_szc == 0); 8153 if (pmtx != NULL) { 8154 sfmmu_page_exit(pmtx); 8155 } 8156 sfmmu_mlist_exit(pml); 8157 } 8158 8159 /* 8160 * Refresh the HAT ismttecnt[] element for size szc. 8161 * Caller must have set ISM busy flag to prevent mapping 8162 * lists from changing while we're traversing them. 8163 */ 8164 pgcnt_t 8165 ism_tsb_entries(sfmmu_t *sfmmup, int szc) 8166 { 8167 ism_blk_t *ism_blkp = sfmmup->sfmmu_iblk; 8168 ism_map_t *ism_map; 8169 pgcnt_t npgs = 0; 8170 pgcnt_t npgs_scd = 0; 8171 int j; 8172 sf_scd_t *scdp; 8173 uchar_t rid; 8174 8175 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 8176 scdp = sfmmup->sfmmu_scdp; 8177 8178 for (; ism_blkp != NULL; ism_blkp = ism_blkp->iblk_next) { 8179 ism_map = ism_blkp->iblk_maps; 8180 for (j = 0; ism_map[j].imap_ismhat && j < ISM_MAP_SLOTS; j++) { 8181 rid = ism_map[j].imap_rid; 8182 ASSERT(rid == SFMMU_INVALID_ISMRID || 8183 rid < sfmmup->sfmmu_srdp->srd_next_ismrid); 8184 8185 if (scdp != NULL && rid != SFMMU_INVALID_ISMRID && 8186 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) { 8187 /* ISM is in sfmmup's SCD */ 8188 npgs_scd += 8189 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; 8190 } else { 8191 /* ISMs is not in SCD */ 8192 npgs += 8193 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; 8194 } 8195 } 8196 } 8197 sfmmup->sfmmu_ismttecnt[szc] = npgs; 8198 sfmmup->sfmmu_scdismttecnt[szc] = npgs_scd; 8199 return (npgs); 8200 } 8201 8202 /* 8203 * Yield the memory claim requirement for an address space. 8204 * 8205 * This is currently implemented as the number of bytes that have active 8206 * hardware translations that have page structures. Therefore, it can 8207 * underestimate the traditional resident set size, eg, if the 8208 * physical page is present and the hardware translation is missing; 8209 * and it can overestimate the rss, eg, if there are active 8210 * translations to a frame buffer with page structs. 8211 * Also, it does not take sharing into account. 8212 * 8213 * Note that we don't acquire locks here since this function is most often 8214 * called from the clock thread. 8215 */ 8216 size_t 8217 hat_get_mapped_size(struct hat *hat) 8218 { 8219 size_t assize = 0; 8220 int i; 8221 8222 if (hat == NULL) 8223 return (0); 8224 8225 for (i = 0; i < mmu_page_sizes; i++) 8226 assize += ((pgcnt_t)hat->sfmmu_ttecnt[i] + 8227 (pgcnt_t)hat->sfmmu_scdrttecnt[i]) * TTEBYTES(i); 8228 8229 if (hat->sfmmu_iblk == NULL) 8230 return (assize); 8231 8232 for (i = 0; i < mmu_page_sizes; i++) 8233 assize += ((pgcnt_t)hat->sfmmu_ismttecnt[i] + 8234 (pgcnt_t)hat->sfmmu_scdismttecnt[i]) * TTEBYTES(i); 8235 8236 return (assize); 8237 } 8238 8239 int 8240 hat_stats_enable(struct hat *hat) 8241 { 8242 hatlock_t *hatlockp; 8243 8244 hatlockp = sfmmu_hat_enter(hat); 8245 hat->sfmmu_rmstat++; 8246 sfmmu_hat_exit(hatlockp); 8247 return (1); 8248 } 8249 8250 void 8251 hat_stats_disable(struct hat *hat) 8252 { 8253 hatlock_t *hatlockp; 8254 8255 hatlockp = sfmmu_hat_enter(hat); 8256 hat->sfmmu_rmstat--; 8257 sfmmu_hat_exit(hatlockp); 8258 } 8259 8260 /* 8261 * Routines for entering or removing ourselves from the 8262 * ism_hat's mapping list. This is used for both private and 8263 * SCD hats. 8264 */ 8265 static void 8266 iment_add(struct ism_ment *iment, struct hat *ism_hat) 8267 { 8268 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 8269 8270 iment->iment_prev = NULL; 8271 iment->iment_next = ism_hat->sfmmu_iment; 8272 if (ism_hat->sfmmu_iment) { 8273 ism_hat->sfmmu_iment->iment_prev = iment; 8274 } 8275 ism_hat->sfmmu_iment = iment; 8276 } 8277 8278 static void 8279 iment_sub(struct ism_ment *iment, struct hat *ism_hat) 8280 { 8281 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 8282 8283 if (ism_hat->sfmmu_iment == NULL) { 8284 panic("ism map entry remove - no entries"); 8285 } 8286 8287 if (iment->iment_prev) { 8288 ASSERT(ism_hat->sfmmu_iment != iment); 8289 iment->iment_prev->iment_next = iment->iment_next; 8290 } else { 8291 ASSERT(ism_hat->sfmmu_iment == iment); 8292 ism_hat->sfmmu_iment = iment->iment_next; 8293 } 8294 8295 if (iment->iment_next) { 8296 iment->iment_next->iment_prev = iment->iment_prev; 8297 } 8298 8299 /* 8300 * zero out the entry 8301 */ 8302 iment->iment_next = NULL; 8303 iment->iment_prev = NULL; 8304 iment->iment_hat = NULL; 8305 iment->iment_base_va = 0; 8306 } 8307 8308 /* 8309 * Hat_share()/unshare() return an (non-zero) error 8310 * when saddr and daddr are not properly aligned. 8311 * 8312 * The top level mapping element determines the alignment 8313 * requirement for saddr and daddr, depending on different 8314 * architectures. 8315 * 8316 * When hat_share()/unshare() are not supported, 8317 * HATOP_SHARE()/UNSHARE() return 0 8318 */ 8319 int 8320 hat_share(struct hat *sfmmup, caddr_t addr, 8321 struct hat *ism_hatid, caddr_t sptaddr, size_t len, uint_t ismszc) 8322 { 8323 ism_blk_t *ism_blkp; 8324 ism_blk_t *new_iblk; 8325 ism_map_t *ism_map; 8326 ism_ment_t *ism_ment; 8327 int i, added; 8328 hatlock_t *hatlockp; 8329 int reload_mmu = 0; 8330 uint_t ismshift = page_get_shift(ismszc); 8331 size_t ismpgsz = page_get_pagesize(ismszc); 8332 uint_t ismmask = (uint_t)ismpgsz - 1; 8333 size_t sh_size = ISM_SHIFT(ismshift, len); 8334 ushort_t ismhatflag; 8335 hat_region_cookie_t rcookie; 8336 sf_scd_t *old_scdp; 8337 8338 #ifdef DEBUG 8339 caddr_t eaddr = addr + len; 8340 #endif /* DEBUG */ 8341 8342 ASSERT(ism_hatid != NULL && sfmmup != NULL); 8343 ASSERT(sptaddr == ISMID_STARTADDR); 8344 /* 8345 * Check the alignment. 8346 */ 8347 if (!ISM_ALIGNED(ismshift, addr) || !ISM_ALIGNED(ismshift, sptaddr)) 8348 return (EINVAL); 8349 8350 /* 8351 * Check size alignment. 8352 */ 8353 if (!ISM_ALIGNED(ismshift, len)) 8354 return (EINVAL); 8355 8356 /* 8357 * Allocate ism_ment for the ism_hat's mapping list, and an 8358 * ism map blk in case we need one. We must do our 8359 * allocations before acquiring locks to prevent a deadlock 8360 * in the kmem allocator on the mapping list lock. 8361 */ 8362 new_iblk = kmem_cache_alloc(ism_blk_cache, KM_SLEEP); 8363 ism_ment = kmem_cache_alloc(ism_ment_cache, KM_SLEEP); 8364 8365 /* 8366 * Serialize ISM mappings with the ISM busy flag, and also the 8367 * trap handlers. 8368 */ 8369 sfmmu_ismhat_enter(sfmmup, 0); 8370 8371 /* 8372 * Allocate an ism map blk if necessary. 8373 */ 8374 if (sfmmup->sfmmu_iblk == NULL) { 8375 sfmmup->sfmmu_iblk = new_iblk; 8376 bzero(new_iblk, sizeof (*new_iblk)); 8377 new_iblk->iblk_nextpa = (uint64_t)-1; 8378 membar_stst(); /* make sure next ptr visible to all CPUs */ 8379 sfmmup->sfmmu_ismblkpa = va_to_pa((caddr_t)new_iblk); 8380 reload_mmu = 1; 8381 new_iblk = NULL; 8382 } 8383 8384 #ifdef DEBUG 8385 /* 8386 * Make sure mapping does not already exist. 8387 */ 8388 ism_blkp = sfmmup->sfmmu_iblk; 8389 while (ism_blkp != NULL) { 8390 ism_map = ism_blkp->iblk_maps; 8391 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) { 8392 if ((addr >= ism_start(ism_map[i]) && 8393 addr < ism_end(ism_map[i])) || 8394 eaddr > ism_start(ism_map[i]) && 8395 eaddr <= ism_end(ism_map[i])) { 8396 panic("sfmmu_share: Already mapped!"); 8397 } 8398 } 8399 ism_blkp = ism_blkp->iblk_next; 8400 } 8401 #endif /* DEBUG */ 8402 8403 ASSERT(ismszc >= TTE4M); 8404 if (ismszc == TTE4M) { 8405 ismhatflag = HAT_4M_FLAG; 8406 } else if (ismszc == TTE32M) { 8407 ismhatflag = HAT_32M_FLAG; 8408 } else if (ismszc == TTE256M) { 8409 ismhatflag = HAT_256M_FLAG; 8410 } 8411 /* 8412 * Add mapping to first available mapping slot. 8413 */ 8414 ism_blkp = sfmmup->sfmmu_iblk; 8415 added = 0; 8416 while (!added) { 8417 ism_map = ism_blkp->iblk_maps; 8418 for (i = 0; i < ISM_MAP_SLOTS; i++) { 8419 if (ism_map[i].imap_ismhat == NULL) { 8420 8421 ism_map[i].imap_ismhat = ism_hatid; 8422 ism_map[i].imap_vb_shift = (uchar_t)ismshift; 8423 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID; 8424 ism_map[i].imap_hatflags = ismhatflag; 8425 ism_map[i].imap_sz_mask = ismmask; 8426 /* 8427 * imap_seg is checked in ISM_CHECK to see if 8428 * non-NULL, then other info assumed valid. 8429 */ 8430 membar_stst(); 8431 ism_map[i].imap_seg = (uintptr_t)addr | sh_size; 8432 ism_map[i].imap_ment = ism_ment; 8433 8434 /* 8435 * Now add ourselves to the ism_hat's 8436 * mapping list. 8437 */ 8438 ism_ment->iment_hat = sfmmup; 8439 ism_ment->iment_base_va = addr; 8440 ism_hatid->sfmmu_ismhat = 1; 8441 mutex_enter(&ism_mlist_lock); 8442 iment_add(ism_ment, ism_hatid); 8443 mutex_exit(&ism_mlist_lock); 8444 added = 1; 8445 break; 8446 } 8447 } 8448 if (!added && ism_blkp->iblk_next == NULL) { 8449 ism_blkp->iblk_next = new_iblk; 8450 new_iblk = NULL; 8451 bzero(ism_blkp->iblk_next, 8452 sizeof (*ism_blkp->iblk_next)); 8453 ism_blkp->iblk_next->iblk_nextpa = (uint64_t)-1; 8454 membar_stst(); 8455 ism_blkp->iblk_nextpa = 8456 va_to_pa((caddr_t)ism_blkp->iblk_next); 8457 } 8458 ism_blkp = ism_blkp->iblk_next; 8459 } 8460 8461 /* 8462 * After calling hat_join_region, sfmmup may join a new SCD or 8463 * move from the old scd to a new scd, in which case, we want to 8464 * shrink the sfmmup's private tsb size, i.e., pass shrink to 8465 * sfmmu_check_page_sizes at the end of this routine. 8466 */ 8467 old_scdp = sfmmup->sfmmu_scdp; 8468 8469 rcookie = hat_join_region(sfmmup, addr, len, (void *)ism_hatid, 0, 8470 PROT_ALL, ismszc, NULL, HAT_REGION_ISM); 8471 if (rcookie != HAT_INVALID_REGION_COOKIE) { 8472 ism_map[i].imap_rid = (uchar_t)((uint64_t)rcookie); 8473 } 8474 /* 8475 * Update our counters for this sfmmup's ism mappings. 8476 */ 8477 for (i = 0; i <= ismszc; i++) { 8478 if (!(disable_ism_large_pages & (1 << i))) 8479 (void) ism_tsb_entries(sfmmup, i); 8480 } 8481 8482 /* 8483 * For ISM and DISM we do not support 512K pages, so we only only 8484 * search the 4M and 8K/64K hashes for 4 pagesize cpus, and search the 8485 * 256M or 32M, and 4M and 8K/64K hashes for 6 pagesize cpus. 8486 * 8487 * Need to set 32M/256M ISM flags to make sure 8488 * sfmmu_check_page_sizes() enables them on Panther. 8489 */ 8490 ASSERT((disable_ism_large_pages & (1 << TTE512K)) != 0); 8491 8492 switch (ismszc) { 8493 case TTE256M: 8494 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM)) { 8495 hatlockp = sfmmu_hat_enter(sfmmup); 8496 SFMMU_FLAGS_SET(sfmmup, HAT_256M_ISM); 8497 sfmmu_hat_exit(hatlockp); 8498 } 8499 break; 8500 case TTE32M: 8501 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_32M_ISM)) { 8502 hatlockp = sfmmu_hat_enter(sfmmup); 8503 SFMMU_FLAGS_SET(sfmmup, HAT_32M_ISM); 8504 sfmmu_hat_exit(hatlockp); 8505 } 8506 break; 8507 default: 8508 break; 8509 } 8510 8511 /* 8512 * If we updated the ismblkpa for this HAT we must make 8513 * sure all CPUs running this process reload their tsbmiss area. 8514 * Otherwise they will fail to load the mappings in the tsbmiss 8515 * handler and will loop calling pagefault(). 8516 */ 8517 if (reload_mmu) { 8518 hatlockp = sfmmu_hat_enter(sfmmup); 8519 sfmmu_sync_mmustate(sfmmup); 8520 sfmmu_hat_exit(hatlockp); 8521 } 8522 8523 sfmmu_ismhat_exit(sfmmup, 0); 8524 8525 /* 8526 * Free up ismblk if we didn't use it. 8527 */ 8528 if (new_iblk != NULL) 8529 kmem_cache_free(ism_blk_cache, new_iblk); 8530 8531 /* 8532 * Check TSB and TLB page sizes. 8533 */ 8534 if (sfmmup->sfmmu_scdp != NULL && old_scdp != sfmmup->sfmmu_scdp) { 8535 sfmmu_check_page_sizes(sfmmup, 0); 8536 } else { 8537 sfmmu_check_page_sizes(sfmmup, 1); 8538 } 8539 return (0); 8540 } 8541 8542 /* 8543 * hat_unshare removes exactly one ism_map from 8544 * this process's as. It expects multiple calls 8545 * to hat_unshare for multiple shm segments. 8546 */ 8547 void 8548 hat_unshare(struct hat *sfmmup, caddr_t addr, size_t len, uint_t ismszc) 8549 { 8550 ism_map_t *ism_map; 8551 ism_ment_t *free_ment = NULL; 8552 ism_blk_t *ism_blkp; 8553 struct hat *ism_hatid; 8554 int found, i; 8555 hatlock_t *hatlockp; 8556 struct tsb_info *tsbinfo; 8557 uint_t ismshift = page_get_shift(ismszc); 8558 size_t sh_size = ISM_SHIFT(ismshift, len); 8559 uchar_t ism_rid; 8560 sf_scd_t *old_scdp; 8561 8562 ASSERT(ISM_ALIGNED(ismshift, addr)); 8563 ASSERT(ISM_ALIGNED(ismshift, len)); 8564 ASSERT(sfmmup != NULL); 8565 ASSERT(sfmmup != ksfmmup); 8566 8567 ASSERT(sfmmup->sfmmu_as != NULL); 8568 8569 /* 8570 * Make sure that during the entire time ISM mappings are removed, 8571 * the trap handlers serialize behind us, and that no one else 8572 * can be mucking with ISM mappings. This also lets us get away 8573 * with not doing expensive cross calls to flush the TLB -- we 8574 * just discard the context, flush the entire TSB, and call it 8575 * a day. 8576 */ 8577 sfmmu_ismhat_enter(sfmmup, 0); 8578 8579 /* 8580 * Remove the mapping. 8581 * 8582 * We can't have any holes in the ism map. 8583 * The tsb miss code while searching the ism map will 8584 * stop on an empty map slot. So we must move 8585 * everyone past the hole up 1 if any. 8586 * 8587 * Also empty ism map blks are not freed until the 8588 * process exits. This is to prevent a MT race condition 8589 * between sfmmu_unshare() and sfmmu_tsbmiss_exception(). 8590 */ 8591 found = 0; 8592 ism_blkp = sfmmup->sfmmu_iblk; 8593 while (!found && ism_blkp != NULL) { 8594 ism_map = ism_blkp->iblk_maps; 8595 for (i = 0; i < ISM_MAP_SLOTS; i++) { 8596 if (addr == ism_start(ism_map[i]) && 8597 sh_size == (size_t)(ism_size(ism_map[i]))) { 8598 found = 1; 8599 break; 8600 } 8601 } 8602 if (!found) 8603 ism_blkp = ism_blkp->iblk_next; 8604 } 8605 8606 if (found) { 8607 ism_hatid = ism_map[i].imap_ismhat; 8608 ism_rid = ism_map[i].imap_rid; 8609 ASSERT(ism_hatid != NULL); 8610 ASSERT(ism_hatid->sfmmu_ismhat == 1); 8611 8612 /* 8613 * After hat_leave_region, the sfmmup may leave SCD, 8614 * in which case, we want to grow the private tsb size when 8615 * calling sfmmu_check_page_sizes at the end of the routine. 8616 */ 8617 old_scdp = sfmmup->sfmmu_scdp; 8618 /* 8619 * Then remove ourselves from the region. 8620 */ 8621 if (ism_rid != SFMMU_INVALID_ISMRID) { 8622 hat_leave_region(sfmmup, (void *)((uint64_t)ism_rid), 8623 HAT_REGION_ISM); 8624 } 8625 8626 /* 8627 * And now guarantee that any other cpu 8628 * that tries to process an ISM miss 8629 * will go to tl=0. 8630 */ 8631 hatlockp = sfmmu_hat_enter(sfmmup); 8632 sfmmu_invalidate_ctx(sfmmup); 8633 sfmmu_hat_exit(hatlockp); 8634 8635 /* 8636 * Remove ourselves from the ism mapping list. 8637 */ 8638 mutex_enter(&ism_mlist_lock); 8639 iment_sub(ism_map[i].imap_ment, ism_hatid); 8640 mutex_exit(&ism_mlist_lock); 8641 free_ment = ism_map[i].imap_ment; 8642 8643 /* 8644 * We delete the ism map by copying 8645 * the next map over the current one. 8646 * We will take the next one in the maps 8647 * array or from the next ism_blk. 8648 */ 8649 while (ism_blkp != NULL) { 8650 ism_map = ism_blkp->iblk_maps; 8651 while (i < (ISM_MAP_SLOTS - 1)) { 8652 ism_map[i] = ism_map[i + 1]; 8653 i++; 8654 } 8655 /* i == (ISM_MAP_SLOTS - 1) */ 8656 ism_blkp = ism_blkp->iblk_next; 8657 if (ism_blkp != NULL) { 8658 ism_map[i] = ism_blkp->iblk_maps[0]; 8659 i = 0; 8660 } else { 8661 ism_map[i].imap_seg = 0; 8662 ism_map[i].imap_vb_shift = 0; 8663 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID; 8664 ism_map[i].imap_hatflags = 0; 8665 ism_map[i].imap_sz_mask = 0; 8666 ism_map[i].imap_ismhat = NULL; 8667 ism_map[i].imap_ment = NULL; 8668 } 8669 } 8670 8671 /* 8672 * Now flush entire TSB for the process, since 8673 * demapping page by page can be too expensive. 8674 * We don't have to flush the TLB here anymore 8675 * since we switch to a new TLB ctx instead. 8676 * Also, there is no need to flush if the process 8677 * is exiting since the TSB will be freed later. 8678 */ 8679 if (!sfmmup->sfmmu_free) { 8680 hatlockp = sfmmu_hat_enter(sfmmup); 8681 for (tsbinfo = sfmmup->sfmmu_tsb; tsbinfo != NULL; 8682 tsbinfo = tsbinfo->tsb_next) { 8683 if (tsbinfo->tsb_flags & TSB_SWAPPED) 8684 continue; 8685 if (tsbinfo->tsb_flags & TSB_RELOC_FLAG) { 8686 tsbinfo->tsb_flags |= 8687 TSB_FLUSH_NEEDED; 8688 continue; 8689 } 8690 8691 sfmmu_inv_tsb(tsbinfo->tsb_va, 8692 TSB_BYTES(tsbinfo->tsb_szc)); 8693 } 8694 sfmmu_hat_exit(hatlockp); 8695 } 8696 } 8697 8698 /* 8699 * Update our counters for this sfmmup's ism mappings. 8700 */ 8701 for (i = 0; i <= ismszc; i++) { 8702 if (!(disable_ism_large_pages & (1 << i))) 8703 (void) ism_tsb_entries(sfmmup, i); 8704 } 8705 8706 sfmmu_ismhat_exit(sfmmup, 0); 8707 8708 /* 8709 * We must do our freeing here after dropping locks 8710 * to prevent a deadlock in the kmem allocator on the 8711 * mapping list lock. 8712 */ 8713 if (free_ment != NULL) 8714 kmem_cache_free(ism_ment_cache, free_ment); 8715 8716 /* 8717 * Check TSB and TLB page sizes if the process isn't exiting. 8718 */ 8719 if (!sfmmup->sfmmu_free) { 8720 if (found && old_scdp != NULL && sfmmup->sfmmu_scdp == NULL) { 8721 sfmmu_check_page_sizes(sfmmup, 1); 8722 } else { 8723 sfmmu_check_page_sizes(sfmmup, 0); 8724 } 8725 } 8726 } 8727 8728 /* ARGSUSED */ 8729 static int 8730 sfmmu_idcache_constructor(void *buf, void *cdrarg, int kmflags) 8731 { 8732 /* void *buf is sfmmu_t pointer */ 8733 bzero(buf, sizeof (sfmmu_t)); 8734 8735 return (0); 8736 } 8737 8738 /* ARGSUSED */ 8739 static void 8740 sfmmu_idcache_destructor(void *buf, void *cdrarg) 8741 { 8742 /* void *buf is sfmmu_t pointer */ 8743 } 8744 8745 /* 8746 * setup kmem hmeblks by bzeroing all members and initializing the nextpa 8747 * field to be the pa of this hmeblk 8748 */ 8749 /* ARGSUSED */ 8750 static int 8751 sfmmu_hblkcache_constructor(void *buf, void *cdrarg, int kmflags) 8752 { 8753 struct hme_blk *hmeblkp; 8754 8755 bzero(buf, (size_t)cdrarg); 8756 hmeblkp = (struct hme_blk *)buf; 8757 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp); 8758 8759 #ifdef HBLK_TRACE 8760 mutex_init(&hmeblkp->hblk_audit_lock, NULL, MUTEX_DEFAULT, NULL); 8761 #endif /* HBLK_TRACE */ 8762 8763 return (0); 8764 } 8765 8766 /* ARGSUSED */ 8767 static void 8768 sfmmu_hblkcache_destructor(void *buf, void *cdrarg) 8769 { 8770 8771 #ifdef HBLK_TRACE 8772 8773 struct hme_blk *hmeblkp; 8774 8775 hmeblkp = (struct hme_blk *)buf; 8776 mutex_destroy(&hmeblkp->hblk_audit_lock); 8777 8778 #endif /* HBLK_TRACE */ 8779 } 8780 8781 #define SFMMU_CACHE_RECLAIM_SCAN_RATIO 8 8782 static int sfmmu_cache_reclaim_scan_ratio = SFMMU_CACHE_RECLAIM_SCAN_RATIO; 8783 /* 8784 * The kmem allocator will callback into our reclaim routine when the system 8785 * is running low in memory. We traverse the hash and free up all unused but 8786 * still cached hme_blks. We also traverse the free list and free them up 8787 * as well. 8788 */ 8789 /*ARGSUSED*/ 8790 static void 8791 sfmmu_hblkcache_reclaim(void *cdrarg) 8792 { 8793 int i; 8794 struct hmehash_bucket *hmebp; 8795 struct hme_blk *hmeblkp, *nx_hblk, *pr_hblk = NULL; 8796 static struct hmehash_bucket *uhmehash_reclaim_hand; 8797 static struct hmehash_bucket *khmehash_reclaim_hand; 8798 struct hme_blk *list = NULL, *last_hmeblkp; 8799 cpuset_t cpuset = cpu_ready_set; 8800 cpu_hme_pend_t *cpuhp; 8801 8802 /* Free up hmeblks on the cpu pending lists */ 8803 for (i = 0; i < NCPU; i++) { 8804 cpuhp = &cpu_hme_pend[i]; 8805 if (cpuhp->chp_listp != NULL) { 8806 mutex_enter(&cpuhp->chp_mutex); 8807 if (cpuhp->chp_listp == NULL) { 8808 mutex_exit(&cpuhp->chp_mutex); 8809 continue; 8810 } 8811 for (last_hmeblkp = cpuhp->chp_listp; 8812 last_hmeblkp->hblk_next != NULL; 8813 last_hmeblkp = last_hmeblkp->hblk_next) 8814 ; 8815 last_hmeblkp->hblk_next = list; 8816 list = cpuhp->chp_listp; 8817 cpuhp->chp_listp = NULL; 8818 cpuhp->chp_count = 0; 8819 mutex_exit(&cpuhp->chp_mutex); 8820 } 8821 8822 } 8823 8824 if (list != NULL) { 8825 kpreempt_disable(); 8826 CPUSET_DEL(cpuset, CPU->cpu_id); 8827 xt_sync(cpuset); 8828 xt_sync(cpuset); 8829 kpreempt_enable(); 8830 sfmmu_hblk_free(&list); 8831 list = NULL; 8832 } 8833 8834 hmebp = uhmehash_reclaim_hand; 8835 if (hmebp == NULL || hmebp > &uhme_hash[UHMEHASH_SZ]) 8836 uhmehash_reclaim_hand = hmebp = uhme_hash; 8837 uhmehash_reclaim_hand += UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 8838 8839 for (i = UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 8840 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 8841 hmeblkp = hmebp->hmeblkp; 8842 pr_hblk = NULL; 8843 while (hmeblkp) { 8844 nx_hblk = hmeblkp->hblk_next; 8845 if (!hmeblkp->hblk_vcnt && 8846 !hmeblkp->hblk_hmecnt) { 8847 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 8848 pr_hblk, &list, 0); 8849 } else { 8850 pr_hblk = hmeblkp; 8851 } 8852 hmeblkp = nx_hblk; 8853 } 8854 SFMMU_HASH_UNLOCK(hmebp); 8855 } 8856 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 8857 hmebp = uhme_hash; 8858 } 8859 8860 hmebp = khmehash_reclaim_hand; 8861 if (hmebp == NULL || hmebp > &khme_hash[KHMEHASH_SZ]) 8862 khmehash_reclaim_hand = hmebp = khme_hash; 8863 khmehash_reclaim_hand += KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 8864 8865 for (i = KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 8866 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 8867 hmeblkp = hmebp->hmeblkp; 8868 pr_hblk = NULL; 8869 while (hmeblkp) { 8870 nx_hblk = hmeblkp->hblk_next; 8871 if (!hmeblkp->hblk_vcnt && 8872 !hmeblkp->hblk_hmecnt) { 8873 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 8874 pr_hblk, &list, 0); 8875 } else { 8876 pr_hblk = hmeblkp; 8877 } 8878 hmeblkp = nx_hblk; 8879 } 8880 SFMMU_HASH_UNLOCK(hmebp); 8881 } 8882 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 8883 hmebp = khme_hash; 8884 } 8885 sfmmu_hblks_list_purge(&list, 0); 8886 } 8887 8888 /* 8889 * sfmmu_get_ppvcolor should become a vm_machdep or hatop interface. 8890 * same goes for sfmmu_get_addrvcolor(). 8891 * 8892 * This function will return the virtual color for the specified page. The 8893 * virtual color corresponds to this page current mapping or its last mapping. 8894 * It is used by memory allocators to choose addresses with the correct 8895 * alignment so vac consistency is automatically maintained. If the page 8896 * has no color it returns -1. 8897 */ 8898 /*ARGSUSED*/ 8899 int 8900 sfmmu_get_ppvcolor(struct page *pp) 8901 { 8902 #ifdef VAC 8903 int color; 8904 8905 if (!(cache & CACHE_VAC) || PP_NEWPAGE(pp)) { 8906 return (-1); 8907 } 8908 color = PP_GET_VCOLOR(pp); 8909 ASSERT(color < mmu_btop(shm_alignment)); 8910 return (color); 8911 #else 8912 return (-1); 8913 #endif /* VAC */ 8914 } 8915 8916 /* 8917 * This function will return the desired alignment for vac consistency 8918 * (vac color) given a virtual address. If no vac is present it returns -1. 8919 */ 8920 /*ARGSUSED*/ 8921 int 8922 sfmmu_get_addrvcolor(caddr_t vaddr) 8923 { 8924 #ifdef VAC 8925 if (cache & CACHE_VAC) { 8926 return (addr_to_vcolor(vaddr)); 8927 } else { 8928 return (-1); 8929 } 8930 #else 8931 return (-1); 8932 #endif /* VAC */ 8933 } 8934 8935 #ifdef VAC 8936 /* 8937 * Check for conflicts. 8938 * A conflict exists if the new and existent mappings do not match in 8939 * their "shm_alignment fields. If conflicts exist, the existant mappings 8940 * are flushed unless one of them is locked. If one of them is locked, then 8941 * the mappings are flushed and converted to non-cacheable mappings. 8942 */ 8943 static void 8944 sfmmu_vac_conflict(struct hat *hat, caddr_t addr, page_t *pp) 8945 { 8946 struct hat *tmphat; 8947 struct sf_hment *sfhmep, *tmphme = NULL; 8948 struct hme_blk *hmeblkp; 8949 int vcolor; 8950 tte_t tte; 8951 8952 ASSERT(sfmmu_mlist_held(pp)); 8953 ASSERT(!PP_ISNC(pp)); /* page better be cacheable */ 8954 8955 vcolor = addr_to_vcolor(addr); 8956 if (PP_NEWPAGE(pp)) { 8957 PP_SET_VCOLOR(pp, vcolor); 8958 return; 8959 } 8960 8961 if (PP_GET_VCOLOR(pp) == vcolor) { 8962 return; 8963 } 8964 8965 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) { 8966 /* 8967 * Previous user of page had a different color 8968 * but since there are no current users 8969 * we just flush the cache and change the color. 8970 */ 8971 SFMMU_STAT(sf_pgcolor_conflict); 8972 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 8973 PP_SET_VCOLOR(pp, vcolor); 8974 return; 8975 } 8976 8977 /* 8978 * If we get here we have a vac conflict with a current 8979 * mapping. VAC conflict policy is as follows. 8980 * - The default is to unload the other mappings unless: 8981 * - If we have a large mapping we uncache the page. 8982 * We need to uncache the rest of the large page too. 8983 * - If any of the mappings are locked we uncache the page. 8984 * - If the requested mapping is inconsistent 8985 * with another mapping and that mapping 8986 * is in the same address space we have to 8987 * make it non-cached. The default thing 8988 * to do is unload the inconsistent mapping 8989 * but if they are in the same address space 8990 * we run the risk of unmapping the pc or the 8991 * stack which we will use as we return to the user, 8992 * in which case we can then fault on the thing 8993 * we just unloaded and get into an infinite loop. 8994 */ 8995 if (PP_ISMAPPED_LARGE(pp)) { 8996 int sz; 8997 8998 /* 8999 * Existing mapping is for big pages. We don't unload 9000 * existing big mappings to satisfy new mappings. 9001 * Always convert all mappings to TNC. 9002 */ 9003 sz = fnd_mapping_sz(pp); 9004 pp = PP_GROUPLEADER(pp, sz); 9005 SFMMU_STAT_ADD(sf_uncache_conflict, TTEPAGES(sz)); 9006 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 9007 TTEPAGES(sz)); 9008 9009 return; 9010 } 9011 9012 /* 9013 * check if any mapping is in same as or if it is locked 9014 * since in that case we need to uncache. 9015 */ 9016 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 9017 tmphme = sfhmep->hme_next; 9018 if (IS_PAHME(sfhmep)) 9019 continue; 9020 hmeblkp = sfmmu_hmetohblk(sfhmep); 9021 tmphat = hblktosfmmu(hmeblkp); 9022 sfmmu_copytte(&sfhmep->hme_tte, &tte); 9023 ASSERT(TTE_IS_VALID(&tte)); 9024 if (hmeblkp->hblk_shared || tmphat == hat || 9025 hmeblkp->hblk_lckcnt) { 9026 /* 9027 * We have an uncache conflict 9028 */ 9029 SFMMU_STAT(sf_uncache_conflict); 9030 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 1); 9031 return; 9032 } 9033 } 9034 9035 /* 9036 * We have an unload conflict 9037 * We have already checked for LARGE mappings, therefore 9038 * the remaining mapping(s) must be TTE8K. 9039 */ 9040 SFMMU_STAT(sf_unload_conflict); 9041 9042 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 9043 tmphme = sfhmep->hme_next; 9044 if (IS_PAHME(sfhmep)) 9045 continue; 9046 hmeblkp = sfmmu_hmetohblk(sfhmep); 9047 ASSERT(!hmeblkp->hblk_shared); 9048 (void) sfmmu_pageunload(pp, sfhmep, TTE8K); 9049 } 9050 9051 if (PP_ISMAPPED_KPM(pp)) 9052 sfmmu_kpm_vac_unload(pp, addr); 9053 9054 /* 9055 * Unloads only do TLB flushes so we need to flush the 9056 * cache here. 9057 */ 9058 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 9059 PP_SET_VCOLOR(pp, vcolor); 9060 } 9061 9062 /* 9063 * Whenever a mapping is unloaded and the page is in TNC state, 9064 * we see if the page can be made cacheable again. 'pp' is 9065 * the page that we just unloaded a mapping from, the size 9066 * of mapping that was unloaded is 'ottesz'. 9067 * Remark: 9068 * The recache policy for mpss pages can leave a performance problem 9069 * under the following circumstances: 9070 * . A large page in uncached mode has just been unmapped. 9071 * . All constituent pages are TNC due to a conflicting small mapping. 9072 * . There are many other, non conflicting, small mappings around for 9073 * a lot of the constituent pages. 9074 * . We're called w/ the "old" groupleader page and the old ottesz, 9075 * but this is irrelevant, since we're no more "PP_ISMAPPED_LARGE", so 9076 * we end up w/ TTE8K or npages == 1. 9077 * . We call tst_tnc w/ the old groupleader only, and if there is no 9078 * conflict, we re-cache only this page. 9079 * . All other small mappings are not checked and will be left in TNC mode. 9080 * The problem is not very serious because: 9081 * . mpss is actually only defined for heap and stack, so the probability 9082 * is not very high that a large page mapping exists in parallel to a small 9083 * one (this is possible, but seems to be bad programming style in the 9084 * appl). 9085 * . The problem gets a little bit more serious, when those TNC pages 9086 * have to be mapped into kernel space, e.g. for networking. 9087 * . When VAC alias conflicts occur in applications, this is regarded 9088 * as an application bug. So if kstat's show them, the appl should 9089 * be changed anyway. 9090 */ 9091 void 9092 conv_tnc(page_t *pp, int ottesz) 9093 { 9094 int cursz, dosz; 9095 pgcnt_t curnpgs, dopgs; 9096 pgcnt_t pg64k; 9097 page_t *pp2; 9098 9099 /* 9100 * Determine how big a range we check for TNC and find 9101 * leader page. cursz is the size of the biggest 9102 * mapping that still exist on 'pp'. 9103 */ 9104 if (PP_ISMAPPED_LARGE(pp)) { 9105 cursz = fnd_mapping_sz(pp); 9106 } else { 9107 cursz = TTE8K; 9108 } 9109 9110 if (ottesz >= cursz) { 9111 dosz = ottesz; 9112 pp2 = pp; 9113 } else { 9114 dosz = cursz; 9115 pp2 = PP_GROUPLEADER(pp, dosz); 9116 } 9117 9118 pg64k = TTEPAGES(TTE64K); 9119 dopgs = TTEPAGES(dosz); 9120 9121 ASSERT(dopgs == 1 || ((dopgs & (pg64k - 1)) == 0)); 9122 9123 while (dopgs != 0) { 9124 curnpgs = TTEPAGES(cursz); 9125 if (tst_tnc(pp2, curnpgs)) { 9126 SFMMU_STAT_ADD(sf_recache, curnpgs); 9127 sfmmu_page_cache_array(pp2, HAT_CACHE, CACHE_NO_FLUSH, 9128 curnpgs); 9129 } 9130 9131 ASSERT(dopgs >= curnpgs); 9132 dopgs -= curnpgs; 9133 9134 if (dopgs == 0) { 9135 break; 9136 } 9137 9138 pp2 = PP_PAGENEXT_N(pp2, curnpgs); 9139 if (((dopgs & (pg64k - 1)) == 0) && PP_ISMAPPED_LARGE(pp2)) { 9140 cursz = fnd_mapping_sz(pp2); 9141 } else { 9142 cursz = TTE8K; 9143 } 9144 } 9145 } 9146 9147 /* 9148 * Returns 1 if page(s) can be converted from TNC to cacheable setting, 9149 * returns 0 otherwise. Note that oaddr argument is valid for only 9150 * 8k pages. 9151 */ 9152 int 9153 tst_tnc(page_t *pp, pgcnt_t npages) 9154 { 9155 struct sf_hment *sfhme; 9156 struct hme_blk *hmeblkp; 9157 tte_t tte; 9158 caddr_t vaddr; 9159 int clr_valid = 0; 9160 int color, color1, bcolor; 9161 int i, ncolors; 9162 9163 ASSERT(pp != NULL); 9164 ASSERT(!(cache & CACHE_WRITEBACK)); 9165 9166 if (npages > 1) { 9167 ncolors = CACHE_NUM_COLOR; 9168 } 9169 9170 for (i = 0; i < npages; i++) { 9171 ASSERT(sfmmu_mlist_held(pp)); 9172 ASSERT(PP_ISTNC(pp)); 9173 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 9174 9175 if (PP_ISPNC(pp)) { 9176 return (0); 9177 } 9178 9179 clr_valid = 0; 9180 if (PP_ISMAPPED_KPM(pp)) { 9181 caddr_t kpmvaddr; 9182 9183 ASSERT(kpm_enable); 9184 kpmvaddr = hat_kpm_page2va(pp, 1); 9185 ASSERT(!(npages > 1 && IS_KPM_ALIAS_RANGE(kpmvaddr))); 9186 color1 = addr_to_vcolor(kpmvaddr); 9187 clr_valid = 1; 9188 } 9189 9190 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 9191 if (IS_PAHME(sfhme)) 9192 continue; 9193 hmeblkp = sfmmu_hmetohblk(sfhme); 9194 9195 sfmmu_copytte(&sfhme->hme_tte, &tte); 9196 ASSERT(TTE_IS_VALID(&tte)); 9197 9198 vaddr = tte_to_vaddr(hmeblkp, tte); 9199 color = addr_to_vcolor(vaddr); 9200 9201 if (npages > 1) { 9202 /* 9203 * If there is a big mapping, make sure 9204 * 8K mapping is consistent with the big 9205 * mapping. 9206 */ 9207 bcolor = i % ncolors; 9208 if (color != bcolor) { 9209 return (0); 9210 } 9211 } 9212 if (!clr_valid) { 9213 clr_valid = 1; 9214 color1 = color; 9215 } 9216 9217 if (color1 != color) { 9218 return (0); 9219 } 9220 } 9221 9222 pp = PP_PAGENEXT(pp); 9223 } 9224 9225 return (1); 9226 } 9227 9228 void 9229 sfmmu_page_cache_array(page_t *pp, int flags, int cache_flush_flag, 9230 pgcnt_t npages) 9231 { 9232 kmutex_t *pmtx; 9233 int i, ncolors, bcolor; 9234 kpm_hlk_t *kpmp; 9235 cpuset_t cpuset; 9236 9237 ASSERT(pp != NULL); 9238 ASSERT(!(cache & CACHE_WRITEBACK)); 9239 9240 kpmp = sfmmu_kpm_kpmp_enter(pp, npages); 9241 pmtx = sfmmu_page_enter(pp); 9242 9243 /* 9244 * Fast path caching single unmapped page 9245 */ 9246 if (npages == 1 && !PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp) && 9247 flags == HAT_CACHE) { 9248 PP_CLRTNC(pp); 9249 PP_CLRPNC(pp); 9250 sfmmu_page_exit(pmtx); 9251 sfmmu_kpm_kpmp_exit(kpmp); 9252 return; 9253 } 9254 9255 /* 9256 * We need to capture all cpus in order to change cacheability 9257 * because we can't allow one cpu to access the same physical 9258 * page using a cacheable and a non-cachebale mapping at the same 9259 * time. Since we may end up walking the ism mapping list 9260 * have to grab it's lock now since we can't after all the 9261 * cpus have been captured. 9262 */ 9263 sfmmu_hat_lock_all(); 9264 mutex_enter(&ism_mlist_lock); 9265 kpreempt_disable(); 9266 cpuset = cpu_ready_set; 9267 xc_attention(cpuset); 9268 9269 if (npages > 1) { 9270 /* 9271 * Make sure all colors are flushed since the 9272 * sfmmu_page_cache() only flushes one color- 9273 * it does not know big pages. 9274 */ 9275 ncolors = CACHE_NUM_COLOR; 9276 if (flags & HAT_TMPNC) { 9277 for (i = 0; i < ncolors; i++) { 9278 sfmmu_cache_flushcolor(i, pp->p_pagenum); 9279 } 9280 cache_flush_flag = CACHE_NO_FLUSH; 9281 } 9282 } 9283 9284 for (i = 0; i < npages; i++) { 9285 9286 ASSERT(sfmmu_mlist_held(pp)); 9287 9288 if (!(flags == HAT_TMPNC && PP_ISTNC(pp))) { 9289 9290 if (npages > 1) { 9291 bcolor = i % ncolors; 9292 } else { 9293 bcolor = NO_VCOLOR; 9294 } 9295 9296 sfmmu_page_cache(pp, flags, cache_flush_flag, 9297 bcolor); 9298 } 9299 9300 pp = PP_PAGENEXT(pp); 9301 } 9302 9303 xt_sync(cpuset); 9304 xc_dismissed(cpuset); 9305 mutex_exit(&ism_mlist_lock); 9306 sfmmu_hat_unlock_all(); 9307 sfmmu_page_exit(pmtx); 9308 sfmmu_kpm_kpmp_exit(kpmp); 9309 kpreempt_enable(); 9310 } 9311 9312 /* 9313 * This function changes the virtual cacheability of all mappings to a 9314 * particular page. When changing from uncache to cacheable the mappings will 9315 * only be changed if all of them have the same virtual color. 9316 * We need to flush the cache in all cpus. It is possible that 9317 * a process referenced a page as cacheable but has sinced exited 9318 * and cleared the mapping list. We still to flush it but have no 9319 * state so all cpus is the only alternative. 9320 */ 9321 static void 9322 sfmmu_page_cache(page_t *pp, int flags, int cache_flush_flag, int bcolor) 9323 { 9324 struct sf_hment *sfhme; 9325 struct hme_blk *hmeblkp; 9326 sfmmu_t *sfmmup; 9327 tte_t tte, ttemod; 9328 caddr_t vaddr; 9329 int ret, color; 9330 pfn_t pfn; 9331 9332 color = bcolor; 9333 pfn = pp->p_pagenum; 9334 9335 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 9336 9337 if (IS_PAHME(sfhme)) 9338 continue; 9339 hmeblkp = sfmmu_hmetohblk(sfhme); 9340 9341 sfmmu_copytte(&sfhme->hme_tte, &tte); 9342 ASSERT(TTE_IS_VALID(&tte)); 9343 vaddr = tte_to_vaddr(hmeblkp, tte); 9344 color = addr_to_vcolor(vaddr); 9345 9346 #ifdef DEBUG 9347 if ((flags & HAT_CACHE) && bcolor != NO_VCOLOR) { 9348 ASSERT(color == bcolor); 9349 } 9350 #endif 9351 9352 ASSERT(flags != HAT_TMPNC || color == PP_GET_VCOLOR(pp)); 9353 9354 ttemod = tte; 9355 if (flags & (HAT_UNCACHE | HAT_TMPNC)) { 9356 TTE_CLR_VCACHEABLE(&ttemod); 9357 } else { /* flags & HAT_CACHE */ 9358 TTE_SET_VCACHEABLE(&ttemod); 9359 } 9360 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 9361 if (ret < 0) { 9362 /* 9363 * Since all cpus are captured modifytte should not 9364 * fail. 9365 */ 9366 panic("sfmmu_page_cache: write to tte failed"); 9367 } 9368 9369 sfmmup = hblktosfmmu(hmeblkp); 9370 if (cache_flush_flag == CACHE_FLUSH) { 9371 /* 9372 * Flush TSBs, TLBs and caches 9373 */ 9374 if (hmeblkp->hblk_shared) { 9375 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 9376 uint_t rid = hmeblkp->hblk_tag.htag_rid; 9377 sf_region_t *rgnp; 9378 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 9379 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 9380 ASSERT(srdp != NULL); 9381 rgnp = srdp->srd_hmergnp[rid]; 9382 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 9383 srdp, rgnp, rid); 9384 (void) sfmmu_rgntlb_demap(vaddr, rgnp, 9385 hmeblkp, 0); 9386 sfmmu_cache_flush(pfn, addr_to_vcolor(vaddr)); 9387 } else if (sfmmup->sfmmu_ismhat) { 9388 if (flags & HAT_CACHE) { 9389 SFMMU_STAT(sf_ism_recache); 9390 } else { 9391 SFMMU_STAT(sf_ism_uncache); 9392 } 9393 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 9394 pfn, CACHE_FLUSH); 9395 } else { 9396 sfmmu_tlbcache_demap(vaddr, sfmmup, hmeblkp, 9397 pfn, 0, FLUSH_ALL_CPUS, CACHE_FLUSH, 1); 9398 } 9399 9400 /* 9401 * all cache entries belonging to this pfn are 9402 * now flushed. 9403 */ 9404 cache_flush_flag = CACHE_NO_FLUSH; 9405 } else { 9406 /* 9407 * Flush only TSBs and TLBs. 9408 */ 9409 if (hmeblkp->hblk_shared) { 9410 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 9411 uint_t rid = hmeblkp->hblk_tag.htag_rid; 9412 sf_region_t *rgnp; 9413 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 9414 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 9415 ASSERT(srdp != NULL); 9416 rgnp = srdp->srd_hmergnp[rid]; 9417 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 9418 srdp, rgnp, rid); 9419 (void) sfmmu_rgntlb_demap(vaddr, rgnp, 9420 hmeblkp, 0); 9421 } else if (sfmmup->sfmmu_ismhat) { 9422 if (flags & HAT_CACHE) { 9423 SFMMU_STAT(sf_ism_recache); 9424 } else { 9425 SFMMU_STAT(sf_ism_uncache); 9426 } 9427 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 9428 pfn, CACHE_NO_FLUSH); 9429 } else { 9430 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 1); 9431 } 9432 } 9433 } 9434 9435 if (PP_ISMAPPED_KPM(pp)) 9436 sfmmu_kpm_page_cache(pp, flags, cache_flush_flag); 9437 9438 switch (flags) { 9439 9440 default: 9441 panic("sfmmu_pagecache: unknown flags"); 9442 break; 9443 9444 case HAT_CACHE: 9445 PP_CLRTNC(pp); 9446 PP_CLRPNC(pp); 9447 PP_SET_VCOLOR(pp, color); 9448 break; 9449 9450 case HAT_TMPNC: 9451 PP_SETTNC(pp); 9452 PP_SET_VCOLOR(pp, NO_VCOLOR); 9453 break; 9454 9455 case HAT_UNCACHE: 9456 PP_SETPNC(pp); 9457 PP_CLRTNC(pp); 9458 PP_SET_VCOLOR(pp, NO_VCOLOR); 9459 break; 9460 } 9461 } 9462 #endif /* VAC */ 9463 9464 9465 /* 9466 * Wrapper routine used to return a context. 9467 * 9468 * It's the responsibility of the caller to guarantee that the 9469 * process serializes on calls here by taking the HAT lock for 9470 * the hat. 9471 * 9472 */ 9473 static void 9474 sfmmu_get_ctx(sfmmu_t *sfmmup) 9475 { 9476 mmu_ctx_t *mmu_ctxp; 9477 uint_t pstate_save; 9478 int ret; 9479 9480 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9481 ASSERT(sfmmup != ksfmmup); 9482 9483 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)) { 9484 sfmmu_setup_tsbinfo(sfmmup); 9485 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ALLCTX_INVALID); 9486 } 9487 9488 kpreempt_disable(); 9489 9490 mmu_ctxp = CPU_MMU_CTXP(CPU); 9491 ASSERT(mmu_ctxp); 9492 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms); 9493 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]); 9494 9495 /* 9496 * Do a wrap-around if cnum reaches the max # cnum supported by a MMU. 9497 */ 9498 if (mmu_ctxp->mmu_cnum == mmu_ctxp->mmu_nctxs) 9499 sfmmu_ctx_wrap_around(mmu_ctxp, B_TRUE); 9500 9501 /* 9502 * Let the MMU set up the page sizes to use for 9503 * this context in the TLB. Don't program 2nd dtlb for ism hat. 9504 */ 9505 if ((&mmu_set_ctx_page_sizes) && (sfmmup->sfmmu_ismhat == 0)) { 9506 mmu_set_ctx_page_sizes(sfmmup); 9507 } 9508 9509 /* 9510 * sfmmu_alloc_ctx and sfmmu_load_mmustate will be performed with 9511 * interrupts disabled to prevent race condition with wrap-around 9512 * ctx invalidatation. In sun4v, ctx invalidation also involves 9513 * a HV call to set the number of TSBs to 0. If interrupts are not 9514 * disabled until after sfmmu_load_mmustate is complete TSBs may 9515 * become assigned to INVALID_CONTEXT. This is not allowed. 9516 */ 9517 pstate_save = sfmmu_disable_intrs(); 9518 9519 if (sfmmu_alloc_ctx(sfmmup, 1, CPU, SFMMU_PRIVATE) && 9520 sfmmup->sfmmu_scdp != NULL) { 9521 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 9522 sfmmu_t *scsfmmup = scdp->scd_sfmmup; 9523 ret = sfmmu_alloc_ctx(scsfmmup, 1, CPU, SFMMU_SHARED); 9524 /* debug purpose only */ 9525 ASSERT(!ret || scsfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum 9526 != INVALID_CONTEXT); 9527 } 9528 sfmmu_load_mmustate(sfmmup); 9529 9530 sfmmu_enable_intrs(pstate_save); 9531 9532 kpreempt_enable(); 9533 } 9534 9535 /* 9536 * When all cnums are used up in a MMU, cnum will wrap around to the 9537 * next generation and start from 2. 9538 */ 9539 static void 9540 sfmmu_ctx_wrap_around(mmu_ctx_t *mmu_ctxp, boolean_t reset_cnum) 9541 { 9542 9543 /* caller must have disabled the preemption */ 9544 ASSERT(curthread->t_preempt >= 1); 9545 ASSERT(mmu_ctxp != NULL); 9546 9547 /* acquire Per-MMU (PM) spin lock */ 9548 mutex_enter(&mmu_ctxp->mmu_lock); 9549 9550 /* re-check to see if wrap-around is needed */ 9551 if (mmu_ctxp->mmu_cnum < mmu_ctxp->mmu_nctxs) 9552 goto done; 9553 9554 SFMMU_MMU_STAT(mmu_wrap_around); 9555 9556 /* update gnum */ 9557 ASSERT(mmu_ctxp->mmu_gnum != 0); 9558 mmu_ctxp->mmu_gnum++; 9559 if (mmu_ctxp->mmu_gnum == 0 || 9560 mmu_ctxp->mmu_gnum > MAX_SFMMU_GNUM_VAL) { 9561 cmn_err(CE_PANIC, "mmu_gnum of mmu_ctx 0x%p is out of bound.", 9562 (void *)mmu_ctxp); 9563 } 9564 9565 if (mmu_ctxp->mmu_ncpus > 1) { 9566 cpuset_t cpuset; 9567 9568 membar_enter(); /* make sure updated gnum visible */ 9569 9570 SFMMU_XCALL_STATS(NULL); 9571 9572 /* xcall to others on the same MMU to invalidate ctx */ 9573 cpuset = mmu_ctxp->mmu_cpuset; 9574 ASSERT(CPU_IN_SET(cpuset, CPU->cpu_id) || !reset_cnum); 9575 CPUSET_DEL(cpuset, CPU->cpu_id); 9576 CPUSET_AND(cpuset, cpu_ready_set); 9577 9578 /* 9579 * Pass in INVALID_CONTEXT as the first parameter to 9580 * sfmmu_raise_tsb_exception, which invalidates the context 9581 * of any process running on the CPUs in the MMU. 9582 */ 9583 xt_some(cpuset, sfmmu_raise_tsb_exception, 9584 INVALID_CONTEXT, INVALID_CONTEXT); 9585 xt_sync(cpuset); 9586 9587 SFMMU_MMU_STAT(mmu_tsb_raise_exception); 9588 } 9589 9590 if (sfmmu_getctx_sec() != INVALID_CONTEXT) { 9591 sfmmu_setctx_sec(INVALID_CONTEXT); 9592 sfmmu_clear_utsbinfo(); 9593 } 9594 9595 /* 9596 * No xcall is needed here. For sun4u systems all CPUs in context 9597 * domain share a single physical MMU therefore it's enough to flush 9598 * TLB on local CPU. On sun4v systems we use 1 global context 9599 * domain and flush all remote TLBs in sfmmu_raise_tsb_exception 9600 * handler. Note that vtag_flushall_uctxs() is called 9601 * for Ultra II machine, where the equivalent flushall functionality 9602 * is implemented in SW, and only user ctx TLB entries are flushed. 9603 */ 9604 if (&vtag_flushall_uctxs != NULL) { 9605 vtag_flushall_uctxs(); 9606 } else { 9607 vtag_flushall(); 9608 } 9609 9610 /* reset mmu cnum, skips cnum 0 and 1 */ 9611 if (reset_cnum == B_TRUE) 9612 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS; 9613 9614 done: 9615 mutex_exit(&mmu_ctxp->mmu_lock); 9616 } 9617 9618 9619 /* 9620 * For multi-threaded process, set the process context to INVALID_CONTEXT 9621 * so that it faults and reloads the MMU state from TL=0. For single-threaded 9622 * process, we can just load the MMU state directly without having to 9623 * set context invalid. Caller must hold the hat lock since we don't 9624 * acquire it here. 9625 */ 9626 static void 9627 sfmmu_sync_mmustate(sfmmu_t *sfmmup) 9628 { 9629 uint_t cnum; 9630 uint_t pstate_save; 9631 9632 ASSERT(sfmmup != ksfmmup); 9633 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9634 9635 kpreempt_disable(); 9636 9637 /* 9638 * We check whether the pass'ed-in sfmmup is the same as the 9639 * current running proc. This is to makes sure the current proc 9640 * stays single-threaded if it already is. 9641 */ 9642 if ((sfmmup == curthread->t_procp->p_as->a_hat) && 9643 (curthread->t_procp->p_lwpcnt == 1)) { 9644 /* single-thread */ 9645 cnum = sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum; 9646 if (cnum != INVALID_CONTEXT) { 9647 uint_t curcnum; 9648 /* 9649 * Disable interrupts to prevent race condition 9650 * with sfmmu_ctx_wrap_around ctx invalidation. 9651 * In sun4v, ctx invalidation involves setting 9652 * TSB to NULL, hence, interrupts should be disabled 9653 * untill after sfmmu_load_mmustate is completed. 9654 */ 9655 pstate_save = sfmmu_disable_intrs(); 9656 curcnum = sfmmu_getctx_sec(); 9657 if (curcnum == cnum) 9658 sfmmu_load_mmustate(sfmmup); 9659 sfmmu_enable_intrs(pstate_save); 9660 ASSERT(curcnum == cnum || curcnum == INVALID_CONTEXT); 9661 } 9662 } else { 9663 /* 9664 * multi-thread 9665 * or when sfmmup is not the same as the curproc. 9666 */ 9667 sfmmu_invalidate_ctx(sfmmup); 9668 } 9669 9670 kpreempt_enable(); 9671 } 9672 9673 9674 /* 9675 * Replace the specified TSB with a new TSB. This function gets called when 9676 * we grow, or shrink a TSB. When swapping in a TSB (TSB_SWAPIN), the 9677 * TSB_FORCEALLOC flag may be used to force allocation of a minimum-sized TSB 9678 * (8K). 9679 * 9680 * Caller must hold the HAT lock, but should assume any tsb_info 9681 * pointers it has are no longer valid after calling this function. 9682 * 9683 * Return values: 9684 * TSB_ALLOCFAIL Failed to allocate a TSB, due to memory constraints 9685 * TSB_LOSTRACE HAT is busy, i.e. another thread is already doing 9686 * something to this tsbinfo/TSB 9687 * TSB_SUCCESS Operation succeeded 9688 */ 9689 static tsb_replace_rc_t 9690 sfmmu_replace_tsb(sfmmu_t *sfmmup, struct tsb_info *old_tsbinfo, uint_t szc, 9691 hatlock_t *hatlockp, uint_t flags) 9692 { 9693 struct tsb_info *new_tsbinfo = NULL; 9694 struct tsb_info *curtsb, *prevtsb; 9695 uint_t tte_sz_mask; 9696 int i; 9697 9698 ASSERT(sfmmup != ksfmmup); 9699 ASSERT(sfmmup->sfmmu_ismhat == 0); 9700 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9701 ASSERT(szc <= tsb_max_growsize); 9702 9703 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_BUSY)) 9704 return (TSB_LOSTRACE); 9705 9706 /* 9707 * Find the tsb_info ahead of this one in the list, and 9708 * also make sure that the tsb_info passed in really 9709 * exists! 9710 */ 9711 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 9712 curtsb != old_tsbinfo && curtsb != NULL; 9713 prevtsb = curtsb, curtsb = curtsb->tsb_next) 9714 ; 9715 ASSERT(curtsb != NULL); 9716 9717 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 9718 /* 9719 * The process is swapped out, so just set the new size 9720 * code. When it swaps back in, we'll allocate a new one 9721 * of the new chosen size. 9722 */ 9723 curtsb->tsb_szc = szc; 9724 return (TSB_SUCCESS); 9725 } 9726 SFMMU_FLAGS_SET(sfmmup, HAT_BUSY); 9727 9728 tte_sz_mask = old_tsbinfo->tsb_ttesz_mask; 9729 9730 /* 9731 * All initialization is done inside of sfmmu_tsbinfo_alloc(). 9732 * If we fail to allocate a TSB, exit. 9733 * 9734 * If tsb grows with new tsb size > 4M and old tsb size < 4M, 9735 * then try 4M slab after the initial alloc fails. 9736 * 9737 * If tsb swapin with tsb size > 4M, then try 4M after the 9738 * initial alloc fails. 9739 */ 9740 sfmmu_hat_exit(hatlockp); 9741 if (sfmmu_tsbinfo_alloc(&new_tsbinfo, szc, 9742 tte_sz_mask, flags, sfmmup) && 9743 (!(flags & (TSB_GROW | TSB_SWAPIN)) || (szc <= TSB_4M_SZCODE) || 9744 (!(flags & TSB_SWAPIN) && 9745 (old_tsbinfo->tsb_szc >= TSB_4M_SZCODE)) || 9746 sfmmu_tsbinfo_alloc(&new_tsbinfo, TSB_4M_SZCODE, 9747 tte_sz_mask, flags, sfmmup))) { 9748 (void) sfmmu_hat_enter(sfmmup); 9749 if (!(flags & TSB_SWAPIN)) 9750 SFMMU_STAT(sf_tsb_resize_failures); 9751 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 9752 return (TSB_ALLOCFAIL); 9753 } 9754 (void) sfmmu_hat_enter(sfmmup); 9755 9756 /* 9757 * Re-check to make sure somebody else didn't muck with us while we 9758 * didn't hold the HAT lock. If the process swapped out, fine, just 9759 * exit; this can happen if we try to shrink the TSB from the context 9760 * of another process (such as on an ISM unmap), though it is rare. 9761 */ 9762 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 9763 SFMMU_STAT(sf_tsb_resize_failures); 9764 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 9765 sfmmu_hat_exit(hatlockp); 9766 sfmmu_tsbinfo_free(new_tsbinfo); 9767 (void) sfmmu_hat_enter(sfmmup); 9768 return (TSB_LOSTRACE); 9769 } 9770 9771 #ifdef DEBUG 9772 /* Reverify that the tsb_info still exists.. for debugging only */ 9773 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 9774 curtsb != old_tsbinfo && curtsb != NULL; 9775 prevtsb = curtsb, curtsb = curtsb->tsb_next) 9776 ; 9777 ASSERT(curtsb != NULL); 9778 #endif /* DEBUG */ 9779 9780 /* 9781 * Quiesce any CPUs running this process on their next TLB miss 9782 * so they atomically see the new tsb_info. We temporarily set the 9783 * context to invalid context so new threads that come on processor 9784 * after we do the xcall to cpusran will also serialize behind the 9785 * HAT lock on TLB miss and will see the new TSB. Since this short 9786 * race with a new thread coming on processor is relatively rare, 9787 * this synchronization mechanism should be cheaper than always 9788 * pausing all CPUs for the duration of the setup, which is what 9789 * the old implementation did. This is particuarly true if we are 9790 * copying a huge chunk of memory around during that window. 9791 * 9792 * The memory barriers are to make sure things stay consistent 9793 * with resume() since it does not hold the HAT lock while 9794 * walking the list of tsb_info structures. 9795 */ 9796 if ((flags & TSB_SWAPIN) != TSB_SWAPIN) { 9797 /* The TSB is either growing or shrinking. */ 9798 sfmmu_invalidate_ctx(sfmmup); 9799 } else { 9800 /* 9801 * It is illegal to swap in TSBs from a process other 9802 * than a process being swapped in. This in turn 9803 * implies we do not have a valid MMU context here 9804 * since a process needs one to resolve translation 9805 * misses. 9806 */ 9807 ASSERT(curthread->t_procp->p_as->a_hat == sfmmup); 9808 } 9809 9810 #ifdef DEBUG 9811 ASSERT(max_mmu_ctxdoms > 0); 9812 9813 /* 9814 * Process should have INVALID_CONTEXT on all MMUs 9815 */ 9816 for (i = 0; i < max_mmu_ctxdoms; i++) { 9817 9818 ASSERT(sfmmup->sfmmu_ctxs[i].cnum == INVALID_CONTEXT); 9819 } 9820 #endif 9821 9822 new_tsbinfo->tsb_next = old_tsbinfo->tsb_next; 9823 membar_stst(); /* strict ordering required */ 9824 if (prevtsb) 9825 prevtsb->tsb_next = new_tsbinfo; 9826 else 9827 sfmmup->sfmmu_tsb = new_tsbinfo; 9828 membar_enter(); /* make sure new TSB globally visible */ 9829 9830 /* 9831 * We need to migrate TSB entries from the old TSB to the new TSB 9832 * if tsb_remap_ttes is set and the TSB is growing. 9833 */ 9834 if (tsb_remap_ttes && ((flags & TSB_GROW) == TSB_GROW)) 9835 sfmmu_copy_tsb(old_tsbinfo, new_tsbinfo); 9836 9837 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 9838 9839 /* 9840 * Drop the HAT lock to free our old tsb_info. 9841 */ 9842 sfmmu_hat_exit(hatlockp); 9843 9844 if ((flags & TSB_GROW) == TSB_GROW) { 9845 SFMMU_STAT(sf_tsb_grow); 9846 } else if ((flags & TSB_SHRINK) == TSB_SHRINK) { 9847 SFMMU_STAT(sf_tsb_shrink); 9848 } 9849 9850 sfmmu_tsbinfo_free(old_tsbinfo); 9851 9852 (void) sfmmu_hat_enter(sfmmup); 9853 return (TSB_SUCCESS); 9854 } 9855 9856 /* 9857 * This function will re-program hat pgsz array, and invalidate the 9858 * process' context, forcing the process to switch to another 9859 * context on the next TLB miss, and therefore start using the 9860 * TLB that is reprogrammed for the new page sizes. 9861 */ 9862 void 9863 sfmmu_reprog_pgsz_arr(sfmmu_t *sfmmup, uint8_t *tmp_pgsz) 9864 { 9865 int i; 9866 hatlock_t *hatlockp = NULL; 9867 9868 hatlockp = sfmmu_hat_enter(sfmmup); 9869 /* USIII+-IV+ optimization, requires hat lock */ 9870 if (tmp_pgsz) { 9871 for (i = 0; i < mmu_page_sizes; i++) 9872 sfmmup->sfmmu_pgsz[i] = tmp_pgsz[i]; 9873 } 9874 SFMMU_STAT(sf_tlb_reprog_pgsz); 9875 9876 sfmmu_invalidate_ctx(sfmmup); 9877 9878 sfmmu_hat_exit(hatlockp); 9879 } 9880 9881 /* 9882 * The scd_rttecnt field in the SCD must be updated to take account of the 9883 * regions which it contains. 9884 */ 9885 static void 9886 sfmmu_set_scd_rttecnt(sf_srd_t *srdp, sf_scd_t *scdp) 9887 { 9888 uint_t rid; 9889 uint_t i, j; 9890 ulong_t w; 9891 sf_region_t *rgnp; 9892 9893 ASSERT(srdp != NULL); 9894 9895 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 9896 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 9897 continue; 9898 } 9899 9900 j = 0; 9901 while (w) { 9902 if (!(w & 0x1)) { 9903 j++; 9904 w >>= 1; 9905 continue; 9906 } 9907 rid = (i << BT_ULSHIFT) | j; 9908 j++; 9909 w >>= 1; 9910 9911 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 9912 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 9913 rgnp = srdp->srd_hmergnp[rid]; 9914 ASSERT(rgnp->rgn_refcnt > 0); 9915 ASSERT(rgnp->rgn_id == rid); 9916 9917 scdp->scd_rttecnt[rgnp->rgn_pgszc] += 9918 rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc); 9919 9920 /* 9921 * Maintain the tsb0 inflation cnt for the regions 9922 * in the SCD. 9923 */ 9924 if (rgnp->rgn_pgszc >= TTE4M) { 9925 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt += 9926 rgnp->rgn_size >> 9927 (TTE_PAGE_SHIFT(TTE8K) + 2); 9928 } 9929 } 9930 } 9931 } 9932 9933 /* 9934 * This function assumes that there are either four or six supported page 9935 * sizes and at most two programmable TLBs, so we need to decide which 9936 * page sizes are most important and then tell the MMU layer so it 9937 * can adjust the TLB page sizes accordingly (if supported). 9938 * 9939 * If these assumptions change, this function will need to be 9940 * updated to support whatever the new limits are. 9941 * 9942 * The growing flag is nonzero if we are growing the address space, 9943 * and zero if it is shrinking. This allows us to decide whether 9944 * to grow or shrink our TSB, depending upon available memory 9945 * conditions. 9946 */ 9947 static void 9948 sfmmu_check_page_sizes(sfmmu_t *sfmmup, int growing) 9949 { 9950 uint64_t ttecnt[MMU_PAGE_SIZES]; 9951 uint64_t tte8k_cnt, tte4m_cnt; 9952 uint8_t i; 9953 int sectsb_thresh; 9954 9955 /* 9956 * Kernel threads, processes with small address spaces not using 9957 * large pages, and dummy ISM HATs need not apply. 9958 */ 9959 if (sfmmup == ksfmmup || sfmmup->sfmmu_ismhat != NULL) 9960 return; 9961 9962 if (!SFMMU_LGPGS_INUSE(sfmmup) && 9963 sfmmup->sfmmu_ttecnt[TTE8K] <= tsb_rss_factor) 9964 return; 9965 9966 for (i = 0; i < mmu_page_sizes; i++) { 9967 ttecnt[i] = sfmmup->sfmmu_ttecnt[i] + 9968 sfmmup->sfmmu_ismttecnt[i]; 9969 } 9970 9971 /* Check pagesizes in use, and possibly reprogram DTLB. */ 9972 if (&mmu_check_page_sizes) 9973 mmu_check_page_sizes(sfmmup, ttecnt); 9974 9975 /* 9976 * Calculate the number of 8k ttes to represent the span of these 9977 * pages. 9978 */ 9979 tte8k_cnt = ttecnt[TTE8K] + 9980 (ttecnt[TTE64K] << (MMU_PAGESHIFT64K - MMU_PAGESHIFT)) + 9981 (ttecnt[TTE512K] << (MMU_PAGESHIFT512K - MMU_PAGESHIFT)); 9982 if (mmu_page_sizes == max_mmu_page_sizes) { 9983 tte4m_cnt = ttecnt[TTE4M] + 9984 (ttecnt[TTE32M] << (MMU_PAGESHIFT32M - MMU_PAGESHIFT4M)) + 9985 (ttecnt[TTE256M] << (MMU_PAGESHIFT256M - MMU_PAGESHIFT4M)); 9986 } else { 9987 tte4m_cnt = ttecnt[TTE4M]; 9988 } 9989 9990 /* 9991 * Inflate tte8k_cnt to allow for region large page allocation failure. 9992 */ 9993 tte8k_cnt += sfmmup->sfmmu_tsb0_4minflcnt; 9994 9995 /* 9996 * Inflate TSB sizes by a factor of 2 if this process 9997 * uses 4M text pages to minimize extra conflict misses 9998 * in the first TSB since without counting text pages 9999 * 8K TSB may become too small. 10000 * 10001 * Also double the size of the second TSB to minimize 10002 * extra conflict misses due to competition between 4M text pages 10003 * and data pages. 10004 * 10005 * We need to adjust the second TSB allocation threshold by the 10006 * inflation factor, since there is no point in creating a second 10007 * TSB when we know all the mappings can fit in the I/D TLBs. 10008 */ 10009 sectsb_thresh = tsb_sectsb_threshold; 10010 if (sfmmup->sfmmu_flags & HAT_4MTEXT_FLAG) { 10011 tte8k_cnt <<= 1; 10012 tte4m_cnt <<= 1; 10013 sectsb_thresh <<= 1; 10014 } 10015 10016 /* 10017 * Check to see if our TSB is the right size; we may need to 10018 * grow or shrink it. If the process is small, our work is 10019 * finished at this point. 10020 */ 10021 if (tte8k_cnt <= tsb_rss_factor && tte4m_cnt <= sectsb_thresh) { 10022 return; 10023 } 10024 sfmmu_size_tsb(sfmmup, growing, tte8k_cnt, tte4m_cnt, sectsb_thresh); 10025 } 10026 10027 static void 10028 sfmmu_size_tsb(sfmmu_t *sfmmup, int growing, uint64_t tte8k_cnt, 10029 uint64_t tte4m_cnt, int sectsb_thresh) 10030 { 10031 int tsb_bits; 10032 uint_t tsb_szc; 10033 struct tsb_info *tsbinfop; 10034 hatlock_t *hatlockp = NULL; 10035 10036 hatlockp = sfmmu_hat_enter(sfmmup); 10037 ASSERT(hatlockp != NULL); 10038 tsbinfop = sfmmup->sfmmu_tsb; 10039 ASSERT(tsbinfop != NULL); 10040 10041 /* 10042 * If we're growing, select the size based on RSS. If we're 10043 * shrinking, leave some room so we don't have to turn around and 10044 * grow again immediately. 10045 */ 10046 if (growing) 10047 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt); 10048 else 10049 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt << 1); 10050 10051 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 10052 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 10053 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 10054 hatlockp, TSB_SHRINK); 10055 } else if (growing && tsb_szc > tsbinfop->tsb_szc && TSB_OK_GROW()) { 10056 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 10057 hatlockp, TSB_GROW); 10058 } 10059 tsbinfop = sfmmup->sfmmu_tsb; 10060 10061 /* 10062 * With the TLB and first TSB out of the way, we need to see if 10063 * we need a second TSB for 4M pages. If we managed to reprogram 10064 * the TLB page sizes above, the process will start using this new 10065 * TSB right away; otherwise, it will start using it on the next 10066 * context switch. Either way, it's no big deal so there's no 10067 * synchronization with the trap handlers here unless we grow the 10068 * TSB (in which case it's required to prevent using the old one 10069 * after it's freed). Note: second tsb is required for 32M/256M 10070 * page sizes. 10071 */ 10072 if (tte4m_cnt > sectsb_thresh) { 10073 /* 10074 * If we're growing, select the size based on RSS. If we're 10075 * shrinking, leave some room so we don't have to turn 10076 * around and grow again immediately. 10077 */ 10078 if (growing) 10079 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt); 10080 else 10081 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt << 1); 10082 if (tsbinfop->tsb_next == NULL) { 10083 struct tsb_info *newtsb; 10084 int allocflags = SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)? 10085 0 : TSB_ALLOC; 10086 10087 sfmmu_hat_exit(hatlockp); 10088 10089 /* 10090 * Try to allocate a TSB for 4[32|256]M pages. If we 10091 * can't get the size we want, retry w/a minimum sized 10092 * TSB. If that still didn't work, give up; we can 10093 * still run without one. 10094 */ 10095 tsb_bits = (mmu_page_sizes == max_mmu_page_sizes)? 10096 TSB4M|TSB32M|TSB256M:TSB4M; 10097 if ((sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, tsb_bits, 10098 allocflags, sfmmup)) && 10099 (tsb_szc <= TSB_4M_SZCODE || 10100 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE, 10101 tsb_bits, allocflags, sfmmup)) && 10102 sfmmu_tsbinfo_alloc(&newtsb, TSB_MIN_SZCODE, 10103 tsb_bits, allocflags, sfmmup)) { 10104 return; 10105 } 10106 10107 hatlockp = sfmmu_hat_enter(sfmmup); 10108 10109 sfmmu_invalidate_ctx(sfmmup); 10110 10111 if (sfmmup->sfmmu_tsb->tsb_next == NULL) { 10112 sfmmup->sfmmu_tsb->tsb_next = newtsb; 10113 SFMMU_STAT(sf_tsb_sectsb_create); 10114 sfmmu_hat_exit(hatlockp); 10115 return; 10116 } else { 10117 /* 10118 * It's annoying, but possible for us 10119 * to get here.. we dropped the HAT lock 10120 * because of locking order in the kmem 10121 * allocator, and while we were off getting 10122 * our memory, some other thread decided to 10123 * do us a favor and won the race to get a 10124 * second TSB for this process. Sigh. 10125 */ 10126 sfmmu_hat_exit(hatlockp); 10127 sfmmu_tsbinfo_free(newtsb); 10128 return; 10129 } 10130 } 10131 10132 /* 10133 * We have a second TSB, see if it's big enough. 10134 */ 10135 tsbinfop = tsbinfop->tsb_next; 10136 10137 /* 10138 * Check to see if our second TSB is the right size; 10139 * we may need to grow or shrink it. 10140 * To prevent thrashing (e.g. growing the TSB on a 10141 * subsequent map operation), only try to shrink if 10142 * the TSB reach exceeds twice the virtual address 10143 * space size. 10144 */ 10145 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 10146 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 10147 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 10148 tsb_szc, hatlockp, TSB_SHRINK); 10149 } else if (growing && tsb_szc > tsbinfop->tsb_szc && 10150 TSB_OK_GROW()) { 10151 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 10152 tsb_szc, hatlockp, TSB_GROW); 10153 } 10154 } 10155 10156 sfmmu_hat_exit(hatlockp); 10157 } 10158 10159 /* 10160 * Free up a sfmmu 10161 * Since the sfmmu is currently embedded in the hat struct we simply zero 10162 * out our fields and free up the ism map blk list if any. 10163 */ 10164 static void 10165 sfmmu_free_sfmmu(sfmmu_t *sfmmup) 10166 { 10167 ism_blk_t *blkp, *nx_blkp; 10168 #ifdef DEBUG 10169 ism_map_t *map; 10170 int i; 10171 #endif 10172 10173 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 10174 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 10175 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 10176 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 10177 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 10178 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 10179 ASSERT(SF_RGNMAP_ISNULL(sfmmup)); 10180 10181 sfmmup->sfmmu_free = 0; 10182 sfmmup->sfmmu_ismhat = 0; 10183 10184 blkp = sfmmup->sfmmu_iblk; 10185 sfmmup->sfmmu_iblk = NULL; 10186 10187 while (blkp) { 10188 #ifdef DEBUG 10189 map = blkp->iblk_maps; 10190 for (i = 0; i < ISM_MAP_SLOTS; i++) { 10191 ASSERT(map[i].imap_seg == 0); 10192 ASSERT(map[i].imap_ismhat == NULL); 10193 ASSERT(map[i].imap_ment == NULL); 10194 } 10195 #endif 10196 nx_blkp = blkp->iblk_next; 10197 blkp->iblk_next = NULL; 10198 blkp->iblk_nextpa = (uint64_t)-1; 10199 kmem_cache_free(ism_blk_cache, blkp); 10200 blkp = nx_blkp; 10201 } 10202 } 10203 10204 /* 10205 * Locking primitves accessed by HATLOCK macros 10206 */ 10207 10208 #define SFMMU_SPL_MTX (0x0) 10209 #define SFMMU_ML_MTX (0x1) 10210 10211 #define SFMMU_MLSPL_MTX(type, pg) (((type) == SFMMU_SPL_MTX) ? \ 10212 SPL_HASH(pg) : MLIST_HASH(pg)) 10213 10214 kmutex_t * 10215 sfmmu_page_enter(struct page *pp) 10216 { 10217 return (sfmmu_mlspl_enter(pp, SFMMU_SPL_MTX)); 10218 } 10219 10220 void 10221 sfmmu_page_exit(kmutex_t *spl) 10222 { 10223 mutex_exit(spl); 10224 } 10225 10226 int 10227 sfmmu_page_spl_held(struct page *pp) 10228 { 10229 return (sfmmu_mlspl_held(pp, SFMMU_SPL_MTX)); 10230 } 10231 10232 kmutex_t * 10233 sfmmu_mlist_enter(struct page *pp) 10234 { 10235 return (sfmmu_mlspl_enter(pp, SFMMU_ML_MTX)); 10236 } 10237 10238 void 10239 sfmmu_mlist_exit(kmutex_t *mml) 10240 { 10241 mutex_exit(mml); 10242 } 10243 10244 int 10245 sfmmu_mlist_held(struct page *pp) 10246 { 10247 10248 return (sfmmu_mlspl_held(pp, SFMMU_ML_MTX)); 10249 } 10250 10251 /* 10252 * Common code for sfmmu_mlist_enter() and sfmmu_page_enter(). For 10253 * sfmmu_mlist_enter() case mml_table lock array is used and for 10254 * sfmmu_page_enter() sfmmu_page_lock lock array is used. 10255 * 10256 * The lock is taken on a root page so that it protects an operation on all 10257 * constituent pages of a large page pp belongs to. 10258 * 10259 * The routine takes a lock from the appropriate array. The lock is determined 10260 * by hashing the root page. After taking the lock this routine checks if the 10261 * root page has the same size code that was used to determine the root (i.e 10262 * that root hasn't changed). If root page has the expected p_szc field we 10263 * have the right lock and it's returned to the caller. If root's p_szc 10264 * decreased we release the lock and retry from the beginning. This case can 10265 * happen due to hat_page_demote() decreasing p_szc between our load of p_szc 10266 * value and taking the lock. The number of retries due to p_szc decrease is 10267 * limited by the maximum p_szc value. If p_szc is 0 we return the lock 10268 * determined by hashing pp itself. 10269 * 10270 * If our caller doesn't hold a SE_SHARED or SE_EXCL lock on pp it's also 10271 * possible that p_szc can increase. To increase p_szc a thread has to lock 10272 * all constituent pages EXCL and do hat_pageunload() on all of them. All the 10273 * callers that don't hold a page locked recheck if hmeblk through which pp 10274 * was found still maps this pp. If it doesn't map it anymore returned lock 10275 * is immediately dropped. Therefore if sfmmu_mlspl_enter() hits the case of 10276 * p_szc increase after taking the lock it returns this lock without further 10277 * retries because in this case the caller doesn't care about which lock was 10278 * taken. The caller will drop it right away. 10279 * 10280 * After the routine returns it's guaranteed that hat_page_demote() can't 10281 * change p_szc field of any of constituent pages of a large page pp belongs 10282 * to as long as pp was either locked at least SHARED prior to this call or 10283 * the caller finds that hment that pointed to this pp still references this 10284 * pp (this also assumes that the caller holds hme hash bucket lock so that 10285 * the same pp can't be remapped into the same hmeblk after it was unmapped by 10286 * hat_pageunload()). 10287 */ 10288 static kmutex_t * 10289 sfmmu_mlspl_enter(struct page *pp, int type) 10290 { 10291 kmutex_t *mtx; 10292 uint_t prev_rszc = UINT_MAX; 10293 page_t *rootpp; 10294 uint_t szc; 10295 uint_t rszc; 10296 uint_t pszc = pp->p_szc; 10297 10298 ASSERT(pp != NULL); 10299 10300 again: 10301 if (pszc == 0) { 10302 mtx = SFMMU_MLSPL_MTX(type, pp); 10303 mutex_enter(mtx); 10304 return (mtx); 10305 } 10306 10307 /* The lock lives in the root page */ 10308 rootpp = PP_GROUPLEADER(pp, pszc); 10309 mtx = SFMMU_MLSPL_MTX(type, rootpp); 10310 mutex_enter(mtx); 10311 10312 /* 10313 * Return mml in the following 3 cases: 10314 * 10315 * 1) If pp itself is root since if its p_szc decreased before we took 10316 * the lock pp is still the root of smaller szc page. And if its p_szc 10317 * increased it doesn't matter what lock we return (see comment in 10318 * front of this routine). 10319 * 10320 * 2) If pp's not root but rootpp is the root of a rootpp->p_szc size 10321 * large page we have the right lock since any previous potential 10322 * hat_page_demote() is done demoting from greater than current root's 10323 * p_szc because hat_page_demote() changes root's p_szc last. No 10324 * further hat_page_demote() can start or be in progress since it 10325 * would need the same lock we currently hold. 10326 * 10327 * 3) If rootpp's p_szc increased since previous iteration it doesn't 10328 * matter what lock we return (see comment in front of this routine). 10329 */ 10330 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc || 10331 rszc >= prev_rszc) { 10332 return (mtx); 10333 } 10334 10335 /* 10336 * hat_page_demote() could have decreased root's p_szc. 10337 * In this case pp's p_szc must also be smaller than pszc. 10338 * Retry. 10339 */ 10340 if (rszc < pszc) { 10341 szc = pp->p_szc; 10342 if (szc < pszc) { 10343 mutex_exit(mtx); 10344 pszc = szc; 10345 goto again; 10346 } 10347 /* 10348 * pp's p_szc increased after it was decreased. 10349 * page cannot be mapped. Return current lock. The caller 10350 * will drop it right away. 10351 */ 10352 return (mtx); 10353 } 10354 10355 /* 10356 * root's p_szc is greater than pp's p_szc. 10357 * hat_page_demote() is not done with all pages 10358 * yet. Wait for it to complete. 10359 */ 10360 mutex_exit(mtx); 10361 rootpp = PP_GROUPLEADER(rootpp, rszc); 10362 mtx = SFMMU_MLSPL_MTX(type, rootpp); 10363 mutex_enter(mtx); 10364 mutex_exit(mtx); 10365 prev_rszc = rszc; 10366 goto again; 10367 } 10368 10369 static int 10370 sfmmu_mlspl_held(struct page *pp, int type) 10371 { 10372 kmutex_t *mtx; 10373 10374 ASSERT(pp != NULL); 10375 /* The lock lives in the root page */ 10376 pp = PP_PAGEROOT(pp); 10377 ASSERT(pp != NULL); 10378 10379 mtx = SFMMU_MLSPL_MTX(type, pp); 10380 return (MUTEX_HELD(mtx)); 10381 } 10382 10383 static uint_t 10384 sfmmu_get_free_hblk(struct hme_blk **hmeblkpp, uint_t critical) 10385 { 10386 struct hme_blk *hblkp; 10387 10388 10389 if (freehblkp != NULL) { 10390 mutex_enter(&freehblkp_lock); 10391 if (freehblkp != NULL) { 10392 /* 10393 * If the current thread is owning hblk_reserve OR 10394 * critical request from sfmmu_hblk_steal() 10395 * let it succeed even if freehblkcnt is really low. 10396 */ 10397 if (freehblkcnt <= HBLK_RESERVE_MIN && !critical) { 10398 SFMMU_STAT(sf_get_free_throttle); 10399 mutex_exit(&freehblkp_lock); 10400 return (0); 10401 } 10402 freehblkcnt--; 10403 *hmeblkpp = freehblkp; 10404 hblkp = *hmeblkpp; 10405 freehblkp = hblkp->hblk_next; 10406 mutex_exit(&freehblkp_lock); 10407 hblkp->hblk_next = NULL; 10408 SFMMU_STAT(sf_get_free_success); 10409 10410 ASSERT(hblkp->hblk_hmecnt == 0); 10411 ASSERT(hblkp->hblk_vcnt == 0); 10412 ASSERT(hblkp->hblk_nextpa == va_to_pa((caddr_t)hblkp)); 10413 10414 return (1); 10415 } 10416 mutex_exit(&freehblkp_lock); 10417 } 10418 10419 /* Check cpu hblk pending queues */ 10420 if ((*hmeblkpp = sfmmu_check_pending_hblks(TTE8K)) != NULL) { 10421 hblkp = *hmeblkpp; 10422 hblkp->hblk_next = NULL; 10423 hblkp->hblk_nextpa = va_to_pa((caddr_t)hblkp); 10424 10425 ASSERT(hblkp->hblk_hmecnt == 0); 10426 ASSERT(hblkp->hblk_vcnt == 0); 10427 10428 return (1); 10429 } 10430 10431 SFMMU_STAT(sf_get_free_fail); 10432 return (0); 10433 } 10434 10435 static uint_t 10436 sfmmu_put_free_hblk(struct hme_blk *hmeblkp, uint_t critical) 10437 { 10438 struct hme_blk *hblkp; 10439 10440 ASSERT(hmeblkp->hblk_hmecnt == 0); 10441 ASSERT(hmeblkp->hblk_vcnt == 0); 10442 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp)); 10443 10444 /* 10445 * If the current thread is mapping into kernel space, 10446 * let it succede even if freehblkcnt is max 10447 * so that it will avoid freeing it to kmem. 10448 * This will prevent stack overflow due to 10449 * possible recursion since kmem_cache_free() 10450 * might require creation of a slab which 10451 * in turn needs an hmeblk to map that slab; 10452 * let's break this vicious chain at the first 10453 * opportunity. 10454 */ 10455 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 10456 mutex_enter(&freehblkp_lock); 10457 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 10458 SFMMU_STAT(sf_put_free_success); 10459 freehblkcnt++; 10460 hmeblkp->hblk_next = freehblkp; 10461 freehblkp = hmeblkp; 10462 mutex_exit(&freehblkp_lock); 10463 return (1); 10464 } 10465 mutex_exit(&freehblkp_lock); 10466 } 10467 10468 /* 10469 * Bring down freehblkcnt to HBLK_RESERVE_CNT. We are here 10470 * only if freehblkcnt is at least HBLK_RESERVE_CNT *and* 10471 * we are not in the process of mapping into kernel space. 10472 */ 10473 ASSERT(!critical); 10474 while (freehblkcnt > HBLK_RESERVE_CNT) { 10475 mutex_enter(&freehblkp_lock); 10476 if (freehblkcnt > HBLK_RESERVE_CNT) { 10477 freehblkcnt--; 10478 hblkp = freehblkp; 10479 freehblkp = hblkp->hblk_next; 10480 mutex_exit(&freehblkp_lock); 10481 ASSERT(get_hblk_cache(hblkp) == sfmmu8_cache); 10482 kmem_cache_free(sfmmu8_cache, hblkp); 10483 continue; 10484 } 10485 mutex_exit(&freehblkp_lock); 10486 } 10487 SFMMU_STAT(sf_put_free_fail); 10488 return (0); 10489 } 10490 10491 static void 10492 sfmmu_hblk_swap(struct hme_blk *new) 10493 { 10494 struct hme_blk *old, *hblkp, *prev; 10495 uint64_t newpa; 10496 caddr_t base, vaddr, endaddr; 10497 struct hmehash_bucket *hmebp; 10498 struct sf_hment *osfhme, *nsfhme; 10499 page_t *pp; 10500 kmutex_t *pml; 10501 tte_t tte; 10502 struct hme_blk *list = NULL; 10503 10504 #ifdef DEBUG 10505 hmeblk_tag hblktag; 10506 struct hme_blk *found; 10507 #endif 10508 old = HBLK_RESERVE; 10509 ASSERT(!old->hblk_shared); 10510 10511 /* 10512 * save pa before bcopy clobbers it 10513 */ 10514 newpa = new->hblk_nextpa; 10515 10516 base = (caddr_t)get_hblk_base(old); 10517 endaddr = base + get_hblk_span(old); 10518 10519 /* 10520 * acquire hash bucket lock. 10521 */ 10522 hmebp = sfmmu_tteload_acquire_hashbucket(ksfmmup, base, TTE8K, 10523 SFMMU_INVALID_SHMERID); 10524 10525 /* 10526 * copy contents from old to new 10527 */ 10528 bcopy((void *)old, (void *)new, HME8BLK_SZ); 10529 10530 /* 10531 * add new to hash chain 10532 */ 10533 sfmmu_hblk_hash_add(hmebp, new, newpa); 10534 10535 /* 10536 * search hash chain for hblk_reserve; this needs to be performed 10537 * after adding new, otherwise prev won't correspond to the hblk which 10538 * is prior to old in hash chain when we call sfmmu_hblk_hash_rm to 10539 * remove old later. 10540 */ 10541 for (prev = NULL, 10542 hblkp = hmebp->hmeblkp; hblkp != NULL && hblkp != old; 10543 prev = hblkp, hblkp = hblkp->hblk_next) 10544 ; 10545 10546 if (hblkp != old) 10547 panic("sfmmu_hblk_swap: hblk_reserve not found"); 10548 10549 /* 10550 * p_mapping list is still pointing to hments in hblk_reserve; 10551 * fix up p_mapping list so that they point to hments in new. 10552 * 10553 * Since all these mappings are created by hblk_reserve_thread 10554 * on the way and it's using at least one of the buffers from each of 10555 * the newly minted slabs, there is no danger of any of these 10556 * mappings getting unloaded by another thread. 10557 * 10558 * tsbmiss could only modify ref/mod bits of hments in old/new. 10559 * Since all of these hments hold mappings established by segkmem 10560 * and mappings in segkmem are setup with HAT_NOSYNC, ref/mod bits 10561 * have no meaning for the mappings in hblk_reserve. hments in 10562 * old and new are identical except for ref/mod bits. 10563 */ 10564 for (vaddr = base; vaddr < endaddr; vaddr += TTEBYTES(TTE8K)) { 10565 10566 HBLKTOHME(osfhme, old, vaddr); 10567 sfmmu_copytte(&osfhme->hme_tte, &tte); 10568 10569 if (TTE_IS_VALID(&tte)) { 10570 if ((pp = osfhme->hme_page) == NULL) 10571 panic("sfmmu_hblk_swap: page not mapped"); 10572 10573 pml = sfmmu_mlist_enter(pp); 10574 10575 if (pp != osfhme->hme_page) 10576 panic("sfmmu_hblk_swap: mapping changed"); 10577 10578 HBLKTOHME(nsfhme, new, vaddr); 10579 10580 HME_ADD(nsfhme, pp); 10581 HME_SUB(osfhme, pp); 10582 10583 sfmmu_mlist_exit(pml); 10584 } 10585 } 10586 10587 /* 10588 * remove old from hash chain 10589 */ 10590 sfmmu_hblk_hash_rm(hmebp, old, prev, &list, 1); 10591 10592 #ifdef DEBUG 10593 10594 hblktag.htag_id = ksfmmup; 10595 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 10596 hblktag.htag_bspage = HME_HASH_BSPAGE(base, HME_HASH_SHIFT(TTE8K)); 10597 hblktag.htag_rehash = HME_HASH_REHASH(TTE8K); 10598 HME_HASH_FAST_SEARCH(hmebp, hblktag, found); 10599 10600 if (found != new) 10601 panic("sfmmu_hblk_swap: new hblk not found"); 10602 #endif 10603 10604 SFMMU_HASH_UNLOCK(hmebp); 10605 10606 /* 10607 * Reset hblk_reserve 10608 */ 10609 bzero((void *)old, HME8BLK_SZ); 10610 old->hblk_nextpa = va_to_pa((caddr_t)old); 10611 } 10612 10613 /* 10614 * Grab the mlist mutex for both pages passed in. 10615 * 10616 * low and high will be returned as pointers to the mutexes for these pages. 10617 * low refers to the mutex residing in the lower bin of the mlist hash, while 10618 * high refers to the mutex residing in the higher bin of the mlist hash. This 10619 * is due to the locking order restrictions on the same thread grabbing 10620 * multiple mlist mutexes. The low lock must be acquired before the high lock. 10621 * 10622 * If both pages hash to the same mutex, only grab that single mutex, and 10623 * high will be returned as NULL 10624 * If the pages hash to different bins in the hash, grab the lower addressed 10625 * lock first and then the higher addressed lock in order to follow the locking 10626 * rules involved with the same thread grabbing multiple mlist mutexes. 10627 * low and high will both have non-NULL values. 10628 */ 10629 static void 10630 sfmmu_mlist_reloc_enter(struct page *targ, struct page *repl, 10631 kmutex_t **low, kmutex_t **high) 10632 { 10633 kmutex_t *mml_targ, *mml_repl; 10634 10635 /* 10636 * no need to do the dance around szc as in sfmmu_mlist_enter() 10637 * because this routine is only called by hat_page_relocate() and all 10638 * targ and repl pages are already locked EXCL so szc can't change. 10639 */ 10640 10641 mml_targ = MLIST_HASH(PP_PAGEROOT(targ)); 10642 mml_repl = MLIST_HASH(PP_PAGEROOT(repl)); 10643 10644 if (mml_targ == mml_repl) { 10645 *low = mml_targ; 10646 *high = NULL; 10647 } else { 10648 if (mml_targ < mml_repl) { 10649 *low = mml_targ; 10650 *high = mml_repl; 10651 } else { 10652 *low = mml_repl; 10653 *high = mml_targ; 10654 } 10655 } 10656 10657 mutex_enter(*low); 10658 if (*high) 10659 mutex_enter(*high); 10660 } 10661 10662 static void 10663 sfmmu_mlist_reloc_exit(kmutex_t *low, kmutex_t *high) 10664 { 10665 if (high) 10666 mutex_exit(high); 10667 mutex_exit(low); 10668 } 10669 10670 static hatlock_t * 10671 sfmmu_hat_enter(sfmmu_t *sfmmup) 10672 { 10673 hatlock_t *hatlockp; 10674 10675 if (sfmmup != ksfmmup) { 10676 hatlockp = TSB_HASH(sfmmup); 10677 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 10678 return (hatlockp); 10679 } 10680 return (NULL); 10681 } 10682 10683 static hatlock_t * 10684 sfmmu_hat_tryenter(sfmmu_t *sfmmup) 10685 { 10686 hatlock_t *hatlockp; 10687 10688 if (sfmmup != ksfmmup) { 10689 hatlockp = TSB_HASH(sfmmup); 10690 if (mutex_tryenter(HATLOCK_MUTEXP(hatlockp)) == 0) 10691 return (NULL); 10692 return (hatlockp); 10693 } 10694 return (NULL); 10695 } 10696 10697 static void 10698 sfmmu_hat_exit(hatlock_t *hatlockp) 10699 { 10700 if (hatlockp != NULL) 10701 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 10702 } 10703 10704 static void 10705 sfmmu_hat_lock_all(void) 10706 { 10707 int i; 10708 for (i = 0; i < SFMMU_NUM_LOCK; i++) 10709 mutex_enter(HATLOCK_MUTEXP(&hat_lock[i])); 10710 } 10711 10712 static void 10713 sfmmu_hat_unlock_all(void) 10714 { 10715 int i; 10716 for (i = SFMMU_NUM_LOCK - 1; i >= 0; i--) 10717 mutex_exit(HATLOCK_MUTEXP(&hat_lock[i])); 10718 } 10719 10720 int 10721 sfmmu_hat_lock_held(sfmmu_t *sfmmup) 10722 { 10723 ASSERT(sfmmup != ksfmmup); 10724 return (MUTEX_HELD(HATLOCK_MUTEXP(TSB_HASH(sfmmup)))); 10725 } 10726 10727 /* 10728 * Locking primitives to provide consistency between ISM unmap 10729 * and other operations. Since ISM unmap can take a long time, we 10730 * use HAT_ISMBUSY flag (protected by the hatlock) to avoid creating 10731 * contention on the hatlock buckets while ISM segments are being 10732 * unmapped. The tradeoff is that the flags don't prevent priority 10733 * inversion from occurring, so we must request kernel priority in 10734 * case we have to sleep to keep from getting buried while holding 10735 * the HAT_ISMBUSY flag set, which in turn could block other kernel 10736 * threads from running (for example, in sfmmu_uvatopfn()). 10737 */ 10738 static void 10739 sfmmu_ismhat_enter(sfmmu_t *sfmmup, int hatlock_held) 10740 { 10741 hatlock_t *hatlockp; 10742 10743 THREAD_KPRI_REQUEST(); 10744 if (!hatlock_held) 10745 hatlockp = sfmmu_hat_enter(sfmmup); 10746 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) 10747 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 10748 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY); 10749 if (!hatlock_held) 10750 sfmmu_hat_exit(hatlockp); 10751 } 10752 10753 static void 10754 sfmmu_ismhat_exit(sfmmu_t *sfmmup, int hatlock_held) 10755 { 10756 hatlock_t *hatlockp; 10757 10758 if (!hatlock_held) 10759 hatlockp = sfmmu_hat_enter(sfmmup); 10760 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 10761 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY); 10762 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 10763 if (!hatlock_held) 10764 sfmmu_hat_exit(hatlockp); 10765 THREAD_KPRI_RELEASE(); 10766 } 10767 10768 /* 10769 * 10770 * Algorithm: 10771 * 10772 * (1) if segkmem is not ready, allocate hblk from an array of pre-alloc'ed 10773 * hblks. 10774 * 10775 * (2) if we are allocating an hblk for mapping a slab in sfmmu_cache, 10776 * 10777 * (a) try to return an hblk from reserve pool of free hblks; 10778 * (b) if the reserve pool is empty, acquire hblk_reserve_lock 10779 * and return hblk_reserve. 10780 * 10781 * (3) call kmem_cache_alloc() to allocate hblk; 10782 * 10783 * (a) if hblk_reserve_lock is held by the current thread, 10784 * atomically replace hblk_reserve by the hblk that is 10785 * returned by kmem_cache_alloc; release hblk_reserve_lock 10786 * and call kmem_cache_alloc() again. 10787 * (b) if reserve pool is not full, add the hblk that is 10788 * returned by kmem_cache_alloc to reserve pool and 10789 * call kmem_cache_alloc again. 10790 * 10791 */ 10792 static struct hme_blk * 10793 sfmmu_hblk_alloc(sfmmu_t *sfmmup, caddr_t vaddr, 10794 struct hmehash_bucket *hmebp, uint_t size, hmeblk_tag hblktag, 10795 uint_t flags, uint_t rid) 10796 { 10797 struct hme_blk *hmeblkp = NULL; 10798 struct hme_blk *newhblkp; 10799 struct hme_blk *shw_hblkp = NULL; 10800 struct kmem_cache *sfmmu_cache = NULL; 10801 uint64_t hblkpa; 10802 ulong_t index; 10803 uint_t owner; /* set to 1 if using hblk_reserve */ 10804 uint_t forcefree; 10805 int sleep; 10806 sf_srd_t *srdp; 10807 sf_region_t *rgnp; 10808 10809 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 10810 ASSERT(hblktag.htag_rid == rid); 10811 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 10812 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || 10813 IS_P2ALIGNED(vaddr, TTEBYTES(size))); 10814 10815 /* 10816 * If segkmem is not created yet, allocate from static hmeblks 10817 * created at the end of startup_modules(). See the block comment 10818 * in startup_modules() describing how we estimate the number of 10819 * static hmeblks that will be needed during re-map. 10820 */ 10821 if (!hblk_alloc_dynamic) { 10822 10823 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 10824 10825 if (size == TTE8K) { 10826 index = nucleus_hblk8.index; 10827 if (index >= nucleus_hblk8.len) { 10828 /* 10829 * If we panic here, see startup_modules() to 10830 * make sure that we are calculating the 10831 * number of hblk8's that we need correctly. 10832 */ 10833 prom_panic("no nucleus hblk8 to allocate"); 10834 } 10835 hmeblkp = 10836 (struct hme_blk *)&nucleus_hblk8.list[index]; 10837 nucleus_hblk8.index++; 10838 SFMMU_STAT(sf_hblk8_nalloc); 10839 } else { 10840 index = nucleus_hblk1.index; 10841 if (nucleus_hblk1.index >= nucleus_hblk1.len) { 10842 /* 10843 * If we panic here, see startup_modules(). 10844 * Most likely you need to update the 10845 * calculation of the number of hblk1 elements 10846 * that the kernel needs to boot. 10847 */ 10848 prom_panic("no nucleus hblk1 to allocate"); 10849 } 10850 hmeblkp = 10851 (struct hme_blk *)&nucleus_hblk1.list[index]; 10852 nucleus_hblk1.index++; 10853 SFMMU_STAT(sf_hblk1_nalloc); 10854 } 10855 10856 goto hblk_init; 10857 } 10858 10859 SFMMU_HASH_UNLOCK(hmebp); 10860 10861 if (sfmmup != KHATID && !SFMMU_IS_SHMERID_VALID(rid)) { 10862 if (mmu_page_sizes == max_mmu_page_sizes) { 10863 if (size < TTE256M) 10864 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 10865 size, flags); 10866 } else { 10867 if (size < TTE4M) 10868 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 10869 size, flags); 10870 } 10871 } else if (SFMMU_IS_SHMERID_VALID(rid)) { 10872 /* 10873 * Shared hmes use per region bitmaps in rgn_hmeflag 10874 * rather than shadow hmeblks to keep track of the 10875 * mapping sizes which have been allocated for the region. 10876 * Here we cleanup old invalid hmeblks with this rid, 10877 * which may be left around by pageunload(). 10878 */ 10879 int ttesz; 10880 caddr_t va; 10881 caddr_t eva = vaddr + TTEBYTES(size); 10882 10883 ASSERT(sfmmup != KHATID); 10884 10885 srdp = sfmmup->sfmmu_srdp; 10886 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 10887 rgnp = srdp->srd_hmergnp[rid]; 10888 ASSERT(rgnp != NULL && rgnp->rgn_id == rid); 10889 ASSERT(rgnp->rgn_refcnt != 0); 10890 ASSERT(size <= rgnp->rgn_pgszc); 10891 10892 ttesz = HBLK_MIN_TTESZ; 10893 do { 10894 if (!(rgnp->rgn_hmeflags & (0x1 << ttesz))) { 10895 continue; 10896 } 10897 10898 if (ttesz > size && ttesz != HBLK_MIN_TTESZ) { 10899 sfmmu_cleanup_rhblk(srdp, vaddr, rid, ttesz); 10900 } else if (ttesz < size) { 10901 for (va = vaddr; va < eva; 10902 va += TTEBYTES(ttesz)) { 10903 sfmmu_cleanup_rhblk(srdp, va, rid, 10904 ttesz); 10905 } 10906 } 10907 } while (++ttesz <= rgnp->rgn_pgszc); 10908 } 10909 10910 fill_hblk: 10911 owner = (hblk_reserve_thread == curthread) ? 1 : 0; 10912 10913 if (owner && size == TTE8K) { 10914 10915 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 10916 /* 10917 * We are really in a tight spot. We already own 10918 * hblk_reserve and we need another hblk. In anticipation 10919 * of this kind of scenario, we specifically set aside 10920 * HBLK_RESERVE_MIN number of hblks to be used exclusively 10921 * by owner of hblk_reserve. 10922 */ 10923 SFMMU_STAT(sf_hblk_recurse_cnt); 10924 10925 if (!sfmmu_get_free_hblk(&hmeblkp, 1)) 10926 panic("sfmmu_hblk_alloc: reserve list is empty"); 10927 10928 goto hblk_verify; 10929 } 10930 10931 ASSERT(!owner); 10932 10933 if ((flags & HAT_NO_KALLOC) == 0) { 10934 10935 sfmmu_cache = ((size == TTE8K) ? sfmmu8_cache : sfmmu1_cache); 10936 sleep = ((sfmmup == KHATID) ? KM_NOSLEEP : KM_SLEEP); 10937 10938 if ((hmeblkp = kmem_cache_alloc(sfmmu_cache, sleep)) == NULL) { 10939 hmeblkp = sfmmu_hblk_steal(size); 10940 } else { 10941 /* 10942 * if we are the owner of hblk_reserve, 10943 * swap hblk_reserve with hmeblkp and 10944 * start a fresh life. Hope things go 10945 * better this time. 10946 */ 10947 if (hblk_reserve_thread == curthread) { 10948 ASSERT(sfmmu_cache == sfmmu8_cache); 10949 sfmmu_hblk_swap(hmeblkp); 10950 hblk_reserve_thread = NULL; 10951 mutex_exit(&hblk_reserve_lock); 10952 goto fill_hblk; 10953 } 10954 /* 10955 * let's donate this hblk to our reserve list if 10956 * we are not mapping kernel range 10957 */ 10958 if (size == TTE8K && sfmmup != KHATID) { 10959 if (sfmmu_put_free_hblk(hmeblkp, 0)) 10960 goto fill_hblk; 10961 } 10962 } 10963 } else { 10964 /* 10965 * We are here to map the slab in sfmmu8_cache; let's 10966 * check if we could tap our reserve list; if successful, 10967 * this will avoid the pain of going thru sfmmu_hblk_swap 10968 */ 10969 SFMMU_STAT(sf_hblk_slab_cnt); 10970 if (!sfmmu_get_free_hblk(&hmeblkp, 0)) { 10971 /* 10972 * let's start hblk_reserve dance 10973 */ 10974 SFMMU_STAT(sf_hblk_reserve_cnt); 10975 owner = 1; 10976 mutex_enter(&hblk_reserve_lock); 10977 hmeblkp = HBLK_RESERVE; 10978 hblk_reserve_thread = curthread; 10979 } 10980 } 10981 10982 hblk_verify: 10983 ASSERT(hmeblkp != NULL); 10984 set_hblk_sz(hmeblkp, size); 10985 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp)); 10986 SFMMU_HASH_LOCK(hmebp); 10987 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 10988 if (newhblkp != NULL) { 10989 SFMMU_HASH_UNLOCK(hmebp); 10990 if (hmeblkp != HBLK_RESERVE) { 10991 /* 10992 * This is really tricky! 10993 * 10994 * vmem_alloc(vmem_seg_arena) 10995 * vmem_alloc(vmem_internal_arena) 10996 * segkmem_alloc(heap_arena) 10997 * vmem_alloc(heap_arena) 10998 * page_create() 10999 * hat_memload() 11000 * kmem_cache_free() 11001 * kmem_cache_alloc() 11002 * kmem_slab_create() 11003 * vmem_alloc(kmem_internal_arena) 11004 * segkmem_alloc(heap_arena) 11005 * vmem_alloc(heap_arena) 11006 * page_create() 11007 * hat_memload() 11008 * kmem_cache_free() 11009 * ... 11010 * 11011 * Thus, hat_memload() could call kmem_cache_free 11012 * for enough number of times that we could easily 11013 * hit the bottom of the stack or run out of reserve 11014 * list of vmem_seg structs. So, we must donate 11015 * this hblk to reserve list if it's allocated 11016 * from sfmmu8_cache *and* mapping kernel range. 11017 * We don't need to worry about freeing hmeblk1's 11018 * to kmem since they don't map any kmem slabs. 11019 * 11020 * Note: When segkmem supports largepages, we must 11021 * free hmeblk1's to reserve list as well. 11022 */ 11023 forcefree = (sfmmup == KHATID) ? 1 : 0; 11024 if (size == TTE8K && 11025 sfmmu_put_free_hblk(hmeblkp, forcefree)) { 11026 goto re_verify; 11027 } 11028 ASSERT(sfmmup != KHATID); 11029 kmem_cache_free(get_hblk_cache(hmeblkp), hmeblkp); 11030 } else { 11031 /* 11032 * Hey! we don't need hblk_reserve any more. 11033 */ 11034 ASSERT(owner); 11035 hblk_reserve_thread = NULL; 11036 mutex_exit(&hblk_reserve_lock); 11037 owner = 0; 11038 } 11039 re_verify: 11040 /* 11041 * let's check if the goodies are still present 11042 */ 11043 SFMMU_HASH_LOCK(hmebp); 11044 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 11045 if (newhblkp != NULL) { 11046 /* 11047 * return newhblkp if it's not hblk_reserve; 11048 * if newhblkp is hblk_reserve, return it 11049 * _only if_ we are the owner of hblk_reserve. 11050 */ 11051 if (newhblkp != HBLK_RESERVE || owner) { 11052 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || 11053 newhblkp->hblk_shared); 11054 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || 11055 !newhblkp->hblk_shared); 11056 return (newhblkp); 11057 } else { 11058 /* 11059 * we just hit hblk_reserve in the hash and 11060 * we are not the owner of that; 11061 * 11062 * block until hblk_reserve_thread completes 11063 * swapping hblk_reserve and try the dance 11064 * once again. 11065 */ 11066 SFMMU_HASH_UNLOCK(hmebp); 11067 mutex_enter(&hblk_reserve_lock); 11068 mutex_exit(&hblk_reserve_lock); 11069 SFMMU_STAT(sf_hblk_reserve_hit); 11070 goto fill_hblk; 11071 } 11072 } else { 11073 /* 11074 * it's no more! try the dance once again. 11075 */ 11076 SFMMU_HASH_UNLOCK(hmebp); 11077 goto fill_hblk; 11078 } 11079 } 11080 11081 hblk_init: 11082 if (SFMMU_IS_SHMERID_VALID(rid)) { 11083 uint16_t tteflag = 0x1 << 11084 ((size < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : size); 11085 11086 if (!(rgnp->rgn_hmeflags & tteflag)) { 11087 atomic_or_16(&rgnp->rgn_hmeflags, tteflag); 11088 } 11089 hmeblkp->hblk_shared = 1; 11090 } else { 11091 hmeblkp->hblk_shared = 0; 11092 } 11093 set_hblk_sz(hmeblkp, size); 11094 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11095 hmeblkp->hblk_next = (struct hme_blk *)NULL; 11096 hmeblkp->hblk_tag = hblktag; 11097 hmeblkp->hblk_shadow = shw_hblkp; 11098 hblkpa = hmeblkp->hblk_nextpa; 11099 hmeblkp->hblk_nextpa = HMEBLK_ENDPA; 11100 11101 ASSERT(get_hblk_ttesz(hmeblkp) == size); 11102 ASSERT(get_hblk_span(hmeblkp) == HMEBLK_SPAN(size)); 11103 ASSERT(hmeblkp->hblk_hmecnt == 0); 11104 ASSERT(hmeblkp->hblk_vcnt == 0); 11105 ASSERT(hmeblkp->hblk_lckcnt == 0); 11106 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp)); 11107 sfmmu_hblk_hash_add(hmebp, hmeblkp, hblkpa); 11108 return (hmeblkp); 11109 } 11110 11111 /* 11112 * This function cleans up the hme_blk and returns it to the free list. 11113 */ 11114 /* ARGSUSED */ 11115 static void 11116 sfmmu_hblk_free(struct hme_blk **listp) 11117 { 11118 struct hme_blk *hmeblkp, *next_hmeblkp; 11119 int size; 11120 uint_t critical; 11121 uint64_t hblkpa; 11122 11123 ASSERT(*listp != NULL); 11124 11125 hmeblkp = *listp; 11126 while (hmeblkp != NULL) { 11127 next_hmeblkp = hmeblkp->hblk_next; 11128 ASSERT(!hmeblkp->hblk_hmecnt); 11129 ASSERT(!hmeblkp->hblk_vcnt); 11130 ASSERT(!hmeblkp->hblk_lckcnt); 11131 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 11132 ASSERT(hmeblkp->hblk_shared == 0); 11133 ASSERT(hmeblkp->hblk_shw_bit == 0); 11134 ASSERT(hmeblkp->hblk_shadow == NULL); 11135 11136 hblkpa = va_to_pa((caddr_t)hmeblkp); 11137 ASSERT(hblkpa != (uint64_t)-1); 11138 critical = (hblktosfmmu(hmeblkp) == KHATID) ? 1 : 0; 11139 11140 size = get_hblk_ttesz(hmeblkp); 11141 hmeblkp->hblk_next = NULL; 11142 hmeblkp->hblk_nextpa = hblkpa; 11143 11144 if (hmeblkp->hblk_nuc_bit == 0) { 11145 11146 if (size != TTE8K || 11147 !sfmmu_put_free_hblk(hmeblkp, critical)) 11148 kmem_cache_free(get_hblk_cache(hmeblkp), 11149 hmeblkp); 11150 } 11151 hmeblkp = next_hmeblkp; 11152 } 11153 } 11154 11155 #define BUCKETS_TO_SEARCH_BEFORE_UNLOAD 30 11156 #define SFMMU_HBLK_STEAL_THRESHOLD 5 11157 11158 static uint_t sfmmu_hblk_steal_twice; 11159 static uint_t sfmmu_hblk_steal_count, sfmmu_hblk_steal_unload_count; 11160 11161 /* 11162 * Steal a hmeblk from user or kernel hme hash lists. 11163 * For 8K tte grab one from reserve pool (freehblkp) before proceeding to 11164 * steal and if we fail to steal after SFMMU_HBLK_STEAL_THRESHOLD attempts 11165 * tap into critical reserve of freehblkp. 11166 * Note: We remain looping in this routine until we find one. 11167 */ 11168 static struct hme_blk * 11169 sfmmu_hblk_steal(int size) 11170 { 11171 static struct hmehash_bucket *uhmehash_steal_hand = NULL; 11172 struct hmehash_bucket *hmebp; 11173 struct hme_blk *hmeblkp = NULL, *pr_hblk; 11174 uint64_t hblkpa; 11175 int i; 11176 uint_t loop_cnt = 0, critical; 11177 11178 for (;;) { 11179 /* Check cpu hblk pending queues */ 11180 if ((hmeblkp = sfmmu_check_pending_hblks(size)) != NULL) { 11181 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp); 11182 ASSERT(hmeblkp->hblk_hmecnt == 0); 11183 ASSERT(hmeblkp->hblk_vcnt == 0); 11184 return (hmeblkp); 11185 } 11186 11187 if (size == TTE8K) { 11188 critical = 11189 (++loop_cnt > SFMMU_HBLK_STEAL_THRESHOLD) ? 1 : 0; 11190 if (sfmmu_get_free_hblk(&hmeblkp, critical)) 11191 return (hmeblkp); 11192 } 11193 11194 hmebp = (uhmehash_steal_hand == NULL) ? uhme_hash : 11195 uhmehash_steal_hand; 11196 ASSERT(hmebp >= uhme_hash && hmebp <= &uhme_hash[UHMEHASH_SZ]); 11197 11198 for (i = 0; hmeblkp == NULL && i <= UHMEHASH_SZ + 11199 BUCKETS_TO_SEARCH_BEFORE_UNLOAD; i++) { 11200 SFMMU_HASH_LOCK(hmebp); 11201 hmeblkp = hmebp->hmeblkp; 11202 hblkpa = hmebp->hmeh_nextpa; 11203 pr_hblk = NULL; 11204 while (hmeblkp) { 11205 /* 11206 * check if it is a hmeblk that is not locked 11207 * and not shared. skip shadow hmeblks with 11208 * shadow_mask set i.e valid count non zero. 11209 */ 11210 if ((get_hblk_ttesz(hmeblkp) == size) && 11211 (hmeblkp->hblk_shw_bit == 0 || 11212 hmeblkp->hblk_vcnt == 0) && 11213 (hmeblkp->hblk_lckcnt == 0)) { 11214 /* 11215 * there is a high probability that we 11216 * will find a free one. search some 11217 * buckets for a free hmeblk initially 11218 * before unloading a valid hmeblk. 11219 */ 11220 if ((hmeblkp->hblk_vcnt == 0 && 11221 hmeblkp->hblk_hmecnt == 0) || (i >= 11222 BUCKETS_TO_SEARCH_BEFORE_UNLOAD)) { 11223 if (sfmmu_steal_this_hblk(hmebp, 11224 hmeblkp, hblkpa, pr_hblk)) { 11225 /* 11226 * Hblk is unloaded 11227 * successfully 11228 */ 11229 break; 11230 } 11231 } 11232 } 11233 pr_hblk = hmeblkp; 11234 hblkpa = hmeblkp->hblk_nextpa; 11235 hmeblkp = hmeblkp->hblk_next; 11236 } 11237 11238 SFMMU_HASH_UNLOCK(hmebp); 11239 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 11240 hmebp = uhme_hash; 11241 } 11242 uhmehash_steal_hand = hmebp; 11243 11244 if (hmeblkp != NULL) 11245 break; 11246 11247 /* 11248 * in the worst case, look for a free one in the kernel 11249 * hash table. 11250 */ 11251 for (i = 0, hmebp = khme_hash; i <= KHMEHASH_SZ; i++) { 11252 SFMMU_HASH_LOCK(hmebp); 11253 hmeblkp = hmebp->hmeblkp; 11254 hblkpa = hmebp->hmeh_nextpa; 11255 pr_hblk = NULL; 11256 while (hmeblkp) { 11257 /* 11258 * check if it is free hmeblk 11259 */ 11260 if ((get_hblk_ttesz(hmeblkp) == size) && 11261 (hmeblkp->hblk_lckcnt == 0) && 11262 (hmeblkp->hblk_vcnt == 0) && 11263 (hmeblkp->hblk_hmecnt == 0)) { 11264 if (sfmmu_steal_this_hblk(hmebp, 11265 hmeblkp, hblkpa, pr_hblk)) { 11266 break; 11267 } else { 11268 /* 11269 * Cannot fail since we have 11270 * hash lock. 11271 */ 11272 panic("fail to steal?"); 11273 } 11274 } 11275 11276 pr_hblk = hmeblkp; 11277 hblkpa = hmeblkp->hblk_nextpa; 11278 hmeblkp = hmeblkp->hblk_next; 11279 } 11280 11281 SFMMU_HASH_UNLOCK(hmebp); 11282 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 11283 hmebp = khme_hash; 11284 } 11285 11286 if (hmeblkp != NULL) 11287 break; 11288 sfmmu_hblk_steal_twice++; 11289 } 11290 return (hmeblkp); 11291 } 11292 11293 /* 11294 * This routine does real work to prepare a hblk to be "stolen" by 11295 * unloading the mappings, updating shadow counts .... 11296 * It returns 1 if the block is ready to be reused (stolen), or 0 11297 * means the block cannot be stolen yet- pageunload is still working 11298 * on this hblk. 11299 */ 11300 static int 11301 sfmmu_steal_this_hblk(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 11302 uint64_t hblkpa, struct hme_blk *pr_hblk) 11303 { 11304 int shw_size, vshift; 11305 struct hme_blk *shw_hblkp; 11306 caddr_t vaddr; 11307 uint_t shw_mask, newshw_mask; 11308 struct hme_blk *list = NULL; 11309 11310 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11311 11312 /* 11313 * check if the hmeblk is free, unload if necessary 11314 */ 11315 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 11316 sfmmu_t *sfmmup; 11317 demap_range_t dmr; 11318 11319 sfmmup = hblktosfmmu(hmeblkp); 11320 if (hmeblkp->hblk_shared || sfmmup->sfmmu_ismhat) { 11321 return (0); 11322 } 11323 DEMAP_RANGE_INIT(sfmmup, &dmr); 11324 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 11325 (caddr_t)get_hblk_base(hmeblkp), 11326 get_hblk_endaddr(hmeblkp), &dmr, HAT_UNLOAD); 11327 DEMAP_RANGE_FLUSH(&dmr); 11328 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 11329 /* 11330 * Pageunload is working on the same hblk. 11331 */ 11332 return (0); 11333 } 11334 11335 sfmmu_hblk_steal_unload_count++; 11336 } 11337 11338 ASSERT(hmeblkp->hblk_lckcnt == 0); 11339 ASSERT(hmeblkp->hblk_vcnt == 0 && hmeblkp->hblk_hmecnt == 0); 11340 11341 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 1); 11342 hmeblkp->hblk_nextpa = hblkpa; 11343 11344 shw_hblkp = hmeblkp->hblk_shadow; 11345 if (shw_hblkp) { 11346 ASSERT(!hmeblkp->hblk_shared); 11347 shw_size = get_hblk_ttesz(shw_hblkp); 11348 vaddr = (caddr_t)get_hblk_base(hmeblkp); 11349 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 11350 ASSERT(vshift < 8); 11351 /* 11352 * Atomically clear shadow mask bit 11353 */ 11354 do { 11355 shw_mask = shw_hblkp->hblk_shw_mask; 11356 ASSERT(shw_mask & (1 << vshift)); 11357 newshw_mask = shw_mask & ~(1 << vshift); 11358 newshw_mask = atomic_cas_32(&shw_hblkp->hblk_shw_mask, 11359 shw_mask, newshw_mask); 11360 } while (newshw_mask != shw_mask); 11361 hmeblkp->hblk_shadow = NULL; 11362 } 11363 11364 /* 11365 * remove shadow bit if we are stealing an unused shadow hmeblk. 11366 * sfmmu_hblk_alloc needs it that way, will set shadow bit later if 11367 * we are indeed allocating a shadow hmeblk. 11368 */ 11369 hmeblkp->hblk_shw_bit = 0; 11370 11371 if (hmeblkp->hblk_shared) { 11372 sf_srd_t *srdp; 11373 sf_region_t *rgnp; 11374 uint_t rid; 11375 11376 srdp = hblktosrd(hmeblkp); 11377 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 11378 rid = hmeblkp->hblk_tag.htag_rid; 11379 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 11380 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 11381 rgnp = srdp->srd_hmergnp[rid]; 11382 ASSERT(rgnp != NULL); 11383 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 11384 hmeblkp->hblk_shared = 0; 11385 } 11386 11387 sfmmu_hblk_steal_count++; 11388 SFMMU_STAT(sf_steal_count); 11389 11390 return (1); 11391 } 11392 11393 struct hme_blk * 11394 sfmmu_hmetohblk(struct sf_hment *sfhme) 11395 { 11396 struct hme_blk *hmeblkp; 11397 struct sf_hment *sfhme0; 11398 struct hme_blk *hblk_dummy = 0; 11399 11400 /* 11401 * No dummy sf_hments, please. 11402 */ 11403 ASSERT(sfhme->hme_tte.ll != 0); 11404 11405 sfhme0 = sfhme - sfhme->hme_tte.tte_hmenum; 11406 hmeblkp = (struct hme_blk *)((uintptr_t)sfhme0 - 11407 (uintptr_t)&hblk_dummy->hblk_hme[0]); 11408 11409 return (hmeblkp); 11410 } 11411 11412 /* 11413 * On swapin, get appropriately sized TSB(s) and clear the HAT_SWAPPED flag. 11414 * If we can't get appropriately sized TSB(s), try for 8K TSB(s) using 11415 * KM_SLEEP allocation. 11416 * 11417 * Return 0 on success, -1 otherwise. 11418 */ 11419 static void 11420 sfmmu_tsb_swapin(sfmmu_t *sfmmup, hatlock_t *hatlockp) 11421 { 11422 struct tsb_info *tsbinfop, *next; 11423 tsb_replace_rc_t rc; 11424 boolean_t gotfirst = B_FALSE; 11425 11426 ASSERT(sfmmup != ksfmmup); 11427 ASSERT(sfmmu_hat_lock_held(sfmmup)); 11428 11429 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPIN)) { 11430 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 11431 } 11432 11433 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 11434 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPIN); 11435 } else { 11436 return; 11437 } 11438 11439 ASSERT(sfmmup->sfmmu_tsb != NULL); 11440 11441 /* 11442 * Loop over all tsbinfo's replacing them with ones that actually have 11443 * a TSB. If any of the replacements ever fail, bail out of the loop. 11444 */ 11445 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; tsbinfop = next) { 11446 ASSERT(tsbinfop->tsb_flags & TSB_SWAPPED); 11447 next = tsbinfop->tsb_next; 11448 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, tsbinfop->tsb_szc, 11449 hatlockp, TSB_SWAPIN); 11450 if (rc != TSB_SUCCESS) { 11451 break; 11452 } 11453 gotfirst = B_TRUE; 11454 } 11455 11456 switch (rc) { 11457 case TSB_SUCCESS: 11458 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 11459 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11460 return; 11461 case TSB_LOSTRACE: 11462 break; 11463 case TSB_ALLOCFAIL: 11464 break; 11465 default: 11466 panic("sfmmu_replace_tsb returned unrecognized failure code " 11467 "%d", rc); 11468 } 11469 11470 /* 11471 * In this case, we failed to get one of our TSBs. If we failed to 11472 * get the first TSB, get one of minimum size (8KB). Walk the list 11473 * and throw away the tsbinfos, starting where the allocation failed; 11474 * we can get by with just one TSB as long as we don't leave the 11475 * SWAPPED tsbinfo structures lying around. 11476 */ 11477 tsbinfop = sfmmup->sfmmu_tsb; 11478 next = tsbinfop->tsb_next; 11479 tsbinfop->tsb_next = NULL; 11480 11481 sfmmu_hat_exit(hatlockp); 11482 for (tsbinfop = next; tsbinfop != NULL; tsbinfop = next) { 11483 next = tsbinfop->tsb_next; 11484 sfmmu_tsbinfo_free(tsbinfop); 11485 } 11486 hatlockp = sfmmu_hat_enter(sfmmup); 11487 11488 /* 11489 * If we don't have any TSBs, get a single 8K TSB for 8K, 64K and 512K 11490 * pages. 11491 */ 11492 if (!gotfirst) { 11493 tsbinfop = sfmmup->sfmmu_tsb; 11494 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, TSB_MIN_SZCODE, 11495 hatlockp, TSB_SWAPIN | TSB_FORCEALLOC); 11496 ASSERT(rc == TSB_SUCCESS); 11497 } 11498 11499 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 11500 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11501 } 11502 11503 static int 11504 sfmmu_is_rgnva(sf_srd_t *srdp, caddr_t addr, ulong_t w, ulong_t bmw) 11505 { 11506 ulong_t bix = 0; 11507 uint_t rid; 11508 sf_region_t *rgnp; 11509 11510 ASSERT(srdp != NULL); 11511 ASSERT(srdp->srd_refcnt != 0); 11512 11513 w <<= BT_ULSHIFT; 11514 while (bmw) { 11515 if (!(bmw & 0x1)) { 11516 bix++; 11517 bmw >>= 1; 11518 continue; 11519 } 11520 rid = w | bix; 11521 rgnp = srdp->srd_hmergnp[rid]; 11522 ASSERT(rgnp->rgn_refcnt > 0); 11523 ASSERT(rgnp->rgn_id == rid); 11524 if (addr < rgnp->rgn_saddr || 11525 addr >= (rgnp->rgn_saddr + rgnp->rgn_size)) { 11526 bix++; 11527 bmw >>= 1; 11528 } else { 11529 return (1); 11530 } 11531 } 11532 return (0); 11533 } 11534 11535 /* 11536 * Handle exceptions for low level tsb_handler. 11537 * 11538 * There are many scenarios that could land us here: 11539 * 11540 * If the context is invalid we land here. The context can be invalid 11541 * for 3 reasons: 1) we couldn't allocate a new context and now need to 11542 * perform a wrap around operation in order to allocate a new context. 11543 * 2) Context was invalidated to change pagesize programming 3) ISMs or 11544 * TSBs configuration is changeing for this process and we are forced into 11545 * here to do a syncronization operation. If the context is valid we can 11546 * be here from window trap hanlder. In this case just call trap to handle 11547 * the fault. 11548 * 11549 * Note that the process will run in INVALID_CONTEXT before 11550 * faulting into here and subsequently loading the MMU registers 11551 * (including the TSB base register) associated with this process. 11552 * For this reason, the trap handlers must all test for 11553 * INVALID_CONTEXT before attempting to access any registers other 11554 * than the context registers. 11555 */ 11556 void 11557 sfmmu_tsbmiss_exception(struct regs *rp, uintptr_t tagaccess, uint_t traptype) 11558 { 11559 sfmmu_t *sfmmup, *shsfmmup; 11560 uint_t ctxtype; 11561 klwp_id_t lwp; 11562 char lwp_save_state; 11563 hatlock_t *hatlockp, *shatlockp; 11564 struct tsb_info *tsbinfop; 11565 struct tsbmiss *tsbmp; 11566 sf_scd_t *scdp; 11567 11568 SFMMU_STAT(sf_tsb_exceptions); 11569 SFMMU_MMU_STAT(mmu_tsb_exceptions); 11570 sfmmup = astosfmmu(curthread->t_procp->p_as); 11571 /* 11572 * note that in sun4u, tagacces register contains ctxnum 11573 * while sun4v passes ctxtype in the tagaccess register. 11574 */ 11575 ctxtype = tagaccess & TAGACC_CTX_MASK; 11576 11577 ASSERT(sfmmup != ksfmmup && ctxtype != KCONTEXT); 11578 ASSERT(sfmmup->sfmmu_ismhat == 0); 11579 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED) || 11580 ctxtype == INVALID_CONTEXT); 11581 11582 if (ctxtype != INVALID_CONTEXT && traptype != T_DATA_PROT) { 11583 /* 11584 * We may land here because shme bitmap and pagesize 11585 * flags are updated lazily in tsbmiss area on other cpus. 11586 * If we detect here that tsbmiss area is out of sync with 11587 * sfmmu update it and retry the trapped instruction. 11588 * Otherwise call trap(). 11589 */ 11590 int ret = 0; 11591 uchar_t tteflag_mask = (1 << TTE64K) | (1 << TTE8K); 11592 caddr_t addr = (caddr_t)(tagaccess & TAGACC_VADDR_MASK); 11593 11594 /* 11595 * Must set lwp state to LWP_SYS before 11596 * trying to acquire any adaptive lock 11597 */ 11598 lwp = ttolwp(curthread); 11599 ASSERT(lwp); 11600 lwp_save_state = lwp->lwp_state; 11601 lwp->lwp_state = LWP_SYS; 11602 11603 hatlockp = sfmmu_hat_enter(sfmmup); 11604 kpreempt_disable(); 11605 tsbmp = &tsbmiss_area[CPU->cpu_id]; 11606 ASSERT(sfmmup == tsbmp->usfmmup); 11607 if (((tsbmp->uhat_tteflags ^ sfmmup->sfmmu_tteflags) & 11608 ~tteflag_mask) || 11609 ((tsbmp->uhat_rtteflags ^ sfmmup->sfmmu_rtteflags) & 11610 ~tteflag_mask)) { 11611 tsbmp->uhat_tteflags = sfmmup->sfmmu_tteflags; 11612 tsbmp->uhat_rtteflags = sfmmup->sfmmu_rtteflags; 11613 ret = 1; 11614 } 11615 if (sfmmup->sfmmu_srdp != NULL) { 11616 ulong_t *sm = sfmmup->sfmmu_hmeregion_map.bitmap; 11617 ulong_t *tm = tsbmp->shmermap; 11618 ulong_t i; 11619 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 11620 ulong_t d = tm[i] ^ sm[i]; 11621 if (d) { 11622 if (d & sm[i]) { 11623 if (!ret && sfmmu_is_rgnva( 11624 sfmmup->sfmmu_srdp, 11625 addr, i, d & sm[i])) { 11626 ret = 1; 11627 } 11628 } 11629 tm[i] = sm[i]; 11630 } 11631 } 11632 } 11633 kpreempt_enable(); 11634 sfmmu_hat_exit(hatlockp); 11635 lwp->lwp_state = lwp_save_state; 11636 if (ret) { 11637 return; 11638 } 11639 } else if (ctxtype == INVALID_CONTEXT) { 11640 /* 11641 * First, make sure we come out of here with a valid ctx, 11642 * since if we don't get one we'll simply loop on the 11643 * faulting instruction. 11644 * 11645 * If the ISM mappings are changing, the TSB is relocated, 11646 * the process is swapped, the process is joining SCD or 11647 * leaving SCD or shared regions we serialize behind the 11648 * controlling thread with hat lock, sfmmu_flags and 11649 * sfmmu_tsb_cv condition variable. 11650 */ 11651 11652 /* 11653 * Must set lwp state to LWP_SYS before 11654 * trying to acquire any adaptive lock 11655 */ 11656 lwp = ttolwp(curthread); 11657 ASSERT(lwp); 11658 lwp_save_state = lwp->lwp_state; 11659 lwp->lwp_state = LWP_SYS; 11660 11661 hatlockp = sfmmu_hat_enter(sfmmup); 11662 retry: 11663 if ((scdp = sfmmup->sfmmu_scdp) != NULL) { 11664 shsfmmup = scdp->scd_sfmmup; 11665 ASSERT(shsfmmup != NULL); 11666 11667 for (tsbinfop = shsfmmup->sfmmu_tsb; tsbinfop != NULL; 11668 tsbinfop = tsbinfop->tsb_next) { 11669 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) { 11670 /* drop the private hat lock */ 11671 sfmmu_hat_exit(hatlockp); 11672 /* acquire the shared hat lock */ 11673 shatlockp = sfmmu_hat_enter(shsfmmup); 11674 /* 11675 * recheck to see if anything changed 11676 * after we drop the private hat lock. 11677 */ 11678 if (sfmmup->sfmmu_scdp == scdp && 11679 shsfmmup == scdp->scd_sfmmup) { 11680 sfmmu_tsb_chk_reloc(shsfmmup, 11681 shatlockp); 11682 } 11683 sfmmu_hat_exit(shatlockp); 11684 hatlockp = sfmmu_hat_enter(sfmmup); 11685 goto retry; 11686 } 11687 } 11688 } 11689 11690 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 11691 tsbinfop = tsbinfop->tsb_next) { 11692 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) { 11693 cv_wait(&sfmmup->sfmmu_tsb_cv, 11694 HATLOCK_MUTEXP(hatlockp)); 11695 goto retry; 11696 } 11697 } 11698 11699 /* 11700 * Wait for ISM maps to be updated. 11701 */ 11702 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { 11703 cv_wait(&sfmmup->sfmmu_tsb_cv, 11704 HATLOCK_MUTEXP(hatlockp)); 11705 goto retry; 11706 } 11707 11708 /* Is this process joining an SCD? */ 11709 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 11710 /* 11711 * Flush private TSB and setup shared TSB. 11712 * sfmmu_finish_join_scd() does not drop the 11713 * hat lock. 11714 */ 11715 sfmmu_finish_join_scd(sfmmup); 11716 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD); 11717 } 11718 11719 /* 11720 * If we're swapping in, get TSB(s). Note that we must do 11721 * this before we get a ctx or load the MMU state. Once 11722 * we swap in we have to recheck to make sure the TSB(s) and 11723 * ISM mappings didn't change while we slept. 11724 */ 11725 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 11726 sfmmu_tsb_swapin(sfmmup, hatlockp); 11727 goto retry; 11728 } 11729 11730 sfmmu_get_ctx(sfmmup); 11731 11732 sfmmu_hat_exit(hatlockp); 11733 /* 11734 * Must restore lwp_state if not calling 11735 * trap() for further processing. Restore 11736 * it anyway. 11737 */ 11738 lwp->lwp_state = lwp_save_state; 11739 return; 11740 } 11741 trap(rp, (caddr_t)tagaccess, traptype, 0); 11742 } 11743 11744 static void 11745 sfmmu_tsb_chk_reloc(sfmmu_t *sfmmup, hatlock_t *hatlockp) 11746 { 11747 struct tsb_info *tp; 11748 11749 ASSERT(sfmmu_hat_lock_held(sfmmup)); 11750 11751 for (tp = sfmmup->sfmmu_tsb; tp != NULL; tp = tp->tsb_next) { 11752 if (tp->tsb_flags & TSB_RELOC_FLAG) { 11753 cv_wait(&sfmmup->sfmmu_tsb_cv, 11754 HATLOCK_MUTEXP(hatlockp)); 11755 break; 11756 } 11757 } 11758 } 11759 11760 /* 11761 * sfmmu_vatopfn_suspended is called from GET_TTE when TL=0 and 11762 * TTE_SUSPENDED bit set in tte we block on aquiring a page lock 11763 * rather than spinning to avoid send mondo timeouts with 11764 * interrupts enabled. When the lock is acquired it is immediately 11765 * released and we return back to sfmmu_vatopfn just after 11766 * the GET_TTE call. 11767 */ 11768 void 11769 sfmmu_vatopfn_suspended(caddr_t vaddr, sfmmu_t *sfmmu, tte_t *ttep) 11770 { 11771 struct page **pp; 11772 11773 (void) as_pagelock(sfmmu->sfmmu_as, &pp, vaddr, TTE_CSZ(ttep), S_WRITE); 11774 as_pageunlock(sfmmu->sfmmu_as, pp, vaddr, TTE_CSZ(ttep), S_WRITE); 11775 } 11776 11777 /* 11778 * sfmmu_tsbmiss_suspended is called from GET_TTE when TL>0 and 11779 * TTE_SUSPENDED bit set in tte. We do this so that we can handle 11780 * cross traps which cannot be handled while spinning in the 11781 * trap handlers. Simply enter and exit the kpr_suspendlock spin 11782 * mutex, which is held by the holder of the suspend bit, and then 11783 * retry the trapped instruction after unwinding. 11784 */ 11785 /*ARGSUSED*/ 11786 void 11787 sfmmu_tsbmiss_suspended(struct regs *rp, uintptr_t tagacc, uint_t traptype) 11788 { 11789 ASSERT(curthread != kreloc_thread); 11790 mutex_enter(&kpr_suspendlock); 11791 mutex_exit(&kpr_suspendlock); 11792 } 11793 11794 /* 11795 * This routine could be optimized to reduce the number of xcalls by flushing 11796 * the entire TLBs if region reference count is above some threshold but the 11797 * tradeoff will depend on the size of the TLB. So for now flush the specific 11798 * page a context at a time. 11799 * 11800 * If uselocks is 0 then it's called after all cpus were captured and all the 11801 * hat locks were taken. In this case don't take the region lock by relying on 11802 * the order of list region update operations in hat_join_region(), 11803 * hat_leave_region() and hat_dup_region(). The ordering in those routines 11804 * guarantees that list is always forward walkable and reaches active sfmmus 11805 * regardless of where xc_attention() captures a cpu. 11806 */ 11807 cpuset_t 11808 sfmmu_rgntlb_demap(caddr_t addr, sf_region_t *rgnp, 11809 struct hme_blk *hmeblkp, int uselocks) 11810 { 11811 sfmmu_t *sfmmup; 11812 cpuset_t cpuset; 11813 cpuset_t rcpuset; 11814 hatlock_t *hatlockp; 11815 uint_t rid = rgnp->rgn_id; 11816 sf_rgn_link_t *rlink; 11817 sf_scd_t *scdp; 11818 11819 ASSERT(hmeblkp->hblk_shared); 11820 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 11821 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 11822 11823 CPUSET_ZERO(rcpuset); 11824 if (uselocks) { 11825 mutex_enter(&rgnp->rgn_mutex); 11826 } 11827 sfmmup = rgnp->rgn_sfmmu_head; 11828 while (sfmmup != NULL) { 11829 if (uselocks) { 11830 hatlockp = sfmmu_hat_enter(sfmmup); 11831 } 11832 11833 /* 11834 * When an SCD is created the SCD hat is linked on the sfmmu 11835 * region lists for each hme region which is part of the 11836 * SCD. If we find an SCD hat, when walking these lists, 11837 * then we flush the shared TSBs, if we find a private hat, 11838 * which is part of an SCD, but where the region 11839 * is not part of the SCD then we flush the private TSBs. 11840 */ 11841 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && 11842 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 11843 scdp = sfmmup->sfmmu_scdp; 11844 if (SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 11845 if (uselocks) { 11846 sfmmu_hat_exit(hatlockp); 11847 } 11848 goto next; 11849 } 11850 } 11851 11852 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 11853 11854 kpreempt_disable(); 11855 cpuset = sfmmup->sfmmu_cpusran; 11856 CPUSET_AND(cpuset, cpu_ready_set); 11857 CPUSET_DEL(cpuset, CPU->cpu_id); 11858 SFMMU_XCALL_STATS(sfmmup); 11859 xt_some(cpuset, vtag_flushpage_tl1, 11860 (uint64_t)addr, (uint64_t)sfmmup); 11861 vtag_flushpage(addr, (uint64_t)sfmmup); 11862 if (uselocks) { 11863 sfmmu_hat_exit(hatlockp); 11864 } 11865 kpreempt_enable(); 11866 CPUSET_OR(rcpuset, cpuset); 11867 11868 next: 11869 /* LINTED: constant in conditional context */ 11870 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0); 11871 ASSERT(rlink != NULL); 11872 sfmmup = rlink->next; 11873 } 11874 if (uselocks) { 11875 mutex_exit(&rgnp->rgn_mutex); 11876 } 11877 return (rcpuset); 11878 } 11879 11880 /* 11881 * This routine takes an sfmmu pointer and the va for an adddress in an 11882 * ISM region as input and returns the corresponding region id in ism_rid. 11883 * The return value of 1 indicates that a region has been found and ism_rid 11884 * is valid, otherwise 0 is returned. 11885 */ 11886 static int 11887 find_ism_rid(sfmmu_t *sfmmup, sfmmu_t *ism_sfmmup, caddr_t va, uint_t *ism_rid) 11888 { 11889 ism_blk_t *ism_blkp; 11890 int i; 11891 ism_map_t *ism_map; 11892 #ifdef DEBUG 11893 struct hat *ism_hatid; 11894 #endif 11895 ASSERT(sfmmu_hat_lock_held(sfmmup)); 11896 11897 ism_blkp = sfmmup->sfmmu_iblk; 11898 while (ism_blkp != NULL) { 11899 ism_map = ism_blkp->iblk_maps; 11900 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) { 11901 if ((va >= ism_start(ism_map[i])) && 11902 (va < ism_end(ism_map[i]))) { 11903 11904 *ism_rid = ism_map[i].imap_rid; 11905 #ifdef DEBUG 11906 ism_hatid = ism_map[i].imap_ismhat; 11907 ASSERT(ism_hatid == ism_sfmmup); 11908 ASSERT(ism_hatid->sfmmu_ismhat); 11909 #endif 11910 return (1); 11911 } 11912 } 11913 ism_blkp = ism_blkp->iblk_next; 11914 } 11915 return (0); 11916 } 11917 11918 /* 11919 * Special routine to flush out ism mappings- TSBs, TLBs and D-caches. 11920 * This routine may be called with all cpu's captured. Therefore, the 11921 * caller is responsible for holding all locks and disabling kernel 11922 * preemption. 11923 */ 11924 /* ARGSUSED */ 11925 static void 11926 sfmmu_ismtlbcache_demap(caddr_t addr, sfmmu_t *ism_sfmmup, 11927 struct hme_blk *hmeblkp, pfn_t pfnum, int cache_flush_flag) 11928 { 11929 cpuset_t cpuset; 11930 caddr_t va; 11931 ism_ment_t *ment; 11932 sfmmu_t *sfmmup; 11933 #ifdef VAC 11934 int vcolor; 11935 #endif 11936 11937 sf_scd_t *scdp; 11938 uint_t ism_rid; 11939 11940 ASSERT(!hmeblkp->hblk_shared); 11941 /* 11942 * Walk the ism_hat's mapping list and flush the page 11943 * from every hat sharing this ism_hat. This routine 11944 * may be called while all cpu's have been captured. 11945 * Therefore we can't attempt to grab any locks. For now 11946 * this means we will protect the ism mapping list under 11947 * a single lock which will be grabbed by the caller. 11948 * If hat_share/unshare scalibility becomes a performance 11949 * problem then we may need to re-think ism mapping list locking. 11950 */ 11951 ASSERT(ism_sfmmup->sfmmu_ismhat); 11952 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 11953 addr = addr - ISMID_STARTADDR; 11954 11955 for (ment = ism_sfmmup->sfmmu_iment; ment; ment = ment->iment_next) { 11956 11957 sfmmup = ment->iment_hat; 11958 11959 va = ment->iment_base_va; 11960 va = (caddr_t)((uintptr_t)va + (uintptr_t)addr); 11961 11962 /* 11963 * When an SCD is created the SCD hat is linked on the ism 11964 * mapping lists for each ISM segment which is part of the 11965 * SCD. If we find an SCD hat, when walking these lists, 11966 * then we flush the shared TSBs, if we find a private hat, 11967 * which is part of an SCD, but where the region 11968 * corresponding to this va is not part of the SCD then we 11969 * flush the private TSBs. 11970 */ 11971 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && 11972 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD) && 11973 !SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { 11974 if (!find_ism_rid(sfmmup, ism_sfmmup, va, 11975 &ism_rid)) { 11976 cmn_err(CE_PANIC, 11977 "can't find matching ISM rid!"); 11978 } 11979 11980 scdp = sfmmup->sfmmu_scdp; 11981 if (SFMMU_IS_ISMRID_VALID(ism_rid) && 11982 SF_RGNMAP_TEST(scdp->scd_ismregion_map, 11983 ism_rid)) { 11984 continue; 11985 } 11986 } 11987 SFMMU_UNLOAD_TSB(va, sfmmup, hmeblkp, 1); 11988 11989 cpuset = sfmmup->sfmmu_cpusran; 11990 CPUSET_AND(cpuset, cpu_ready_set); 11991 CPUSET_DEL(cpuset, CPU->cpu_id); 11992 SFMMU_XCALL_STATS(sfmmup); 11993 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)va, 11994 (uint64_t)sfmmup); 11995 vtag_flushpage(va, (uint64_t)sfmmup); 11996 11997 #ifdef VAC 11998 /* 11999 * Flush D$ 12000 * When flushing D$ we must flush all 12001 * cpu's. See sfmmu_cache_flush(). 12002 */ 12003 if (cache_flush_flag == CACHE_FLUSH) { 12004 cpuset = cpu_ready_set; 12005 CPUSET_DEL(cpuset, CPU->cpu_id); 12006 12007 SFMMU_XCALL_STATS(sfmmup); 12008 vcolor = addr_to_vcolor(va); 12009 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12010 vac_flushpage(pfnum, vcolor); 12011 } 12012 #endif /* VAC */ 12013 } 12014 } 12015 12016 /* 12017 * Demaps the TSB, CPU caches, and flushes all TLBs on all CPUs of 12018 * a particular virtual address and ctx. If noflush is set we do not 12019 * flush the TLB/TSB. This function may or may not be called with the 12020 * HAT lock held. 12021 */ 12022 static void 12023 sfmmu_tlbcache_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 12024 pfn_t pfnum, int tlb_noflush, int cpu_flag, int cache_flush_flag, 12025 int hat_lock_held) 12026 { 12027 #ifdef VAC 12028 int vcolor; 12029 #endif 12030 cpuset_t cpuset; 12031 hatlock_t *hatlockp; 12032 12033 ASSERT(!hmeblkp->hblk_shared); 12034 12035 #if defined(lint) && !defined(VAC) 12036 pfnum = pfnum; 12037 cpu_flag = cpu_flag; 12038 cache_flush_flag = cache_flush_flag; 12039 #endif 12040 12041 /* 12042 * There is no longer a need to protect against ctx being 12043 * stolen here since we don't store the ctx in the TSB anymore. 12044 */ 12045 #ifdef VAC 12046 vcolor = addr_to_vcolor(addr); 12047 #endif 12048 12049 /* 12050 * We must hold the hat lock during the flush of TLB, 12051 * to avoid a race with sfmmu_invalidate_ctx(), where 12052 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT, 12053 * causing TLB demap routine to skip flush on that MMU. 12054 * If the context on a MMU has already been set to 12055 * INVALID_CONTEXT, we just get an extra flush on 12056 * that MMU. 12057 */ 12058 if (!hat_lock_held && !tlb_noflush) 12059 hatlockp = sfmmu_hat_enter(sfmmup); 12060 12061 kpreempt_disable(); 12062 if (!tlb_noflush) { 12063 /* 12064 * Flush the TSB and TLB. 12065 */ 12066 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12067 12068 cpuset = sfmmup->sfmmu_cpusran; 12069 CPUSET_AND(cpuset, cpu_ready_set); 12070 CPUSET_DEL(cpuset, CPU->cpu_id); 12071 12072 SFMMU_XCALL_STATS(sfmmup); 12073 12074 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 12075 (uint64_t)sfmmup); 12076 12077 vtag_flushpage(addr, (uint64_t)sfmmup); 12078 } 12079 12080 if (!hat_lock_held && !tlb_noflush) 12081 sfmmu_hat_exit(hatlockp); 12082 12083 #ifdef VAC 12084 /* 12085 * Flush the D$ 12086 * 12087 * Even if the ctx is stolen, we need to flush the 12088 * cache. Our ctx stealer only flushes the TLBs. 12089 */ 12090 if (cache_flush_flag == CACHE_FLUSH) { 12091 if (cpu_flag & FLUSH_ALL_CPUS) { 12092 cpuset = cpu_ready_set; 12093 } else { 12094 cpuset = sfmmup->sfmmu_cpusran; 12095 CPUSET_AND(cpuset, cpu_ready_set); 12096 } 12097 CPUSET_DEL(cpuset, CPU->cpu_id); 12098 SFMMU_XCALL_STATS(sfmmup); 12099 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12100 vac_flushpage(pfnum, vcolor); 12101 } 12102 #endif /* VAC */ 12103 kpreempt_enable(); 12104 } 12105 12106 /* 12107 * Demaps the TSB and flushes all TLBs on all cpus for a particular virtual 12108 * address and ctx. If noflush is set we do not currently do anything. 12109 * This function may or may not be called with the HAT lock held. 12110 */ 12111 static void 12112 sfmmu_tlb_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 12113 int tlb_noflush, int hat_lock_held) 12114 { 12115 cpuset_t cpuset; 12116 hatlock_t *hatlockp; 12117 12118 ASSERT(!hmeblkp->hblk_shared); 12119 12120 /* 12121 * If the process is exiting we have nothing to do. 12122 */ 12123 if (tlb_noflush) 12124 return; 12125 12126 /* 12127 * Flush TSB. 12128 */ 12129 if (!hat_lock_held) 12130 hatlockp = sfmmu_hat_enter(sfmmup); 12131 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12132 12133 kpreempt_disable(); 12134 12135 cpuset = sfmmup->sfmmu_cpusran; 12136 CPUSET_AND(cpuset, cpu_ready_set); 12137 CPUSET_DEL(cpuset, CPU->cpu_id); 12138 12139 SFMMU_XCALL_STATS(sfmmup); 12140 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, (uint64_t)sfmmup); 12141 12142 vtag_flushpage(addr, (uint64_t)sfmmup); 12143 12144 if (!hat_lock_held) 12145 sfmmu_hat_exit(hatlockp); 12146 12147 kpreempt_enable(); 12148 12149 } 12150 12151 /* 12152 * Special case of sfmmu_tlb_demap for MMU_PAGESIZE hblks. Use the xcall 12153 * call handler that can flush a range of pages to save on xcalls. 12154 */ 12155 static int sfmmu_xcall_save; 12156 12157 /* 12158 * this routine is never used for demaping addresses backed by SRD hmeblks. 12159 */ 12160 static void 12161 sfmmu_tlb_range_demap(demap_range_t *dmrp) 12162 { 12163 sfmmu_t *sfmmup = dmrp->dmr_sfmmup; 12164 hatlock_t *hatlockp; 12165 cpuset_t cpuset; 12166 uint64_t sfmmu_pgcnt; 12167 pgcnt_t pgcnt = 0; 12168 int pgunload = 0; 12169 int dirtypg = 0; 12170 caddr_t addr = dmrp->dmr_addr; 12171 caddr_t eaddr; 12172 uint64_t bitvec = dmrp->dmr_bitvec; 12173 12174 ASSERT(bitvec & 1); 12175 12176 /* 12177 * Flush TSB and calculate number of pages to flush. 12178 */ 12179 while (bitvec != 0) { 12180 dirtypg = 0; 12181 /* 12182 * Find the first page to flush and then count how many 12183 * pages there are after it that also need to be flushed. 12184 * This way the number of TSB flushes is minimized. 12185 */ 12186 while ((bitvec & 1) == 0) { 12187 pgcnt++; 12188 addr += MMU_PAGESIZE; 12189 bitvec >>= 1; 12190 } 12191 while (bitvec & 1) { 12192 dirtypg++; 12193 bitvec >>= 1; 12194 } 12195 eaddr = addr + ptob(dirtypg); 12196 hatlockp = sfmmu_hat_enter(sfmmup); 12197 sfmmu_unload_tsb_range(sfmmup, addr, eaddr, TTE8K); 12198 sfmmu_hat_exit(hatlockp); 12199 pgunload += dirtypg; 12200 addr = eaddr; 12201 pgcnt += dirtypg; 12202 } 12203 12204 ASSERT((pgcnt<<MMU_PAGESHIFT) <= dmrp->dmr_endaddr - dmrp->dmr_addr); 12205 if (sfmmup->sfmmu_free == 0) { 12206 addr = dmrp->dmr_addr; 12207 bitvec = dmrp->dmr_bitvec; 12208 12209 /* 12210 * make sure it has SFMMU_PGCNT_SHIFT bits only, 12211 * as it will be used to pack argument for xt_some 12212 */ 12213 ASSERT((pgcnt > 0) && 12214 (pgcnt <= (1 << SFMMU_PGCNT_SHIFT))); 12215 12216 /* 12217 * Encode pgcnt as (pgcnt -1 ), and pass (pgcnt - 1) in 12218 * the low 6 bits of sfmmup. This is doable since pgcnt 12219 * always >= 1. 12220 */ 12221 ASSERT(!((uint64_t)sfmmup & SFMMU_PGCNT_MASK)); 12222 sfmmu_pgcnt = (uint64_t)sfmmup | 12223 ((pgcnt - 1) & SFMMU_PGCNT_MASK); 12224 12225 /* 12226 * We must hold the hat lock during the flush of TLB, 12227 * to avoid a race with sfmmu_invalidate_ctx(), where 12228 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT, 12229 * causing TLB demap routine to skip flush on that MMU. 12230 * If the context on a MMU has already been set to 12231 * INVALID_CONTEXT, we just get an extra flush on 12232 * that MMU. 12233 */ 12234 hatlockp = sfmmu_hat_enter(sfmmup); 12235 kpreempt_disable(); 12236 12237 cpuset = sfmmup->sfmmu_cpusran; 12238 CPUSET_AND(cpuset, cpu_ready_set); 12239 CPUSET_DEL(cpuset, CPU->cpu_id); 12240 12241 SFMMU_XCALL_STATS(sfmmup); 12242 xt_some(cpuset, vtag_flush_pgcnt_tl1, (uint64_t)addr, 12243 sfmmu_pgcnt); 12244 12245 for (; bitvec != 0; bitvec >>= 1) { 12246 if (bitvec & 1) 12247 vtag_flushpage(addr, (uint64_t)sfmmup); 12248 addr += MMU_PAGESIZE; 12249 } 12250 kpreempt_enable(); 12251 sfmmu_hat_exit(hatlockp); 12252 12253 sfmmu_xcall_save += (pgunload-1); 12254 } 12255 dmrp->dmr_bitvec = 0; 12256 } 12257 12258 /* 12259 * In cases where we need to synchronize with TLB/TSB miss trap 12260 * handlers, _and_ need to flush the TLB, it's a lot easier to 12261 * throw away the context from the process than to do a 12262 * special song and dance to keep things consistent for the 12263 * handlers. 12264 * 12265 * Since the process suddenly ends up without a context and our caller 12266 * holds the hat lock, threads that fault after this function is called 12267 * will pile up on the lock. We can then do whatever we need to 12268 * atomically from the context of the caller. The first blocked thread 12269 * to resume executing will get the process a new context, and the 12270 * process will resume executing. 12271 * 12272 * One added advantage of this approach is that on MMUs that 12273 * support a "flush all" operation, we will delay the flush until 12274 * cnum wrap-around, and then flush the TLB one time. This 12275 * is rather rare, so it's a lot less expensive than making 8000 12276 * x-calls to flush the TLB 8000 times. 12277 * 12278 * A per-process (PP) lock is used to synchronize ctx allocations in 12279 * resume() and ctx invalidations here. 12280 */ 12281 static void 12282 sfmmu_invalidate_ctx(sfmmu_t *sfmmup) 12283 { 12284 cpuset_t cpuset; 12285 int cnum, currcnum; 12286 mmu_ctx_t *mmu_ctxp; 12287 int i; 12288 uint_t pstate_save; 12289 12290 SFMMU_STAT(sf_ctx_inv); 12291 12292 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12293 ASSERT(sfmmup != ksfmmup); 12294 12295 kpreempt_disable(); 12296 12297 mmu_ctxp = CPU_MMU_CTXP(CPU); 12298 ASSERT(mmu_ctxp); 12299 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms); 12300 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]); 12301 12302 currcnum = sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum; 12303 12304 pstate_save = sfmmu_disable_intrs(); 12305 12306 lock_set(&sfmmup->sfmmu_ctx_lock); /* acquire PP lock */ 12307 /* set HAT cnum invalid across all context domains. */ 12308 for (i = 0; i < max_mmu_ctxdoms; i++) { 12309 12310 cnum = sfmmup->sfmmu_ctxs[i].cnum; 12311 if (cnum == INVALID_CONTEXT) { 12312 continue; 12313 } 12314 12315 sfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT; 12316 } 12317 membar_enter(); /* make sure globally visible to all CPUs */ 12318 lock_clear(&sfmmup->sfmmu_ctx_lock); /* release PP lock */ 12319 12320 sfmmu_enable_intrs(pstate_save); 12321 12322 cpuset = sfmmup->sfmmu_cpusran; 12323 CPUSET_DEL(cpuset, CPU->cpu_id); 12324 CPUSET_AND(cpuset, cpu_ready_set); 12325 if (!CPUSET_ISNULL(cpuset)) { 12326 SFMMU_XCALL_STATS(sfmmup); 12327 xt_some(cpuset, sfmmu_raise_tsb_exception, 12328 (uint64_t)sfmmup, INVALID_CONTEXT); 12329 xt_sync(cpuset); 12330 SFMMU_STAT(sf_tsb_raise_exception); 12331 SFMMU_MMU_STAT(mmu_tsb_raise_exception); 12332 } 12333 12334 /* 12335 * If the hat to-be-invalidated is the same as the current 12336 * process on local CPU we need to invalidate 12337 * this CPU context as well. 12338 */ 12339 if ((sfmmu_getctx_sec() == currcnum) && 12340 (currcnum != INVALID_CONTEXT)) { 12341 /* sets shared context to INVALID too */ 12342 sfmmu_setctx_sec(INVALID_CONTEXT); 12343 sfmmu_clear_utsbinfo(); 12344 } 12345 12346 SFMMU_FLAGS_SET(sfmmup, HAT_ALLCTX_INVALID); 12347 12348 kpreempt_enable(); 12349 12350 /* 12351 * we hold the hat lock, so nobody should allocate a context 12352 * for us yet 12353 */ 12354 ASSERT(sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum == INVALID_CONTEXT); 12355 } 12356 12357 #ifdef VAC 12358 /* 12359 * We need to flush the cache in all cpus. It is possible that 12360 * a process referenced a page as cacheable but has sinced exited 12361 * and cleared the mapping list. We still to flush it but have no 12362 * state so all cpus is the only alternative. 12363 */ 12364 void 12365 sfmmu_cache_flush(pfn_t pfnum, int vcolor) 12366 { 12367 cpuset_t cpuset; 12368 12369 kpreempt_disable(); 12370 cpuset = cpu_ready_set; 12371 CPUSET_DEL(cpuset, CPU->cpu_id); 12372 SFMMU_XCALL_STATS(NULL); /* account to any ctx */ 12373 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12374 xt_sync(cpuset); 12375 vac_flushpage(pfnum, vcolor); 12376 kpreempt_enable(); 12377 } 12378 12379 void 12380 sfmmu_cache_flushcolor(int vcolor, pfn_t pfnum) 12381 { 12382 cpuset_t cpuset; 12383 12384 ASSERT(vcolor >= 0); 12385 12386 kpreempt_disable(); 12387 cpuset = cpu_ready_set; 12388 CPUSET_DEL(cpuset, CPU->cpu_id); 12389 SFMMU_XCALL_STATS(NULL); /* account to any ctx */ 12390 xt_some(cpuset, vac_flushcolor_tl1, vcolor, pfnum); 12391 xt_sync(cpuset); 12392 vac_flushcolor(vcolor, pfnum); 12393 kpreempt_enable(); 12394 } 12395 #endif /* VAC */ 12396 12397 /* 12398 * We need to prevent processes from accessing the TSB using a cached physical 12399 * address. It's alright if they try to access the TSB via virtual address 12400 * since they will just fault on that virtual address once the mapping has 12401 * been suspended. 12402 */ 12403 #pragma weak sendmondo_in_recover 12404 12405 /* ARGSUSED */ 12406 static int 12407 sfmmu_tsb_pre_relocator(caddr_t va, uint_t tsbsz, uint_t flags, void *tsbinfo) 12408 { 12409 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 12410 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 12411 hatlock_t *hatlockp; 12412 sf_scd_t *scdp; 12413 12414 if (flags != HAT_PRESUSPEND) 12415 return (0); 12416 12417 /* 12418 * If tsb is a shared TSB with TSB_SHAREDCTX set, sfmmup must 12419 * be a shared hat, then set SCD's tsbinfo's flag. 12420 * If tsb is not shared, sfmmup is a private hat, then set 12421 * its private tsbinfo's flag. 12422 */ 12423 hatlockp = sfmmu_hat_enter(sfmmup); 12424 tsbinfop->tsb_flags |= TSB_RELOC_FLAG; 12425 12426 if (!(tsbinfop->tsb_flags & TSB_SHAREDCTX)) { 12427 sfmmu_tsb_inv_ctx(sfmmup); 12428 sfmmu_hat_exit(hatlockp); 12429 } else { 12430 /* release lock on the shared hat */ 12431 sfmmu_hat_exit(hatlockp); 12432 /* sfmmup is a shared hat */ 12433 ASSERT(sfmmup->sfmmu_scdhat); 12434 scdp = sfmmup->sfmmu_scdp; 12435 ASSERT(scdp != NULL); 12436 /* get private hat from the scd list */ 12437 mutex_enter(&scdp->scd_mutex); 12438 sfmmup = scdp->scd_sf_list; 12439 while (sfmmup != NULL) { 12440 hatlockp = sfmmu_hat_enter(sfmmup); 12441 /* 12442 * We do not call sfmmu_tsb_inv_ctx here because 12443 * sendmondo_in_recover check is only needed for 12444 * sun4u. 12445 */ 12446 sfmmu_invalidate_ctx(sfmmup); 12447 sfmmu_hat_exit(hatlockp); 12448 sfmmup = sfmmup->sfmmu_scd_link.next; 12449 12450 } 12451 mutex_exit(&scdp->scd_mutex); 12452 } 12453 return (0); 12454 } 12455 12456 static void 12457 sfmmu_tsb_inv_ctx(sfmmu_t *sfmmup) 12458 { 12459 extern uint32_t sendmondo_in_recover; 12460 12461 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12462 12463 /* 12464 * For Cheetah+ Erratum 25: 12465 * Wait for any active recovery to finish. We can't risk 12466 * relocating the TSB of the thread running mondo_recover_proc() 12467 * since, if we did that, we would deadlock. The scenario we are 12468 * trying to avoid is as follows: 12469 * 12470 * THIS CPU RECOVER CPU 12471 * -------- ----------- 12472 * Begins recovery, walking through TSB 12473 * hat_pagesuspend() TSB TTE 12474 * TLB miss on TSB TTE, spins at TL1 12475 * xt_sync() 12476 * send_mondo_timeout() 12477 * mondo_recover_proc() 12478 * ((deadlocked)) 12479 * 12480 * The second half of the workaround is that mondo_recover_proc() 12481 * checks to see if the tsb_info has the RELOC flag set, and if it 12482 * does, it skips over that TSB without ever touching tsbinfop->tsb_va 12483 * and hence avoiding the TLB miss that could result in a deadlock. 12484 */ 12485 if (&sendmondo_in_recover) { 12486 membar_enter(); /* make sure RELOC flag visible */ 12487 while (sendmondo_in_recover) { 12488 drv_usecwait(1); 12489 membar_consumer(); 12490 } 12491 } 12492 12493 sfmmu_invalidate_ctx(sfmmup); 12494 } 12495 12496 /* ARGSUSED */ 12497 static int 12498 sfmmu_tsb_post_relocator(caddr_t va, uint_t tsbsz, uint_t flags, 12499 void *tsbinfo, pfn_t newpfn) 12500 { 12501 hatlock_t *hatlockp; 12502 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 12503 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 12504 12505 if (flags != HAT_POSTUNSUSPEND) 12506 return (0); 12507 12508 hatlockp = sfmmu_hat_enter(sfmmup); 12509 12510 SFMMU_STAT(sf_tsb_reloc); 12511 12512 /* 12513 * The process may have swapped out while we were relocating one 12514 * of its TSBs. If so, don't bother doing the setup since the 12515 * process can't be using the memory anymore. 12516 */ 12517 if ((tsbinfop->tsb_flags & TSB_SWAPPED) == 0) { 12518 ASSERT(va == tsbinfop->tsb_va); 12519 sfmmu_tsbinfo_setup_phys(tsbinfop, newpfn); 12520 12521 if (tsbinfop->tsb_flags & TSB_FLUSH_NEEDED) { 12522 sfmmu_inv_tsb(tsbinfop->tsb_va, 12523 TSB_BYTES(tsbinfop->tsb_szc)); 12524 tsbinfop->tsb_flags &= ~TSB_FLUSH_NEEDED; 12525 } 12526 } 12527 12528 membar_exit(); 12529 tsbinfop->tsb_flags &= ~TSB_RELOC_FLAG; 12530 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 12531 12532 sfmmu_hat_exit(hatlockp); 12533 12534 return (0); 12535 } 12536 12537 /* 12538 * Allocate and initialize a tsb_info structure. Note that we may or may not 12539 * allocate a TSB here, depending on the flags passed in. 12540 */ 12541 static int 12542 sfmmu_tsbinfo_alloc(struct tsb_info **tsbinfopp, int tsb_szc, int tte_sz_mask, 12543 uint_t flags, sfmmu_t *sfmmup) 12544 { 12545 int err; 12546 12547 *tsbinfopp = (struct tsb_info *)kmem_cache_alloc( 12548 sfmmu_tsbinfo_cache, KM_SLEEP); 12549 12550 if ((err = sfmmu_init_tsbinfo(*tsbinfopp, tte_sz_mask, 12551 tsb_szc, flags, sfmmup)) != 0) { 12552 kmem_cache_free(sfmmu_tsbinfo_cache, *tsbinfopp); 12553 SFMMU_STAT(sf_tsb_allocfail); 12554 *tsbinfopp = NULL; 12555 return (err); 12556 } 12557 SFMMU_STAT(sf_tsb_alloc); 12558 12559 /* 12560 * Bump the TSB size counters for this TSB size. 12561 */ 12562 (*(((int *)&sfmmu_tsbsize_stat) + tsb_szc))++; 12563 return (0); 12564 } 12565 12566 static void 12567 sfmmu_tsb_free(struct tsb_info *tsbinfo) 12568 { 12569 caddr_t tsbva = tsbinfo->tsb_va; 12570 uint_t tsb_size = TSB_BYTES(tsbinfo->tsb_szc); 12571 struct kmem_cache *kmem_cachep = tsbinfo->tsb_cache; 12572 vmem_t *vmp = tsbinfo->tsb_vmp; 12573 12574 /* 12575 * If we allocated this TSB from relocatable kernel memory, then we 12576 * need to uninstall the callback handler. 12577 */ 12578 if (tsbinfo->tsb_cache != sfmmu_tsb8k_cache) { 12579 uintptr_t slab_mask; 12580 caddr_t slab_vaddr; 12581 page_t **ppl; 12582 int ret; 12583 12584 ASSERT(tsb_size <= MMU_PAGESIZE4M || use_bigtsb_arena); 12585 if (tsb_size > MMU_PAGESIZE4M) 12586 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT; 12587 else 12588 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 12589 slab_vaddr = (caddr_t)((uintptr_t)tsbva & slab_mask); 12590 12591 ret = as_pagelock(&kas, &ppl, slab_vaddr, PAGESIZE, S_WRITE); 12592 ASSERT(ret == 0); 12593 hat_delete_callback(tsbva, (uint_t)tsb_size, (void *)tsbinfo, 12594 0, NULL); 12595 as_pageunlock(&kas, ppl, slab_vaddr, PAGESIZE, S_WRITE); 12596 } 12597 12598 if (kmem_cachep != NULL) { 12599 kmem_cache_free(kmem_cachep, tsbva); 12600 } else { 12601 vmem_xfree(vmp, (void *)tsbva, tsb_size); 12602 } 12603 tsbinfo->tsb_va = (caddr_t)0xbad00bad; 12604 atomic_add_64(&tsb_alloc_bytes, -(int64_t)tsb_size); 12605 } 12606 12607 static void 12608 sfmmu_tsbinfo_free(struct tsb_info *tsbinfo) 12609 { 12610 if ((tsbinfo->tsb_flags & TSB_SWAPPED) == 0) { 12611 sfmmu_tsb_free(tsbinfo); 12612 } 12613 kmem_cache_free(sfmmu_tsbinfo_cache, tsbinfo); 12614 12615 } 12616 12617 /* 12618 * Setup all the references to physical memory for this tsbinfo. 12619 * The underlying page(s) must be locked. 12620 */ 12621 static void 12622 sfmmu_tsbinfo_setup_phys(struct tsb_info *tsbinfo, pfn_t pfn) 12623 { 12624 ASSERT(pfn != PFN_INVALID); 12625 ASSERT(pfn == va_to_pfn(tsbinfo->tsb_va)); 12626 12627 #ifndef sun4v 12628 if (tsbinfo->tsb_szc == 0) { 12629 sfmmu_memtte(&tsbinfo->tsb_tte, pfn, 12630 PROT_WRITE|PROT_READ, TTE8K); 12631 } else { 12632 /* 12633 * Round down PA and use a large mapping; the handlers will 12634 * compute the TSB pointer at the correct offset into the 12635 * big virtual page. NOTE: this assumes all TSBs larger 12636 * than 8K must come from physically contiguous slabs of 12637 * size tsb_slab_size. 12638 */ 12639 sfmmu_memtte(&tsbinfo->tsb_tte, pfn & ~tsb_slab_mask, 12640 PROT_WRITE|PROT_READ, tsb_slab_ttesz); 12641 } 12642 tsbinfo->tsb_pa = ptob(pfn); 12643 12644 TTE_SET_LOCKED(&tsbinfo->tsb_tte); /* lock the tte into dtlb */ 12645 TTE_SET_MOD(&tsbinfo->tsb_tte); /* enable writes */ 12646 12647 ASSERT(TTE_IS_PRIVILEGED(&tsbinfo->tsb_tte)); 12648 ASSERT(TTE_IS_LOCKED(&tsbinfo->tsb_tte)); 12649 #else /* sun4v */ 12650 tsbinfo->tsb_pa = ptob(pfn); 12651 #endif /* sun4v */ 12652 } 12653 12654 12655 /* 12656 * Returns zero on success, ENOMEM if over the high water mark, 12657 * or EAGAIN if the caller needs to retry with a smaller TSB 12658 * size (or specify TSB_FORCEALLOC if the allocation can't fail). 12659 * 12660 * This call cannot fail to allocate a TSB if TSB_FORCEALLOC 12661 * is specified and the TSB requested is PAGESIZE, though it 12662 * may sleep waiting for memory if sufficient memory is not 12663 * available. 12664 */ 12665 static int 12666 sfmmu_init_tsbinfo(struct tsb_info *tsbinfo, int tteszmask, 12667 int tsbcode, uint_t flags, sfmmu_t *sfmmup) 12668 { 12669 caddr_t vaddr = NULL; 12670 caddr_t slab_vaddr; 12671 uintptr_t slab_mask; 12672 int tsbbytes = TSB_BYTES(tsbcode); 12673 int lowmem = 0; 12674 struct kmem_cache *kmem_cachep = NULL; 12675 vmem_t *vmp = NULL; 12676 lgrp_id_t lgrpid = LGRP_NONE; 12677 pfn_t pfn; 12678 uint_t cbflags = HAC_SLEEP; 12679 page_t **pplist; 12680 int ret; 12681 12682 ASSERT(tsbbytes <= MMU_PAGESIZE4M || use_bigtsb_arena); 12683 if (tsbbytes > MMU_PAGESIZE4M) 12684 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT; 12685 else 12686 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 12687 12688 if (flags & (TSB_FORCEALLOC | TSB_SWAPIN | TSB_GROW | TSB_SHRINK)) 12689 flags |= TSB_ALLOC; 12690 12691 ASSERT((flags & TSB_FORCEALLOC) == 0 || tsbcode == TSB_MIN_SZCODE); 12692 12693 tsbinfo->tsb_sfmmu = sfmmup; 12694 12695 /* 12696 * If not allocating a TSB, set up the tsbinfo, set TSB_SWAPPED, and 12697 * return. 12698 */ 12699 if ((flags & TSB_ALLOC) == 0) { 12700 tsbinfo->tsb_szc = tsbcode; 12701 tsbinfo->tsb_ttesz_mask = tteszmask; 12702 tsbinfo->tsb_va = (caddr_t)0xbadbadbeef; 12703 tsbinfo->tsb_pa = -1; 12704 tsbinfo->tsb_tte.ll = 0; 12705 tsbinfo->tsb_next = NULL; 12706 tsbinfo->tsb_flags = TSB_SWAPPED; 12707 tsbinfo->tsb_cache = NULL; 12708 tsbinfo->tsb_vmp = NULL; 12709 return (0); 12710 } 12711 12712 #ifdef DEBUG 12713 /* 12714 * For debugging: 12715 * Randomly force allocation failures every tsb_alloc_mtbf 12716 * tries if TSB_FORCEALLOC is not specified. This will 12717 * return ENOMEM if tsb_alloc_mtbf is odd, or EAGAIN if 12718 * it is even, to allow testing of both failure paths... 12719 */ 12720 if (tsb_alloc_mtbf && ((flags & TSB_FORCEALLOC) == 0) && 12721 (tsb_alloc_count++ == tsb_alloc_mtbf)) { 12722 tsb_alloc_count = 0; 12723 tsb_alloc_fail_mtbf++; 12724 return ((tsb_alloc_mtbf & 1)? ENOMEM : EAGAIN); 12725 } 12726 #endif /* DEBUG */ 12727 12728 /* 12729 * Enforce high water mark if we are not doing a forced allocation 12730 * and are not shrinking a process' TSB. 12731 */ 12732 if ((flags & TSB_SHRINK) == 0 && 12733 (tsbbytes + tsb_alloc_bytes) > tsb_alloc_hiwater) { 12734 if ((flags & TSB_FORCEALLOC) == 0) 12735 return (ENOMEM); 12736 lowmem = 1; 12737 } 12738 12739 /* 12740 * Allocate from the correct location based upon the size of the TSB 12741 * compared to the base page size, and what memory conditions dictate. 12742 * Note we always do nonblocking allocations from the TSB arena since 12743 * we don't want memory fragmentation to cause processes to block 12744 * indefinitely waiting for memory; until the kernel algorithms that 12745 * coalesce large pages are improved this is our best option. 12746 * 12747 * Algorithm: 12748 * If allocating a "large" TSB (>8K), allocate from the 12749 * appropriate kmem_tsb_default_arena vmem arena 12750 * else if low on memory or the TSB_FORCEALLOC flag is set or 12751 * tsb_forceheap is set 12752 * Allocate from kernel heap via sfmmu_tsb8k_cache with 12753 * KM_SLEEP (never fails) 12754 * else 12755 * Allocate from appropriate sfmmu_tsb_cache with 12756 * KM_NOSLEEP 12757 * endif 12758 */ 12759 if (tsb_lgrp_affinity) 12760 lgrpid = lgrp_home_id(curthread); 12761 if (lgrpid == LGRP_NONE) 12762 lgrpid = 0; /* use lgrp of boot CPU */ 12763 12764 if (tsbbytes > MMU_PAGESIZE) { 12765 if (tsbbytes > MMU_PAGESIZE4M) { 12766 vmp = kmem_bigtsb_default_arena[lgrpid]; 12767 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes, 12768 0, 0, NULL, NULL, VM_NOSLEEP); 12769 } else { 12770 vmp = kmem_tsb_default_arena[lgrpid]; 12771 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes, 12772 0, 0, NULL, NULL, VM_NOSLEEP); 12773 } 12774 #ifdef DEBUG 12775 } else if (lowmem || (flags & TSB_FORCEALLOC) || tsb_forceheap) { 12776 #else /* !DEBUG */ 12777 } else if (lowmem || (flags & TSB_FORCEALLOC)) { 12778 #endif /* DEBUG */ 12779 kmem_cachep = sfmmu_tsb8k_cache; 12780 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_SLEEP); 12781 ASSERT(vaddr != NULL); 12782 } else { 12783 kmem_cachep = sfmmu_tsb_cache[lgrpid]; 12784 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_NOSLEEP); 12785 } 12786 12787 tsbinfo->tsb_cache = kmem_cachep; 12788 tsbinfo->tsb_vmp = vmp; 12789 12790 if (vaddr == NULL) { 12791 return (EAGAIN); 12792 } 12793 12794 atomic_add_64(&tsb_alloc_bytes, (int64_t)tsbbytes); 12795 kmem_cachep = tsbinfo->tsb_cache; 12796 12797 /* 12798 * If we are allocating from outside the cage, then we need to 12799 * register a relocation callback handler. Note that for now 12800 * since pseudo mappings always hang off of the slab's root page, 12801 * we need only lock the first 8K of the TSB slab. This is a bit 12802 * hacky but it is good for performance. 12803 */ 12804 if (kmem_cachep != sfmmu_tsb8k_cache) { 12805 slab_vaddr = (caddr_t)((uintptr_t)vaddr & slab_mask); 12806 ret = as_pagelock(&kas, &pplist, slab_vaddr, PAGESIZE, S_WRITE); 12807 ASSERT(ret == 0); 12808 ret = hat_add_callback(sfmmu_tsb_cb_id, vaddr, (uint_t)tsbbytes, 12809 cbflags, (void *)tsbinfo, &pfn, NULL); 12810 12811 /* 12812 * Need to free up resources if we could not successfully 12813 * add the callback function and return an error condition. 12814 */ 12815 if (ret != 0) { 12816 if (kmem_cachep) { 12817 kmem_cache_free(kmem_cachep, vaddr); 12818 } else { 12819 vmem_xfree(vmp, (void *)vaddr, tsbbytes); 12820 } 12821 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, 12822 S_WRITE); 12823 return (EAGAIN); 12824 } 12825 } else { 12826 /* 12827 * Since allocation of 8K TSBs from heap is rare and occurs 12828 * during memory pressure we allocate them from permanent 12829 * memory rather than using callbacks to get the PFN. 12830 */ 12831 pfn = hat_getpfnum(kas.a_hat, vaddr); 12832 } 12833 12834 tsbinfo->tsb_va = vaddr; 12835 tsbinfo->tsb_szc = tsbcode; 12836 tsbinfo->tsb_ttesz_mask = tteszmask; 12837 tsbinfo->tsb_next = NULL; 12838 tsbinfo->tsb_flags = 0; 12839 12840 sfmmu_tsbinfo_setup_phys(tsbinfo, pfn); 12841 12842 sfmmu_inv_tsb(vaddr, tsbbytes); 12843 12844 if (kmem_cachep != sfmmu_tsb8k_cache) { 12845 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, S_WRITE); 12846 } 12847 12848 return (0); 12849 } 12850 12851 /* 12852 * Initialize per cpu tsb and per cpu tsbmiss_area 12853 */ 12854 void 12855 sfmmu_init_tsbs(void) 12856 { 12857 int i; 12858 struct tsbmiss *tsbmissp; 12859 struct kpmtsbm *kpmtsbmp; 12860 #ifndef sun4v 12861 extern int dcache_line_mask; 12862 #endif /* sun4v */ 12863 extern uint_t vac_colors; 12864 12865 /* 12866 * Init. tsb miss area. 12867 */ 12868 tsbmissp = tsbmiss_area; 12869 12870 for (i = 0; i < NCPU; tsbmissp++, i++) { 12871 /* 12872 * initialize the tsbmiss area. 12873 * Do this for all possible CPUs as some may be added 12874 * while the system is running. There is no cost to this. 12875 */ 12876 tsbmissp->ksfmmup = ksfmmup; 12877 #ifndef sun4v 12878 tsbmissp->dcache_line_mask = (uint16_t)dcache_line_mask; 12879 #endif /* sun4v */ 12880 tsbmissp->khashstart = 12881 (struct hmehash_bucket *)va_to_pa((caddr_t)khme_hash); 12882 tsbmissp->uhashstart = 12883 (struct hmehash_bucket *)va_to_pa((caddr_t)uhme_hash); 12884 tsbmissp->khashsz = khmehash_num; 12885 tsbmissp->uhashsz = uhmehash_num; 12886 } 12887 12888 sfmmu_tsb_cb_id = hat_register_callback('T'<<16 | 'S' << 8 | 'B', 12889 sfmmu_tsb_pre_relocator, sfmmu_tsb_post_relocator, NULL, 0); 12890 12891 if (kpm_enable == 0) 12892 return; 12893 12894 /* -- Begin KPM specific init -- */ 12895 12896 if (kpm_smallpages) { 12897 /* 12898 * If we're using base pagesize pages for seg_kpm 12899 * mappings, we use the kernel TSB since we can't afford 12900 * to allocate a second huge TSB for these mappings. 12901 */ 12902 kpm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 12903 kpm_tsbsz = ktsb_szcode; 12904 kpmsm_tsbbase = kpm_tsbbase; 12905 kpmsm_tsbsz = kpm_tsbsz; 12906 } else { 12907 /* 12908 * In VAC conflict case, just put the entries in the 12909 * kernel 8K indexed TSB for now so we can find them. 12910 * This could really be changed in the future if we feel 12911 * the need... 12912 */ 12913 kpmsm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 12914 kpmsm_tsbsz = ktsb_szcode; 12915 kpm_tsbbase = ktsb_phys? ktsb4m_pbase : (uint64_t)ktsb4m_base; 12916 kpm_tsbsz = ktsb4m_szcode; 12917 } 12918 12919 kpmtsbmp = kpmtsbm_area; 12920 for (i = 0; i < NCPU; kpmtsbmp++, i++) { 12921 /* 12922 * Initialize the kpmtsbm area. 12923 * Do this for all possible CPUs as some may be added 12924 * while the system is running. There is no cost to this. 12925 */ 12926 kpmtsbmp->vbase = kpm_vbase; 12927 kpmtsbmp->vend = kpm_vbase + kpm_size * vac_colors; 12928 kpmtsbmp->sz_shift = kpm_size_shift; 12929 kpmtsbmp->kpmp_shift = kpmp_shift; 12930 kpmtsbmp->kpmp2pshft = (uchar_t)kpmp2pshft; 12931 if (kpm_smallpages == 0) { 12932 kpmtsbmp->kpmp_table_sz = kpmp_table_sz; 12933 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_table); 12934 } else { 12935 kpmtsbmp->kpmp_table_sz = kpmp_stable_sz; 12936 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_stable); 12937 } 12938 kpmtsbmp->msegphashpa = va_to_pa(memseg_phash); 12939 kpmtsbmp->flags = KPMTSBM_ENABLE_FLAG; 12940 #ifdef DEBUG 12941 kpmtsbmp->flags |= (kpm_tsbmtl) ? KPMTSBM_TLTSBM_FLAG : 0; 12942 #endif /* DEBUG */ 12943 if (ktsb_phys) 12944 kpmtsbmp->flags |= KPMTSBM_TSBPHYS_FLAG; 12945 } 12946 12947 /* -- End KPM specific init -- */ 12948 } 12949 12950 /* Avoid using sfmmu_tsbinfo_alloc() to avoid kmem_alloc - no real reason */ 12951 struct tsb_info ktsb_info[2]; 12952 12953 /* 12954 * Called from hat_kern_setup() to setup the tsb_info for ksfmmup. 12955 */ 12956 void 12957 sfmmu_init_ktsbinfo() 12958 { 12959 ASSERT(ksfmmup != NULL); 12960 ASSERT(ksfmmup->sfmmu_tsb == NULL); 12961 /* 12962 * Allocate tsbinfos for kernel and copy in data 12963 * to make debug easier and sun4v setup easier. 12964 */ 12965 ktsb_info[0].tsb_sfmmu = ksfmmup; 12966 ktsb_info[0].tsb_szc = ktsb_szcode; 12967 ktsb_info[0].tsb_ttesz_mask = TSB8K|TSB64K|TSB512K; 12968 ktsb_info[0].tsb_va = ktsb_base; 12969 ktsb_info[0].tsb_pa = ktsb_pbase; 12970 ktsb_info[0].tsb_flags = 0; 12971 ktsb_info[0].tsb_tte.ll = 0; 12972 ktsb_info[0].tsb_cache = NULL; 12973 12974 ktsb_info[1].tsb_sfmmu = ksfmmup; 12975 ktsb_info[1].tsb_szc = ktsb4m_szcode; 12976 ktsb_info[1].tsb_ttesz_mask = TSB4M; 12977 ktsb_info[1].tsb_va = ktsb4m_base; 12978 ktsb_info[1].tsb_pa = ktsb4m_pbase; 12979 ktsb_info[1].tsb_flags = 0; 12980 ktsb_info[1].tsb_tte.ll = 0; 12981 ktsb_info[1].tsb_cache = NULL; 12982 12983 /* Link them into ksfmmup. */ 12984 ktsb_info[0].tsb_next = &ktsb_info[1]; 12985 ktsb_info[1].tsb_next = NULL; 12986 ksfmmup->sfmmu_tsb = &ktsb_info[0]; 12987 12988 sfmmu_setup_tsbinfo(ksfmmup); 12989 } 12990 12991 /* 12992 * Cache the last value returned from va_to_pa(). If the VA specified 12993 * in the current call to cached_va_to_pa() maps to the same Page (as the 12994 * previous call to cached_va_to_pa()), then compute the PA using 12995 * cached info, else call va_to_pa(). 12996 * 12997 * Note: this function is neither MT-safe nor consistent in the presence 12998 * of multiple, interleaved threads. This function was created to enable 12999 * an optimization used during boot (at a point when there's only one thread 13000 * executing on the "boot CPU", and before startup_vm() has been called). 13001 */ 13002 static uint64_t 13003 cached_va_to_pa(void *vaddr) 13004 { 13005 static uint64_t prev_vaddr_base = 0; 13006 static uint64_t prev_pfn = 0; 13007 13008 if ((((uint64_t)vaddr) & MMU_PAGEMASK) == prev_vaddr_base) { 13009 return (prev_pfn | ((uint64_t)vaddr & MMU_PAGEOFFSET)); 13010 } else { 13011 uint64_t pa = va_to_pa(vaddr); 13012 13013 if (pa != ((uint64_t)-1)) { 13014 /* 13015 * Computed physical address is valid. Cache its 13016 * related info for the next cached_va_to_pa() call. 13017 */ 13018 prev_pfn = pa & MMU_PAGEMASK; 13019 prev_vaddr_base = ((uint64_t)vaddr) & MMU_PAGEMASK; 13020 } 13021 13022 return (pa); 13023 } 13024 } 13025 13026 /* 13027 * Carve up our nucleus hblk region. We may allocate more hblks than 13028 * asked due to rounding errors but we are guaranteed to have at least 13029 * enough space to allocate the requested number of hblk8's and hblk1's. 13030 */ 13031 void 13032 sfmmu_init_nucleus_hblks(caddr_t addr, size_t size, int nhblk8, int nhblk1) 13033 { 13034 struct hme_blk *hmeblkp; 13035 size_t hme8blk_sz, hme1blk_sz; 13036 size_t i; 13037 size_t hblk8_bound; 13038 ulong_t j = 0, k = 0; 13039 13040 ASSERT(addr != NULL && size != 0); 13041 13042 /* Need to use proper structure alignment */ 13043 hme8blk_sz = roundup(HME8BLK_SZ, sizeof (int64_t)); 13044 hme1blk_sz = roundup(HME1BLK_SZ, sizeof (int64_t)); 13045 13046 nucleus_hblk8.list = (void *)addr; 13047 nucleus_hblk8.index = 0; 13048 13049 /* 13050 * Use as much memory as possible for hblk8's since we 13051 * expect all bop_alloc'ed memory to be allocated in 8k chunks. 13052 * We need to hold back enough space for the hblk1's which 13053 * we'll allocate next. 13054 */ 13055 hblk8_bound = size - (nhblk1 * hme1blk_sz) - hme8blk_sz; 13056 for (i = 0; i <= hblk8_bound; i += hme8blk_sz, j++) { 13057 hmeblkp = (struct hme_blk *)addr; 13058 addr += hme8blk_sz; 13059 hmeblkp->hblk_nuc_bit = 1; 13060 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 13061 } 13062 nucleus_hblk8.len = j; 13063 ASSERT(j >= nhblk8); 13064 SFMMU_STAT_ADD(sf_hblk8_ncreate, j); 13065 13066 nucleus_hblk1.list = (void *)addr; 13067 nucleus_hblk1.index = 0; 13068 for (; i <= (size - hme1blk_sz); i += hme1blk_sz, k++) { 13069 hmeblkp = (struct hme_blk *)addr; 13070 addr += hme1blk_sz; 13071 hmeblkp->hblk_nuc_bit = 1; 13072 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 13073 } 13074 ASSERT(k >= nhblk1); 13075 nucleus_hblk1.len = k; 13076 SFMMU_STAT_ADD(sf_hblk1_ncreate, k); 13077 } 13078 13079 /* 13080 * This function is currently not supported on this platform. For what 13081 * it's supposed to do, see hat.c and hat_srmmu.c 13082 */ 13083 /* ARGSUSED */ 13084 faultcode_t 13085 hat_softlock(struct hat *hat, caddr_t addr, size_t *lenp, page_t **ppp, 13086 uint_t flags) 13087 { 13088 return (FC_NOSUPPORT); 13089 } 13090 13091 /* 13092 * Searchs the mapping list of the page for a mapping of the same size. If not 13093 * found the corresponding bit is cleared in the p_index field. When large 13094 * pages are more prevalent in the system, we can maintain the mapping list 13095 * in order and we don't have to traverse the list each time. Just check the 13096 * next and prev entries, and if both are of different size, we clear the bit. 13097 */ 13098 static void 13099 sfmmu_rm_large_mappings(page_t *pp, int ttesz) 13100 { 13101 struct sf_hment *sfhmep; 13102 struct hme_blk *hmeblkp; 13103 int index; 13104 pgcnt_t npgs; 13105 13106 ASSERT(ttesz > TTE8K); 13107 13108 ASSERT(sfmmu_mlist_held(pp)); 13109 13110 ASSERT(PP_ISMAPPED_LARGE(pp)); 13111 13112 /* 13113 * Traverse mapping list looking for another mapping of same size. 13114 * since we only want to clear index field if all mappings of 13115 * that size are gone. 13116 */ 13117 13118 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 13119 if (IS_PAHME(sfhmep)) 13120 continue; 13121 hmeblkp = sfmmu_hmetohblk(sfhmep); 13122 if (hme_size(sfhmep) == ttesz) { 13123 /* 13124 * another mapping of the same size. don't clear index. 13125 */ 13126 return; 13127 } 13128 } 13129 13130 /* 13131 * Clear the p_index bit for large page. 13132 */ 13133 index = PAGESZ_TO_INDEX(ttesz); 13134 npgs = TTEPAGES(ttesz); 13135 while (npgs-- > 0) { 13136 ASSERT(pp->p_index & index); 13137 pp->p_index &= ~index; 13138 pp = PP_PAGENEXT(pp); 13139 } 13140 } 13141 13142 /* 13143 * return supported features 13144 */ 13145 /* ARGSUSED */ 13146 int 13147 hat_supported(enum hat_features feature, void *arg) 13148 { 13149 switch (feature) { 13150 case HAT_SHARED_PT: 13151 case HAT_DYNAMIC_ISM_UNMAP: 13152 case HAT_VMODSORT: 13153 return (1); 13154 case HAT_SHARED_REGIONS: 13155 if (shctx_on) 13156 return (1); 13157 else 13158 return (0); 13159 default: 13160 return (0); 13161 } 13162 } 13163 13164 void 13165 hat_enter(struct hat *hat) 13166 { 13167 hatlock_t *hatlockp; 13168 13169 if (hat != ksfmmup) { 13170 hatlockp = TSB_HASH(hat); 13171 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 13172 } 13173 } 13174 13175 void 13176 hat_exit(struct hat *hat) 13177 { 13178 hatlock_t *hatlockp; 13179 13180 if (hat != ksfmmup) { 13181 hatlockp = TSB_HASH(hat); 13182 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 13183 } 13184 } 13185 13186 /*ARGSUSED*/ 13187 void 13188 hat_reserve(struct as *as, caddr_t addr, size_t len) 13189 { 13190 } 13191 13192 static void 13193 hat_kstat_init(void) 13194 { 13195 kstat_t *ksp; 13196 13197 ksp = kstat_create("unix", 0, "sfmmu_global_stat", "hat", 13198 KSTAT_TYPE_RAW, sizeof (struct sfmmu_global_stat), 13199 KSTAT_FLAG_VIRTUAL); 13200 if (ksp) { 13201 ksp->ks_data = (void *) &sfmmu_global_stat; 13202 kstat_install(ksp); 13203 } 13204 ksp = kstat_create("unix", 0, "sfmmu_tsbsize_stat", "hat", 13205 KSTAT_TYPE_RAW, sizeof (struct sfmmu_tsbsize_stat), 13206 KSTAT_FLAG_VIRTUAL); 13207 if (ksp) { 13208 ksp->ks_data = (void *) &sfmmu_tsbsize_stat; 13209 kstat_install(ksp); 13210 } 13211 ksp = kstat_create("unix", 0, "sfmmu_percpu_stat", "hat", 13212 KSTAT_TYPE_RAW, sizeof (struct sfmmu_percpu_stat) * NCPU, 13213 KSTAT_FLAG_WRITABLE); 13214 if (ksp) { 13215 ksp->ks_update = sfmmu_kstat_percpu_update; 13216 kstat_install(ksp); 13217 } 13218 } 13219 13220 /* ARGSUSED */ 13221 static int 13222 sfmmu_kstat_percpu_update(kstat_t *ksp, int rw) 13223 { 13224 struct sfmmu_percpu_stat *cpu_kstat = ksp->ks_data; 13225 struct tsbmiss *tsbm = tsbmiss_area; 13226 struct kpmtsbm *kpmtsbm = kpmtsbm_area; 13227 int i; 13228 13229 ASSERT(cpu_kstat); 13230 if (rw == KSTAT_READ) { 13231 for (i = 0; i < NCPU; cpu_kstat++, tsbm++, kpmtsbm++, i++) { 13232 cpu_kstat->sf_itlb_misses = 0; 13233 cpu_kstat->sf_dtlb_misses = 0; 13234 cpu_kstat->sf_utsb_misses = tsbm->utsb_misses - 13235 tsbm->uprot_traps; 13236 cpu_kstat->sf_ktsb_misses = tsbm->ktsb_misses + 13237 kpmtsbm->kpm_tsb_misses - tsbm->kprot_traps; 13238 cpu_kstat->sf_tsb_hits = 0; 13239 cpu_kstat->sf_umod_faults = tsbm->uprot_traps; 13240 cpu_kstat->sf_kmod_faults = tsbm->kprot_traps; 13241 } 13242 } else { 13243 /* KSTAT_WRITE is used to clear stats */ 13244 for (i = 0; i < NCPU; tsbm++, kpmtsbm++, i++) { 13245 tsbm->utsb_misses = 0; 13246 tsbm->ktsb_misses = 0; 13247 tsbm->uprot_traps = 0; 13248 tsbm->kprot_traps = 0; 13249 kpmtsbm->kpm_dtlb_misses = 0; 13250 kpmtsbm->kpm_tsb_misses = 0; 13251 } 13252 } 13253 return (0); 13254 } 13255 13256 #ifdef DEBUG 13257 13258 tte_t *gorig[NCPU], *gcur[NCPU], *gnew[NCPU]; 13259 13260 /* 13261 * A tte checker. *orig_old is the value we read before cas. 13262 * *cur is the value returned by cas. 13263 * *new is the desired value when we do the cas. 13264 * 13265 * *hmeblkp is currently unused. 13266 */ 13267 13268 /* ARGSUSED */ 13269 void 13270 chk_tte(tte_t *orig_old, tte_t *cur, tte_t *new, struct hme_blk *hmeblkp) 13271 { 13272 pfn_t i, j, k; 13273 int cpuid = CPU->cpu_id; 13274 13275 gorig[cpuid] = orig_old; 13276 gcur[cpuid] = cur; 13277 gnew[cpuid] = new; 13278 13279 #ifdef lint 13280 hmeblkp = hmeblkp; 13281 #endif 13282 13283 if (TTE_IS_VALID(orig_old)) { 13284 if (TTE_IS_VALID(cur)) { 13285 i = TTE_TO_TTEPFN(orig_old); 13286 j = TTE_TO_TTEPFN(cur); 13287 k = TTE_TO_TTEPFN(new); 13288 if (i != j) { 13289 /* remap error? */ 13290 panic("chk_tte: bad pfn, 0x%lx, 0x%lx", i, j); 13291 } 13292 13293 if (i != k) { 13294 /* remap error? */ 13295 panic("chk_tte: bad pfn2, 0x%lx, 0x%lx", i, k); 13296 } 13297 } else { 13298 if (TTE_IS_VALID(new)) { 13299 panic("chk_tte: invalid cur? "); 13300 } 13301 13302 i = TTE_TO_TTEPFN(orig_old); 13303 k = TTE_TO_TTEPFN(new); 13304 if (i != k) { 13305 panic("chk_tte: bad pfn3, 0x%lx, 0x%lx", i, k); 13306 } 13307 } 13308 } else { 13309 if (TTE_IS_VALID(cur)) { 13310 j = TTE_TO_TTEPFN(cur); 13311 if (TTE_IS_VALID(new)) { 13312 k = TTE_TO_TTEPFN(new); 13313 if (j != k) { 13314 panic("chk_tte: bad pfn4, 0x%lx, 0x%lx", 13315 j, k); 13316 } 13317 } else { 13318 panic("chk_tte: why here?"); 13319 } 13320 } else { 13321 if (!TTE_IS_VALID(new)) { 13322 panic("chk_tte: why here2 ?"); 13323 } 13324 } 13325 } 13326 } 13327 13328 #endif /* DEBUG */ 13329 13330 extern void prefetch_tsbe_read(struct tsbe *); 13331 extern void prefetch_tsbe_write(struct tsbe *); 13332 13333 13334 /* 13335 * We want to prefetch 7 cache lines ahead for our read prefetch. This gives 13336 * us optimal performance on Cheetah+. You can only have 8 outstanding 13337 * prefetches at any one time, so we opted for 7 read prefetches and 1 write 13338 * prefetch to make the most utilization of the prefetch capability. 13339 */ 13340 #define TSBE_PREFETCH_STRIDE (7) 13341 13342 void 13343 sfmmu_copy_tsb(struct tsb_info *old_tsbinfo, struct tsb_info *new_tsbinfo) 13344 { 13345 int old_bytes = TSB_BYTES(old_tsbinfo->tsb_szc); 13346 int new_bytes = TSB_BYTES(new_tsbinfo->tsb_szc); 13347 int old_entries = TSB_ENTRIES(old_tsbinfo->tsb_szc); 13348 int new_entries = TSB_ENTRIES(new_tsbinfo->tsb_szc); 13349 struct tsbe *old; 13350 struct tsbe *new; 13351 struct tsbe *new_base = (struct tsbe *)new_tsbinfo->tsb_va; 13352 uint64_t va; 13353 int new_offset; 13354 int i; 13355 int vpshift; 13356 int last_prefetch; 13357 13358 if (old_bytes == new_bytes) { 13359 bcopy(old_tsbinfo->tsb_va, new_tsbinfo->tsb_va, new_bytes); 13360 } else { 13361 13362 /* 13363 * A TSBE is 16 bytes which means there are four TSBE's per 13364 * P$ line (64 bytes), thus every 4 TSBE's we prefetch. 13365 */ 13366 old = (struct tsbe *)old_tsbinfo->tsb_va; 13367 last_prefetch = old_entries - (4*(TSBE_PREFETCH_STRIDE+1)); 13368 for (i = 0; i < old_entries; i++, old++) { 13369 if (((i & (4-1)) == 0) && (i < last_prefetch)) 13370 prefetch_tsbe_read(old); 13371 if (!old->tte_tag.tag_invalid) { 13372 /* 13373 * We have a valid TTE to remap. Check the 13374 * size. We won't remap 64K or 512K TTEs 13375 * because they span more than one TSB entry 13376 * and are indexed using an 8K virt. page. 13377 * Ditto for 32M and 256M TTEs. 13378 */ 13379 if (TTE_CSZ(&old->tte_data) == TTE64K || 13380 TTE_CSZ(&old->tte_data) == TTE512K) 13381 continue; 13382 if (mmu_page_sizes == max_mmu_page_sizes) { 13383 if (TTE_CSZ(&old->tte_data) == TTE32M || 13384 TTE_CSZ(&old->tte_data) == TTE256M) 13385 continue; 13386 } 13387 13388 /* clear the lower 22 bits of the va */ 13389 va = *(uint64_t *)old << 22; 13390 /* turn va into a virtual pfn */ 13391 va >>= 22 - TSB_START_SIZE; 13392 /* 13393 * or in bits from the offset in the tsb 13394 * to get the real virtual pfn. These 13395 * correspond to bits [21:13] in the va 13396 */ 13397 vpshift = 13398 TTE_BSZS_SHIFT(TTE_CSZ(&old->tte_data)) & 13399 0x1ff; 13400 va |= (i << vpshift); 13401 va >>= vpshift; 13402 new_offset = va & (new_entries - 1); 13403 new = new_base + new_offset; 13404 prefetch_tsbe_write(new); 13405 *new = *old; 13406 } 13407 } 13408 } 13409 } 13410 13411 /* 13412 * unused in sfmmu 13413 */ 13414 void 13415 hat_dump(void) 13416 { 13417 } 13418 13419 /* 13420 * Called when a thread is exiting and we have switched to the kernel address 13421 * space. Perform the same VM initialization resume() uses when switching 13422 * processes. 13423 * 13424 * Note that sfmmu_load_mmustate() is currently a no-op for kernel threads, but 13425 * we call it anyway in case the semantics change in the future. 13426 */ 13427 /*ARGSUSED*/ 13428 void 13429 hat_thread_exit(kthread_t *thd) 13430 { 13431 uint_t pgsz_cnum; 13432 uint_t pstate_save; 13433 13434 ASSERT(thd->t_procp->p_as == &kas); 13435 13436 pgsz_cnum = KCONTEXT; 13437 #ifdef sun4u 13438 pgsz_cnum |= (ksfmmup->sfmmu_cext << CTXREG_EXT_SHIFT); 13439 #endif 13440 13441 /* 13442 * Note that sfmmu_load_mmustate() is currently a no-op for 13443 * kernel threads. We need to disable interrupts here, 13444 * simply because otherwise sfmmu_load_mmustate() would panic 13445 * if the caller does not disable interrupts. 13446 */ 13447 pstate_save = sfmmu_disable_intrs(); 13448 13449 /* Compatibility Note: hw takes care of MMU_SCONTEXT1 */ 13450 sfmmu_setctx_sec(pgsz_cnum); 13451 sfmmu_load_mmustate(ksfmmup); 13452 sfmmu_enable_intrs(pstate_save); 13453 } 13454 13455 13456 /* 13457 * SRD support 13458 */ 13459 #define SRD_HASH_FUNCTION(vp) (((((uintptr_t)(vp)) >> 4) ^ \ 13460 (((uintptr_t)(vp)) >> 11)) & \ 13461 srd_hashmask) 13462 13463 /* 13464 * Attach the process to the srd struct associated with the exec vnode 13465 * from which the process is started. 13466 */ 13467 void 13468 hat_join_srd(struct hat *sfmmup, vnode_t *evp) 13469 { 13470 uint_t hash = SRD_HASH_FUNCTION(evp); 13471 sf_srd_t *srdp; 13472 sf_srd_t *newsrdp; 13473 13474 ASSERT(sfmmup != ksfmmup); 13475 ASSERT(sfmmup->sfmmu_srdp == NULL); 13476 13477 if (!shctx_on) { 13478 return; 13479 } 13480 13481 VN_HOLD(evp); 13482 13483 if (srd_buckets[hash].srdb_srdp != NULL) { 13484 mutex_enter(&srd_buckets[hash].srdb_lock); 13485 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL; 13486 srdp = srdp->srd_hash) { 13487 if (srdp->srd_evp == evp) { 13488 ASSERT(srdp->srd_refcnt >= 0); 13489 sfmmup->sfmmu_srdp = srdp; 13490 atomic_inc_32( 13491 (volatile uint_t *)&srdp->srd_refcnt); 13492 mutex_exit(&srd_buckets[hash].srdb_lock); 13493 return; 13494 } 13495 } 13496 mutex_exit(&srd_buckets[hash].srdb_lock); 13497 } 13498 newsrdp = kmem_cache_alloc(srd_cache, KM_SLEEP); 13499 ASSERT(newsrdp->srd_next_ismrid == 0 && newsrdp->srd_next_hmerid == 0); 13500 13501 newsrdp->srd_evp = evp; 13502 newsrdp->srd_refcnt = 1; 13503 newsrdp->srd_hmergnfree = NULL; 13504 newsrdp->srd_ismrgnfree = NULL; 13505 13506 mutex_enter(&srd_buckets[hash].srdb_lock); 13507 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL; 13508 srdp = srdp->srd_hash) { 13509 if (srdp->srd_evp == evp) { 13510 ASSERT(srdp->srd_refcnt >= 0); 13511 sfmmup->sfmmu_srdp = srdp; 13512 atomic_inc_32((volatile uint_t *)&srdp->srd_refcnt); 13513 mutex_exit(&srd_buckets[hash].srdb_lock); 13514 kmem_cache_free(srd_cache, newsrdp); 13515 return; 13516 } 13517 } 13518 newsrdp->srd_hash = srd_buckets[hash].srdb_srdp; 13519 srd_buckets[hash].srdb_srdp = newsrdp; 13520 sfmmup->sfmmu_srdp = newsrdp; 13521 13522 mutex_exit(&srd_buckets[hash].srdb_lock); 13523 13524 } 13525 13526 static void 13527 sfmmu_leave_srd(sfmmu_t *sfmmup) 13528 { 13529 vnode_t *evp; 13530 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 13531 uint_t hash; 13532 sf_srd_t **prev_srdpp; 13533 sf_region_t *rgnp; 13534 sf_region_t *nrgnp; 13535 #ifdef DEBUG 13536 int rgns = 0; 13537 #endif 13538 int i; 13539 13540 ASSERT(sfmmup != ksfmmup); 13541 ASSERT(srdp != NULL); 13542 ASSERT(srdp->srd_refcnt > 0); 13543 ASSERT(sfmmup->sfmmu_scdp == NULL); 13544 ASSERT(sfmmup->sfmmu_free == 1); 13545 13546 sfmmup->sfmmu_srdp = NULL; 13547 evp = srdp->srd_evp; 13548 ASSERT(evp != NULL); 13549 if (atomic_dec_32_nv((volatile uint_t *)&srdp->srd_refcnt)) { 13550 VN_RELE(evp); 13551 return; 13552 } 13553 13554 hash = SRD_HASH_FUNCTION(evp); 13555 mutex_enter(&srd_buckets[hash].srdb_lock); 13556 for (prev_srdpp = &srd_buckets[hash].srdb_srdp; 13557 (srdp = *prev_srdpp) != NULL; prev_srdpp = &srdp->srd_hash) { 13558 if (srdp->srd_evp == evp) { 13559 break; 13560 } 13561 } 13562 if (srdp == NULL || srdp->srd_refcnt) { 13563 mutex_exit(&srd_buckets[hash].srdb_lock); 13564 VN_RELE(evp); 13565 return; 13566 } 13567 *prev_srdpp = srdp->srd_hash; 13568 mutex_exit(&srd_buckets[hash].srdb_lock); 13569 13570 ASSERT(srdp->srd_refcnt == 0); 13571 VN_RELE(evp); 13572 13573 #ifdef DEBUG 13574 for (i = 0; i < SFMMU_MAX_REGION_BUCKETS; i++) { 13575 ASSERT(srdp->srd_rgnhash[i] == NULL); 13576 } 13577 #endif /* DEBUG */ 13578 13579 /* free each hme regions in the srd */ 13580 for (rgnp = srdp->srd_hmergnfree; rgnp != NULL; rgnp = nrgnp) { 13581 nrgnp = rgnp->rgn_next; 13582 ASSERT(rgnp->rgn_id < srdp->srd_next_hmerid); 13583 ASSERT(rgnp->rgn_refcnt == 0); 13584 ASSERT(rgnp->rgn_sfmmu_head == NULL); 13585 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 13586 ASSERT(rgnp->rgn_hmeflags == 0); 13587 ASSERT(srdp->srd_hmergnp[rgnp->rgn_id] == rgnp); 13588 #ifdef DEBUG 13589 for (i = 0; i < MMU_PAGE_SIZES; i++) { 13590 ASSERT(rgnp->rgn_ttecnt[i] == 0); 13591 } 13592 rgns++; 13593 #endif /* DEBUG */ 13594 kmem_cache_free(region_cache, rgnp); 13595 } 13596 ASSERT(rgns == srdp->srd_next_hmerid); 13597 13598 #ifdef DEBUG 13599 rgns = 0; 13600 #endif 13601 /* free each ism rgns in the srd */ 13602 for (rgnp = srdp->srd_ismrgnfree; rgnp != NULL; rgnp = nrgnp) { 13603 nrgnp = rgnp->rgn_next; 13604 ASSERT(rgnp->rgn_id < srdp->srd_next_ismrid); 13605 ASSERT(rgnp->rgn_refcnt == 0); 13606 ASSERT(rgnp->rgn_sfmmu_head == NULL); 13607 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 13608 ASSERT(srdp->srd_ismrgnp[rgnp->rgn_id] == rgnp); 13609 #ifdef DEBUG 13610 for (i = 0; i < MMU_PAGE_SIZES; i++) { 13611 ASSERT(rgnp->rgn_ttecnt[i] == 0); 13612 } 13613 rgns++; 13614 #endif /* DEBUG */ 13615 kmem_cache_free(region_cache, rgnp); 13616 } 13617 ASSERT(rgns == srdp->srd_next_ismrid); 13618 ASSERT(srdp->srd_ismbusyrgns == 0); 13619 ASSERT(srdp->srd_hmebusyrgns == 0); 13620 13621 srdp->srd_next_ismrid = 0; 13622 srdp->srd_next_hmerid = 0; 13623 13624 bzero((void *)srdp->srd_ismrgnp, 13625 sizeof (sf_region_t *) * SFMMU_MAX_ISM_REGIONS); 13626 bzero((void *)srdp->srd_hmergnp, 13627 sizeof (sf_region_t *) * SFMMU_MAX_HME_REGIONS); 13628 13629 ASSERT(srdp->srd_scdp == NULL); 13630 kmem_cache_free(srd_cache, srdp); 13631 } 13632 13633 /* ARGSUSED */ 13634 static int 13635 sfmmu_srdcache_constructor(void *buf, void *cdrarg, int kmflags) 13636 { 13637 sf_srd_t *srdp = (sf_srd_t *)buf; 13638 bzero(buf, sizeof (*srdp)); 13639 13640 mutex_init(&srdp->srd_mutex, NULL, MUTEX_DEFAULT, NULL); 13641 mutex_init(&srdp->srd_scd_mutex, NULL, MUTEX_DEFAULT, NULL); 13642 return (0); 13643 } 13644 13645 /* ARGSUSED */ 13646 static void 13647 sfmmu_srdcache_destructor(void *buf, void *cdrarg) 13648 { 13649 sf_srd_t *srdp = (sf_srd_t *)buf; 13650 13651 mutex_destroy(&srdp->srd_mutex); 13652 mutex_destroy(&srdp->srd_scd_mutex); 13653 } 13654 13655 /* 13656 * The caller makes sure hat_join_region()/hat_leave_region() can't be called 13657 * at the same time for the same process and address range. This is ensured by 13658 * the fact that address space is locked as writer when a process joins the 13659 * regions. Therefore there's no need to hold an srd lock during the entire 13660 * execution of hat_join_region()/hat_leave_region(). 13661 */ 13662 13663 #define RGN_HASH_FUNCTION(obj) (((((uintptr_t)(obj)) >> 4) ^ \ 13664 (((uintptr_t)(obj)) >> 11)) & \ 13665 srd_rgn_hashmask) 13666 /* 13667 * This routine implements the shared context functionality required when 13668 * attaching a segment to an address space. It must be called from 13669 * hat_share() for D(ISM) segments and from segvn_create() for segments 13670 * with the MAP_PRIVATE and MAP_TEXT flags set. It returns a region_cookie 13671 * which is saved in the private segment data for hme segments and 13672 * the ism_map structure for ism segments. 13673 */ 13674 hat_region_cookie_t 13675 hat_join_region(struct hat *sfmmup, 13676 caddr_t r_saddr, 13677 size_t r_size, 13678 void *r_obj, 13679 u_offset_t r_objoff, 13680 uchar_t r_perm, 13681 uchar_t r_pgszc, 13682 hat_rgn_cb_func_t r_cb_function, 13683 uint_t flags) 13684 { 13685 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 13686 uint_t rhash; 13687 uint_t rid; 13688 hatlock_t *hatlockp; 13689 sf_region_t *rgnp; 13690 sf_region_t *new_rgnp = NULL; 13691 int i; 13692 uint16_t *nextidp; 13693 sf_region_t **freelistp; 13694 int maxids; 13695 sf_region_t **rarrp; 13696 uint16_t *busyrgnsp; 13697 ulong_t rttecnt; 13698 uchar_t tteflag; 13699 uchar_t r_type = flags & HAT_REGION_TYPE_MASK; 13700 int text = (r_type == HAT_REGION_TEXT); 13701 13702 if (srdp == NULL || r_size == 0) { 13703 return (HAT_INVALID_REGION_COOKIE); 13704 } 13705 13706 ASSERT(sfmmup != ksfmmup); 13707 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 13708 ASSERT(srdp->srd_refcnt > 0); 13709 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK)); 13710 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM); 13711 ASSERT(r_pgszc < mmu_page_sizes); 13712 if (!IS_P2ALIGNED(r_saddr, TTEBYTES(r_pgszc)) || 13713 !IS_P2ALIGNED(r_size, TTEBYTES(r_pgszc))) { 13714 panic("hat_join_region: region addr or size is not aligned\n"); 13715 } 13716 13717 13718 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM : 13719 SFMMU_REGION_HME; 13720 /* 13721 * Currently only support shared hmes for the read only main text 13722 * region. 13723 */ 13724 if (r_type == SFMMU_REGION_HME && ((r_obj != srdp->srd_evp) || 13725 (r_perm & PROT_WRITE))) { 13726 return (HAT_INVALID_REGION_COOKIE); 13727 } 13728 13729 rhash = RGN_HASH_FUNCTION(r_obj); 13730 13731 if (r_type == SFMMU_REGION_ISM) { 13732 nextidp = &srdp->srd_next_ismrid; 13733 freelistp = &srdp->srd_ismrgnfree; 13734 maxids = SFMMU_MAX_ISM_REGIONS; 13735 rarrp = srdp->srd_ismrgnp; 13736 busyrgnsp = &srdp->srd_ismbusyrgns; 13737 } else { 13738 nextidp = &srdp->srd_next_hmerid; 13739 freelistp = &srdp->srd_hmergnfree; 13740 maxids = SFMMU_MAX_HME_REGIONS; 13741 rarrp = srdp->srd_hmergnp; 13742 busyrgnsp = &srdp->srd_hmebusyrgns; 13743 } 13744 13745 mutex_enter(&srdp->srd_mutex); 13746 13747 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL; 13748 rgnp = rgnp->rgn_hash) { 13749 if (rgnp->rgn_saddr == r_saddr && rgnp->rgn_size == r_size && 13750 rgnp->rgn_obj == r_obj && rgnp->rgn_objoff == r_objoff && 13751 rgnp->rgn_perm == r_perm && rgnp->rgn_pgszc == r_pgszc) { 13752 break; 13753 } 13754 } 13755 13756 rfound: 13757 if (rgnp != NULL) { 13758 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 13759 ASSERT(rgnp->rgn_cb_function == r_cb_function); 13760 ASSERT(rgnp->rgn_refcnt >= 0); 13761 rid = rgnp->rgn_id; 13762 ASSERT(rid < maxids); 13763 ASSERT(rarrp[rid] == rgnp); 13764 ASSERT(rid < *nextidp); 13765 atomic_inc_32((volatile uint_t *)&rgnp->rgn_refcnt); 13766 mutex_exit(&srdp->srd_mutex); 13767 if (new_rgnp != NULL) { 13768 kmem_cache_free(region_cache, new_rgnp); 13769 } 13770 if (r_type == SFMMU_REGION_HME) { 13771 int myjoin = 13772 (sfmmup == astosfmmu(curthread->t_procp->p_as)); 13773 13774 sfmmu_link_to_hmeregion(sfmmup, rgnp); 13775 /* 13776 * bitmap should be updated after linking sfmmu on 13777 * region list so that pageunload() doesn't skip 13778 * TSB/TLB flush. As soon as bitmap is updated another 13779 * thread in this process can already start accessing 13780 * this region. 13781 */ 13782 /* 13783 * Normally ttecnt accounting is done as part of 13784 * pagefault handling. But a process may not take any 13785 * pagefaults on shared hmeblks created by some other 13786 * process. To compensate for this assume that the 13787 * entire region will end up faulted in using 13788 * the region's pagesize. 13789 * 13790 */ 13791 if (r_pgszc > TTE8K) { 13792 tteflag = 1 << r_pgszc; 13793 if (disable_large_pages & tteflag) { 13794 tteflag = 0; 13795 } 13796 } else { 13797 tteflag = 0; 13798 } 13799 if (tteflag && !(sfmmup->sfmmu_rtteflags & tteflag)) { 13800 hatlockp = sfmmu_hat_enter(sfmmup); 13801 sfmmup->sfmmu_rtteflags |= tteflag; 13802 sfmmu_hat_exit(hatlockp); 13803 } 13804 hatlockp = sfmmu_hat_enter(sfmmup); 13805 13806 /* 13807 * Preallocate 1/4 of ttecnt's in 8K TSB for >= 4M 13808 * region to allow for large page allocation failure. 13809 */ 13810 if (r_pgszc >= TTE4M) { 13811 sfmmup->sfmmu_tsb0_4minflcnt += 13812 r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 13813 } 13814 13815 /* update sfmmu_ttecnt with the shme rgn ttecnt */ 13816 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 13817 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], 13818 rttecnt); 13819 13820 if (text && r_pgszc >= TTE4M && 13821 (tteflag || ((disable_large_pages >> TTE4M) & 13822 ((1 << (r_pgszc - TTE4M + 1)) - 1))) && 13823 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) { 13824 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG); 13825 } 13826 13827 sfmmu_hat_exit(hatlockp); 13828 /* 13829 * On Panther we need to make sure TLB is programmed 13830 * to accept 32M/256M pages. Call 13831 * sfmmu_check_page_sizes() now to make sure TLB is 13832 * setup before making hmeregions visible to other 13833 * threads. 13834 */ 13835 sfmmu_check_page_sizes(sfmmup, 1); 13836 hatlockp = sfmmu_hat_enter(sfmmup); 13837 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid); 13838 13839 /* 13840 * if context is invalid tsb miss exception code will 13841 * call sfmmu_check_page_sizes() and update tsbmiss 13842 * area later. 13843 */ 13844 kpreempt_disable(); 13845 if (myjoin && 13846 (sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum 13847 != INVALID_CONTEXT)) { 13848 struct tsbmiss *tsbmp; 13849 13850 tsbmp = &tsbmiss_area[CPU->cpu_id]; 13851 ASSERT(sfmmup == tsbmp->usfmmup); 13852 BT_SET(tsbmp->shmermap, rid); 13853 if (r_pgszc > TTE64K) { 13854 tsbmp->uhat_rtteflags |= tteflag; 13855 } 13856 13857 } 13858 kpreempt_enable(); 13859 13860 sfmmu_hat_exit(hatlockp); 13861 ASSERT((hat_region_cookie_t)((uint64_t)rid) != 13862 HAT_INVALID_REGION_COOKIE); 13863 } else { 13864 hatlockp = sfmmu_hat_enter(sfmmup); 13865 SF_RGNMAP_ADD(sfmmup->sfmmu_ismregion_map, rid); 13866 sfmmu_hat_exit(hatlockp); 13867 } 13868 ASSERT(rid < maxids); 13869 13870 if (r_type == SFMMU_REGION_ISM) { 13871 sfmmu_find_scd(sfmmup); 13872 } 13873 return ((hat_region_cookie_t)((uint64_t)rid)); 13874 } 13875 13876 ASSERT(new_rgnp == NULL); 13877 13878 if (*busyrgnsp >= maxids) { 13879 mutex_exit(&srdp->srd_mutex); 13880 return (HAT_INVALID_REGION_COOKIE); 13881 } 13882 13883 ASSERT(MUTEX_HELD(&srdp->srd_mutex)); 13884 if (*freelistp != NULL) { 13885 rgnp = *freelistp; 13886 *freelistp = rgnp->rgn_next; 13887 ASSERT(rgnp->rgn_id < *nextidp); 13888 ASSERT(rgnp->rgn_id < maxids); 13889 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 13890 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) 13891 == r_type); 13892 ASSERT(rarrp[rgnp->rgn_id] == rgnp); 13893 ASSERT(rgnp->rgn_hmeflags == 0); 13894 } else { 13895 /* 13896 * release local locks before memory allocation. 13897 */ 13898 mutex_exit(&srdp->srd_mutex); 13899 13900 new_rgnp = kmem_cache_alloc(region_cache, KM_SLEEP); 13901 13902 mutex_enter(&srdp->srd_mutex); 13903 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL; 13904 rgnp = rgnp->rgn_hash) { 13905 if (rgnp->rgn_saddr == r_saddr && 13906 rgnp->rgn_size == r_size && 13907 rgnp->rgn_obj == r_obj && 13908 rgnp->rgn_objoff == r_objoff && 13909 rgnp->rgn_perm == r_perm && 13910 rgnp->rgn_pgszc == r_pgszc) { 13911 break; 13912 } 13913 } 13914 if (rgnp != NULL) { 13915 goto rfound; 13916 } 13917 13918 if (*nextidp >= maxids) { 13919 mutex_exit(&srdp->srd_mutex); 13920 goto fail; 13921 } 13922 rgnp = new_rgnp; 13923 new_rgnp = NULL; 13924 rgnp->rgn_id = (*nextidp)++; 13925 ASSERT(rgnp->rgn_id < maxids); 13926 ASSERT(rarrp[rgnp->rgn_id] == NULL); 13927 rarrp[rgnp->rgn_id] = rgnp; 13928 } 13929 13930 ASSERT(rgnp->rgn_sfmmu_head == NULL); 13931 ASSERT(rgnp->rgn_hmeflags == 0); 13932 #ifdef DEBUG 13933 for (i = 0; i < MMU_PAGE_SIZES; i++) { 13934 ASSERT(rgnp->rgn_ttecnt[i] == 0); 13935 } 13936 #endif 13937 rgnp->rgn_saddr = r_saddr; 13938 rgnp->rgn_size = r_size; 13939 rgnp->rgn_obj = r_obj; 13940 rgnp->rgn_objoff = r_objoff; 13941 rgnp->rgn_perm = r_perm; 13942 rgnp->rgn_pgszc = r_pgszc; 13943 rgnp->rgn_flags = r_type; 13944 rgnp->rgn_refcnt = 0; 13945 rgnp->rgn_cb_function = r_cb_function; 13946 rgnp->rgn_hash = srdp->srd_rgnhash[rhash]; 13947 srdp->srd_rgnhash[rhash] = rgnp; 13948 (*busyrgnsp)++; 13949 ASSERT(*busyrgnsp <= maxids); 13950 goto rfound; 13951 13952 fail: 13953 ASSERT(new_rgnp != NULL); 13954 kmem_cache_free(region_cache, new_rgnp); 13955 return (HAT_INVALID_REGION_COOKIE); 13956 } 13957 13958 /* 13959 * This function implements the shared context functionality required 13960 * when detaching a segment from an address space. It must be called 13961 * from hat_unshare() for all D(ISM) segments and from segvn_unmap(), 13962 * for segments with a valid region_cookie. 13963 * It will also be called from all seg_vn routines which change a 13964 * segment's attributes such as segvn_setprot(), segvn_setpagesize(), 13965 * segvn_clrszc() & segvn_advise(), as well as in the case of COW fault 13966 * from segvn_fault(). 13967 */ 13968 void 13969 hat_leave_region(struct hat *sfmmup, hat_region_cookie_t rcookie, uint_t flags) 13970 { 13971 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 13972 sf_scd_t *scdp; 13973 uint_t rhash; 13974 uint_t rid = (uint_t)((uint64_t)rcookie); 13975 hatlock_t *hatlockp = NULL; 13976 sf_region_t *rgnp; 13977 sf_region_t **prev_rgnpp; 13978 sf_region_t *cur_rgnp; 13979 void *r_obj; 13980 int i; 13981 caddr_t r_saddr; 13982 caddr_t r_eaddr; 13983 size_t r_size; 13984 uchar_t r_pgszc; 13985 uchar_t r_type = flags & HAT_REGION_TYPE_MASK; 13986 13987 ASSERT(sfmmup != ksfmmup); 13988 ASSERT(srdp != NULL); 13989 ASSERT(srdp->srd_refcnt > 0); 13990 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK)); 13991 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM); 13992 ASSERT(!sfmmup->sfmmu_free || sfmmup->sfmmu_scdp == NULL); 13993 13994 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM : 13995 SFMMU_REGION_HME; 13996 13997 if (r_type == SFMMU_REGION_ISM) { 13998 ASSERT(SFMMU_IS_ISMRID_VALID(rid)); 13999 ASSERT(rid < SFMMU_MAX_ISM_REGIONS); 14000 rgnp = srdp->srd_ismrgnp[rid]; 14001 } else { 14002 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14003 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 14004 rgnp = srdp->srd_hmergnp[rid]; 14005 } 14006 ASSERT(rgnp != NULL); 14007 ASSERT(rgnp->rgn_id == rid); 14008 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14009 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE)); 14010 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 14011 14012 if (sfmmup->sfmmu_free) { 14013 ulong_t rttecnt; 14014 r_pgszc = rgnp->rgn_pgszc; 14015 r_size = rgnp->rgn_size; 14016 14017 ASSERT(sfmmup->sfmmu_scdp == NULL); 14018 if (r_type == SFMMU_REGION_ISM) { 14019 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid); 14020 } else { 14021 /* update shme rgns ttecnt in sfmmu_ttecnt */ 14022 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14023 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt); 14024 14025 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], 14026 -rttecnt); 14027 14028 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid); 14029 } 14030 } else if (r_type == SFMMU_REGION_ISM) { 14031 hatlockp = sfmmu_hat_enter(sfmmup); 14032 ASSERT(rid < srdp->srd_next_ismrid); 14033 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid); 14034 scdp = sfmmup->sfmmu_scdp; 14035 if (scdp != NULL && 14036 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) { 14037 sfmmu_leave_scd(sfmmup, r_type); 14038 ASSERT(sfmmu_hat_lock_held(sfmmup)); 14039 } 14040 sfmmu_hat_exit(hatlockp); 14041 } else { 14042 ulong_t rttecnt; 14043 r_pgszc = rgnp->rgn_pgszc; 14044 r_saddr = rgnp->rgn_saddr; 14045 r_size = rgnp->rgn_size; 14046 r_eaddr = r_saddr + r_size; 14047 14048 ASSERT(r_type == SFMMU_REGION_HME); 14049 hatlockp = sfmmu_hat_enter(sfmmup); 14050 ASSERT(rid < srdp->srd_next_hmerid); 14051 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid); 14052 14053 /* 14054 * If region is part of an SCD call sfmmu_leave_scd(). 14055 * Otherwise if process is not exiting and has valid context 14056 * just drop the context on the floor to lose stale TLB 14057 * entries and force the update of tsb miss area to reflect 14058 * the new region map. After that clean our TSB entries. 14059 */ 14060 scdp = sfmmup->sfmmu_scdp; 14061 if (scdp != NULL && 14062 SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 14063 sfmmu_leave_scd(sfmmup, r_type); 14064 ASSERT(sfmmu_hat_lock_held(sfmmup)); 14065 } 14066 sfmmu_invalidate_ctx(sfmmup); 14067 14068 i = TTE8K; 14069 while (i < mmu_page_sizes) { 14070 if (rgnp->rgn_ttecnt[i] != 0) { 14071 sfmmu_unload_tsb_range(sfmmup, r_saddr, 14072 r_eaddr, i); 14073 if (i < TTE4M) { 14074 i = TTE4M; 14075 continue; 14076 } else { 14077 break; 14078 } 14079 } 14080 i++; 14081 } 14082 /* Remove the preallocated 1/4 8k ttecnt for 4M regions. */ 14083 if (r_pgszc >= TTE4M) { 14084 rttecnt = r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14085 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >= 14086 rttecnt); 14087 sfmmup->sfmmu_tsb0_4minflcnt -= rttecnt; 14088 } 14089 14090 /* update shme rgns ttecnt in sfmmu_ttecnt */ 14091 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14092 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt); 14093 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], -rttecnt); 14094 14095 sfmmu_hat_exit(hatlockp); 14096 if (scdp != NULL && sfmmup->sfmmu_scdp == NULL) { 14097 /* sfmmup left the scd, grow private tsb */ 14098 sfmmu_check_page_sizes(sfmmup, 1); 14099 } else { 14100 sfmmu_check_page_sizes(sfmmup, 0); 14101 } 14102 } 14103 14104 if (r_type == SFMMU_REGION_HME) { 14105 sfmmu_unlink_from_hmeregion(sfmmup, rgnp); 14106 } 14107 14108 r_obj = rgnp->rgn_obj; 14109 if (atomic_dec_32_nv((volatile uint_t *)&rgnp->rgn_refcnt)) { 14110 return; 14111 } 14112 14113 /* 14114 * looks like nobody uses this region anymore. Free it. 14115 */ 14116 rhash = RGN_HASH_FUNCTION(r_obj); 14117 mutex_enter(&srdp->srd_mutex); 14118 for (prev_rgnpp = &srdp->srd_rgnhash[rhash]; 14119 (cur_rgnp = *prev_rgnpp) != NULL; 14120 prev_rgnpp = &cur_rgnp->rgn_hash) { 14121 if (cur_rgnp == rgnp && cur_rgnp->rgn_refcnt == 0) { 14122 break; 14123 } 14124 } 14125 14126 if (cur_rgnp == NULL) { 14127 mutex_exit(&srdp->srd_mutex); 14128 return; 14129 } 14130 14131 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14132 *prev_rgnpp = rgnp->rgn_hash; 14133 if (r_type == SFMMU_REGION_ISM) { 14134 rgnp->rgn_flags |= SFMMU_REGION_FREE; 14135 ASSERT(rid < srdp->srd_next_ismrid); 14136 rgnp->rgn_next = srdp->srd_ismrgnfree; 14137 srdp->srd_ismrgnfree = rgnp; 14138 ASSERT(srdp->srd_ismbusyrgns > 0); 14139 srdp->srd_ismbusyrgns--; 14140 mutex_exit(&srdp->srd_mutex); 14141 return; 14142 } 14143 mutex_exit(&srdp->srd_mutex); 14144 14145 /* 14146 * Destroy region's hmeblks. 14147 */ 14148 sfmmu_unload_hmeregion(srdp, rgnp); 14149 14150 rgnp->rgn_hmeflags = 0; 14151 14152 ASSERT(rgnp->rgn_sfmmu_head == NULL); 14153 ASSERT(rgnp->rgn_id == rid); 14154 for (i = 0; i < MMU_PAGE_SIZES; i++) { 14155 rgnp->rgn_ttecnt[i] = 0; 14156 } 14157 rgnp->rgn_flags |= SFMMU_REGION_FREE; 14158 mutex_enter(&srdp->srd_mutex); 14159 ASSERT(rid < srdp->srd_next_hmerid); 14160 rgnp->rgn_next = srdp->srd_hmergnfree; 14161 srdp->srd_hmergnfree = rgnp; 14162 ASSERT(srdp->srd_hmebusyrgns > 0); 14163 srdp->srd_hmebusyrgns--; 14164 mutex_exit(&srdp->srd_mutex); 14165 } 14166 14167 /* 14168 * For now only called for hmeblk regions and not for ISM regions. 14169 */ 14170 void 14171 hat_dup_region(struct hat *sfmmup, hat_region_cookie_t rcookie) 14172 { 14173 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14174 uint_t rid = (uint_t)((uint64_t)rcookie); 14175 sf_region_t *rgnp; 14176 sf_rgn_link_t *rlink; 14177 sf_rgn_link_t *hrlink; 14178 ulong_t rttecnt; 14179 14180 ASSERT(sfmmup != ksfmmup); 14181 ASSERT(srdp != NULL); 14182 ASSERT(srdp->srd_refcnt > 0); 14183 14184 ASSERT(rid < srdp->srd_next_hmerid); 14185 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14186 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 14187 14188 rgnp = srdp->srd_hmergnp[rid]; 14189 ASSERT(rgnp->rgn_refcnt > 0); 14190 ASSERT(rgnp->rgn_id == rid); 14191 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == SFMMU_REGION_HME); 14192 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE)); 14193 14194 atomic_inc_32((volatile uint_t *)&rgnp->rgn_refcnt); 14195 14196 /* LINTED: constant in conditional context */ 14197 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 0); 14198 ASSERT(rlink != NULL); 14199 mutex_enter(&rgnp->rgn_mutex); 14200 ASSERT(rgnp->rgn_sfmmu_head != NULL); 14201 /* LINTED: constant in conditional context */ 14202 SFMMU_HMERID2RLINKP(rgnp->rgn_sfmmu_head, rid, hrlink, 0, 0); 14203 ASSERT(hrlink != NULL); 14204 ASSERT(hrlink->prev == NULL); 14205 rlink->next = rgnp->rgn_sfmmu_head; 14206 rlink->prev = NULL; 14207 hrlink->prev = sfmmup; 14208 /* 14209 * make sure rlink's next field is correct 14210 * before making this link visible. 14211 */ 14212 membar_stst(); 14213 rgnp->rgn_sfmmu_head = sfmmup; 14214 mutex_exit(&rgnp->rgn_mutex); 14215 14216 /* update sfmmu_ttecnt with the shme rgn ttecnt */ 14217 rttecnt = rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc); 14218 atomic_add_long(&sfmmup->sfmmu_ttecnt[rgnp->rgn_pgszc], rttecnt); 14219 /* update tsb0 inflation count */ 14220 if (rgnp->rgn_pgszc >= TTE4M) { 14221 sfmmup->sfmmu_tsb0_4minflcnt += 14222 rgnp->rgn_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14223 } 14224 /* 14225 * Update regionid bitmask without hat lock since no other thread 14226 * can update this region bitmask right now. 14227 */ 14228 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid); 14229 } 14230 14231 /* ARGSUSED */ 14232 static int 14233 sfmmu_rgncache_constructor(void *buf, void *cdrarg, int kmflags) 14234 { 14235 sf_region_t *rgnp = (sf_region_t *)buf; 14236 bzero(buf, sizeof (*rgnp)); 14237 14238 mutex_init(&rgnp->rgn_mutex, NULL, MUTEX_DEFAULT, NULL); 14239 14240 return (0); 14241 } 14242 14243 /* ARGSUSED */ 14244 static void 14245 sfmmu_rgncache_destructor(void *buf, void *cdrarg) 14246 { 14247 sf_region_t *rgnp = (sf_region_t *)buf; 14248 mutex_destroy(&rgnp->rgn_mutex); 14249 } 14250 14251 static int 14252 sfrgnmap_isnull(sf_region_map_t *map) 14253 { 14254 int i; 14255 14256 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14257 if (map->bitmap[i] != 0) { 14258 return (0); 14259 } 14260 } 14261 return (1); 14262 } 14263 14264 static int 14265 sfhmergnmap_isnull(sf_hmeregion_map_t *map) 14266 { 14267 int i; 14268 14269 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 14270 if (map->bitmap[i] != 0) { 14271 return (0); 14272 } 14273 } 14274 return (1); 14275 } 14276 14277 #ifdef DEBUG 14278 static void 14279 check_scd_sfmmu_list(sfmmu_t **headp, sfmmu_t *sfmmup, int onlist) 14280 { 14281 sfmmu_t *sp; 14282 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14283 14284 for (sp = *headp; sp != NULL; sp = sp->sfmmu_scd_link.next) { 14285 ASSERT(srdp == sp->sfmmu_srdp); 14286 if (sp == sfmmup) { 14287 if (onlist) { 14288 return; 14289 } else { 14290 panic("shctx: sfmmu 0x%p found on scd" 14291 "list 0x%p", (void *)sfmmup, 14292 (void *)*headp); 14293 } 14294 } 14295 } 14296 if (onlist) { 14297 panic("shctx: sfmmu 0x%p not found on scd list 0x%p", 14298 (void *)sfmmup, (void *)*headp); 14299 } else { 14300 return; 14301 } 14302 } 14303 #else /* DEBUG */ 14304 #define check_scd_sfmmu_list(headp, sfmmup, onlist) 14305 #endif /* DEBUG */ 14306 14307 /* 14308 * Removes an sfmmu from the SCD sfmmu list. 14309 */ 14310 static void 14311 sfmmu_from_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup) 14312 { 14313 ASSERT(sfmmup->sfmmu_srdp != NULL); 14314 check_scd_sfmmu_list(headp, sfmmup, 1); 14315 if (sfmmup->sfmmu_scd_link.prev != NULL) { 14316 ASSERT(*headp != sfmmup); 14317 sfmmup->sfmmu_scd_link.prev->sfmmu_scd_link.next = 14318 sfmmup->sfmmu_scd_link.next; 14319 } else { 14320 ASSERT(*headp == sfmmup); 14321 *headp = sfmmup->sfmmu_scd_link.next; 14322 } 14323 if (sfmmup->sfmmu_scd_link.next != NULL) { 14324 sfmmup->sfmmu_scd_link.next->sfmmu_scd_link.prev = 14325 sfmmup->sfmmu_scd_link.prev; 14326 } 14327 } 14328 14329 14330 /* 14331 * Adds an sfmmu to the start of the queue. 14332 */ 14333 static void 14334 sfmmu_to_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup) 14335 { 14336 check_scd_sfmmu_list(headp, sfmmup, 0); 14337 sfmmup->sfmmu_scd_link.prev = NULL; 14338 sfmmup->sfmmu_scd_link.next = *headp; 14339 if (*headp != NULL) 14340 (*headp)->sfmmu_scd_link.prev = sfmmup; 14341 *headp = sfmmup; 14342 } 14343 14344 /* 14345 * Remove an scd from the start of the queue. 14346 */ 14347 static void 14348 sfmmu_remove_scd(sf_scd_t **headp, sf_scd_t *scdp) 14349 { 14350 if (scdp->scd_prev != NULL) { 14351 ASSERT(*headp != scdp); 14352 scdp->scd_prev->scd_next = scdp->scd_next; 14353 } else { 14354 ASSERT(*headp == scdp); 14355 *headp = scdp->scd_next; 14356 } 14357 14358 if (scdp->scd_next != NULL) { 14359 scdp->scd_next->scd_prev = scdp->scd_prev; 14360 } 14361 } 14362 14363 /* 14364 * Add an scd to the start of the queue. 14365 */ 14366 static void 14367 sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *scdp) 14368 { 14369 scdp->scd_prev = NULL; 14370 scdp->scd_next = *headp; 14371 if (*headp != NULL) { 14372 (*headp)->scd_prev = scdp; 14373 } 14374 *headp = scdp; 14375 } 14376 14377 static int 14378 sfmmu_alloc_scd_tsbs(sf_srd_t *srdp, sf_scd_t *scdp) 14379 { 14380 uint_t rid; 14381 uint_t i; 14382 uint_t j; 14383 ulong_t w; 14384 sf_region_t *rgnp; 14385 ulong_t tte8k_cnt = 0; 14386 ulong_t tte4m_cnt = 0; 14387 uint_t tsb_szc; 14388 sfmmu_t *scsfmmup = scdp->scd_sfmmup; 14389 sfmmu_t *ism_hatid; 14390 struct tsb_info *newtsb; 14391 int szc; 14392 14393 ASSERT(srdp != NULL); 14394 14395 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14396 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 14397 continue; 14398 } 14399 j = 0; 14400 while (w) { 14401 if (!(w & 0x1)) { 14402 j++; 14403 w >>= 1; 14404 continue; 14405 } 14406 rid = (i << BT_ULSHIFT) | j; 14407 j++; 14408 w >>= 1; 14409 14410 if (rid < SFMMU_MAX_HME_REGIONS) { 14411 rgnp = srdp->srd_hmergnp[rid]; 14412 ASSERT(rgnp->rgn_id == rid); 14413 ASSERT(rgnp->rgn_refcnt > 0); 14414 14415 if (rgnp->rgn_pgszc < TTE4M) { 14416 tte8k_cnt += rgnp->rgn_size >> 14417 TTE_PAGE_SHIFT(TTE8K); 14418 } else { 14419 ASSERT(rgnp->rgn_pgszc >= TTE4M); 14420 tte4m_cnt += rgnp->rgn_size >> 14421 TTE_PAGE_SHIFT(TTE4M); 14422 /* 14423 * Inflate SCD tsb0 by preallocating 14424 * 1/4 8k ttecnt for 4M regions to 14425 * allow for lgpg alloc failure. 14426 */ 14427 tte8k_cnt += rgnp->rgn_size >> 14428 (TTE_PAGE_SHIFT(TTE8K) + 2); 14429 } 14430 } else { 14431 rid -= SFMMU_MAX_HME_REGIONS; 14432 rgnp = srdp->srd_ismrgnp[rid]; 14433 ASSERT(rgnp->rgn_id == rid); 14434 ASSERT(rgnp->rgn_refcnt > 0); 14435 14436 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 14437 ASSERT(ism_hatid->sfmmu_ismhat); 14438 14439 for (szc = 0; szc < TTE4M; szc++) { 14440 tte8k_cnt += 14441 ism_hatid->sfmmu_ttecnt[szc] << 14442 TTE_BSZS_SHIFT(szc); 14443 } 14444 14445 ASSERT(rgnp->rgn_pgszc >= TTE4M); 14446 if (rgnp->rgn_pgszc >= TTE4M) { 14447 tte4m_cnt += rgnp->rgn_size >> 14448 TTE_PAGE_SHIFT(TTE4M); 14449 } 14450 } 14451 } 14452 } 14453 14454 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt); 14455 14456 /* Allocate both the SCD TSBs here. */ 14457 if (sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb, 14458 tsb_szc, TSB8K|TSB64K|TSB512K, TSB_ALLOC, scsfmmup) && 14459 (tsb_szc <= TSB_4M_SZCODE || 14460 sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb, 14461 TSB_4M_SZCODE, TSB8K|TSB64K|TSB512K, 14462 TSB_ALLOC, scsfmmup))) { 14463 14464 SFMMU_STAT(sf_scd_1sttsb_allocfail); 14465 return (TSB_ALLOCFAIL); 14466 } else { 14467 scsfmmup->sfmmu_tsb->tsb_flags |= TSB_SHAREDCTX; 14468 14469 if (tte4m_cnt) { 14470 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt); 14471 if (sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, 14472 TSB4M|TSB32M|TSB256M, TSB_ALLOC, scsfmmup) && 14473 (tsb_szc <= TSB_4M_SZCODE || 14474 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE, 14475 TSB4M|TSB32M|TSB256M, 14476 TSB_ALLOC, scsfmmup))) { 14477 /* 14478 * If we fail to allocate the 2nd shared tsb, 14479 * just free the 1st tsb, return failure. 14480 */ 14481 sfmmu_tsbinfo_free(scsfmmup->sfmmu_tsb); 14482 SFMMU_STAT(sf_scd_2ndtsb_allocfail); 14483 return (TSB_ALLOCFAIL); 14484 } else { 14485 ASSERT(scsfmmup->sfmmu_tsb->tsb_next == NULL); 14486 newtsb->tsb_flags |= TSB_SHAREDCTX; 14487 scsfmmup->sfmmu_tsb->tsb_next = newtsb; 14488 SFMMU_STAT(sf_scd_2ndtsb_alloc); 14489 } 14490 } 14491 SFMMU_STAT(sf_scd_1sttsb_alloc); 14492 } 14493 return (TSB_SUCCESS); 14494 } 14495 14496 static void 14497 sfmmu_free_scd_tsbs(sfmmu_t *scd_sfmmu) 14498 { 14499 while (scd_sfmmu->sfmmu_tsb != NULL) { 14500 struct tsb_info *next = scd_sfmmu->sfmmu_tsb->tsb_next; 14501 sfmmu_tsbinfo_free(scd_sfmmu->sfmmu_tsb); 14502 scd_sfmmu->sfmmu_tsb = next; 14503 } 14504 } 14505 14506 /* 14507 * Link the sfmmu onto the hme region list. 14508 */ 14509 void 14510 sfmmu_link_to_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp) 14511 { 14512 uint_t rid; 14513 sf_rgn_link_t *rlink; 14514 sfmmu_t *head; 14515 sf_rgn_link_t *hrlink; 14516 14517 rid = rgnp->rgn_id; 14518 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14519 14520 /* LINTED: constant in conditional context */ 14521 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 1); 14522 ASSERT(rlink != NULL); 14523 mutex_enter(&rgnp->rgn_mutex); 14524 if ((head = rgnp->rgn_sfmmu_head) == NULL) { 14525 rlink->next = NULL; 14526 rlink->prev = NULL; 14527 /* 14528 * make sure rlink's next field is NULL 14529 * before making this link visible. 14530 */ 14531 membar_stst(); 14532 rgnp->rgn_sfmmu_head = sfmmup; 14533 } else { 14534 /* LINTED: constant in conditional context */ 14535 SFMMU_HMERID2RLINKP(head, rid, hrlink, 0, 0); 14536 ASSERT(hrlink != NULL); 14537 ASSERT(hrlink->prev == NULL); 14538 rlink->next = head; 14539 rlink->prev = NULL; 14540 hrlink->prev = sfmmup; 14541 /* 14542 * make sure rlink's next field is correct 14543 * before making this link visible. 14544 */ 14545 membar_stst(); 14546 rgnp->rgn_sfmmu_head = sfmmup; 14547 } 14548 mutex_exit(&rgnp->rgn_mutex); 14549 } 14550 14551 /* 14552 * Unlink the sfmmu from the hme region list. 14553 */ 14554 void 14555 sfmmu_unlink_from_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp) 14556 { 14557 uint_t rid; 14558 sf_rgn_link_t *rlink; 14559 14560 rid = rgnp->rgn_id; 14561 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14562 14563 /* LINTED: constant in conditional context */ 14564 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0); 14565 ASSERT(rlink != NULL); 14566 mutex_enter(&rgnp->rgn_mutex); 14567 if (rgnp->rgn_sfmmu_head == sfmmup) { 14568 sfmmu_t *next = rlink->next; 14569 rgnp->rgn_sfmmu_head = next; 14570 /* 14571 * if we are stopped by xc_attention() after this 14572 * point the forward link walking in 14573 * sfmmu_rgntlb_demap() will work correctly since the 14574 * head correctly points to the next element. 14575 */ 14576 membar_stst(); 14577 rlink->next = NULL; 14578 ASSERT(rlink->prev == NULL); 14579 if (next != NULL) { 14580 sf_rgn_link_t *nrlink; 14581 /* LINTED: constant in conditional context */ 14582 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0); 14583 ASSERT(nrlink != NULL); 14584 ASSERT(nrlink->prev == sfmmup); 14585 nrlink->prev = NULL; 14586 } 14587 } else { 14588 sfmmu_t *next = rlink->next; 14589 sfmmu_t *prev = rlink->prev; 14590 sf_rgn_link_t *prlink; 14591 14592 ASSERT(prev != NULL); 14593 /* LINTED: constant in conditional context */ 14594 SFMMU_HMERID2RLINKP(prev, rid, prlink, 0, 0); 14595 ASSERT(prlink != NULL); 14596 ASSERT(prlink->next == sfmmup); 14597 prlink->next = next; 14598 /* 14599 * if we are stopped by xc_attention() 14600 * after this point the forward link walking 14601 * will work correctly since the prev element 14602 * correctly points to the next element. 14603 */ 14604 membar_stst(); 14605 rlink->next = NULL; 14606 rlink->prev = NULL; 14607 if (next != NULL) { 14608 sf_rgn_link_t *nrlink; 14609 /* LINTED: constant in conditional context */ 14610 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0); 14611 ASSERT(nrlink != NULL); 14612 ASSERT(nrlink->prev == sfmmup); 14613 nrlink->prev = prev; 14614 } 14615 } 14616 mutex_exit(&rgnp->rgn_mutex); 14617 } 14618 14619 /* 14620 * Link scd sfmmu onto ism or hme region list for each region in the 14621 * scd region map. 14622 */ 14623 void 14624 sfmmu_link_scd_to_regions(sf_srd_t *srdp, sf_scd_t *scdp) 14625 { 14626 uint_t rid; 14627 uint_t i; 14628 uint_t j; 14629 ulong_t w; 14630 sf_region_t *rgnp; 14631 sfmmu_t *scsfmmup; 14632 14633 scsfmmup = scdp->scd_sfmmup; 14634 ASSERT(scsfmmup->sfmmu_scdhat); 14635 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14636 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 14637 continue; 14638 } 14639 j = 0; 14640 while (w) { 14641 if (!(w & 0x1)) { 14642 j++; 14643 w >>= 1; 14644 continue; 14645 } 14646 rid = (i << BT_ULSHIFT) | j; 14647 j++; 14648 w >>= 1; 14649 14650 if (rid < SFMMU_MAX_HME_REGIONS) { 14651 rgnp = srdp->srd_hmergnp[rid]; 14652 ASSERT(rgnp->rgn_id == rid); 14653 ASSERT(rgnp->rgn_refcnt > 0); 14654 sfmmu_link_to_hmeregion(scsfmmup, rgnp); 14655 } else { 14656 sfmmu_t *ism_hatid = NULL; 14657 ism_ment_t *ism_ment; 14658 rid -= SFMMU_MAX_HME_REGIONS; 14659 rgnp = srdp->srd_ismrgnp[rid]; 14660 ASSERT(rgnp->rgn_id == rid); 14661 ASSERT(rgnp->rgn_refcnt > 0); 14662 14663 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 14664 ASSERT(ism_hatid->sfmmu_ismhat); 14665 ism_ment = &scdp->scd_ism_links[rid]; 14666 ism_ment->iment_hat = scsfmmup; 14667 ism_ment->iment_base_va = rgnp->rgn_saddr; 14668 mutex_enter(&ism_mlist_lock); 14669 iment_add(ism_ment, ism_hatid); 14670 mutex_exit(&ism_mlist_lock); 14671 14672 } 14673 } 14674 } 14675 } 14676 /* 14677 * Unlink scd sfmmu from ism or hme region list for each region in the 14678 * scd region map. 14679 */ 14680 void 14681 sfmmu_unlink_scd_from_regions(sf_srd_t *srdp, sf_scd_t *scdp) 14682 { 14683 uint_t rid; 14684 uint_t i; 14685 uint_t j; 14686 ulong_t w; 14687 sf_region_t *rgnp; 14688 sfmmu_t *scsfmmup; 14689 14690 scsfmmup = scdp->scd_sfmmup; 14691 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14692 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 14693 continue; 14694 } 14695 j = 0; 14696 while (w) { 14697 if (!(w & 0x1)) { 14698 j++; 14699 w >>= 1; 14700 continue; 14701 } 14702 rid = (i << BT_ULSHIFT) | j; 14703 j++; 14704 w >>= 1; 14705 14706 if (rid < SFMMU_MAX_HME_REGIONS) { 14707 rgnp = srdp->srd_hmergnp[rid]; 14708 ASSERT(rgnp->rgn_id == rid); 14709 ASSERT(rgnp->rgn_refcnt > 0); 14710 sfmmu_unlink_from_hmeregion(scsfmmup, 14711 rgnp); 14712 14713 } else { 14714 sfmmu_t *ism_hatid = NULL; 14715 ism_ment_t *ism_ment; 14716 rid -= SFMMU_MAX_HME_REGIONS; 14717 rgnp = srdp->srd_ismrgnp[rid]; 14718 ASSERT(rgnp->rgn_id == rid); 14719 ASSERT(rgnp->rgn_refcnt > 0); 14720 14721 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 14722 ASSERT(ism_hatid->sfmmu_ismhat); 14723 ism_ment = &scdp->scd_ism_links[rid]; 14724 ASSERT(ism_ment->iment_hat == scdp->scd_sfmmup); 14725 ASSERT(ism_ment->iment_base_va == 14726 rgnp->rgn_saddr); 14727 mutex_enter(&ism_mlist_lock); 14728 iment_sub(ism_ment, ism_hatid); 14729 mutex_exit(&ism_mlist_lock); 14730 14731 } 14732 } 14733 } 14734 } 14735 /* 14736 * Allocates and initialises a new SCD structure, this is called with 14737 * the srd_scd_mutex held and returns with the reference count 14738 * initialised to 1. 14739 */ 14740 static sf_scd_t * 14741 sfmmu_alloc_scd(sf_srd_t *srdp, sf_region_map_t *new_map) 14742 { 14743 sf_scd_t *new_scdp; 14744 sfmmu_t *scsfmmup; 14745 int i; 14746 14747 ASSERT(MUTEX_HELD(&srdp->srd_scd_mutex)); 14748 new_scdp = kmem_cache_alloc(scd_cache, KM_SLEEP); 14749 14750 scsfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP); 14751 new_scdp->scd_sfmmup = scsfmmup; 14752 scsfmmup->sfmmu_srdp = srdp; 14753 scsfmmup->sfmmu_scdp = new_scdp; 14754 scsfmmup->sfmmu_tsb0_4minflcnt = 0; 14755 scsfmmup->sfmmu_scdhat = 1; 14756 CPUSET_ALL(scsfmmup->sfmmu_cpusran); 14757 bzero(scsfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE); 14758 14759 ASSERT(max_mmu_ctxdoms > 0); 14760 for (i = 0; i < max_mmu_ctxdoms; i++) { 14761 scsfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT; 14762 scsfmmup->sfmmu_ctxs[i].gnum = 0; 14763 } 14764 14765 for (i = 0; i < MMU_PAGE_SIZES; i++) { 14766 new_scdp->scd_rttecnt[i] = 0; 14767 } 14768 14769 new_scdp->scd_region_map = *new_map; 14770 new_scdp->scd_refcnt = 1; 14771 if (sfmmu_alloc_scd_tsbs(srdp, new_scdp) != TSB_SUCCESS) { 14772 kmem_cache_free(scd_cache, new_scdp); 14773 kmem_cache_free(sfmmuid_cache, scsfmmup); 14774 return (NULL); 14775 } 14776 if (&mmu_init_scd) { 14777 mmu_init_scd(new_scdp); 14778 } 14779 return (new_scdp); 14780 } 14781 14782 /* 14783 * The first phase of a process joining an SCD. The hat structure is 14784 * linked to the SCD queue and then the HAT_JOIN_SCD sfmmu flag is set 14785 * and a cross-call with context invalidation is used to cause the 14786 * remaining work to be carried out in the sfmmu_tsbmiss_exception() 14787 * routine. 14788 */ 14789 static void 14790 sfmmu_join_scd(sf_scd_t *scdp, sfmmu_t *sfmmup) 14791 { 14792 hatlock_t *hatlockp; 14793 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14794 int i; 14795 sf_scd_t *old_scdp; 14796 14797 ASSERT(srdp != NULL); 14798 ASSERT(scdp != NULL); 14799 ASSERT(scdp->scd_refcnt > 0); 14800 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 14801 14802 if ((old_scdp = sfmmup->sfmmu_scdp) != NULL) { 14803 ASSERT(old_scdp != scdp); 14804 14805 mutex_enter(&old_scdp->scd_mutex); 14806 sfmmu_from_scd_list(&old_scdp->scd_sf_list, sfmmup); 14807 mutex_exit(&old_scdp->scd_mutex); 14808 /* 14809 * sfmmup leaves the old scd. Update sfmmu_ttecnt to 14810 * include the shme rgn ttecnt for rgns that 14811 * were in the old SCD 14812 */ 14813 for (i = 0; i < mmu_page_sizes; i++) { 14814 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 14815 old_scdp->scd_rttecnt[i]); 14816 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 14817 sfmmup->sfmmu_scdrttecnt[i]); 14818 } 14819 } 14820 14821 /* 14822 * Move sfmmu to the scd lists. 14823 */ 14824 mutex_enter(&scdp->scd_mutex); 14825 sfmmu_to_scd_list(&scdp->scd_sf_list, sfmmup); 14826 mutex_exit(&scdp->scd_mutex); 14827 SF_SCD_INCR_REF(scdp); 14828 14829 hatlockp = sfmmu_hat_enter(sfmmup); 14830 /* 14831 * For a multi-thread process, we must stop 14832 * all the other threads before joining the scd. 14833 */ 14834 14835 SFMMU_FLAGS_SET(sfmmup, HAT_JOIN_SCD); 14836 14837 sfmmu_invalidate_ctx(sfmmup); 14838 sfmmup->sfmmu_scdp = scdp; 14839 14840 /* 14841 * Copy scd_rttecnt into sfmmup's sfmmu_scdrttecnt, and update 14842 * sfmmu_ttecnt to not include the rgn ttecnt just joined in SCD. 14843 */ 14844 for (i = 0; i < mmu_page_sizes; i++) { 14845 sfmmup->sfmmu_scdrttecnt[i] = scdp->scd_rttecnt[i]; 14846 ASSERT(sfmmup->sfmmu_ttecnt[i] >= scdp->scd_rttecnt[i]); 14847 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 14848 -sfmmup->sfmmu_scdrttecnt[i]); 14849 } 14850 /* update tsb0 inflation count */ 14851 if (old_scdp != NULL) { 14852 sfmmup->sfmmu_tsb0_4minflcnt += 14853 old_scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 14854 } 14855 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >= 14856 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt); 14857 sfmmup->sfmmu_tsb0_4minflcnt -= scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 14858 14859 sfmmu_hat_exit(hatlockp); 14860 14861 if (old_scdp != NULL) { 14862 SF_SCD_DECR_REF(srdp, old_scdp); 14863 } 14864 14865 } 14866 14867 /* 14868 * This routine is called by a process to become part of an SCD. It is called 14869 * from sfmmu_tsbmiss_exception() once most of the initial work has been 14870 * done by sfmmu_join_scd(). This routine must not drop the hat lock. 14871 */ 14872 static void 14873 sfmmu_finish_join_scd(sfmmu_t *sfmmup) 14874 { 14875 struct tsb_info *tsbinfop; 14876 14877 ASSERT(sfmmu_hat_lock_held(sfmmup)); 14878 ASSERT(sfmmup->sfmmu_scdp != NULL); 14879 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)); 14880 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 14881 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)); 14882 14883 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 14884 tsbinfop = tsbinfop->tsb_next) { 14885 if (tsbinfop->tsb_flags & TSB_SWAPPED) { 14886 continue; 14887 } 14888 ASSERT(!(tsbinfop->tsb_flags & TSB_RELOC_FLAG)); 14889 14890 sfmmu_inv_tsb(tsbinfop->tsb_va, 14891 TSB_BYTES(tsbinfop->tsb_szc)); 14892 } 14893 14894 /* Set HAT_CTX1_FLAG for all SCD ISMs */ 14895 sfmmu_ism_hatflags(sfmmup, 1); 14896 14897 SFMMU_STAT(sf_join_scd); 14898 } 14899 14900 /* 14901 * This routine is called in order to check if there is an SCD which matches 14902 * the process's region map if not then a new SCD may be created. 14903 */ 14904 static void 14905 sfmmu_find_scd(sfmmu_t *sfmmup) 14906 { 14907 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14908 sf_scd_t *scdp, *new_scdp; 14909 int ret; 14910 14911 ASSERT(srdp != NULL); 14912 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 14913 14914 mutex_enter(&srdp->srd_scd_mutex); 14915 for (scdp = srdp->srd_scdp; scdp != NULL; 14916 scdp = scdp->scd_next) { 14917 SF_RGNMAP_EQUAL(&scdp->scd_region_map, 14918 &sfmmup->sfmmu_region_map, ret); 14919 if (ret == 1) { 14920 SF_SCD_INCR_REF(scdp); 14921 mutex_exit(&srdp->srd_scd_mutex); 14922 sfmmu_join_scd(scdp, sfmmup); 14923 ASSERT(scdp->scd_refcnt >= 2); 14924 atomic_dec_32((volatile uint32_t *)&scdp->scd_refcnt); 14925 return; 14926 } else { 14927 /* 14928 * If the sfmmu region map is a subset of the scd 14929 * region map, then the assumption is that this process 14930 * will continue attaching to ISM segments until the 14931 * region maps are equal. 14932 */ 14933 SF_RGNMAP_IS_SUBSET(&scdp->scd_region_map, 14934 &sfmmup->sfmmu_region_map, ret); 14935 if (ret == 1) { 14936 mutex_exit(&srdp->srd_scd_mutex); 14937 return; 14938 } 14939 } 14940 } 14941 14942 ASSERT(scdp == NULL); 14943 /* 14944 * No matching SCD has been found, create a new one. 14945 */ 14946 if ((new_scdp = sfmmu_alloc_scd(srdp, &sfmmup->sfmmu_region_map)) == 14947 NULL) { 14948 mutex_exit(&srdp->srd_scd_mutex); 14949 return; 14950 } 14951 14952 /* 14953 * sfmmu_alloc_scd() returns with a ref count of 1 on the scd. 14954 */ 14955 14956 /* Set scd_rttecnt for shme rgns in SCD */ 14957 sfmmu_set_scd_rttecnt(srdp, new_scdp); 14958 14959 /* 14960 * Link scd onto srd_scdp list and scd sfmmu onto region/iment lists. 14961 */ 14962 sfmmu_link_scd_to_regions(srdp, new_scdp); 14963 sfmmu_add_scd(&srdp->srd_scdp, new_scdp); 14964 SFMMU_STAT_ADD(sf_create_scd, 1); 14965 14966 mutex_exit(&srdp->srd_scd_mutex); 14967 sfmmu_join_scd(new_scdp, sfmmup); 14968 ASSERT(new_scdp->scd_refcnt >= 2); 14969 atomic_dec_32((volatile uint32_t *)&new_scdp->scd_refcnt); 14970 } 14971 14972 /* 14973 * This routine is called by a process to remove itself from an SCD. It is 14974 * either called when the processes has detached from a segment or from 14975 * hat_free_start() as a result of calling exit. 14976 */ 14977 static void 14978 sfmmu_leave_scd(sfmmu_t *sfmmup, uchar_t r_type) 14979 { 14980 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 14981 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14982 hatlock_t *hatlockp = TSB_HASH(sfmmup); 14983 int i; 14984 14985 ASSERT(scdp != NULL); 14986 ASSERT(srdp != NULL); 14987 14988 if (sfmmup->sfmmu_free) { 14989 /* 14990 * If the process is part of an SCD the sfmmu is unlinked 14991 * from scd_sf_list. 14992 */ 14993 mutex_enter(&scdp->scd_mutex); 14994 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup); 14995 mutex_exit(&scdp->scd_mutex); 14996 /* 14997 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that 14998 * are about to leave the SCD 14999 */ 15000 for (i = 0; i < mmu_page_sizes; i++) { 15001 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15002 scdp->scd_rttecnt[i]); 15003 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15004 sfmmup->sfmmu_scdrttecnt[i]); 15005 sfmmup->sfmmu_scdrttecnt[i] = 0; 15006 } 15007 sfmmup->sfmmu_scdp = NULL; 15008 15009 SF_SCD_DECR_REF(srdp, scdp); 15010 return; 15011 } 15012 15013 ASSERT(r_type != SFMMU_REGION_ISM || 15014 SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15015 ASSERT(scdp->scd_refcnt); 15016 ASSERT(!sfmmup->sfmmu_free); 15017 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15018 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 15019 15020 /* 15021 * Wait for ISM maps to be updated. 15022 */ 15023 if (r_type != SFMMU_REGION_ISM) { 15024 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY) && 15025 sfmmup->sfmmu_scdp != NULL) { 15026 cv_wait(&sfmmup->sfmmu_tsb_cv, 15027 HATLOCK_MUTEXP(hatlockp)); 15028 } 15029 15030 if (sfmmup->sfmmu_scdp == NULL) { 15031 sfmmu_hat_exit(hatlockp); 15032 return; 15033 } 15034 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY); 15035 } 15036 15037 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 15038 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD); 15039 /* 15040 * Since HAT_JOIN_SCD was set our context 15041 * is still invalid. 15042 */ 15043 } else { 15044 /* 15045 * For a multi-thread process, we must stop 15046 * all the other threads before leaving the scd. 15047 */ 15048 15049 sfmmu_invalidate_ctx(sfmmup); 15050 } 15051 15052 /* Clear all the rid's for ISM, delete flags, etc */ 15053 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15054 sfmmu_ism_hatflags(sfmmup, 0); 15055 15056 /* 15057 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that 15058 * are in SCD before this sfmmup leaves the SCD. 15059 */ 15060 for (i = 0; i < mmu_page_sizes; i++) { 15061 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15062 scdp->scd_rttecnt[i]); 15063 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15064 sfmmup->sfmmu_scdrttecnt[i]); 15065 sfmmup->sfmmu_scdrttecnt[i] = 0; 15066 /* update ismttecnt to include SCD ism before hat leaves SCD */ 15067 sfmmup->sfmmu_ismttecnt[i] += sfmmup->sfmmu_scdismttecnt[i]; 15068 sfmmup->sfmmu_scdismttecnt[i] = 0; 15069 } 15070 /* update tsb0 inflation count */ 15071 sfmmup->sfmmu_tsb0_4minflcnt += scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 15072 15073 if (r_type != SFMMU_REGION_ISM) { 15074 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY); 15075 } 15076 sfmmup->sfmmu_scdp = NULL; 15077 15078 sfmmu_hat_exit(hatlockp); 15079 15080 /* 15081 * Unlink sfmmu from scd_sf_list this can be done without holding 15082 * the hat lock as we hold the sfmmu_as lock which prevents 15083 * hat_join_region from adding this thread to the scd again. Other 15084 * threads check if sfmmu_scdp is NULL under hat lock and if it's NULL 15085 * they won't get here, since sfmmu_leave_scd() clears sfmmu_scdp 15086 * while holding the hat lock. 15087 */ 15088 mutex_enter(&scdp->scd_mutex); 15089 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup); 15090 mutex_exit(&scdp->scd_mutex); 15091 SFMMU_STAT(sf_leave_scd); 15092 15093 SF_SCD_DECR_REF(srdp, scdp); 15094 hatlockp = sfmmu_hat_enter(sfmmup); 15095 15096 } 15097 15098 /* 15099 * Unlink and free up an SCD structure with a reference count of 0. 15100 */ 15101 static void 15102 sfmmu_destroy_scd(sf_srd_t *srdp, sf_scd_t *scdp, sf_region_map_t *scd_rmap) 15103 { 15104 sfmmu_t *scsfmmup; 15105 sf_scd_t *sp; 15106 hatlock_t *shatlockp; 15107 int i, ret; 15108 15109 mutex_enter(&srdp->srd_scd_mutex); 15110 for (sp = srdp->srd_scdp; sp != NULL; sp = sp->scd_next) { 15111 if (sp == scdp) 15112 break; 15113 } 15114 if (sp == NULL || sp->scd_refcnt) { 15115 mutex_exit(&srdp->srd_scd_mutex); 15116 return; 15117 } 15118 15119 /* 15120 * It is possible that the scd has been freed and reallocated with a 15121 * different region map while we've been waiting for the srd_scd_mutex. 15122 */ 15123 SF_RGNMAP_EQUAL(scd_rmap, &sp->scd_region_map, ret); 15124 if (ret != 1) { 15125 mutex_exit(&srdp->srd_scd_mutex); 15126 return; 15127 } 15128 15129 ASSERT(scdp->scd_sf_list == NULL); 15130 /* 15131 * Unlink scd from srd_scdp list. 15132 */ 15133 sfmmu_remove_scd(&srdp->srd_scdp, scdp); 15134 mutex_exit(&srdp->srd_scd_mutex); 15135 15136 sfmmu_unlink_scd_from_regions(srdp, scdp); 15137 15138 /* Clear shared context tsb and release ctx */ 15139 scsfmmup = scdp->scd_sfmmup; 15140 15141 /* 15142 * create a barrier so that scd will not be destroyed 15143 * if other thread still holds the same shared hat lock. 15144 * E.g., sfmmu_tsbmiss_exception() needs to acquire the 15145 * shared hat lock before checking the shared tsb reloc flag. 15146 */ 15147 shatlockp = sfmmu_hat_enter(scsfmmup); 15148 sfmmu_hat_exit(shatlockp); 15149 15150 sfmmu_free_scd_tsbs(scsfmmup); 15151 15152 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 15153 if (scsfmmup->sfmmu_hmeregion_links[i] != NULL) { 15154 kmem_free(scsfmmup->sfmmu_hmeregion_links[i], 15155 SFMMU_L2_HMERLINKS_SIZE); 15156 scsfmmup->sfmmu_hmeregion_links[i] = NULL; 15157 } 15158 } 15159 kmem_cache_free(sfmmuid_cache, scsfmmup); 15160 kmem_cache_free(scd_cache, scdp); 15161 SFMMU_STAT(sf_destroy_scd); 15162 } 15163 15164 /* 15165 * Modifies the HAT_CTX1_FLAG for each of the ISM segments which correspond to 15166 * bits which are set in the ism_region_map parameter. This flag indicates to 15167 * the tsbmiss handler that mapping for these segments should be loaded using 15168 * the shared context. 15169 */ 15170 static void 15171 sfmmu_ism_hatflags(sfmmu_t *sfmmup, int addflag) 15172 { 15173 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 15174 ism_blk_t *ism_blkp; 15175 ism_map_t *ism_map; 15176 int i, rid; 15177 15178 ASSERT(sfmmup->sfmmu_iblk != NULL); 15179 ASSERT(scdp != NULL); 15180 /* 15181 * Note that the caller either set HAT_ISMBUSY flag or checked 15182 * under hat lock that HAT_ISMBUSY was not set by another thread. 15183 */ 15184 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15185 15186 ism_blkp = sfmmup->sfmmu_iblk; 15187 while (ism_blkp != NULL) { 15188 ism_map = ism_blkp->iblk_maps; 15189 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 15190 rid = ism_map[i].imap_rid; 15191 if (rid == SFMMU_INVALID_ISMRID) { 15192 continue; 15193 } 15194 ASSERT(rid >= 0 && rid < SFMMU_MAX_ISM_REGIONS); 15195 if (SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid) && 15196 addflag) { 15197 ism_map[i].imap_hatflags |= 15198 HAT_CTX1_FLAG; 15199 } else { 15200 ism_map[i].imap_hatflags &= 15201 ~HAT_CTX1_FLAG; 15202 } 15203 } 15204 ism_blkp = ism_blkp->iblk_next; 15205 } 15206 } 15207 15208 static int 15209 sfmmu_srd_lock_held(sf_srd_t *srdp) 15210 { 15211 return (MUTEX_HELD(&srdp->srd_mutex)); 15212 } 15213 15214 /* ARGSUSED */ 15215 static int 15216 sfmmu_scdcache_constructor(void *buf, void *cdrarg, int kmflags) 15217 { 15218 sf_scd_t *scdp = (sf_scd_t *)buf; 15219 15220 bzero(buf, sizeof (sf_scd_t)); 15221 mutex_init(&scdp->scd_mutex, NULL, MUTEX_DEFAULT, NULL); 15222 return (0); 15223 } 15224 15225 /* ARGSUSED */ 15226 static void 15227 sfmmu_scdcache_destructor(void *buf, void *cdrarg) 15228 { 15229 sf_scd_t *scdp = (sf_scd_t *)buf; 15230 15231 mutex_destroy(&scdp->scd_mutex); 15232 } 15233 15234 /* 15235 * The listp parameter is a pointer to a list of hmeblks which are partially 15236 * freed as result of calling sfmmu_hblk_hash_rm(), the last phase of the 15237 * freeing process is to cross-call all cpus to ensure that there are no 15238 * remaining cached references. 15239 * 15240 * If the local generation number is less than the global then we can free 15241 * hmeblks which are already on the pending queue as another cpu has completed 15242 * the cross-call. 15243 * 15244 * We cross-call to make sure that there are no threads on other cpus accessing 15245 * these hmblks and then complete the process of freeing them under the 15246 * following conditions: 15247 * The total number of pending hmeblks is greater than the threshold 15248 * The reserve list has fewer than HBLK_RESERVE_CNT hmeblks 15249 * It is at least 1 second since the last time we cross-called 15250 * 15251 * Otherwise, we add the hmeblks to the per-cpu pending queue. 15252 */ 15253 static void 15254 sfmmu_hblks_list_purge(struct hme_blk **listp, int dontfree) 15255 { 15256 struct hme_blk *hblkp, *pr_hblkp = NULL; 15257 int count = 0; 15258 cpuset_t cpuset = cpu_ready_set; 15259 cpu_hme_pend_t *cpuhp; 15260 timestruc_t now; 15261 int one_second_expired = 0; 15262 15263 gethrestime_lasttick(&now); 15264 15265 for (hblkp = *listp; hblkp != NULL; hblkp = hblkp->hblk_next) { 15266 ASSERT(hblkp->hblk_shw_bit == 0); 15267 ASSERT(hblkp->hblk_shared == 0); 15268 count++; 15269 pr_hblkp = hblkp; 15270 } 15271 15272 cpuhp = &cpu_hme_pend[CPU->cpu_seqid]; 15273 mutex_enter(&cpuhp->chp_mutex); 15274 15275 if ((cpuhp->chp_count + count) == 0) { 15276 mutex_exit(&cpuhp->chp_mutex); 15277 return; 15278 } 15279 15280 if ((now.tv_sec - cpuhp->chp_timestamp) > 1) { 15281 one_second_expired = 1; 15282 } 15283 15284 if (!dontfree && (freehblkcnt < HBLK_RESERVE_CNT || 15285 (cpuhp->chp_count + count) > cpu_hme_pend_thresh || 15286 one_second_expired)) { 15287 /* Append global list to local */ 15288 if (pr_hblkp == NULL) { 15289 *listp = cpuhp->chp_listp; 15290 } else { 15291 pr_hblkp->hblk_next = cpuhp->chp_listp; 15292 } 15293 cpuhp->chp_listp = NULL; 15294 cpuhp->chp_count = 0; 15295 cpuhp->chp_timestamp = now.tv_sec; 15296 mutex_exit(&cpuhp->chp_mutex); 15297 15298 kpreempt_disable(); 15299 CPUSET_DEL(cpuset, CPU->cpu_id); 15300 xt_sync(cpuset); 15301 xt_sync(cpuset); 15302 kpreempt_enable(); 15303 15304 /* 15305 * At this stage we know that no trap handlers on other 15306 * cpus can have references to hmeblks on the list. 15307 */ 15308 sfmmu_hblk_free(listp); 15309 } else if (*listp != NULL) { 15310 pr_hblkp->hblk_next = cpuhp->chp_listp; 15311 cpuhp->chp_listp = *listp; 15312 cpuhp->chp_count += count; 15313 *listp = NULL; 15314 mutex_exit(&cpuhp->chp_mutex); 15315 } else { 15316 mutex_exit(&cpuhp->chp_mutex); 15317 } 15318 } 15319 15320 /* 15321 * Add an hmeblk to the the hash list. 15322 */ 15323 void 15324 sfmmu_hblk_hash_add(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 15325 uint64_t hblkpa) 15326 { 15327 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 15328 #ifdef DEBUG 15329 if (hmebp->hmeblkp == NULL) { 15330 ASSERT(hmebp->hmeh_nextpa == HMEBLK_ENDPA); 15331 } 15332 #endif /* DEBUG */ 15333 15334 hmeblkp->hblk_nextpa = hmebp->hmeh_nextpa; 15335 /* 15336 * Since the TSB miss handler now does not lock the hash chain before 15337 * walking it, make sure that the hmeblks nextpa is globally visible 15338 * before we make the hmeblk globally visible by updating the chain root 15339 * pointer in the hash bucket. 15340 */ 15341 membar_producer(); 15342 hmebp->hmeh_nextpa = hblkpa; 15343 hmeblkp->hblk_next = hmebp->hmeblkp; 15344 hmebp->hmeblkp = hmeblkp; 15345 15346 } 15347 15348 /* 15349 * This function is the first part of a 2 part process to remove an hmeblk 15350 * from the hash chain. In this phase we unlink the hmeblk from the hash chain 15351 * but leave the next physical pointer unchanged. The hmeblk is then linked onto 15352 * a per-cpu pending list using the virtual address pointer. 15353 * 15354 * TSB miss trap handlers that start after this phase will no longer see 15355 * this hmeblk. TSB miss handlers that still cache this hmeblk in a register 15356 * can still use it for further chain traversal because we haven't yet modifed 15357 * the next physical pointer or freed it. 15358 * 15359 * In the second phase of hmeblk removal we'll issue a barrier xcall before 15360 * we reuse or free this hmeblk. This will make sure all lingering references to 15361 * the hmeblk after first phase disappear before we finally reclaim it. 15362 * This scheme eliminates the need for TSB miss handlers to lock hmeblk chains 15363 * during their traversal. 15364 * 15365 * The hmehash_mutex must be held when calling this function. 15366 * 15367 * Input: 15368 * hmebp - hme hash bucket pointer 15369 * hmeblkp - address of hmeblk to be removed 15370 * pr_hblk - virtual address of previous hmeblkp 15371 * listp - pointer to list of hmeblks linked by virtual address 15372 * free_now flag - indicates that a complete removal from the hash chains 15373 * is necessary. 15374 * 15375 * It is inefficient to use the free_now flag as a cross-call is required to 15376 * remove a single hmeblk from the hash chain but is necessary when hmeblks are 15377 * in short supply. 15378 */ 15379 void 15380 sfmmu_hblk_hash_rm(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 15381 struct hme_blk *pr_hblk, struct hme_blk **listp, 15382 int free_now) 15383 { 15384 int shw_size, vshift; 15385 struct hme_blk *shw_hblkp; 15386 uint_t shw_mask, newshw_mask; 15387 caddr_t vaddr; 15388 int size; 15389 cpuset_t cpuset = cpu_ready_set; 15390 15391 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 15392 15393 if (hmebp->hmeblkp == hmeblkp) { 15394 hmebp->hmeh_nextpa = hmeblkp->hblk_nextpa; 15395 hmebp->hmeblkp = hmeblkp->hblk_next; 15396 } else { 15397 pr_hblk->hblk_nextpa = hmeblkp->hblk_nextpa; 15398 pr_hblk->hblk_next = hmeblkp->hblk_next; 15399 } 15400 15401 size = get_hblk_ttesz(hmeblkp); 15402 shw_hblkp = hmeblkp->hblk_shadow; 15403 if (shw_hblkp) { 15404 ASSERT(hblktosfmmu(hmeblkp) != KHATID); 15405 ASSERT(!hmeblkp->hblk_shared); 15406 #ifdef DEBUG 15407 if (mmu_page_sizes == max_mmu_page_sizes) { 15408 ASSERT(size < TTE256M); 15409 } else { 15410 ASSERT(size < TTE4M); 15411 } 15412 #endif /* DEBUG */ 15413 15414 shw_size = get_hblk_ttesz(shw_hblkp); 15415 vaddr = (caddr_t)get_hblk_base(hmeblkp); 15416 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 15417 ASSERT(vshift < 8); 15418 /* 15419 * Atomically clear shadow mask bit 15420 */ 15421 do { 15422 shw_mask = shw_hblkp->hblk_shw_mask; 15423 ASSERT(shw_mask & (1 << vshift)); 15424 newshw_mask = shw_mask & ~(1 << vshift); 15425 newshw_mask = atomic_cas_32(&shw_hblkp->hblk_shw_mask, 15426 shw_mask, newshw_mask); 15427 } while (newshw_mask != shw_mask); 15428 hmeblkp->hblk_shadow = NULL; 15429 } 15430 hmeblkp->hblk_shw_bit = 0; 15431 15432 if (hmeblkp->hblk_shared) { 15433 #ifdef DEBUG 15434 sf_srd_t *srdp; 15435 sf_region_t *rgnp; 15436 uint_t rid; 15437 15438 srdp = hblktosrd(hmeblkp); 15439 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 15440 rid = hmeblkp->hblk_tag.htag_rid; 15441 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 15442 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 15443 rgnp = srdp->srd_hmergnp[rid]; 15444 ASSERT(rgnp != NULL); 15445 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 15446 #endif /* DEBUG */ 15447 hmeblkp->hblk_shared = 0; 15448 } 15449 if (free_now) { 15450 kpreempt_disable(); 15451 CPUSET_DEL(cpuset, CPU->cpu_id); 15452 xt_sync(cpuset); 15453 xt_sync(cpuset); 15454 kpreempt_enable(); 15455 15456 hmeblkp->hblk_nextpa = HMEBLK_ENDPA; 15457 hmeblkp->hblk_next = NULL; 15458 } else { 15459 /* Append hmeblkp to listp for processing later. */ 15460 hmeblkp->hblk_next = *listp; 15461 *listp = hmeblkp; 15462 } 15463 } 15464 15465 /* 15466 * This routine is called when memory is in short supply and returns a free 15467 * hmeblk of the requested size from the cpu pending lists. 15468 */ 15469 static struct hme_blk * 15470 sfmmu_check_pending_hblks(int size) 15471 { 15472 int i; 15473 struct hme_blk *hmeblkp = NULL, *last_hmeblkp; 15474 int found_hmeblk; 15475 cpuset_t cpuset = cpu_ready_set; 15476 cpu_hme_pend_t *cpuhp; 15477 15478 /* Flush cpu hblk pending queues */ 15479 for (i = 0; i < NCPU; i++) { 15480 cpuhp = &cpu_hme_pend[i]; 15481 if (cpuhp->chp_listp != NULL) { 15482 mutex_enter(&cpuhp->chp_mutex); 15483 if (cpuhp->chp_listp == NULL) { 15484 mutex_exit(&cpuhp->chp_mutex); 15485 continue; 15486 } 15487 found_hmeblk = 0; 15488 last_hmeblkp = NULL; 15489 for (hmeblkp = cpuhp->chp_listp; hmeblkp != NULL; 15490 hmeblkp = hmeblkp->hblk_next) { 15491 if (get_hblk_ttesz(hmeblkp) == size) { 15492 if (last_hmeblkp == NULL) { 15493 cpuhp->chp_listp = 15494 hmeblkp->hblk_next; 15495 } else { 15496 last_hmeblkp->hblk_next = 15497 hmeblkp->hblk_next; 15498 } 15499 ASSERT(cpuhp->chp_count > 0); 15500 cpuhp->chp_count--; 15501 found_hmeblk = 1; 15502 break; 15503 } else { 15504 last_hmeblkp = hmeblkp; 15505 } 15506 } 15507 mutex_exit(&cpuhp->chp_mutex); 15508 15509 if (found_hmeblk) { 15510 kpreempt_disable(); 15511 CPUSET_DEL(cpuset, CPU->cpu_id); 15512 xt_sync(cpuset); 15513 xt_sync(cpuset); 15514 kpreempt_enable(); 15515 return (hmeblkp); 15516 } 15517 } 15518 } 15519 return (NULL); 15520 }