1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 /* 40 * VM - shared or copy-on-write from a vnode/anonymous memory. 41 */ 42 43 #include <sys/types.h> 44 #include <sys/param.h> 45 #include <sys/t_lock.h> 46 #include <sys/errno.h> 47 #include <sys/systm.h> 48 #include <sys/mman.h> 49 #include <sys/debug.h> 50 #include <sys/cred.h> 51 #include <sys/vmsystm.h> 52 #include <sys/tuneable.h> 53 #include <sys/bitmap.h> 54 #include <sys/swap.h> 55 #include <sys/kmem.h> 56 #include <sys/sysmacros.h> 57 #include <sys/vtrace.h> 58 #include <sys/cmn_err.h> 59 #include <sys/callb.h> 60 #include <sys/vm.h> 61 #include <sys/dumphdr.h> 62 #include <sys/lgrp.h> 63 64 #include <vm/hat.h> 65 #include <vm/as.h> 66 #include <vm/seg.h> 67 #include <vm/seg_vn.h> 68 #include <vm/pvn.h> 69 #include <vm/anon.h> 70 #include <vm/page.h> 71 #include <vm/vpage.h> 72 #include <sys/proc.h> 73 #include <sys/task.h> 74 #include <sys/project.h> 75 #include <sys/zone.h> 76 #include <sys/shm_impl.h> 77 78 /* 79 * segvn_fault needs a temporary page list array. To avoid calling kmem all 80 * the time, it creates a small (PVN_GETPAGE_NUM entry) array and uses it if 81 * it can. In the rare case when this page list is not large enough, it 82 * goes and gets a large enough array from kmem. 83 * 84 * This small page list array covers either 8 pages or 64kB worth of pages - 85 * whichever is smaller. 86 */ 87 #define PVN_MAX_GETPAGE_SZ 0x10000 88 #define PVN_MAX_GETPAGE_NUM 0x8 89 90 #if PVN_MAX_GETPAGE_SZ > PVN_MAX_GETPAGE_NUM * PAGESIZE 91 #define PVN_GETPAGE_SZ ptob(PVN_MAX_GETPAGE_NUM) 92 #define PVN_GETPAGE_NUM PVN_MAX_GETPAGE_NUM 93 #else 94 #define PVN_GETPAGE_SZ PVN_MAX_GETPAGE_SZ 95 #define PVN_GETPAGE_NUM btop(PVN_MAX_GETPAGE_SZ) 96 #endif 97 98 /* 99 * Private seg op routines. 100 */ 101 static int segvn_dup(struct seg *seg, struct seg *newseg); 102 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 103 static void segvn_free(struct seg *seg); 104 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 105 caddr_t addr, size_t len, enum fault_type type, 106 enum seg_rw rw); 107 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 108 static int segvn_setprot(struct seg *seg, caddr_t addr, 109 size_t len, uint_t prot); 110 static int segvn_checkprot(struct seg *seg, caddr_t addr, 111 size_t len, uint_t prot); 112 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 113 static size_t segvn_swapout(struct seg *seg); 114 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 115 int attr, uint_t flags); 116 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 117 char *vec); 118 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 119 int attr, int op, ulong_t *lockmap, size_t pos); 120 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 121 uint_t *protv); 122 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 123 static int segvn_gettype(struct seg *seg, caddr_t addr); 124 static int segvn_getvp(struct seg *seg, caddr_t addr, 125 struct vnode **vpp); 126 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 127 uint_t behav); 128 static void segvn_dump(struct seg *seg); 129 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 130 struct page ***ppp, enum lock_type type, enum seg_rw rw); 131 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 132 uint_t szc); 133 static int segvn_getmemid(struct seg *seg, caddr_t addr, 134 memid_t *memidp); 135 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 136 static int segvn_capable(struct seg *seg, segcapability_t capable); 137 138 struct seg_ops segvn_ops = { 139 segvn_dup, 140 segvn_unmap, 141 segvn_free, 142 segvn_fault, 143 segvn_faulta, 144 segvn_setprot, 145 segvn_checkprot, 146 segvn_kluster, 147 segvn_swapout, 148 segvn_sync, 149 segvn_incore, 150 segvn_lockop, 151 segvn_getprot, 152 segvn_getoffset, 153 segvn_gettype, 154 segvn_getvp, 155 segvn_advise, 156 segvn_dump, 157 segvn_pagelock, 158 segvn_setpagesize, 159 segvn_getmemid, 160 segvn_getpolicy, 161 segvn_capable, 162 }; 163 164 /* 165 * Common zfod structures, provided as a shorthand for others to use. 166 */ 167 static segvn_crargs_t zfod_segvn_crargs = 168 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 169 static segvn_crargs_t kzfod_segvn_crargs = 170 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 171 PROT_ALL & ~PROT_USER); 172 static segvn_crargs_t stack_noexec_crargs = 173 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 174 175 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 176 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 177 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 178 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 179 180 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 181 182 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 183 184 size_t segvn_pglock_comb_thrshld = (1UL << 16); /* 64K */ 185 size_t segvn_pglock_comb_balign = (1UL << 16); /* 64K */ 186 uint_t segvn_pglock_comb_bshift; 187 size_t segvn_pglock_comb_palign; 188 189 static int segvn_concat(struct seg *, struct seg *, int); 190 static int segvn_extend_prev(struct seg *, struct seg *, 191 struct segvn_crargs *, size_t); 192 static int segvn_extend_next(struct seg *, struct seg *, 193 struct segvn_crargs *, size_t); 194 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 195 static void segvn_pagelist_rele(page_t **); 196 static void segvn_setvnode_mpss(vnode_t *); 197 static void segvn_relocate_pages(page_t **, page_t *); 198 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 199 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 200 uint_t, page_t **, page_t **, uint_t *, int *); 201 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 202 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 203 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 204 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 205 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 206 u_offset_t, struct vpage *, page_t **, uint_t, 207 enum fault_type, enum seg_rw, int); 208 static void segvn_vpage(struct seg *); 209 static size_t segvn_count_swap_by_vpages(struct seg *); 210 211 static void segvn_purge(struct seg *seg); 212 static int segvn_reclaim(void *, caddr_t, size_t, struct page **, 213 enum seg_rw, int); 214 static int shamp_reclaim(void *, caddr_t, size_t, struct page **, 215 enum seg_rw, int); 216 217 static int sameprot(struct seg *, caddr_t, size_t); 218 219 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 220 static int segvn_clrszc(struct seg *); 221 static struct seg *segvn_split_seg(struct seg *, caddr_t); 222 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 223 ulong_t, uint_t); 224 225 static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t, 226 size_t, void *, u_offset_t); 227 228 static struct kmem_cache *segvn_cache; 229 static struct kmem_cache **segvn_szc_cache; 230 231 #ifdef VM_STATS 232 static struct segvnvmstats_str { 233 ulong_t fill_vp_pages[31]; 234 ulong_t fltvnpages[49]; 235 ulong_t fullszcpages[10]; 236 ulong_t relocatepages[3]; 237 ulong_t fltanpages[17]; 238 ulong_t pagelock[2]; 239 ulong_t demoterange[3]; 240 } segvnvmstats; 241 #endif /* VM_STATS */ 242 243 #define SDR_RANGE 1 /* demote entire range */ 244 #define SDR_END 2 /* demote non aligned ends only */ 245 246 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 247 if ((len) != 0) { \ 248 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 249 ASSERT(lpgaddr >= (seg)->s_base); \ 250 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 251 (len)), pgsz); \ 252 ASSERT(lpgeaddr > lpgaddr); \ 253 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 254 } else { \ 255 lpgeaddr = lpgaddr = (addr); \ 256 } \ 257 } 258 259 /*ARGSUSED*/ 260 static int 261 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 262 { 263 struct segvn_data *svd = buf; 264 265 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 266 mutex_init(&svd->segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL); 267 svd->svn_trnext = svd->svn_trprev = NULL; 268 return (0); 269 } 270 271 /*ARGSUSED1*/ 272 static void 273 segvn_cache_destructor(void *buf, void *cdrarg) 274 { 275 struct segvn_data *svd = buf; 276 277 rw_destroy(&svd->lock); 278 mutex_destroy(&svd->segfree_syncmtx); 279 } 280 281 /*ARGSUSED*/ 282 static int 283 svntr_cache_constructor(void *buf, void *cdrarg, int kmflags) 284 { 285 bzero(buf, sizeof (svntr_t)); 286 return (0); 287 } 288 289 /* 290 * Patching this variable to non-zero allows the system to run with 291 * stacks marked as "not executable". It's a bit of a kludge, but is 292 * provided as a tweakable for platforms that export those ABIs 293 * (e.g. sparc V8) that have executable stacks enabled by default. 294 * There are also some restrictions for platforms that don't actually 295 * implement 'noexec' protections. 296 * 297 * Once enabled, the system is (therefore) unable to provide a fully 298 * ABI-compliant execution environment, though practically speaking, 299 * most everything works. The exceptions are generally some interpreters 300 * and debuggers that create executable code on the stack and jump 301 * into it (without explicitly mprotecting the address range to include 302 * PROT_EXEC). 303 * 304 * One important class of applications that are disabled are those 305 * that have been transformed into malicious agents using one of the 306 * numerous "buffer overflow" attacks. See 4007890. 307 */ 308 int noexec_user_stack = 0; 309 int noexec_user_stack_log = 1; 310 311 int segvn_lpg_disable = 0; 312 uint_t segvn_maxpgszc = 0; 313 314 ulong_t segvn_vmpss_clrszc_cnt; 315 ulong_t segvn_vmpss_clrszc_err; 316 ulong_t segvn_fltvnpages_clrszc_cnt; 317 ulong_t segvn_fltvnpages_clrszc_err; 318 ulong_t segvn_setpgsz_align_err; 319 ulong_t segvn_setpgsz_anon_align_err; 320 ulong_t segvn_setpgsz_getattr_err; 321 ulong_t segvn_setpgsz_eof_err; 322 ulong_t segvn_faultvnmpss_align_err1; 323 ulong_t segvn_faultvnmpss_align_err2; 324 ulong_t segvn_faultvnmpss_align_err3; 325 ulong_t segvn_faultvnmpss_align_err4; 326 ulong_t segvn_faultvnmpss_align_err5; 327 ulong_t segvn_vmpss_pageio_deadlk_err; 328 329 int segvn_use_regions = 1; 330 331 /* 332 * Segvn supports text replication optimization for NUMA platforms. Text 333 * replica's are represented by anon maps (amp). There's one amp per text file 334 * region per lgroup. A process chooses the amp for each of its text mappings 335 * based on the lgroup assignment of its main thread (t_tid = 1). All 336 * processes that want a replica on a particular lgroup for the same text file 337 * mapping share the same amp. amp's are looked up in svntr_hashtab hash table 338 * with vp,off,size,szc used as a key. Text replication segments are read only 339 * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by 340 * forcing COW faults from vnode to amp and mapping amp pages instead of vnode 341 * pages. Replication amp is assigned to a segment when it gets its first 342 * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread 343 * rechecks periodically if the process still maps an amp local to the main 344 * thread. If not async thread forces process to remap to an amp in the new 345 * home lgroup of the main thread. Current text replication implementation 346 * only provides the benefit to workloads that do most of their work in the 347 * main thread of a process or all the threads of a process run in the same 348 * lgroup. To extend text replication benefit to different types of 349 * multithreaded workloads further work would be needed in the hat layer to 350 * allow the same virtual address in the same hat to simultaneously map 351 * different physical addresses (i.e. page table replication would be needed 352 * for x86). 353 * 354 * amp pages are used instead of vnode pages as long as segment has a very 355 * simple life cycle. It's created via segvn_create(), handles S_EXEC 356 * (S_READ) pagefaults and is fully unmapped. If anything more complicated 357 * happens such as protection is changed, real COW fault happens, pagesize is 358 * changed, MC_LOCK is requested or segment is partially unmapped we turn off 359 * text replication by converting the segment back to vnode only segment 360 * (unmap segment's address range and set svd->amp to NULL). 361 * 362 * The original file can be changed after amp is inserted into 363 * svntr_hashtab. Processes that are launched after the file is already 364 * changed can't use the replica's created prior to the file change. To 365 * implement this functionality hash entries are timestamped. Replica's can 366 * only be used if current file modification time is the same as the timestamp 367 * saved when hash entry was created. However just timestamps alone are not 368 * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We 369 * deal with file changes via MAP_SHARED mappings differently. When writable 370 * MAP_SHARED mappings are created to vnodes marked as executable we mark all 371 * existing replica's for this vnode as not usable for future text 372 * mappings. And we don't create new replica's for files that currently have 373 * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is 374 * true). 375 */ 376 377 #define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20) 378 size_t segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR; 379 380 static ulong_t svntr_hashtab_sz = 512; 381 static svntr_bucket_t *svntr_hashtab = NULL; 382 static struct kmem_cache *svntr_cache; 383 static svntr_stats_t *segvn_textrepl_stats; 384 static ksema_t segvn_trasync_sem; 385 386 int segvn_disable_textrepl = 1; 387 size_t textrepl_size_thresh = (size_t)-1; 388 size_t segvn_textrepl_bytes = 0; 389 size_t segvn_textrepl_max_bytes = 0; 390 clock_t segvn_update_textrepl_interval = 0; 391 int segvn_update_tr_time = 10; 392 int segvn_disable_textrepl_update = 0; 393 394 static void segvn_textrepl(struct seg *); 395 static void segvn_textunrepl(struct seg *, int); 396 static void segvn_inval_trcache(vnode_t *); 397 static void segvn_trasync_thread(void); 398 static void segvn_trupdate_wakeup(void *); 399 static void segvn_trupdate(void); 400 static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *, 401 ulong_t); 402 403 /* 404 * Initialize segvn data structures 405 */ 406 void 407 segvn_init(void) 408 { 409 uint_t maxszc; 410 uint_t szc; 411 size_t pgsz; 412 413 segvn_cache = kmem_cache_create("segvn_cache", 414 sizeof (struct segvn_data), 0, 415 segvn_cache_constructor, segvn_cache_destructor, NULL, 416 NULL, NULL, 0); 417 418 if (segvn_lpg_disable == 0) { 419 szc = maxszc = page_num_pagesizes() - 1; 420 if (szc == 0) { 421 segvn_lpg_disable = 1; 422 } 423 if (page_get_pagesize(0) != PAGESIZE) { 424 panic("segvn_init: bad szc 0"); 425 /*NOTREACHED*/ 426 } 427 while (szc != 0) { 428 pgsz = page_get_pagesize(szc); 429 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 430 panic("segvn_init: bad szc %d", szc); 431 /*NOTREACHED*/ 432 } 433 szc--; 434 } 435 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 436 segvn_maxpgszc = maxszc; 437 } 438 439 if (segvn_maxpgszc) { 440 segvn_szc_cache = (struct kmem_cache **)kmem_alloc( 441 (segvn_maxpgszc + 1) * sizeof (struct kmem_cache *), 442 KM_SLEEP); 443 } 444 445 for (szc = 1; szc <= segvn_maxpgszc; szc++) { 446 char str[32]; 447 448 (void) sprintf(str, "segvn_szc_cache%d", szc); 449 segvn_szc_cache[szc] = kmem_cache_create(str, 450 page_get_pagecnt(szc) * sizeof (page_t *), 0, 451 NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 452 } 453 454 455 if (segvn_use_regions && !hat_supported(HAT_SHARED_REGIONS, NULL)) 456 segvn_use_regions = 0; 457 458 /* 459 * For now shared regions and text replication segvn support 460 * are mutually exclusive. This is acceptable because 461 * currently significant benefit from text replication was 462 * only observed on AMD64 NUMA platforms (due to relatively 463 * small L2$ size) and currently we don't support shared 464 * regions on x86. 465 */ 466 if (segvn_use_regions && !segvn_disable_textrepl) { 467 segvn_disable_textrepl = 1; 468 } 469 470 #if defined(_LP64) 471 if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 && 472 !segvn_disable_textrepl) { 473 ulong_t i; 474 size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t); 475 476 svntr_cache = kmem_cache_create("svntr_cache", 477 sizeof (svntr_t), 0, svntr_cache_constructor, NULL, 478 NULL, NULL, NULL, 0); 479 svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP); 480 for (i = 0; i < svntr_hashtab_sz; i++) { 481 mutex_init(&svntr_hashtab[i].tr_lock, NULL, 482 MUTEX_DEFAULT, NULL); 483 } 484 segvn_textrepl_max_bytes = ptob(physmem) / 485 segvn_textrepl_max_bytes_factor; 486 segvn_textrepl_stats = kmem_zalloc(NCPU * 487 sizeof (svntr_stats_t), KM_SLEEP); 488 sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL); 489 (void) thread_create(NULL, 0, segvn_trasync_thread, 490 NULL, 0, &p0, TS_RUN, minclsyspri); 491 } 492 #endif 493 494 if (!ISP2(segvn_pglock_comb_balign) || 495 segvn_pglock_comb_balign < PAGESIZE) { 496 segvn_pglock_comb_balign = 1UL << 16; /* 64K */ 497 } 498 segvn_pglock_comb_bshift = highbit(segvn_pglock_comb_balign) - 1; 499 segvn_pglock_comb_palign = btop(segvn_pglock_comb_balign); 500 } 501 502 #define SEGVN_PAGEIO ((void *)0x1) 503 #define SEGVN_NOPAGEIO ((void *)0x2) 504 505 static void 506 segvn_setvnode_mpss(vnode_t *vp) 507 { 508 int err; 509 510 ASSERT(vp->v_mpssdata == NULL || 511 vp->v_mpssdata == SEGVN_PAGEIO || 512 vp->v_mpssdata == SEGVN_NOPAGEIO); 513 514 if (vp->v_mpssdata == NULL) { 515 if (vn_vmpss_usepageio(vp)) { 516 err = VOP_PAGEIO(vp, (page_t *)NULL, 517 (u_offset_t)0, 0, 0, CRED(), NULL); 518 } else { 519 err = ENOSYS; 520 } 521 /* 522 * set v_mpssdata just once per vnode life 523 * so that it never changes. 524 */ 525 mutex_enter(&vp->v_lock); 526 if (vp->v_mpssdata == NULL) { 527 if (err == EINVAL) { 528 vp->v_mpssdata = SEGVN_PAGEIO; 529 } else { 530 vp->v_mpssdata = SEGVN_NOPAGEIO; 531 } 532 } 533 mutex_exit(&vp->v_lock); 534 } 535 } 536 537 int 538 segvn_create(struct seg *seg, void *argsp) 539 { 540 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 541 struct segvn_data *svd; 542 size_t swresv = 0; 543 struct cred *cred; 544 struct anon_map *amp; 545 int error = 0; 546 size_t pgsz; 547 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 548 int use_rgn = 0; 549 int trok = 0; 550 551 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 552 553 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 554 panic("segvn_create type"); 555 /*NOTREACHED*/ 556 } 557 558 /* 559 * Check arguments. If a shared anon structure is given then 560 * it is illegal to also specify a vp. 561 */ 562 if (a->amp != NULL && a->vp != NULL) { 563 panic("segvn_create anon_map"); 564 /*NOTREACHED*/ 565 } 566 567 if (a->type == MAP_PRIVATE && (a->flags & MAP_TEXT) && 568 a->vp != NULL && a->prot == (PROT_USER | PROT_READ | PROT_EXEC) && 569 segvn_use_regions) { 570 use_rgn = 1; 571 } 572 573 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 574 if (a->type == MAP_SHARED) 575 a->flags &= ~MAP_NORESERVE; 576 577 if (a->szc != 0) { 578 if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) || 579 (a->amp != NULL && a->type == MAP_PRIVATE) || 580 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 581 a->szc = 0; 582 } else { 583 if (a->szc > segvn_maxpgszc) 584 a->szc = segvn_maxpgszc; 585 pgsz = page_get_pagesize(a->szc); 586 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 587 !IS_P2ALIGNED(seg->s_size, pgsz)) { 588 a->szc = 0; 589 } else if (a->vp != NULL) { 590 if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) { 591 /* 592 * paranoid check. 593 * hat_page_demote() is not supported 594 * on swapfs pages. 595 */ 596 a->szc = 0; 597 } else if (map_addr_vacalign_check(seg->s_base, 598 a->offset & PAGEMASK)) { 599 a->szc = 0; 600 } 601 } else if (a->amp != NULL) { 602 pgcnt_t anum = btopr(a->offset); 603 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 604 if (!IS_P2ALIGNED(anum, pgcnt)) { 605 a->szc = 0; 606 } 607 } 608 } 609 } 610 611 /* 612 * If segment may need private pages, reserve them now. 613 */ 614 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 615 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 616 if (anon_resv_zone(seg->s_size, 617 seg->s_as->a_proc->p_zone) == 0) 618 return (EAGAIN); 619 swresv = seg->s_size; 620 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 621 seg, swresv, 1); 622 } 623 624 /* 625 * Reserve any mapping structures that may be required. 626 * 627 * Don't do it for segments that may use regions. It's currently a 628 * noop in the hat implementations anyway. 629 */ 630 if (!use_rgn) { 631 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 632 } 633 634 if (a->cred) { 635 cred = a->cred; 636 crhold(cred); 637 } else { 638 crhold(cred = CRED()); 639 } 640 641 /* Inform the vnode of the new mapping */ 642 if (a->vp != NULL) { 643 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 644 seg->s_as, seg->s_base, seg->s_size, a->prot, 645 a->maxprot, a->type, cred, NULL); 646 if (error) { 647 if (swresv != 0) { 648 anon_unresv_zone(swresv, 649 seg->s_as->a_proc->p_zone); 650 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 651 "anon proc:%p %lu %u", seg, swresv, 0); 652 } 653 crfree(cred); 654 if (!use_rgn) { 655 hat_unload(seg->s_as->a_hat, seg->s_base, 656 seg->s_size, HAT_UNLOAD_UNMAP); 657 } 658 return (error); 659 } 660 /* 661 * svntr_hashtab will be NULL if we support shared regions. 662 */ 663 trok = ((a->flags & MAP_TEXT) && 664 (seg->s_size > textrepl_size_thresh || 665 (a->flags & _MAP_TEXTREPL)) && 666 lgrp_optimizations() && svntr_hashtab != NULL && 667 a->type == MAP_PRIVATE && swresv == 0 && 668 !(a->flags & MAP_NORESERVE) && 669 seg->s_as != &kas && a->vp->v_type == VREG); 670 671 ASSERT(!trok || !use_rgn); 672 } 673 674 /* 675 * MAP_NORESERVE mappings don't count towards the VSZ of a process 676 * until we fault the pages in. 677 */ 678 if ((a->vp == NULL || a->vp->v_type != VREG) && 679 a->flags & MAP_NORESERVE) { 680 seg->s_as->a_resvsize -= seg->s_size; 681 } 682 683 /* 684 * If more than one segment in the address space, and they're adjacent 685 * virtually, try to concatenate them. Don't concatenate if an 686 * explicit anon_map structure was supplied (e.g., SystemV shared 687 * memory) or if we'll use text replication for this segment. 688 */ 689 if (a->amp == NULL && !use_rgn && !trok) { 690 struct seg *pseg, *nseg; 691 struct segvn_data *psvd, *nsvd; 692 lgrp_mem_policy_t ppolicy, npolicy; 693 uint_t lgrp_mem_policy_flags = 0; 694 extern lgrp_mem_policy_t lgrp_mem_default_policy; 695 696 /* 697 * Memory policy flags (lgrp_mem_policy_flags) is valid when 698 * extending stack/heap segments. 699 */ 700 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 701 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 702 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 703 } else { 704 /* 705 * Get policy when not extending it from another segment 706 */ 707 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 708 } 709 710 /* 711 * First, try to concatenate the previous and new segments 712 */ 713 pseg = AS_SEGPREV(seg->s_as, seg); 714 if (pseg != NULL && 715 pseg->s_base + pseg->s_size == seg->s_base && 716 pseg->s_ops == &segvn_ops) { 717 /* 718 * Get memory allocation policy from previous segment. 719 * When extension is specified (e.g. for heap) apply 720 * this policy to the new segment regardless of the 721 * outcome of segment concatenation. Extension occurs 722 * for non-default policy otherwise default policy is 723 * used and is based on extended segment size. 724 */ 725 psvd = (struct segvn_data *)pseg->s_data; 726 ppolicy = psvd->policy_info.mem_policy; 727 if (lgrp_mem_policy_flags == 728 LGRP_MP_FLAG_EXTEND_UP) { 729 if (ppolicy != lgrp_mem_default_policy) { 730 mpolicy = ppolicy; 731 } else { 732 mpolicy = lgrp_mem_policy_default( 733 pseg->s_size + seg->s_size, 734 a->type); 735 } 736 } 737 738 if (mpolicy == ppolicy && 739 (pseg->s_size + seg->s_size <= 740 segvn_comb_thrshld || psvd->amp == NULL) && 741 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 742 /* 743 * success! now try to concatenate 744 * with following seg 745 */ 746 crfree(cred); 747 nseg = AS_SEGNEXT(pseg->s_as, pseg); 748 if (nseg != NULL && 749 nseg != pseg && 750 nseg->s_ops == &segvn_ops && 751 pseg->s_base + pseg->s_size == 752 nseg->s_base) 753 (void) segvn_concat(pseg, nseg, 0); 754 ASSERT(pseg->s_szc == 0 || 755 (a->szc == pseg->s_szc && 756 IS_P2ALIGNED(pseg->s_base, pgsz) && 757 IS_P2ALIGNED(pseg->s_size, pgsz))); 758 return (0); 759 } 760 } 761 762 /* 763 * Failed, so try to concatenate with following seg 764 */ 765 nseg = AS_SEGNEXT(seg->s_as, seg); 766 if (nseg != NULL && 767 seg->s_base + seg->s_size == nseg->s_base && 768 nseg->s_ops == &segvn_ops) { 769 /* 770 * Get memory allocation policy from next segment. 771 * When extension is specified (e.g. for stack) apply 772 * this policy to the new segment regardless of the 773 * outcome of segment concatenation. Extension occurs 774 * for non-default policy otherwise default policy is 775 * used and is based on extended segment size. 776 */ 777 nsvd = (struct segvn_data *)nseg->s_data; 778 npolicy = nsvd->policy_info.mem_policy; 779 if (lgrp_mem_policy_flags == 780 LGRP_MP_FLAG_EXTEND_DOWN) { 781 if (npolicy != lgrp_mem_default_policy) { 782 mpolicy = npolicy; 783 } else { 784 mpolicy = lgrp_mem_policy_default( 785 nseg->s_size + seg->s_size, 786 a->type); 787 } 788 } 789 790 if (mpolicy == npolicy && 791 segvn_extend_next(seg, nseg, a, swresv) == 0) { 792 crfree(cred); 793 ASSERT(nseg->s_szc == 0 || 794 (a->szc == nseg->s_szc && 795 IS_P2ALIGNED(nseg->s_base, pgsz) && 796 IS_P2ALIGNED(nseg->s_size, pgsz))); 797 return (0); 798 } 799 } 800 } 801 802 if (a->vp != NULL) { 803 VN_HOLD(a->vp); 804 if (a->type == MAP_SHARED) 805 lgrp_shm_policy_init(NULL, a->vp); 806 } 807 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 808 809 seg->s_ops = &segvn_ops; 810 seg->s_data = (void *)svd; 811 seg->s_szc = a->szc; 812 813 svd->seg = seg; 814 svd->vp = a->vp; 815 /* 816 * Anonymous mappings have no backing file so the offset is meaningless. 817 */ 818 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 819 svd->prot = a->prot; 820 svd->maxprot = a->maxprot; 821 svd->pageprot = 0; 822 svd->type = a->type; 823 svd->vpage = NULL; 824 svd->cred = cred; 825 svd->advice = MADV_NORMAL; 826 svd->pageadvice = 0; 827 svd->flags = (ushort_t)a->flags; 828 svd->softlockcnt = 0; 829 svd->softlockcnt_sbase = 0; 830 svd->softlockcnt_send = 0; 831 svd->rcookie = HAT_INVALID_REGION_COOKIE; 832 svd->pageswap = 0; 833 834 if (a->szc != 0 && a->vp != NULL) { 835 segvn_setvnode_mpss(a->vp); 836 } 837 if (svd->type == MAP_SHARED && svd->vp != NULL && 838 (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) { 839 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 840 segvn_inval_trcache(svd->vp); 841 } 842 843 amp = a->amp; 844 if ((svd->amp = amp) == NULL) { 845 svd->anon_index = 0; 846 if (svd->type == MAP_SHARED) { 847 svd->swresv = 0; 848 /* 849 * Shared mappings to a vp need no other setup. 850 * If we have a shared mapping to an anon_map object 851 * which hasn't been allocated yet, allocate the 852 * struct now so that it will be properly shared 853 * by remembering the swap reservation there. 854 */ 855 if (a->vp == NULL) { 856 svd->amp = anonmap_alloc(seg->s_size, swresv, 857 ANON_SLEEP); 858 svd->amp->a_szc = seg->s_szc; 859 } 860 } else { 861 /* 862 * Private mapping (with or without a vp). 863 * Allocate anon_map when needed. 864 */ 865 svd->swresv = swresv; 866 } 867 } else { 868 pgcnt_t anon_num; 869 870 /* 871 * Mapping to an existing anon_map structure without a vp. 872 * For now we will insure that the segment size isn't larger 873 * than the size - offset gives us. Later on we may wish to 874 * have the anon array dynamically allocated itself so that 875 * we don't always have to allocate all the anon pointer slots. 876 * This of course involves adding extra code to check that we 877 * aren't trying to use an anon pointer slot beyond the end 878 * of the currently allocated anon array. 879 */ 880 if ((amp->size - a->offset) < seg->s_size) { 881 panic("segvn_create anon_map size"); 882 /*NOTREACHED*/ 883 } 884 885 anon_num = btopr(a->offset); 886 887 if (a->type == MAP_SHARED) { 888 /* 889 * SHARED mapping to a given anon_map. 890 */ 891 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 892 amp->refcnt++; 893 if (a->szc > amp->a_szc) { 894 amp->a_szc = a->szc; 895 } 896 ANON_LOCK_EXIT(&->a_rwlock); 897 svd->anon_index = anon_num; 898 svd->swresv = 0; 899 } else { 900 /* 901 * PRIVATE mapping to a given anon_map. 902 * Make sure that all the needed anon 903 * structures are created (so that we will 904 * share the underlying pages if nothing 905 * is written by this mapping) and then 906 * duplicate the anon array as is done 907 * when a privately mapped segment is dup'ed. 908 */ 909 struct anon *ap; 910 caddr_t addr; 911 caddr_t eaddr; 912 ulong_t anon_idx; 913 int hat_flag = HAT_LOAD; 914 915 if (svd->flags & MAP_TEXT) { 916 hat_flag |= HAT_LOAD_TEXT; 917 } 918 919 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 920 svd->amp->a_szc = seg->s_szc; 921 svd->anon_index = 0; 922 svd->swresv = swresv; 923 924 /* 925 * Prevent 2 threads from allocating anon 926 * slots simultaneously. 927 */ 928 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 929 eaddr = seg->s_base + seg->s_size; 930 931 for (anon_idx = anon_num, addr = seg->s_base; 932 addr < eaddr; addr += PAGESIZE, anon_idx++) { 933 page_t *pp; 934 935 if ((ap = anon_get_ptr(amp->ahp, 936 anon_idx)) != NULL) 937 continue; 938 939 /* 940 * Allocate the anon struct now. 941 * Might as well load up translation 942 * to the page while we're at it... 943 */ 944 pp = anon_zero(seg, addr, &ap, cred); 945 if (ap == NULL || pp == NULL) { 946 panic("segvn_create anon_zero"); 947 /*NOTREACHED*/ 948 } 949 950 /* 951 * Re-acquire the anon_map lock and 952 * initialize the anon array entry. 953 */ 954 ASSERT(anon_get_ptr(amp->ahp, 955 anon_idx) == NULL); 956 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 957 ANON_SLEEP); 958 959 ASSERT(seg->s_szc == 0); 960 ASSERT(!IS_VMODSORT(pp->p_vnode)); 961 962 ASSERT(use_rgn == 0); 963 hat_memload(seg->s_as->a_hat, addr, pp, 964 svd->prot & ~PROT_WRITE, hat_flag); 965 966 page_unlock(pp); 967 } 968 ASSERT(seg->s_szc == 0); 969 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 970 0, seg->s_size); 971 ANON_LOCK_EXIT(&->a_rwlock); 972 } 973 } 974 975 /* 976 * Set default memory allocation policy for segment 977 * 978 * Always set policy for private memory at least for initialization 979 * even if this is a shared memory segment 980 */ 981 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 982 983 if (svd->type == MAP_SHARED) 984 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 985 svd->vp, svd->offset, seg->s_size); 986 987 if (use_rgn) { 988 ASSERT(!trok); 989 ASSERT(svd->amp == NULL); 990 svd->rcookie = hat_join_region(seg->s_as->a_hat, seg->s_base, 991 seg->s_size, (void *)svd->vp, svd->offset, svd->prot, 992 (uchar_t)seg->s_szc, segvn_hat_rgn_unload_callback, 993 HAT_REGION_TEXT); 994 } 995 996 ASSERT(!trok || !(svd->prot & PROT_WRITE)); 997 svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF; 998 999 return (0); 1000 } 1001 1002 /* 1003 * Concatenate two existing segments, if possible. 1004 * Return 0 on success, -1 if two segments are not compatible 1005 * or -2 on memory allocation failure. 1006 * If amp_cat == 1 then try and concat segments with anon maps 1007 */ 1008 static int 1009 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat) 1010 { 1011 struct segvn_data *svd1 = seg1->s_data; 1012 struct segvn_data *svd2 = seg2->s_data; 1013 struct anon_map *amp1 = svd1->amp; 1014 struct anon_map *amp2 = svd2->amp; 1015 struct vpage *vpage1 = svd1->vpage; 1016 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 1017 size_t size, nvpsize; 1018 pgcnt_t npages1, npages2; 1019 1020 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 1021 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 1022 ASSERT(seg1->s_ops == seg2->s_ops); 1023 1024 if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie) || 1025 HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) { 1026 return (-1); 1027 } 1028 1029 /* both segments exist, try to merge them */ 1030 #define incompat(x) (svd1->x != svd2->x) 1031 if (incompat(vp) || incompat(maxprot) || 1032 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 1033 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 1034 incompat(type) || incompat(cred) || incompat(flags) || 1035 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 1036 (svd2->softlockcnt > 0) || svd1->softlockcnt_send > 0) 1037 return (-1); 1038 #undef incompat 1039 1040 /* 1041 * vp == NULL implies zfod, offset doesn't matter 1042 */ 1043 if (svd1->vp != NULL && 1044 svd1->offset + seg1->s_size != svd2->offset) { 1045 return (-1); 1046 } 1047 1048 /* 1049 * Don't concatenate if either segment uses text replication. 1050 */ 1051 if (svd1->tr_state != SEGVN_TR_OFF || svd2->tr_state != SEGVN_TR_OFF) { 1052 return (-1); 1053 } 1054 1055 /* 1056 * Fail early if we're not supposed to concatenate 1057 * segments with non NULL amp. 1058 */ 1059 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) { 1060 return (-1); 1061 } 1062 1063 if (svd1->vp == NULL && svd1->type == MAP_SHARED) { 1064 if (amp1 != amp2) { 1065 return (-1); 1066 } 1067 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) != 1068 svd2->anon_index) { 1069 return (-1); 1070 } 1071 ASSERT(amp1 == NULL || amp1->refcnt >= 2); 1072 } 1073 1074 /* 1075 * If either seg has vpages, create a new merged vpage array. 1076 */ 1077 if (vpage1 != NULL || vpage2 != NULL) { 1078 struct vpage *vp, *evp; 1079 1080 npages1 = seg_pages(seg1); 1081 npages2 = seg_pages(seg2); 1082 nvpsize = vpgtob(npages1 + npages2); 1083 1084 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 1085 return (-2); 1086 } 1087 1088 if (vpage1 != NULL) { 1089 bcopy(vpage1, nvpage, vpgtob(npages1)); 1090 } else { 1091 evp = nvpage + npages1; 1092 for (vp = nvpage; vp < evp; vp++) { 1093 VPP_SETPROT(vp, svd1->prot); 1094 VPP_SETADVICE(vp, svd1->advice); 1095 } 1096 } 1097 1098 if (vpage2 != NULL) { 1099 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 1100 } else { 1101 evp = nvpage + npages1 + npages2; 1102 for (vp = nvpage + npages1; vp < evp; vp++) { 1103 VPP_SETPROT(vp, svd2->prot); 1104 VPP_SETADVICE(vp, svd2->advice); 1105 } 1106 } 1107 1108 if (svd2->pageswap && (!svd1->pageswap && svd1->swresv)) { 1109 ASSERT(svd1->swresv == seg1->s_size); 1110 ASSERT(!(svd1->flags & MAP_NORESERVE)); 1111 ASSERT(!(svd2->flags & MAP_NORESERVE)); 1112 evp = nvpage + npages1; 1113 for (vp = nvpage; vp < evp; vp++) { 1114 VPP_SETSWAPRES(vp); 1115 } 1116 } 1117 1118 if (svd1->pageswap && (!svd2->pageswap && svd2->swresv)) { 1119 ASSERT(svd2->swresv == seg2->s_size); 1120 ASSERT(!(svd1->flags & MAP_NORESERVE)); 1121 ASSERT(!(svd2->flags & MAP_NORESERVE)); 1122 vp = nvpage + npages1; 1123 evp = vp + npages2; 1124 for (; vp < evp; vp++) { 1125 VPP_SETSWAPRES(vp); 1126 } 1127 } 1128 } 1129 ASSERT((vpage1 != NULL || vpage2 != NULL) || 1130 (svd1->pageswap == 0 && svd2->pageswap == 0)); 1131 1132 /* 1133 * If either segment has private pages, create a new merged anon 1134 * array. If mergeing shared anon segments just decrement anon map's 1135 * refcnt. 1136 */ 1137 if (amp1 != NULL && svd1->type == MAP_SHARED) { 1138 ASSERT(amp1 == amp2 && svd1->vp == NULL); 1139 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1140 ASSERT(amp1->refcnt >= 2); 1141 amp1->refcnt--; 1142 ANON_LOCK_EXIT(&1->a_rwlock); 1143 svd2->amp = NULL; 1144 } else if (amp1 != NULL || amp2 != NULL) { 1145 struct anon_hdr *nahp; 1146 struct anon_map *namp = NULL; 1147 size_t asize; 1148 1149 ASSERT(svd1->type == MAP_PRIVATE); 1150 1151 asize = seg1->s_size + seg2->s_size; 1152 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 1153 if (nvpage != NULL) { 1154 kmem_free(nvpage, nvpsize); 1155 } 1156 return (-2); 1157 } 1158 if (amp1 != NULL) { 1159 /* 1160 * XXX anon rwlock is not really needed because 1161 * this is a private segment and we are writers. 1162 */ 1163 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1164 ASSERT(amp1->refcnt == 1); 1165 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 1166 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 1167 anon_release(nahp, btop(asize)); 1168 ANON_LOCK_EXIT(&1->a_rwlock); 1169 if (nvpage != NULL) { 1170 kmem_free(nvpage, nvpsize); 1171 } 1172 return (-2); 1173 } 1174 } 1175 if (amp2 != NULL) { 1176 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1177 ASSERT(amp2->refcnt == 1); 1178 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 1179 nahp, btop(seg1->s_size), btop(seg2->s_size), 1180 ANON_NOSLEEP)) { 1181 anon_release(nahp, btop(asize)); 1182 ANON_LOCK_EXIT(&2->a_rwlock); 1183 if (amp1 != NULL) { 1184 ANON_LOCK_EXIT(&1->a_rwlock); 1185 } 1186 if (nvpage != NULL) { 1187 kmem_free(nvpage, nvpsize); 1188 } 1189 return (-2); 1190 } 1191 } 1192 if (amp1 != NULL) { 1193 namp = amp1; 1194 anon_release(amp1->ahp, btop(amp1->size)); 1195 } 1196 if (amp2 != NULL) { 1197 if (namp == NULL) { 1198 ASSERT(amp1 == NULL); 1199 namp = amp2; 1200 anon_release(amp2->ahp, btop(amp2->size)); 1201 } else { 1202 amp2->refcnt--; 1203 ANON_LOCK_EXIT(&2->a_rwlock); 1204 anonmap_free(amp2); 1205 } 1206 svd2->amp = NULL; /* needed for seg_free */ 1207 } 1208 namp->ahp = nahp; 1209 namp->size = asize; 1210 svd1->amp = namp; 1211 svd1->anon_index = 0; 1212 ANON_LOCK_EXIT(&namp->a_rwlock); 1213 } 1214 /* 1215 * Now free the old vpage structures. 1216 */ 1217 if (nvpage != NULL) { 1218 if (vpage1 != NULL) { 1219 kmem_free(vpage1, vpgtob(npages1)); 1220 } 1221 if (vpage2 != NULL) { 1222 svd2->vpage = NULL; 1223 kmem_free(vpage2, vpgtob(npages2)); 1224 } 1225 if (svd2->pageprot) { 1226 svd1->pageprot = 1; 1227 } 1228 if (svd2->pageadvice) { 1229 svd1->pageadvice = 1; 1230 } 1231 if (svd2->pageswap) { 1232 svd1->pageswap = 1; 1233 } 1234 svd1->vpage = nvpage; 1235 } 1236 1237 /* all looks ok, merge segments */ 1238 svd1->swresv += svd2->swresv; 1239 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 1240 size = seg2->s_size; 1241 seg_free(seg2); 1242 seg1->s_size += size; 1243 return (0); 1244 } 1245 1246 /* 1247 * Extend the previous segment (seg1) to include the 1248 * new segment (seg2 + a), if possible. 1249 * Return 0 on success. 1250 */ 1251 static int 1252 segvn_extend_prev(seg1, seg2, a, swresv) 1253 struct seg *seg1, *seg2; 1254 struct segvn_crargs *a; 1255 size_t swresv; 1256 { 1257 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 1258 size_t size; 1259 struct anon_map *amp1; 1260 struct vpage *new_vpage; 1261 1262 /* 1263 * We don't need any segment level locks for "segvn" data 1264 * since the address space is "write" locked. 1265 */ 1266 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 1267 1268 if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie)) { 1269 return (-1); 1270 } 1271 1272 /* second segment is new, try to extend first */ 1273 /* XXX - should also check cred */ 1274 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 1275 (!svd1->pageprot && (svd1->prot != a->prot)) || 1276 svd1->type != a->type || svd1->flags != a->flags || 1277 seg1->s_szc != a->szc || svd1->softlockcnt_send > 0) 1278 return (-1); 1279 1280 /* vp == NULL implies zfod, offset doesn't matter */ 1281 if (svd1->vp != NULL && 1282 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 1283 return (-1); 1284 1285 if (svd1->tr_state != SEGVN_TR_OFF) { 1286 return (-1); 1287 } 1288 1289 amp1 = svd1->amp; 1290 if (amp1) { 1291 pgcnt_t newpgs; 1292 1293 /* 1294 * Segment has private pages, can data structures 1295 * be expanded? 1296 * 1297 * Acquire the anon_map lock to prevent it from changing, 1298 * if it is shared. This ensures that the anon_map 1299 * will not change while a thread which has a read/write 1300 * lock on an address space references it. 1301 * XXX - Don't need the anon_map lock at all if "refcnt" 1302 * is 1. 1303 * 1304 * Can't grow a MAP_SHARED segment with an anonmap because 1305 * there may be existing anon slots where we want to extend 1306 * the segment and we wouldn't know what to do with them 1307 * (e.g., for tmpfs right thing is to just leave them there, 1308 * for /dev/zero they should be cleared out). 1309 */ 1310 if (svd1->type == MAP_SHARED) 1311 return (-1); 1312 1313 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1314 if (amp1->refcnt > 1) { 1315 ANON_LOCK_EXIT(&1->a_rwlock); 1316 return (-1); 1317 } 1318 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1319 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1320 1321 if (newpgs == 0) { 1322 ANON_LOCK_EXIT(&1->a_rwlock); 1323 return (-1); 1324 } 1325 amp1->size = ptob(newpgs); 1326 ANON_LOCK_EXIT(&1->a_rwlock); 1327 } 1328 if (svd1->vpage != NULL) { 1329 struct vpage *vp, *evp; 1330 new_vpage = 1331 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1332 KM_NOSLEEP); 1333 if (new_vpage == NULL) 1334 return (-1); 1335 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1336 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1337 svd1->vpage = new_vpage; 1338 1339 vp = new_vpage + seg_pages(seg1); 1340 evp = vp + seg_pages(seg2); 1341 for (; vp < evp; vp++) 1342 VPP_SETPROT(vp, a->prot); 1343 if (svd1->pageswap && swresv) { 1344 ASSERT(!(svd1->flags & MAP_NORESERVE)); 1345 ASSERT(swresv == seg2->s_size); 1346 vp = new_vpage + seg_pages(seg1); 1347 for (; vp < evp; vp++) { 1348 VPP_SETSWAPRES(vp); 1349 } 1350 } 1351 } 1352 ASSERT(svd1->vpage != NULL || svd1->pageswap == 0); 1353 size = seg2->s_size; 1354 seg_free(seg2); 1355 seg1->s_size += size; 1356 svd1->swresv += swresv; 1357 if (svd1->pageprot && (a->prot & PROT_WRITE) && 1358 svd1->type == MAP_SHARED && svd1->vp != NULL && 1359 (svd1->vp->v_flag & VVMEXEC)) { 1360 ASSERT(vn_is_mapped(svd1->vp, V_WRITE)); 1361 segvn_inval_trcache(svd1->vp); 1362 } 1363 return (0); 1364 } 1365 1366 /* 1367 * Extend the next segment (seg2) to include the 1368 * new segment (seg1 + a), if possible. 1369 * Return 0 on success. 1370 */ 1371 static int 1372 segvn_extend_next( 1373 struct seg *seg1, 1374 struct seg *seg2, 1375 struct segvn_crargs *a, 1376 size_t swresv) 1377 { 1378 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1379 size_t size; 1380 struct anon_map *amp2; 1381 struct vpage *new_vpage; 1382 1383 /* 1384 * We don't need any segment level locks for "segvn" data 1385 * since the address space is "write" locked. 1386 */ 1387 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1388 1389 if (HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) { 1390 return (-1); 1391 } 1392 1393 /* first segment is new, try to extend second */ 1394 /* XXX - should also check cred */ 1395 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1396 (!svd2->pageprot && (svd2->prot != a->prot)) || 1397 svd2->type != a->type || svd2->flags != a->flags || 1398 seg2->s_szc != a->szc || svd2->softlockcnt_sbase > 0) 1399 return (-1); 1400 /* vp == NULL implies zfod, offset doesn't matter */ 1401 if (svd2->vp != NULL && 1402 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1403 return (-1); 1404 1405 if (svd2->tr_state != SEGVN_TR_OFF) { 1406 return (-1); 1407 } 1408 1409 amp2 = svd2->amp; 1410 if (amp2) { 1411 pgcnt_t newpgs; 1412 1413 /* 1414 * Segment has private pages, can data structures 1415 * be expanded? 1416 * 1417 * Acquire the anon_map lock to prevent it from changing, 1418 * if it is shared. This ensures that the anon_map 1419 * will not change while a thread which has a read/write 1420 * lock on an address space references it. 1421 * 1422 * XXX - Don't need the anon_map lock at all if "refcnt" 1423 * is 1. 1424 */ 1425 if (svd2->type == MAP_SHARED) 1426 return (-1); 1427 1428 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1429 if (amp2->refcnt > 1) { 1430 ANON_LOCK_EXIT(&2->a_rwlock); 1431 return (-1); 1432 } 1433 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1434 btop(seg2->s_size), btop(seg1->s_size), 1435 ANON_NOSLEEP | ANON_GROWDOWN); 1436 1437 if (newpgs == 0) { 1438 ANON_LOCK_EXIT(&2->a_rwlock); 1439 return (-1); 1440 } 1441 amp2->size = ptob(newpgs); 1442 ANON_LOCK_EXIT(&2->a_rwlock); 1443 } 1444 if (svd2->vpage != NULL) { 1445 struct vpage *vp, *evp; 1446 new_vpage = 1447 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1448 KM_NOSLEEP); 1449 if (new_vpage == NULL) { 1450 /* Not merging segments so adjust anon_index back */ 1451 if (amp2) 1452 svd2->anon_index += seg_pages(seg1); 1453 return (-1); 1454 } 1455 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1456 vpgtob(seg_pages(seg2))); 1457 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1458 svd2->vpage = new_vpage; 1459 1460 vp = new_vpage; 1461 evp = vp + seg_pages(seg1); 1462 for (; vp < evp; vp++) 1463 VPP_SETPROT(vp, a->prot); 1464 if (svd2->pageswap && swresv) { 1465 ASSERT(!(svd2->flags & MAP_NORESERVE)); 1466 ASSERT(swresv == seg1->s_size); 1467 vp = new_vpage; 1468 for (; vp < evp; vp++) { 1469 VPP_SETSWAPRES(vp); 1470 } 1471 } 1472 } 1473 ASSERT(svd2->vpage != NULL || svd2->pageswap == 0); 1474 size = seg1->s_size; 1475 seg_free(seg1); 1476 seg2->s_size += size; 1477 seg2->s_base -= size; 1478 svd2->offset -= size; 1479 svd2->swresv += swresv; 1480 if (svd2->pageprot && (a->prot & PROT_WRITE) && 1481 svd2->type == MAP_SHARED && svd2->vp != NULL && 1482 (svd2->vp->v_flag & VVMEXEC)) { 1483 ASSERT(vn_is_mapped(svd2->vp, V_WRITE)); 1484 segvn_inval_trcache(svd2->vp); 1485 } 1486 return (0); 1487 } 1488 1489 static int 1490 segvn_dup(struct seg *seg, struct seg *newseg) 1491 { 1492 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1493 struct segvn_data *newsvd; 1494 pgcnt_t npages = seg_pages(seg); 1495 int error = 0; 1496 uint_t prot; 1497 size_t len; 1498 struct anon_map *amp; 1499 1500 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1501 ASSERT(newseg->s_as->a_proc->p_parent == curproc); 1502 1503 /* 1504 * If segment has anon reserved, reserve more for the new seg. 1505 * For a MAP_NORESERVE segment swresv will be a count of all the 1506 * allocated anon slots; thus we reserve for the child as many slots 1507 * as the parent has allocated. This semantic prevents the child or 1508 * parent from dieing during a copy-on-write fault caused by trying 1509 * to write a shared pre-existing anon page. 1510 */ 1511 if ((len = svd->swresv) != 0) { 1512 if (anon_resv(svd->swresv) == 0) 1513 return (ENOMEM); 1514 1515 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1516 seg, len, 0); 1517 } 1518 1519 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1520 1521 newseg->s_ops = &segvn_ops; 1522 newseg->s_data = (void *)newsvd; 1523 newseg->s_szc = seg->s_szc; 1524 1525 newsvd->seg = newseg; 1526 if ((newsvd->vp = svd->vp) != NULL) { 1527 VN_HOLD(svd->vp); 1528 if (svd->type == MAP_SHARED) 1529 lgrp_shm_policy_init(NULL, svd->vp); 1530 } 1531 newsvd->offset = svd->offset; 1532 newsvd->prot = svd->prot; 1533 newsvd->maxprot = svd->maxprot; 1534 newsvd->pageprot = svd->pageprot; 1535 newsvd->type = svd->type; 1536 newsvd->cred = svd->cred; 1537 crhold(newsvd->cred); 1538 newsvd->advice = svd->advice; 1539 newsvd->pageadvice = svd->pageadvice; 1540 newsvd->swresv = svd->swresv; 1541 newsvd->pageswap = svd->pageswap; 1542 newsvd->flags = svd->flags; 1543 newsvd->softlockcnt = 0; 1544 newsvd->softlockcnt_sbase = 0; 1545 newsvd->softlockcnt_send = 0; 1546 newsvd->policy_info = svd->policy_info; 1547 newsvd->rcookie = HAT_INVALID_REGION_COOKIE; 1548 1549 if ((amp = svd->amp) == NULL || svd->tr_state == SEGVN_TR_ON) { 1550 /* 1551 * Not attaching to a shared anon object. 1552 */ 1553 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie) || 1554 svd->tr_state == SEGVN_TR_OFF); 1555 if (svd->tr_state == SEGVN_TR_ON) { 1556 ASSERT(newsvd->vp != NULL && amp != NULL); 1557 newsvd->tr_state = SEGVN_TR_INIT; 1558 } else { 1559 newsvd->tr_state = svd->tr_state; 1560 } 1561 newsvd->amp = NULL; 1562 newsvd->anon_index = 0; 1563 } else { 1564 /* regions for now are only used on pure vnode segments */ 1565 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 1566 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1567 newsvd->tr_state = SEGVN_TR_OFF; 1568 if (svd->type == MAP_SHARED) { 1569 newsvd->amp = amp; 1570 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1571 amp->refcnt++; 1572 ANON_LOCK_EXIT(&->a_rwlock); 1573 newsvd->anon_index = svd->anon_index; 1574 } else { 1575 int reclaim = 1; 1576 1577 /* 1578 * Allocate and initialize new anon_map structure. 1579 */ 1580 newsvd->amp = anonmap_alloc(newseg->s_size, 0, 1581 ANON_SLEEP); 1582 newsvd->amp->a_szc = newseg->s_szc; 1583 newsvd->anon_index = 0; 1584 1585 /* 1586 * We don't have to acquire the anon_map lock 1587 * for the new segment (since it belongs to an 1588 * address space that is still not associated 1589 * with any process), or the segment in the old 1590 * address space (since all threads in it 1591 * are stopped while duplicating the address space). 1592 */ 1593 1594 /* 1595 * The goal of the following code is to make sure that 1596 * softlocked pages do not end up as copy on write 1597 * pages. This would cause problems where one 1598 * thread writes to a page that is COW and a different 1599 * thread in the same process has softlocked it. The 1600 * softlock lock would move away from this process 1601 * because the write would cause this process to get 1602 * a copy (without the softlock). 1603 * 1604 * The strategy here is to just break the 1605 * sharing on pages that could possibly be 1606 * softlocked. 1607 */ 1608 retry: 1609 if (svd->softlockcnt) { 1610 struct anon *ap, *newap; 1611 size_t i; 1612 uint_t vpprot; 1613 page_t *anon_pl[1+1], *pp; 1614 caddr_t addr; 1615 ulong_t old_idx = svd->anon_index; 1616 ulong_t new_idx = 0; 1617 1618 /* 1619 * The softlock count might be non zero 1620 * because some pages are still stuck in the 1621 * cache for lazy reclaim. Flush the cache 1622 * now. This should drop the count to zero. 1623 * [or there is really I/O going on to these 1624 * pages]. Note, we have the writers lock so 1625 * nothing gets inserted during the flush. 1626 */ 1627 if (reclaim == 1) { 1628 segvn_purge(seg); 1629 reclaim = 0; 1630 goto retry; 1631 } 1632 i = btopr(seg->s_size); 1633 addr = seg->s_base; 1634 /* 1635 * XXX break cow sharing using PAGESIZE 1636 * pages. They will be relocated into larger 1637 * pages at fault time. 1638 */ 1639 while (i-- > 0) { 1640 if (ap = anon_get_ptr(amp->ahp, 1641 old_idx)) { 1642 error = anon_getpage(&ap, 1643 &vpprot, anon_pl, PAGESIZE, 1644 seg, addr, S_READ, 1645 svd->cred); 1646 if (error) { 1647 newsvd->vpage = NULL; 1648 goto out; 1649 } 1650 /* 1651 * prot need not be computed 1652 * below 'cause anon_private is 1653 * going to ignore it anyway 1654 * as child doesn't inherit 1655 * pagelock from parent. 1656 */ 1657 prot = svd->pageprot ? 1658 VPP_PROT( 1659 &svd->vpage[ 1660 seg_page(seg, addr)]) 1661 : svd->prot; 1662 pp = anon_private(&newap, 1663 newseg, addr, prot, 1664 anon_pl[0], 0, 1665 newsvd->cred); 1666 if (pp == NULL) { 1667 /* no mem abort */ 1668 newsvd->vpage = NULL; 1669 error = ENOMEM; 1670 goto out; 1671 } 1672 (void) anon_set_ptr( 1673 newsvd->amp->ahp, new_idx, 1674 newap, ANON_SLEEP); 1675 page_unlock(pp); 1676 } 1677 addr += PAGESIZE; 1678 old_idx++; 1679 new_idx++; 1680 } 1681 } else { /* common case */ 1682 if (seg->s_szc != 0) { 1683 /* 1684 * If at least one of anon slots of a 1685 * large page exists then make sure 1686 * all anon slots of a large page 1687 * exist to avoid partial cow sharing 1688 * of a large page in the future. 1689 */ 1690 anon_dup_fill_holes(amp->ahp, 1691 svd->anon_index, newsvd->amp->ahp, 1692 0, seg->s_size, seg->s_szc, 1693 svd->vp != NULL); 1694 } else { 1695 anon_dup(amp->ahp, svd->anon_index, 1696 newsvd->amp->ahp, 0, seg->s_size); 1697 } 1698 1699 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1700 seg->s_size, PROT_WRITE); 1701 } 1702 } 1703 } 1704 /* 1705 * If necessary, create a vpage structure for the new segment. 1706 * Do not copy any page lock indications. 1707 */ 1708 if (svd->vpage != NULL) { 1709 uint_t i; 1710 struct vpage *ovp = svd->vpage; 1711 struct vpage *nvp; 1712 1713 nvp = newsvd->vpage = 1714 kmem_alloc(vpgtob(npages), KM_SLEEP); 1715 for (i = 0; i < npages; i++) { 1716 *nvp = *ovp++; 1717 VPP_CLRPPLOCK(nvp++); 1718 } 1719 } else 1720 newsvd->vpage = NULL; 1721 1722 /* Inform the vnode of the new mapping */ 1723 if (newsvd->vp != NULL) { 1724 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1725 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1726 newsvd->maxprot, newsvd->type, newsvd->cred, NULL); 1727 } 1728 out: 1729 if (error == 0 && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1730 ASSERT(newsvd->amp == NULL); 1731 ASSERT(newsvd->tr_state == SEGVN_TR_OFF); 1732 newsvd->rcookie = svd->rcookie; 1733 hat_dup_region(newseg->s_as->a_hat, newsvd->rcookie); 1734 } 1735 return (error); 1736 } 1737 1738 1739 /* 1740 * callback function to invoke free_vp_pages() for only those pages actually 1741 * processed by the HAT when a shared region is destroyed. 1742 */ 1743 extern int free_pages; 1744 1745 static void 1746 segvn_hat_rgn_unload_callback(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr, 1747 size_t r_size, void *r_obj, u_offset_t r_objoff) 1748 { 1749 u_offset_t off; 1750 size_t len; 1751 vnode_t *vp = (vnode_t *)r_obj; 1752 1753 ASSERT(eaddr > saddr); 1754 ASSERT(saddr >= r_saddr); 1755 ASSERT(saddr < r_saddr + r_size); 1756 ASSERT(eaddr > r_saddr); 1757 ASSERT(eaddr <= r_saddr + r_size); 1758 ASSERT(vp != NULL); 1759 1760 if (!free_pages) { 1761 return; 1762 } 1763 1764 len = eaddr - saddr; 1765 off = (saddr - r_saddr) + r_objoff; 1766 free_vp_pages(vp, off, len); 1767 } 1768 1769 /* 1770 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1771 * those pages actually processed by the HAT 1772 */ 1773 static void 1774 segvn_hat_unload_callback(hat_callback_t *cb) 1775 { 1776 struct seg *seg = cb->hcb_data; 1777 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1778 size_t len; 1779 u_offset_t off; 1780 1781 ASSERT(svd->vp != NULL); 1782 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1783 ASSERT(cb->hcb_start_addr >= seg->s_base); 1784 1785 len = cb->hcb_end_addr - cb->hcb_start_addr; 1786 off = cb->hcb_start_addr - seg->s_base; 1787 free_vp_pages(svd->vp, svd->offset + off, len); 1788 } 1789 1790 /* 1791 * This function determines the number of bytes of swap reserved by 1792 * a segment for which per-page accounting is present. It is used to 1793 * calculate the correct value of a segvn_data's swresv. 1794 */ 1795 static size_t 1796 segvn_count_swap_by_vpages(struct seg *seg) 1797 { 1798 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1799 struct vpage *vp, *evp; 1800 size_t nswappages = 0; 1801 1802 ASSERT(svd->pageswap); 1803 ASSERT(svd->vpage != NULL); 1804 1805 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 1806 1807 for (vp = svd->vpage; vp < evp; vp++) { 1808 if (VPP_ISSWAPRES(vp)) 1809 nswappages++; 1810 } 1811 1812 return (nswappages << PAGESHIFT); 1813 } 1814 1815 static int 1816 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1817 { 1818 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1819 struct segvn_data *nsvd; 1820 struct seg *nseg; 1821 struct anon_map *amp; 1822 pgcnt_t opages; /* old segment size in pages */ 1823 pgcnt_t npages; /* new segment size in pages */ 1824 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1825 hat_callback_t callback; /* used for free_vp_pages() */ 1826 hat_callback_t *cbp = NULL; 1827 caddr_t nbase; 1828 size_t nsize; 1829 size_t oswresv; 1830 int reclaim = 1; 1831 1832 /* 1833 * We don't need any segment level locks for "segvn" data 1834 * since the address space is "write" locked. 1835 */ 1836 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1837 1838 /* 1839 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1840 * softlockcnt is protected from change by the as write lock. 1841 */ 1842 retry: 1843 if (svd->softlockcnt > 0) { 1844 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1845 1846 /* 1847 * If this is shared segment non 0 softlockcnt 1848 * means locked pages are still in use. 1849 */ 1850 if (svd->type == MAP_SHARED) { 1851 return (EAGAIN); 1852 } 1853 1854 /* 1855 * since we do have the writers lock nobody can fill 1856 * the cache during the purge. The flush either succeeds 1857 * or we still have pending I/Os. 1858 */ 1859 if (reclaim == 1) { 1860 segvn_purge(seg); 1861 reclaim = 0; 1862 goto retry; 1863 } 1864 return (EAGAIN); 1865 } 1866 1867 /* 1868 * Check for bad sizes 1869 */ 1870 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1871 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1872 panic("segvn_unmap"); 1873 /*NOTREACHED*/ 1874 } 1875 1876 if (seg->s_szc != 0) { 1877 size_t pgsz = page_get_pagesize(seg->s_szc); 1878 int err; 1879 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1880 ASSERT(seg->s_base != addr || seg->s_size != len); 1881 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1882 ASSERT(svd->amp == NULL); 1883 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1884 hat_leave_region(seg->s_as->a_hat, 1885 svd->rcookie, HAT_REGION_TEXT); 1886 svd->rcookie = HAT_INVALID_REGION_COOKIE; 1887 /* 1888 * could pass a flag to segvn_demote_range() 1889 * below to tell it not to do any unloads but 1890 * this case is rare enough to not bother for 1891 * now. 1892 */ 1893 } else if (svd->tr_state == SEGVN_TR_INIT) { 1894 svd->tr_state = SEGVN_TR_OFF; 1895 } else if (svd->tr_state == SEGVN_TR_ON) { 1896 ASSERT(svd->amp != NULL); 1897 segvn_textunrepl(seg, 1); 1898 ASSERT(svd->amp == NULL); 1899 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1900 } 1901 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1902 err = segvn_demote_range(seg, addr, len, SDR_END, 0); 1903 if (err == 0) { 1904 return (IE_RETRY); 1905 } 1906 return (err); 1907 } 1908 } 1909 1910 /* Inform the vnode of the unmapping. */ 1911 if (svd->vp) { 1912 int error; 1913 1914 error = VOP_DELMAP(svd->vp, 1915 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1916 seg->s_as, addr, len, svd->prot, svd->maxprot, 1917 svd->type, svd->cred, NULL); 1918 1919 if (error == EAGAIN) 1920 return (error); 1921 } 1922 1923 /* 1924 * Remove any page locks set through this mapping. 1925 * If text replication is not off no page locks could have been 1926 * established via this mapping. 1927 */ 1928 if (svd->tr_state == SEGVN_TR_OFF) { 1929 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1930 } 1931 1932 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1933 ASSERT(svd->amp == NULL); 1934 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1935 ASSERT(svd->type == MAP_PRIVATE); 1936 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 1937 HAT_REGION_TEXT); 1938 svd->rcookie = HAT_INVALID_REGION_COOKIE; 1939 } else if (svd->tr_state == SEGVN_TR_ON) { 1940 ASSERT(svd->amp != NULL); 1941 ASSERT(svd->pageprot == 0 && !(svd->prot & PROT_WRITE)); 1942 segvn_textunrepl(seg, 1); 1943 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 1944 } else { 1945 if (svd->tr_state != SEGVN_TR_OFF) { 1946 ASSERT(svd->tr_state == SEGVN_TR_INIT); 1947 svd->tr_state = SEGVN_TR_OFF; 1948 } 1949 /* 1950 * Unload any hardware translations in the range to be taken 1951 * out. Use a callback to invoke free_vp_pages() effectively. 1952 */ 1953 if (svd->vp != NULL && free_pages != 0) { 1954 callback.hcb_data = seg; 1955 callback.hcb_function = segvn_hat_unload_callback; 1956 cbp = &callback; 1957 } 1958 hat_unload_callback(seg->s_as->a_hat, addr, len, 1959 HAT_UNLOAD_UNMAP, cbp); 1960 1961 if (svd->type == MAP_SHARED && svd->vp != NULL && 1962 (svd->vp->v_flag & VVMEXEC) && 1963 ((svd->prot & PROT_WRITE) || svd->pageprot)) { 1964 segvn_inval_trcache(svd->vp); 1965 } 1966 } 1967 1968 /* 1969 * Check for entire segment 1970 */ 1971 if (addr == seg->s_base && len == seg->s_size) { 1972 seg_free(seg); 1973 return (0); 1974 } 1975 1976 opages = seg_pages(seg); 1977 dpages = btop(len); 1978 npages = opages - dpages; 1979 amp = svd->amp; 1980 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc); 1981 1982 /* 1983 * Check for beginning of segment 1984 */ 1985 if (addr == seg->s_base) { 1986 if (svd->vpage != NULL) { 1987 size_t nbytes; 1988 struct vpage *ovpage; 1989 1990 ovpage = svd->vpage; /* keep pointer to vpage */ 1991 1992 nbytes = vpgtob(npages); 1993 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1994 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1995 1996 /* free up old vpage */ 1997 kmem_free(ovpage, vpgtob(opages)); 1998 } 1999 if (amp != NULL) { 2000 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2001 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 2002 /* 2003 * Shared anon map is no longer in use. Before 2004 * freeing its pages purge all entries from 2005 * pcache that belong to this amp. 2006 */ 2007 if (svd->type == MAP_SHARED) { 2008 ASSERT(amp->refcnt == 1); 2009 ASSERT(svd->softlockcnt == 0); 2010 anonmap_purge(amp); 2011 } 2012 /* 2013 * Free up now unused parts of anon_map array. 2014 */ 2015 if (amp->a_szc == seg->s_szc) { 2016 if (seg->s_szc != 0) { 2017 anon_free_pages(amp->ahp, 2018 svd->anon_index, len, 2019 seg->s_szc); 2020 } else { 2021 anon_free(amp->ahp, 2022 svd->anon_index, 2023 len); 2024 } 2025 } else { 2026 ASSERT(svd->type == MAP_SHARED); 2027 ASSERT(amp->a_szc > seg->s_szc); 2028 anon_shmap_free_pages(amp, 2029 svd->anon_index, len); 2030 } 2031 2032 /* 2033 * Unreserve swap space for the 2034 * unmapped chunk of this segment in 2035 * case it's MAP_SHARED 2036 */ 2037 if (svd->type == MAP_SHARED) { 2038 anon_unresv_zone(len, 2039 seg->s_as->a_proc->p_zone); 2040 amp->swresv -= len; 2041 } 2042 } 2043 ANON_LOCK_EXIT(&->a_rwlock); 2044 svd->anon_index += dpages; 2045 } 2046 if (svd->vp != NULL) 2047 svd->offset += len; 2048 2049 seg->s_base += len; 2050 seg->s_size -= len; 2051 2052 if (svd->swresv) { 2053 if (svd->flags & MAP_NORESERVE) { 2054 ASSERT(amp); 2055 oswresv = svd->swresv; 2056 2057 svd->swresv = ptob(anon_pages(amp->ahp, 2058 svd->anon_index, npages)); 2059 anon_unresv_zone(oswresv - svd->swresv, 2060 seg->s_as->a_proc->p_zone); 2061 if (SEG_IS_PARTIAL_RESV(seg)) 2062 seg->s_as->a_resvsize -= oswresv - 2063 svd->swresv; 2064 } else { 2065 size_t unlen; 2066 2067 if (svd->pageswap) { 2068 oswresv = svd->swresv; 2069 svd->swresv = 2070 segvn_count_swap_by_vpages(seg); 2071 ASSERT(oswresv >= svd->swresv); 2072 unlen = oswresv - svd->swresv; 2073 } else { 2074 svd->swresv -= len; 2075 ASSERT(svd->swresv == seg->s_size); 2076 unlen = len; 2077 } 2078 anon_unresv_zone(unlen, 2079 seg->s_as->a_proc->p_zone); 2080 } 2081 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2082 seg, len, 0); 2083 } 2084 2085 return (0); 2086 } 2087 2088 /* 2089 * Check for end of segment 2090 */ 2091 if (addr + len == seg->s_base + seg->s_size) { 2092 if (svd->vpage != NULL) { 2093 size_t nbytes; 2094 struct vpage *ovpage; 2095 2096 ovpage = svd->vpage; /* keep pointer to vpage */ 2097 2098 nbytes = vpgtob(npages); 2099 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2100 bcopy(ovpage, svd->vpage, nbytes); 2101 2102 /* free up old vpage */ 2103 kmem_free(ovpage, vpgtob(opages)); 2104 2105 } 2106 if (amp != NULL) { 2107 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2108 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 2109 /* 2110 * Free up now unused parts of anon_map array. 2111 */ 2112 ulong_t an_idx = svd->anon_index + npages; 2113 2114 /* 2115 * Shared anon map is no longer in use. Before 2116 * freeing its pages purge all entries from 2117 * pcache that belong to this amp. 2118 */ 2119 if (svd->type == MAP_SHARED) { 2120 ASSERT(amp->refcnt == 1); 2121 ASSERT(svd->softlockcnt == 0); 2122 anonmap_purge(amp); 2123 } 2124 2125 if (amp->a_szc == seg->s_szc) { 2126 if (seg->s_szc != 0) { 2127 anon_free_pages(amp->ahp, 2128 an_idx, len, 2129 seg->s_szc); 2130 } else { 2131 anon_free(amp->ahp, an_idx, 2132 len); 2133 } 2134 } else { 2135 ASSERT(svd->type == MAP_SHARED); 2136 ASSERT(amp->a_szc > seg->s_szc); 2137 anon_shmap_free_pages(amp, 2138 an_idx, len); 2139 } 2140 2141 /* 2142 * Unreserve swap space for the 2143 * unmapped chunk of this segment in 2144 * case it's MAP_SHARED 2145 */ 2146 if (svd->type == MAP_SHARED) { 2147 anon_unresv_zone(len, 2148 seg->s_as->a_proc->p_zone); 2149 amp->swresv -= len; 2150 } 2151 } 2152 ANON_LOCK_EXIT(&->a_rwlock); 2153 } 2154 2155 seg->s_size -= len; 2156 2157 if (svd->swresv) { 2158 if (svd->flags & MAP_NORESERVE) { 2159 ASSERT(amp); 2160 oswresv = svd->swresv; 2161 svd->swresv = ptob(anon_pages(amp->ahp, 2162 svd->anon_index, npages)); 2163 anon_unresv_zone(oswresv - svd->swresv, 2164 seg->s_as->a_proc->p_zone); 2165 if (SEG_IS_PARTIAL_RESV(seg)) 2166 seg->s_as->a_resvsize -= oswresv - 2167 svd->swresv; 2168 } else { 2169 size_t unlen; 2170 2171 if (svd->pageswap) { 2172 oswresv = svd->swresv; 2173 svd->swresv = 2174 segvn_count_swap_by_vpages(seg); 2175 ASSERT(oswresv >= svd->swresv); 2176 unlen = oswresv - svd->swresv; 2177 } else { 2178 svd->swresv -= len; 2179 ASSERT(svd->swresv == seg->s_size); 2180 unlen = len; 2181 } 2182 anon_unresv_zone(unlen, 2183 seg->s_as->a_proc->p_zone); 2184 } 2185 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 2186 "anon proc:%p %lu %u", seg, len, 0); 2187 } 2188 2189 return (0); 2190 } 2191 2192 /* 2193 * The section to go is in the middle of the segment, 2194 * have to make it into two segments. nseg is made for 2195 * the high end while seg is cut down at the low end. 2196 */ 2197 nbase = addr + len; /* new seg base */ 2198 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 2199 seg->s_size = addr - seg->s_base; /* shrink old seg */ 2200 nseg = seg_alloc(seg->s_as, nbase, nsize); 2201 if (nseg == NULL) { 2202 panic("segvn_unmap seg_alloc"); 2203 /*NOTREACHED*/ 2204 } 2205 nseg->s_ops = seg->s_ops; 2206 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 2207 nseg->s_data = (void *)nsvd; 2208 nseg->s_szc = seg->s_szc; 2209 *nsvd = *svd; 2210 nsvd->seg = nseg; 2211 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 2212 nsvd->swresv = 0; 2213 nsvd->softlockcnt = 0; 2214 nsvd->softlockcnt_sbase = 0; 2215 nsvd->softlockcnt_send = 0; 2216 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 2217 2218 if (svd->vp != NULL) { 2219 VN_HOLD(nsvd->vp); 2220 if (nsvd->type == MAP_SHARED) 2221 lgrp_shm_policy_init(NULL, nsvd->vp); 2222 } 2223 crhold(svd->cred); 2224 2225 if (svd->vpage == NULL) { 2226 nsvd->vpage = NULL; 2227 } else { 2228 /* need to split vpage into two arrays */ 2229 size_t nbytes; 2230 struct vpage *ovpage; 2231 2232 ovpage = svd->vpage; /* keep pointer to vpage */ 2233 2234 npages = seg_pages(seg); /* seg has shrunk */ 2235 nbytes = vpgtob(npages); 2236 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2237 2238 bcopy(ovpage, svd->vpage, nbytes); 2239 2240 npages = seg_pages(nseg); 2241 nbytes = vpgtob(npages); 2242 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2243 2244 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 2245 2246 /* free up old vpage */ 2247 kmem_free(ovpage, vpgtob(opages)); 2248 } 2249 2250 if (amp == NULL) { 2251 nsvd->amp = NULL; 2252 nsvd->anon_index = 0; 2253 } else { 2254 /* 2255 * Need to create a new anon map for the new segment. 2256 * We'll also allocate a new smaller array for the old 2257 * smaller segment to save space. 2258 */ 2259 opages = btop((uintptr_t)(addr - seg->s_base)); 2260 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2261 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 2262 /* 2263 * Free up now unused parts of anon_map array. 2264 */ 2265 ulong_t an_idx = svd->anon_index + opages; 2266 2267 /* 2268 * Shared anon map is no longer in use. Before 2269 * freeing its pages purge all entries from 2270 * pcache that belong to this amp. 2271 */ 2272 if (svd->type == MAP_SHARED) { 2273 ASSERT(amp->refcnt == 1); 2274 ASSERT(svd->softlockcnt == 0); 2275 anonmap_purge(amp); 2276 } 2277 2278 if (amp->a_szc == seg->s_szc) { 2279 if (seg->s_szc != 0) { 2280 anon_free_pages(amp->ahp, an_idx, len, 2281 seg->s_szc); 2282 } else { 2283 anon_free(amp->ahp, an_idx, 2284 len); 2285 } 2286 } else { 2287 ASSERT(svd->type == MAP_SHARED); 2288 ASSERT(amp->a_szc > seg->s_szc); 2289 anon_shmap_free_pages(amp, an_idx, len); 2290 } 2291 2292 /* 2293 * Unreserve swap space for the 2294 * unmapped chunk of this segment in 2295 * case it's MAP_SHARED 2296 */ 2297 if (svd->type == MAP_SHARED) { 2298 anon_unresv_zone(len, 2299 seg->s_as->a_proc->p_zone); 2300 amp->swresv -= len; 2301 } 2302 } 2303 nsvd->anon_index = svd->anon_index + 2304 btop((uintptr_t)(nseg->s_base - seg->s_base)); 2305 if (svd->type == MAP_SHARED) { 2306 amp->refcnt++; 2307 nsvd->amp = amp; 2308 } else { 2309 struct anon_map *namp; 2310 struct anon_hdr *nahp; 2311 2312 ASSERT(svd->type == MAP_PRIVATE); 2313 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 2314 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP); 2315 namp->a_szc = seg->s_szc; 2316 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 2317 0, btop(seg->s_size), ANON_SLEEP); 2318 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 2319 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 2320 anon_release(amp->ahp, btop(amp->size)); 2321 svd->anon_index = 0; 2322 nsvd->anon_index = 0; 2323 amp->ahp = nahp; 2324 amp->size = seg->s_size; 2325 nsvd->amp = namp; 2326 } 2327 ANON_LOCK_EXIT(&->a_rwlock); 2328 } 2329 if (svd->swresv) { 2330 if (svd->flags & MAP_NORESERVE) { 2331 ASSERT(amp); 2332 oswresv = svd->swresv; 2333 svd->swresv = ptob(anon_pages(amp->ahp, 2334 svd->anon_index, btop(seg->s_size))); 2335 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 2336 nsvd->anon_index, btop(nseg->s_size))); 2337 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 2338 anon_unresv_zone(oswresv - (svd->swresv + nsvd->swresv), 2339 seg->s_as->a_proc->p_zone); 2340 if (SEG_IS_PARTIAL_RESV(seg)) 2341 seg->s_as->a_resvsize -= oswresv - 2342 (svd->swresv + nsvd->swresv); 2343 } else { 2344 size_t unlen; 2345 2346 if (svd->pageswap) { 2347 oswresv = svd->swresv; 2348 svd->swresv = segvn_count_swap_by_vpages(seg); 2349 nsvd->swresv = segvn_count_swap_by_vpages(nseg); 2350 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 2351 unlen = oswresv - (svd->swresv + nsvd->swresv); 2352 } else { 2353 if (seg->s_size + nseg->s_size + len != 2354 svd->swresv) { 2355 panic("segvn_unmap: cannot split " 2356 "swap reservation"); 2357 /*NOTREACHED*/ 2358 } 2359 svd->swresv = seg->s_size; 2360 nsvd->swresv = nseg->s_size; 2361 unlen = len; 2362 } 2363 anon_unresv_zone(unlen, 2364 seg->s_as->a_proc->p_zone); 2365 } 2366 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2367 seg, len, 0); 2368 } 2369 2370 return (0); /* I'm glad that's all over with! */ 2371 } 2372 2373 static void 2374 segvn_free(struct seg *seg) 2375 { 2376 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2377 pgcnt_t npages = seg_pages(seg); 2378 struct anon_map *amp; 2379 size_t len; 2380 2381 /* 2382 * We don't need any segment level locks for "segvn" data 2383 * since the address space is "write" locked. 2384 */ 2385 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 2386 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2387 2388 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2389 2390 /* 2391 * Be sure to unlock pages. XXX Why do things get free'ed instead 2392 * of unmapped? XXX 2393 */ 2394 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 2395 0, MC_UNLOCK, NULL, 0); 2396 2397 /* 2398 * Deallocate the vpage and anon pointers if necessary and possible. 2399 */ 2400 if (svd->vpage != NULL) { 2401 kmem_free(svd->vpage, vpgtob(npages)); 2402 svd->vpage = NULL; 2403 } 2404 if ((amp = svd->amp) != NULL) { 2405 /* 2406 * If there are no more references to this anon_map 2407 * structure, then deallocate the structure after freeing 2408 * up all the anon slot pointers that we can. 2409 */ 2410 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2411 ASSERT(amp->a_szc >= seg->s_szc); 2412 if (--amp->refcnt == 0) { 2413 if (svd->type == MAP_PRIVATE) { 2414 /* 2415 * Private - we only need to anon_free 2416 * the part that this segment refers to. 2417 */ 2418 if (seg->s_szc != 0) { 2419 anon_free_pages(amp->ahp, 2420 svd->anon_index, seg->s_size, 2421 seg->s_szc); 2422 } else { 2423 anon_free(amp->ahp, svd->anon_index, 2424 seg->s_size); 2425 } 2426 } else { 2427 2428 /* 2429 * Shared anon map is no longer in use. Before 2430 * freeing its pages purge all entries from 2431 * pcache that belong to this amp. 2432 */ 2433 ASSERT(svd->softlockcnt == 0); 2434 anonmap_purge(amp); 2435 2436 /* 2437 * Shared - anon_free the entire 2438 * anon_map's worth of stuff and 2439 * release any swap reservation. 2440 */ 2441 if (amp->a_szc != 0) { 2442 anon_shmap_free_pages(amp, 0, 2443 amp->size); 2444 } else { 2445 anon_free(amp->ahp, 0, amp->size); 2446 } 2447 if ((len = amp->swresv) != 0) { 2448 anon_unresv_zone(len, 2449 seg->s_as->a_proc->p_zone); 2450 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 2451 "anon proc:%p %lu %u", seg, len, 0); 2452 } 2453 } 2454 svd->amp = NULL; 2455 ANON_LOCK_EXIT(&->a_rwlock); 2456 anonmap_free(amp); 2457 } else if (svd->type == MAP_PRIVATE) { 2458 /* 2459 * We had a private mapping which still has 2460 * a held anon_map so just free up all the 2461 * anon slot pointers that we were using. 2462 */ 2463 if (seg->s_szc != 0) { 2464 anon_free_pages(amp->ahp, svd->anon_index, 2465 seg->s_size, seg->s_szc); 2466 } else { 2467 anon_free(amp->ahp, svd->anon_index, 2468 seg->s_size); 2469 } 2470 ANON_LOCK_EXIT(&->a_rwlock); 2471 } else { 2472 ANON_LOCK_EXIT(&->a_rwlock); 2473 } 2474 } 2475 2476 /* 2477 * Release swap reservation. 2478 */ 2479 if ((len = svd->swresv) != 0) { 2480 anon_unresv_zone(svd->swresv, 2481 seg->s_as->a_proc->p_zone); 2482 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2483 seg, len, 0); 2484 if (SEG_IS_PARTIAL_RESV(seg)) 2485 seg->s_as->a_resvsize -= svd->swresv; 2486 svd->swresv = 0; 2487 } 2488 /* 2489 * Release claim on vnode, credentials, and finally free the 2490 * private data. 2491 */ 2492 if (svd->vp != NULL) { 2493 if (svd->type == MAP_SHARED) 2494 lgrp_shm_policy_fini(NULL, svd->vp); 2495 VN_RELE(svd->vp); 2496 svd->vp = NULL; 2497 } 2498 crfree(svd->cred); 2499 svd->pageprot = 0; 2500 svd->pageadvice = 0; 2501 svd->pageswap = 0; 2502 svd->cred = NULL; 2503 2504 /* 2505 * Take segfree_syncmtx lock to let segvn_reclaim() finish if it's 2506 * still working with this segment without holding as lock (in case 2507 * it's called by pcache async thread). 2508 */ 2509 ASSERT(svd->softlockcnt == 0); 2510 mutex_enter(&svd->segfree_syncmtx); 2511 mutex_exit(&svd->segfree_syncmtx); 2512 2513 seg->s_data = NULL; 2514 kmem_cache_free(segvn_cache, svd); 2515 } 2516 2517 /* 2518 * Do a F_SOFTUNLOCK call over the range requested. The range must have 2519 * already been F_SOFTLOCK'ed. 2520 * Caller must always match addr and len of a softunlock with a previous 2521 * softlock with exactly the same addr and len. 2522 */ 2523 static void 2524 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 2525 { 2526 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2527 page_t *pp; 2528 caddr_t adr; 2529 struct vnode *vp; 2530 u_offset_t offset; 2531 ulong_t anon_index; 2532 struct anon_map *amp; 2533 struct anon *ap = NULL; 2534 2535 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2536 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 2537 2538 if ((amp = svd->amp) != NULL) 2539 anon_index = svd->anon_index + seg_page(seg, addr); 2540 2541 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 2542 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2543 hat_unlock_region(seg->s_as->a_hat, addr, len, svd->rcookie); 2544 } else { 2545 hat_unlock(seg->s_as->a_hat, addr, len); 2546 } 2547 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 2548 if (amp != NULL) { 2549 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2550 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 2551 != NULL) { 2552 swap_xlate(ap, &vp, &offset); 2553 } else { 2554 vp = svd->vp; 2555 offset = svd->offset + 2556 (uintptr_t)(adr - seg->s_base); 2557 } 2558 ANON_LOCK_EXIT(&->a_rwlock); 2559 } else { 2560 vp = svd->vp; 2561 offset = svd->offset + 2562 (uintptr_t)(adr - seg->s_base); 2563 } 2564 2565 /* 2566 * Use page_find() instead of page_lookup() to 2567 * find the page since we know that it is locked. 2568 */ 2569 pp = page_find(vp, offset); 2570 if (pp == NULL) { 2571 panic( 2572 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 2573 (void *)adr, (void *)ap, (void *)vp, offset); 2574 /*NOTREACHED*/ 2575 } 2576 2577 if (rw == S_WRITE) { 2578 hat_setrefmod(pp); 2579 if (seg->s_as->a_vbits) 2580 hat_setstat(seg->s_as, adr, PAGESIZE, 2581 P_REF | P_MOD); 2582 } else if (rw != S_OTHER) { 2583 hat_setref(pp); 2584 if (seg->s_as->a_vbits) 2585 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 2586 } 2587 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2588 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 2589 page_unlock(pp); 2590 } 2591 ASSERT(svd->softlockcnt >= btop(len)); 2592 if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -btop(len))) { 2593 /* 2594 * All SOFTLOCKS are gone. Wakeup any waiting 2595 * unmappers so they can try again to unmap. 2596 * Check for waiters first without the mutex 2597 * held so we don't always grab the mutex on 2598 * softunlocks. 2599 */ 2600 if (AS_ISUNMAPWAIT(seg->s_as)) { 2601 mutex_enter(&seg->s_as->a_contents); 2602 if (AS_ISUNMAPWAIT(seg->s_as)) { 2603 AS_CLRUNMAPWAIT(seg->s_as); 2604 cv_broadcast(&seg->s_as->a_cv); 2605 } 2606 mutex_exit(&seg->s_as->a_contents); 2607 } 2608 } 2609 } 2610 2611 #define PAGE_HANDLED ((page_t *)-1) 2612 2613 /* 2614 * Release all the pages in the NULL terminated ppp list 2615 * which haven't already been converted to PAGE_HANDLED. 2616 */ 2617 static void 2618 segvn_pagelist_rele(page_t **ppp) 2619 { 2620 for (; *ppp != NULL; ppp++) { 2621 if (*ppp != PAGE_HANDLED) 2622 page_unlock(*ppp); 2623 } 2624 } 2625 2626 static int stealcow = 1; 2627 2628 /* 2629 * Workaround for viking chip bug. See bug id 1220902. 2630 * To fix this down in pagefault() would require importing so 2631 * much as and segvn code as to be unmaintainable. 2632 */ 2633 int enable_mbit_wa = 0; 2634 2635 /* 2636 * Handles all the dirty work of getting the right 2637 * anonymous pages and loading up the translations. 2638 * This routine is called only from segvn_fault() 2639 * when looping over the range of addresses requested. 2640 * 2641 * The basic algorithm here is: 2642 * If this is an anon_zero case 2643 * Call anon_zero to allocate page 2644 * Load up translation 2645 * Return 2646 * endif 2647 * If this is an anon page 2648 * Use anon_getpage to get the page 2649 * else 2650 * Find page in pl[] list passed in 2651 * endif 2652 * If not a cow 2653 * Load up the translation to the page 2654 * return 2655 * endif 2656 * Call anon_private to handle cow 2657 * Load up (writable) translation to new page 2658 */ 2659 static faultcode_t 2660 segvn_faultpage( 2661 struct hat *hat, /* the hat to use for mapping */ 2662 struct seg *seg, /* seg_vn of interest */ 2663 caddr_t addr, /* address in as */ 2664 u_offset_t off, /* offset in vp */ 2665 struct vpage *vpage, /* pointer to vpage for vp, off */ 2666 page_t *pl[], /* object source page pointer */ 2667 uint_t vpprot, /* access allowed to object pages */ 2668 enum fault_type type, /* type of fault */ 2669 enum seg_rw rw, /* type of access at fault */ 2670 int brkcow) /* we may need to break cow */ 2671 { 2672 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2673 page_t *pp, **ppp; 2674 uint_t pageflags = 0; 2675 page_t *anon_pl[1 + 1]; 2676 page_t *opp = NULL; /* original page */ 2677 uint_t prot; 2678 int err; 2679 int cow; 2680 int claim; 2681 int steal = 0; 2682 ulong_t anon_index; 2683 struct anon *ap, *oldap; 2684 struct anon_map *amp; 2685 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2686 int anon_lock = 0; 2687 anon_sync_obj_t cookie; 2688 2689 if (svd->flags & MAP_TEXT) { 2690 hat_flag |= HAT_LOAD_TEXT; 2691 } 2692 2693 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2694 ASSERT(seg->s_szc == 0); 2695 ASSERT(svd->tr_state != SEGVN_TR_INIT); 2696 2697 /* 2698 * Initialize protection value for this page. 2699 * If we have per page protection values check it now. 2700 */ 2701 if (svd->pageprot) { 2702 uint_t protchk; 2703 2704 switch (rw) { 2705 case S_READ: 2706 protchk = PROT_READ; 2707 break; 2708 case S_WRITE: 2709 protchk = PROT_WRITE; 2710 break; 2711 case S_EXEC: 2712 protchk = PROT_EXEC; 2713 break; 2714 case S_OTHER: 2715 default: 2716 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2717 break; 2718 } 2719 2720 prot = VPP_PROT(vpage); 2721 if ((prot & protchk) == 0) 2722 return (FC_PROT); /* illegal access type */ 2723 } else { 2724 prot = svd->prot; 2725 } 2726 2727 if (type == F_SOFTLOCK) { 2728 atomic_inc_ulong((ulong_t *)&svd->softlockcnt); 2729 } 2730 2731 /* 2732 * Always acquire the anon array lock to prevent 2 threads from 2733 * allocating separate anon slots for the same "addr". 2734 */ 2735 2736 if ((amp = svd->amp) != NULL) { 2737 ASSERT(RW_READ_HELD(&->a_rwlock)); 2738 anon_index = svd->anon_index + seg_page(seg, addr); 2739 anon_array_enter(amp, anon_index, &cookie); 2740 anon_lock = 1; 2741 } 2742 2743 if (svd->vp == NULL && amp != NULL) { 2744 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2745 /* 2746 * Allocate a (normally) writable anonymous page of 2747 * zeroes. If no advance reservations, reserve now. 2748 */ 2749 if (svd->flags & MAP_NORESERVE) { 2750 if (anon_resv_zone(ptob(1), 2751 seg->s_as->a_proc->p_zone)) { 2752 atomic_add_long(&svd->swresv, ptob(1)); 2753 atomic_add_long(&seg->s_as->a_resvsize, 2754 ptob(1)); 2755 } else { 2756 err = ENOMEM; 2757 goto out; 2758 } 2759 } 2760 if ((pp = anon_zero(seg, addr, &ap, 2761 svd->cred)) == NULL) { 2762 err = ENOMEM; 2763 goto out; /* out of swap space */ 2764 } 2765 /* 2766 * Re-acquire the anon_map lock and 2767 * initialize the anon array entry. 2768 */ 2769 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2770 ANON_SLEEP); 2771 2772 ASSERT(pp->p_szc == 0); 2773 2774 /* 2775 * Handle pages that have been marked for migration 2776 */ 2777 if (lgrp_optimizations()) 2778 page_migrate(seg, addr, &pp, 1); 2779 2780 if (enable_mbit_wa) { 2781 if (rw == S_WRITE) 2782 hat_setmod(pp); 2783 else if (!hat_ismod(pp)) 2784 prot &= ~PROT_WRITE; 2785 } 2786 /* 2787 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2788 * with MC_LOCKAS, MCL_FUTURE) and this is a 2789 * MAP_NORESERVE segment, we may need to 2790 * permanently lock the page as it is being faulted 2791 * for the first time. The following text applies 2792 * only to MAP_NORESERVE segments: 2793 * 2794 * As per memcntl(2), if this segment was created 2795 * after MCL_FUTURE was applied (a "future" 2796 * segment), its pages must be locked. If this 2797 * segment existed at MCL_FUTURE application (a 2798 * "past" segment), the interface is unclear. 2799 * 2800 * We decide to lock only if vpage is present: 2801 * 2802 * - "future" segments will have a vpage array (see 2803 * as_map), and so will be locked as required 2804 * 2805 * - "past" segments may not have a vpage array, 2806 * depending on whether events (such as 2807 * mprotect) have occurred. Locking if vpage 2808 * exists will preserve legacy behavior. Not 2809 * locking if vpage is absent, will not break 2810 * the interface or legacy behavior. Note that 2811 * allocating vpage here if it's absent requires 2812 * upgrading the segvn reader lock, the cost of 2813 * which does not seem worthwhile. 2814 * 2815 * Usually testing and setting VPP_ISPPLOCK and 2816 * VPP_SETPPLOCK requires holding the segvn lock as 2817 * writer, but in this case all readers are 2818 * serializing on the anon array lock. 2819 */ 2820 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2821 (svd->flags & MAP_NORESERVE) && 2822 !VPP_ISPPLOCK(vpage)) { 2823 proc_t *p = seg->s_as->a_proc; 2824 ASSERT(svd->type == MAP_PRIVATE); 2825 mutex_enter(&p->p_lock); 2826 if (rctl_incr_locked_mem(p, NULL, PAGESIZE, 2827 1) == 0) { 2828 claim = VPP_PROT(vpage) & PROT_WRITE; 2829 if (page_pp_lock(pp, claim, 0)) { 2830 VPP_SETPPLOCK(vpage); 2831 } else { 2832 rctl_decr_locked_mem(p, NULL, 2833 PAGESIZE, 1); 2834 } 2835 } 2836 mutex_exit(&p->p_lock); 2837 } 2838 2839 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2840 hat_memload(hat, addr, pp, prot, hat_flag); 2841 2842 if (!(hat_flag & HAT_LOAD_LOCK)) 2843 page_unlock(pp); 2844 2845 anon_array_exit(&cookie); 2846 return (0); 2847 } 2848 } 2849 2850 /* 2851 * Obtain the page structure via anon_getpage() if it is 2852 * a private copy of an object (the result of a previous 2853 * copy-on-write). 2854 */ 2855 if (amp != NULL) { 2856 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2857 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2858 seg, addr, rw, svd->cred); 2859 if (err) 2860 goto out; 2861 2862 if (svd->type == MAP_SHARED) { 2863 /* 2864 * If this is a shared mapping to an 2865 * anon_map, then ignore the write 2866 * permissions returned by anon_getpage(). 2867 * They apply to the private mappings 2868 * of this anon_map. 2869 */ 2870 vpprot |= PROT_WRITE; 2871 } 2872 opp = anon_pl[0]; 2873 } 2874 } 2875 2876 /* 2877 * Search the pl[] list passed in if it is from the 2878 * original object (i.e., not a private copy). 2879 */ 2880 if (opp == NULL) { 2881 /* 2882 * Find original page. We must be bringing it in 2883 * from the list in pl[]. 2884 */ 2885 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2886 if (opp == PAGE_HANDLED) 2887 continue; 2888 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2889 if (opp->p_offset == off) 2890 break; 2891 } 2892 if (opp == NULL) { 2893 panic("segvn_faultpage not found"); 2894 /*NOTREACHED*/ 2895 } 2896 *ppp = PAGE_HANDLED; 2897 2898 } 2899 2900 ASSERT(PAGE_LOCKED(opp)); 2901 2902 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2903 "segvn_fault:pp %p vp %p offset %llx", opp, NULL, 0); 2904 2905 /* 2906 * The fault is treated as a copy-on-write fault if a 2907 * write occurs on a private segment and the object 2908 * page (i.e., mapping) is write protected. We assume 2909 * that fatal protection checks have already been made. 2910 */ 2911 2912 if (brkcow) { 2913 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2914 cow = !(vpprot & PROT_WRITE); 2915 } else if (svd->tr_state == SEGVN_TR_ON) { 2916 /* 2917 * If we are doing text replication COW on first touch. 2918 */ 2919 ASSERT(amp != NULL); 2920 ASSERT(svd->vp != NULL); 2921 ASSERT(rw != S_WRITE); 2922 cow = (ap == NULL); 2923 } else { 2924 cow = 0; 2925 } 2926 2927 /* 2928 * If not a copy-on-write case load the translation 2929 * and return. 2930 */ 2931 if (cow == 0) { 2932 2933 /* 2934 * Handle pages that have been marked for migration 2935 */ 2936 if (lgrp_optimizations()) 2937 page_migrate(seg, addr, &opp, 1); 2938 2939 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2940 if (rw == S_WRITE) 2941 hat_setmod(opp); 2942 else if (rw != S_OTHER && !hat_ismod(opp)) 2943 prot &= ~PROT_WRITE; 2944 } 2945 2946 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE || 2947 (!svd->pageprot && svd->prot == (prot & vpprot))); 2948 ASSERT(amp == NULL || 2949 svd->rcookie == HAT_INVALID_REGION_COOKIE); 2950 hat_memload_region(hat, addr, opp, prot & vpprot, hat_flag, 2951 svd->rcookie); 2952 2953 if (!(hat_flag & HAT_LOAD_LOCK)) 2954 page_unlock(opp); 2955 2956 if (anon_lock) { 2957 anon_array_exit(&cookie); 2958 } 2959 return (0); 2960 } 2961 2962 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2963 2964 hat_setref(opp); 2965 2966 ASSERT(amp != NULL && anon_lock); 2967 2968 /* 2969 * Steal the page only if it isn't a private page 2970 * since stealing a private page is not worth the effort. 2971 */ 2972 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2973 steal = 1; 2974 2975 /* 2976 * Steal the original page if the following conditions are true: 2977 * 2978 * We are low on memory, the page is not private, page is not large, 2979 * not shared, not modified, not `locked' or if we have it `locked' 2980 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2981 * that the page is not shared) and if it doesn't have any 2982 * translations. page_struct_lock isn't needed to look at p_cowcnt 2983 * and p_lckcnt because we first get exclusive lock on page. 2984 */ 2985 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2986 2987 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 2988 page_tryupgrade(opp) && !hat_ismod(opp) && 2989 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2990 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2991 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2992 /* 2993 * Check if this page has other translations 2994 * after unloading our translation. 2995 */ 2996 if (hat_page_is_mapped(opp)) { 2997 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2998 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2999 HAT_UNLOAD); 3000 } 3001 3002 /* 3003 * hat_unload() might sync back someone else's recent 3004 * modification, so check again. 3005 */ 3006 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 3007 pageflags |= STEAL_PAGE; 3008 } 3009 3010 /* 3011 * If we have a vpage pointer, see if it indicates that we have 3012 * ``locked'' the page we map -- if so, tell anon_private to 3013 * transfer the locking resource to the new page. 3014 * 3015 * See Statement at the beginning of segvn_lockop regarding 3016 * the way lockcnts/cowcnts are handled during COW. 3017 * 3018 */ 3019 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 3020 pageflags |= LOCK_PAGE; 3021 3022 /* 3023 * Allocate a private page and perform the copy. 3024 * For MAP_NORESERVE reserve swap space now, unless this 3025 * is a cow fault on an existing anon page in which case 3026 * MAP_NORESERVE will have made advance reservations. 3027 */ 3028 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 3029 if (anon_resv_zone(ptob(1), seg->s_as->a_proc->p_zone)) { 3030 atomic_add_long(&svd->swresv, ptob(1)); 3031 atomic_add_long(&seg->s_as->a_resvsize, ptob(1)); 3032 } else { 3033 page_unlock(opp); 3034 err = ENOMEM; 3035 goto out; 3036 } 3037 } 3038 oldap = ap; 3039 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 3040 if (pp == NULL) { 3041 err = ENOMEM; /* out of swap space */ 3042 goto out; 3043 } 3044 3045 /* 3046 * If we copied away from an anonymous page, then 3047 * we are one step closer to freeing up an anon slot. 3048 * 3049 * NOTE: The original anon slot must be released while 3050 * holding the "anon_map" lock. This is necessary to prevent 3051 * other threads from obtaining a pointer to the anon slot 3052 * which may be freed if its "refcnt" is 1. 3053 */ 3054 if (oldap != NULL) 3055 anon_decref(oldap); 3056 3057 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 3058 3059 /* 3060 * Handle pages that have been marked for migration 3061 */ 3062 if (lgrp_optimizations()) 3063 page_migrate(seg, addr, &pp, 1); 3064 3065 ASSERT(pp->p_szc == 0); 3066 3067 ASSERT(!IS_VMODSORT(pp->p_vnode)); 3068 if (enable_mbit_wa) { 3069 if (rw == S_WRITE) 3070 hat_setmod(pp); 3071 else if (!hat_ismod(pp)) 3072 prot &= ~PROT_WRITE; 3073 } 3074 3075 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 3076 hat_memload(hat, addr, pp, prot, hat_flag); 3077 3078 if (!(hat_flag & HAT_LOAD_LOCK)) 3079 page_unlock(pp); 3080 3081 ASSERT(anon_lock); 3082 anon_array_exit(&cookie); 3083 return (0); 3084 out: 3085 if (anon_lock) 3086 anon_array_exit(&cookie); 3087 3088 if (type == F_SOFTLOCK) { 3089 atomic_dec_ulong((ulong_t *)&svd->softlockcnt); 3090 } 3091 return (FC_MAKE_ERR(err)); 3092 } 3093 3094 /* 3095 * relocate a bunch of smaller targ pages into one large repl page. all targ 3096 * pages must be complete pages smaller than replacement pages. 3097 * it's assumed that no page's szc can change since they are all PAGESIZE or 3098 * complete large pages locked SHARED. 3099 */ 3100 static void 3101 segvn_relocate_pages(page_t **targ, page_t *replacement) 3102 { 3103 page_t *pp; 3104 pgcnt_t repl_npgs, curnpgs; 3105 pgcnt_t i; 3106 uint_t repl_szc = replacement->p_szc; 3107 page_t *first_repl = replacement; 3108 page_t *repl; 3109 spgcnt_t npgs; 3110 3111 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 3112 3113 ASSERT(repl_szc != 0); 3114 npgs = repl_npgs = page_get_pagecnt(repl_szc); 3115 3116 i = 0; 3117 while (repl_npgs) { 3118 spgcnt_t nreloc; 3119 int err; 3120 ASSERT(replacement != NULL); 3121 pp = targ[i]; 3122 ASSERT(pp->p_szc < repl_szc); 3123 ASSERT(PAGE_EXCL(pp)); 3124 ASSERT(!PP_ISFREE(pp)); 3125 curnpgs = page_get_pagecnt(pp->p_szc); 3126 if (curnpgs == 1) { 3127 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 3128 repl = replacement; 3129 page_sub(&replacement, repl); 3130 ASSERT(PAGE_EXCL(repl)); 3131 ASSERT(!PP_ISFREE(repl)); 3132 ASSERT(repl->p_szc == repl_szc); 3133 } else { 3134 page_t *repl_savepp; 3135 int j; 3136 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 3137 repl_savepp = replacement; 3138 for (j = 0; j < curnpgs; j++) { 3139 repl = replacement; 3140 page_sub(&replacement, repl); 3141 ASSERT(PAGE_EXCL(repl)); 3142 ASSERT(!PP_ISFREE(repl)); 3143 ASSERT(repl->p_szc == repl_szc); 3144 ASSERT(page_pptonum(targ[i + j]) == 3145 page_pptonum(targ[i]) + j); 3146 } 3147 repl = repl_savepp; 3148 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 3149 } 3150 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 3151 if (err || nreloc != curnpgs) { 3152 panic("segvn_relocate_pages: " 3153 "page_relocate failed err=%d curnpgs=%ld " 3154 "nreloc=%ld", err, curnpgs, nreloc); 3155 } 3156 ASSERT(curnpgs <= repl_npgs); 3157 repl_npgs -= curnpgs; 3158 i += curnpgs; 3159 } 3160 ASSERT(replacement == NULL); 3161 3162 repl = first_repl; 3163 repl_npgs = npgs; 3164 for (i = 0; i < repl_npgs; i++) { 3165 ASSERT(PAGE_EXCL(repl)); 3166 ASSERT(!PP_ISFREE(repl)); 3167 targ[i] = repl; 3168 page_downgrade(targ[i]); 3169 repl++; 3170 } 3171 } 3172 3173 /* 3174 * Check if all pages in ppa array are complete smaller than szc pages and 3175 * their roots will still be aligned relative to their current size if the 3176 * entire ppa array is relocated into one szc page. If these conditions are 3177 * not met return 0. 3178 * 3179 * If all pages are properly aligned attempt to upgrade their locks 3180 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 3181 * upgrdfail was set to 0 by caller. 3182 * 3183 * Return 1 if all pages are aligned and locked exclusively. 3184 * 3185 * If all pages in ppa array happen to be physically contiguous to make one 3186 * szc page and all exclusive locks are successfully obtained promote the page 3187 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 3188 */ 3189 static int 3190 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 3191 { 3192 page_t *pp; 3193 pfn_t pfn; 3194 pgcnt_t totnpgs = page_get_pagecnt(szc); 3195 pfn_t first_pfn; 3196 int contig = 1; 3197 pgcnt_t i; 3198 pgcnt_t j; 3199 uint_t curszc; 3200 pgcnt_t curnpgs; 3201 int root = 0; 3202 3203 ASSERT(szc > 0); 3204 3205 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 3206 3207 for (i = 0; i < totnpgs; i++) { 3208 pp = ppa[i]; 3209 ASSERT(PAGE_SHARED(pp)); 3210 ASSERT(!PP_ISFREE(pp)); 3211 pfn = page_pptonum(pp); 3212 if (i == 0) { 3213 if (!IS_P2ALIGNED(pfn, totnpgs)) { 3214 contig = 0; 3215 } else { 3216 first_pfn = pfn; 3217 } 3218 } else if (contig && pfn != first_pfn + i) { 3219 contig = 0; 3220 } 3221 if (pp->p_szc == 0) { 3222 if (root) { 3223 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 3224 return (0); 3225 } 3226 } else if (!root) { 3227 if ((curszc = pp->p_szc) >= szc) { 3228 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 3229 return (0); 3230 } 3231 if (curszc == 0) { 3232 /* 3233 * p_szc changed means we don't have all pages 3234 * locked. return failure. 3235 */ 3236 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 3237 return (0); 3238 } 3239 curnpgs = page_get_pagecnt(curszc); 3240 if (!IS_P2ALIGNED(pfn, curnpgs) || 3241 !IS_P2ALIGNED(i, curnpgs)) { 3242 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 3243 return (0); 3244 } 3245 root = 1; 3246 } else { 3247 ASSERT(i > 0); 3248 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 3249 if (pp->p_szc != curszc) { 3250 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 3251 return (0); 3252 } 3253 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 3254 panic("segvn_full_szcpages: " 3255 "large page not physically contiguous"); 3256 } 3257 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 3258 root = 0; 3259 } 3260 } 3261 } 3262 3263 for (i = 0; i < totnpgs; i++) { 3264 ASSERT(ppa[i]->p_szc < szc); 3265 if (!page_tryupgrade(ppa[i])) { 3266 for (j = 0; j < i; j++) { 3267 page_downgrade(ppa[j]); 3268 } 3269 *pszc = ppa[i]->p_szc; 3270 *upgrdfail = 1; 3271 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 3272 return (0); 3273 } 3274 } 3275 3276 /* 3277 * When a page is put a free cachelist its szc is set to 0. if file 3278 * system reclaimed pages from cachelist targ pages will be physically 3279 * contiguous with 0 p_szc. in this case just upgrade szc of targ 3280 * pages without any relocations. 3281 * To avoid any hat issues with previous small mappings 3282 * hat_pageunload() the target pages first. 3283 */ 3284 if (contig) { 3285 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 3286 for (i = 0; i < totnpgs; i++) { 3287 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 3288 } 3289 for (i = 0; i < totnpgs; i++) { 3290 ppa[i]->p_szc = szc; 3291 } 3292 for (i = 0; i < totnpgs; i++) { 3293 ASSERT(PAGE_EXCL(ppa[i])); 3294 page_downgrade(ppa[i]); 3295 } 3296 if (pszc != NULL) { 3297 *pszc = szc; 3298 } 3299 } 3300 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 3301 return (1); 3302 } 3303 3304 /* 3305 * Create physically contiguous pages for [vp, off] - [vp, off + 3306 * page_size(szc)) range and for private segment return them in ppa array. 3307 * Pages are created either via IO or relocations. 3308 * 3309 * Return 1 on success and 0 on failure. 3310 * 3311 * If physically contiguous pages already exist for this range return 1 without 3312 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 3313 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 3314 */ 3315 3316 static int 3317 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 3318 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 3319 int *downsize) 3320 3321 { 3322 page_t *pplist = *ppplist; 3323 size_t pgsz = page_get_pagesize(szc); 3324 pgcnt_t pages = btop(pgsz); 3325 ulong_t start_off = off; 3326 u_offset_t eoff = off + pgsz; 3327 spgcnt_t nreloc; 3328 u_offset_t io_off = off; 3329 size_t io_len; 3330 page_t *io_pplist = NULL; 3331 page_t *done_pplist = NULL; 3332 pgcnt_t pgidx = 0; 3333 page_t *pp; 3334 page_t *newpp; 3335 page_t *targpp; 3336 int io_err = 0; 3337 int i; 3338 pfn_t pfn; 3339 ulong_t ppages; 3340 page_t *targ_pplist = NULL; 3341 page_t *repl_pplist = NULL; 3342 page_t *tmp_pplist; 3343 int nios = 0; 3344 uint_t pszc; 3345 struct vattr va; 3346 3347 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 3348 3349 ASSERT(szc != 0); 3350 ASSERT(pplist->p_szc == szc); 3351 3352 /* 3353 * downsize will be set to 1 only if we fail to lock pages. this will 3354 * allow subsequent faults to try to relocate the page again. If we 3355 * fail due to misalignment don't downsize and let the caller map the 3356 * whole region with small mappings to avoid more faults into the area 3357 * where we can't get large pages anyway. 3358 */ 3359 *downsize = 0; 3360 3361 while (off < eoff) { 3362 newpp = pplist; 3363 ASSERT(newpp != NULL); 3364 ASSERT(PAGE_EXCL(newpp)); 3365 ASSERT(!PP_ISFREE(newpp)); 3366 /* 3367 * we pass NULL for nrelocp to page_lookup_create() 3368 * so that it doesn't relocate. We relocate here 3369 * later only after we make sure we can lock all 3370 * pages in the range we handle and they are all 3371 * aligned. 3372 */ 3373 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 3374 ASSERT(pp != NULL); 3375 ASSERT(!PP_ISFREE(pp)); 3376 ASSERT(pp->p_vnode == vp); 3377 ASSERT(pp->p_offset == off); 3378 if (pp == newpp) { 3379 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 3380 page_sub(&pplist, pp); 3381 ASSERT(PAGE_EXCL(pp)); 3382 ASSERT(page_iolock_assert(pp)); 3383 page_list_concat(&io_pplist, &pp); 3384 off += PAGESIZE; 3385 continue; 3386 } 3387 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 3388 pfn = page_pptonum(pp); 3389 pszc = pp->p_szc; 3390 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 3391 IS_P2ALIGNED(pfn, pages)) { 3392 ASSERT(repl_pplist == NULL); 3393 ASSERT(done_pplist == NULL); 3394 ASSERT(pplist == *ppplist); 3395 page_unlock(pp); 3396 page_free_replacement_page(pplist); 3397 page_create_putback(pages); 3398 *ppplist = NULL; 3399 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 3400 return (1); 3401 } 3402 if (pszc >= szc) { 3403 page_unlock(pp); 3404 segvn_faultvnmpss_align_err1++; 3405 goto out; 3406 } 3407 ppages = page_get_pagecnt(pszc); 3408 if (!IS_P2ALIGNED(pfn, ppages)) { 3409 ASSERT(pszc > 0); 3410 /* 3411 * sizing down to pszc won't help. 3412 */ 3413 page_unlock(pp); 3414 segvn_faultvnmpss_align_err2++; 3415 goto out; 3416 } 3417 pfn = page_pptonum(newpp); 3418 if (!IS_P2ALIGNED(pfn, ppages)) { 3419 ASSERT(pszc > 0); 3420 /* 3421 * sizing down to pszc won't help. 3422 */ 3423 page_unlock(pp); 3424 segvn_faultvnmpss_align_err3++; 3425 goto out; 3426 } 3427 if (!PAGE_EXCL(pp)) { 3428 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 3429 page_unlock(pp); 3430 *downsize = 1; 3431 *ret_pszc = pp->p_szc; 3432 goto out; 3433 } 3434 targpp = pp; 3435 if (io_pplist != NULL) { 3436 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 3437 io_len = off - io_off; 3438 /* 3439 * Some file systems like NFS don't check EOF 3440 * conditions in VOP_PAGEIO(). Check it here 3441 * now that pages are locked SE_EXCL. Any file 3442 * truncation will wait until the pages are 3443 * unlocked so no need to worry that file will 3444 * be truncated after we check its size here. 3445 * XXX fix NFS to remove this check. 3446 */ 3447 va.va_mask = AT_SIZE; 3448 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred, NULL)) { 3449 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 3450 page_unlock(targpp); 3451 goto out; 3452 } 3453 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3454 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 3455 *downsize = 1; 3456 *ret_pszc = 0; 3457 page_unlock(targpp); 3458 goto out; 3459 } 3460 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3461 B_READ, svd->cred, NULL); 3462 if (io_err) { 3463 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 3464 page_unlock(targpp); 3465 if (io_err == EDEADLK) { 3466 segvn_vmpss_pageio_deadlk_err++; 3467 } 3468 goto out; 3469 } 3470 nios++; 3471 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 3472 while (io_pplist != NULL) { 3473 pp = io_pplist; 3474 page_sub(&io_pplist, pp); 3475 ASSERT(page_iolock_assert(pp)); 3476 page_io_unlock(pp); 3477 pgidx = (pp->p_offset - start_off) >> 3478 PAGESHIFT; 3479 ASSERT(pgidx < pages); 3480 ppa[pgidx] = pp; 3481 page_list_concat(&done_pplist, &pp); 3482 } 3483 } 3484 pp = targpp; 3485 ASSERT(PAGE_EXCL(pp)); 3486 ASSERT(pp->p_szc <= pszc); 3487 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 3488 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 3489 page_unlock(pp); 3490 *downsize = 1; 3491 *ret_pszc = pp->p_szc; 3492 goto out; 3493 } 3494 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 3495 /* 3496 * page szc chould have changed before the entire group was 3497 * locked. reread page szc. 3498 */ 3499 pszc = pp->p_szc; 3500 ppages = page_get_pagecnt(pszc); 3501 3502 /* link just the roots */ 3503 page_list_concat(&targ_pplist, &pp); 3504 page_sub(&pplist, newpp); 3505 page_list_concat(&repl_pplist, &newpp); 3506 off += PAGESIZE; 3507 while (--ppages != 0) { 3508 newpp = pplist; 3509 page_sub(&pplist, newpp); 3510 off += PAGESIZE; 3511 } 3512 io_off = off; 3513 } 3514 if (io_pplist != NULL) { 3515 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 3516 io_len = eoff - io_off; 3517 va.va_mask = AT_SIZE; 3518 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred, NULL) != 0) { 3519 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 3520 goto out; 3521 } 3522 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3523 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 3524 *downsize = 1; 3525 *ret_pszc = 0; 3526 goto out; 3527 } 3528 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3529 B_READ, svd->cred, NULL); 3530 if (io_err) { 3531 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 3532 if (io_err == EDEADLK) { 3533 segvn_vmpss_pageio_deadlk_err++; 3534 } 3535 goto out; 3536 } 3537 nios++; 3538 while (io_pplist != NULL) { 3539 pp = io_pplist; 3540 page_sub(&io_pplist, pp); 3541 ASSERT(page_iolock_assert(pp)); 3542 page_io_unlock(pp); 3543 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3544 ASSERT(pgidx < pages); 3545 ppa[pgidx] = pp; 3546 } 3547 } 3548 /* 3549 * we're now bound to succeed or panic. 3550 * remove pages from done_pplist. it's not needed anymore. 3551 */ 3552 while (done_pplist != NULL) { 3553 pp = done_pplist; 3554 page_sub(&done_pplist, pp); 3555 } 3556 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 3557 ASSERT(pplist == NULL); 3558 *ppplist = NULL; 3559 while (targ_pplist != NULL) { 3560 int ret; 3561 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 3562 ASSERT(repl_pplist); 3563 pp = targ_pplist; 3564 page_sub(&targ_pplist, pp); 3565 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3566 newpp = repl_pplist; 3567 page_sub(&repl_pplist, newpp); 3568 #ifdef DEBUG 3569 pfn = page_pptonum(pp); 3570 pszc = pp->p_szc; 3571 ppages = page_get_pagecnt(pszc); 3572 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3573 pfn = page_pptonum(newpp); 3574 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3575 ASSERT(P2PHASE(pfn, pages) == pgidx); 3576 #endif 3577 nreloc = 0; 3578 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 3579 if (ret != 0 || nreloc == 0) { 3580 panic("segvn_fill_vp_pages: " 3581 "page_relocate failed"); 3582 } 3583 pp = newpp; 3584 while (nreloc-- != 0) { 3585 ASSERT(PAGE_EXCL(pp)); 3586 ASSERT(pp->p_vnode == vp); 3587 ASSERT(pgidx == 3588 ((pp->p_offset - start_off) >> PAGESHIFT)); 3589 ppa[pgidx++] = pp; 3590 pp++; 3591 } 3592 } 3593 3594 if (svd->type == MAP_PRIVATE) { 3595 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 3596 for (i = 0; i < pages; i++) { 3597 ASSERT(ppa[i] != NULL); 3598 ASSERT(PAGE_EXCL(ppa[i])); 3599 ASSERT(ppa[i]->p_vnode == vp); 3600 ASSERT(ppa[i]->p_offset == 3601 start_off + (i << PAGESHIFT)); 3602 page_downgrade(ppa[i]); 3603 } 3604 ppa[pages] = NULL; 3605 } else { 3606 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 3607 /* 3608 * the caller will still call VOP_GETPAGE() for shared segments 3609 * to check FS write permissions. For private segments we map 3610 * file read only anyway. so no VOP_GETPAGE is needed. 3611 */ 3612 for (i = 0; i < pages; i++) { 3613 ASSERT(ppa[i] != NULL); 3614 ASSERT(PAGE_EXCL(ppa[i])); 3615 ASSERT(ppa[i]->p_vnode == vp); 3616 ASSERT(ppa[i]->p_offset == 3617 start_off + (i << PAGESHIFT)); 3618 page_unlock(ppa[i]); 3619 } 3620 ppa[0] = NULL; 3621 } 3622 3623 return (1); 3624 out: 3625 /* 3626 * Do the cleanup. Unlock target pages we didn't relocate. They are 3627 * linked on targ_pplist by root pages. reassemble unused replacement 3628 * and io pages back to pplist. 3629 */ 3630 if (io_pplist != NULL) { 3631 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 3632 pp = io_pplist; 3633 do { 3634 ASSERT(pp->p_vnode == vp); 3635 ASSERT(pp->p_offset == io_off); 3636 ASSERT(page_iolock_assert(pp)); 3637 page_io_unlock(pp); 3638 page_hashout(pp, NULL); 3639 io_off += PAGESIZE; 3640 } while ((pp = pp->p_next) != io_pplist); 3641 page_list_concat(&io_pplist, &pplist); 3642 pplist = io_pplist; 3643 } 3644 tmp_pplist = NULL; 3645 while (targ_pplist != NULL) { 3646 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 3647 pp = targ_pplist; 3648 ASSERT(PAGE_EXCL(pp)); 3649 page_sub(&targ_pplist, pp); 3650 3651 pszc = pp->p_szc; 3652 ppages = page_get_pagecnt(pszc); 3653 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3654 3655 if (pszc != 0) { 3656 group_page_unlock(pp); 3657 } 3658 page_unlock(pp); 3659 3660 pp = repl_pplist; 3661 ASSERT(pp != NULL); 3662 ASSERT(PAGE_EXCL(pp)); 3663 ASSERT(pp->p_szc == szc); 3664 page_sub(&repl_pplist, pp); 3665 3666 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3667 3668 /* relink replacement page */ 3669 page_list_concat(&tmp_pplist, &pp); 3670 while (--ppages != 0) { 3671 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 3672 pp++; 3673 ASSERT(PAGE_EXCL(pp)); 3674 ASSERT(pp->p_szc == szc); 3675 page_list_concat(&tmp_pplist, &pp); 3676 } 3677 } 3678 if (tmp_pplist != NULL) { 3679 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3680 page_list_concat(&tmp_pplist, &pplist); 3681 pplist = tmp_pplist; 3682 } 3683 /* 3684 * at this point all pages are either on done_pplist or 3685 * pplist. They can't be all on done_pplist otherwise 3686 * we'd've been done. 3687 */ 3688 ASSERT(pplist != NULL); 3689 if (nios != 0) { 3690 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3691 pp = pplist; 3692 do { 3693 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3694 ASSERT(pp->p_szc == szc); 3695 ASSERT(PAGE_EXCL(pp)); 3696 ASSERT(pp->p_vnode != vp); 3697 pp->p_szc = 0; 3698 } while ((pp = pp->p_next) != pplist); 3699 3700 pp = done_pplist; 3701 do { 3702 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3703 ASSERT(pp->p_szc == szc); 3704 ASSERT(PAGE_EXCL(pp)); 3705 ASSERT(pp->p_vnode == vp); 3706 pp->p_szc = 0; 3707 } while ((pp = pp->p_next) != done_pplist); 3708 3709 while (pplist != NULL) { 3710 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3711 pp = pplist; 3712 page_sub(&pplist, pp); 3713 page_free(pp, 0); 3714 } 3715 3716 while (done_pplist != NULL) { 3717 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3718 pp = done_pplist; 3719 page_sub(&done_pplist, pp); 3720 page_unlock(pp); 3721 } 3722 *ppplist = NULL; 3723 return (0); 3724 } 3725 ASSERT(pplist == *ppplist); 3726 if (io_err) { 3727 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3728 /* 3729 * don't downsize on io error. 3730 * see if vop_getpage succeeds. 3731 * pplist may still be used in this case 3732 * for relocations. 3733 */ 3734 return (0); 3735 } 3736 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3737 page_free_replacement_page(pplist); 3738 page_create_putback(pages); 3739 *ppplist = NULL; 3740 return (0); 3741 } 3742 3743 int segvn_anypgsz = 0; 3744 3745 #define SEGVN_RESTORE_SOFTLOCK_VP(type, pages) \ 3746 if ((type) == F_SOFTLOCK) { \ 3747 atomic_add_long((ulong_t *)&(svd)->softlockcnt, \ 3748 -(pages)); \ 3749 } 3750 3751 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3752 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3753 if ((rw) == S_WRITE) { \ 3754 for (i = 0; i < (pages); i++) { \ 3755 ASSERT((ppa)[i]->p_vnode == \ 3756 (ppa)[0]->p_vnode); \ 3757 hat_setmod((ppa)[i]); \ 3758 } \ 3759 } else if ((rw) != S_OTHER && \ 3760 ((prot) & (vpprot) & PROT_WRITE)) { \ 3761 for (i = 0; i < (pages); i++) { \ 3762 ASSERT((ppa)[i]->p_vnode == \ 3763 (ppa)[0]->p_vnode); \ 3764 if (!hat_ismod((ppa)[i])) { \ 3765 prot &= ~PROT_WRITE; \ 3766 break; \ 3767 } \ 3768 } \ 3769 } \ 3770 } 3771 3772 #ifdef VM_STATS 3773 3774 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3775 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3776 3777 #else /* VM_STATS */ 3778 3779 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3780 3781 #endif 3782 3783 static faultcode_t 3784 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3785 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3786 caddr_t eaddr, int brkcow) 3787 { 3788 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3789 struct anon_map *amp = svd->amp; 3790 uchar_t segtype = svd->type; 3791 uint_t szc = seg->s_szc; 3792 size_t pgsz = page_get_pagesize(szc); 3793 size_t maxpgsz = pgsz; 3794 pgcnt_t pages = btop(pgsz); 3795 pgcnt_t maxpages = pages; 3796 size_t ppasize = (pages + 1) * sizeof (page_t *); 3797 caddr_t a = lpgaddr; 3798 caddr_t maxlpgeaddr = lpgeaddr; 3799 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3800 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3801 struct vpage *vpage = (svd->vpage != NULL) ? 3802 &svd->vpage[seg_page(seg, a)] : NULL; 3803 vnode_t *vp = svd->vp; 3804 page_t **ppa; 3805 uint_t pszc; 3806 size_t ppgsz; 3807 pgcnt_t ppages; 3808 faultcode_t err = 0; 3809 int ierr; 3810 int vop_size_err = 0; 3811 uint_t protchk, prot, vpprot; 3812 ulong_t i; 3813 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3814 anon_sync_obj_t an_cookie; 3815 enum seg_rw arw; 3816 int alloc_failed = 0; 3817 int adjszc_chk; 3818 struct vattr va; 3819 int xhat = 0; 3820 page_t *pplist; 3821 pfn_t pfn; 3822 int physcontig; 3823 int upgrdfail; 3824 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3825 int tron = (svd->tr_state == SEGVN_TR_ON); 3826 3827 ASSERT(szc != 0); 3828 ASSERT(vp != NULL); 3829 ASSERT(brkcow == 0 || amp != NULL); 3830 ASSERT(tron == 0 || amp != NULL); 3831 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3832 ASSERT(!(svd->flags & MAP_NORESERVE)); 3833 ASSERT(type != F_SOFTUNLOCK); 3834 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3835 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3836 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3837 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3838 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3839 ASSERT(svd->tr_state != SEGVN_TR_INIT); 3840 3841 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3842 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3843 3844 if (svd->flags & MAP_TEXT) { 3845 hat_flag |= HAT_LOAD_TEXT; 3846 } 3847 3848 if (svd->pageprot) { 3849 switch (rw) { 3850 case S_READ: 3851 protchk = PROT_READ; 3852 break; 3853 case S_WRITE: 3854 protchk = PROT_WRITE; 3855 break; 3856 case S_EXEC: 3857 protchk = PROT_EXEC; 3858 break; 3859 case S_OTHER: 3860 default: 3861 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3862 break; 3863 } 3864 } else { 3865 prot = svd->prot; 3866 /* caller has already done segment level protection check. */ 3867 } 3868 3869 if (seg->s_as->a_hat != hat) { 3870 xhat = 1; 3871 } 3872 3873 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3874 SEGVN_VMSTAT_FLTVNPAGES(2); 3875 arw = S_READ; 3876 } else { 3877 arw = rw; 3878 } 3879 3880 ppa = kmem_alloc(ppasize, KM_SLEEP); 3881 3882 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3883 3884 for (;;) { 3885 adjszc_chk = 0; 3886 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3887 if (adjszc_chk) { 3888 while (szc < seg->s_szc) { 3889 uintptr_t e; 3890 uint_t tszc; 3891 tszc = segvn_anypgsz_vnode ? szc + 1 : 3892 seg->s_szc; 3893 ppgsz = page_get_pagesize(tszc); 3894 if (!IS_P2ALIGNED(a, ppgsz) || 3895 ((alloc_failed >> tszc) & 0x1)) { 3896 break; 3897 } 3898 SEGVN_VMSTAT_FLTVNPAGES(4); 3899 szc = tszc; 3900 pgsz = ppgsz; 3901 pages = btop(pgsz); 3902 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3903 lpgeaddr = (caddr_t)e; 3904 } 3905 } 3906 3907 again: 3908 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3909 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3910 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3911 anon_array_enter(amp, aindx, &an_cookie); 3912 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3913 SEGVN_VMSTAT_FLTVNPAGES(5); 3914 ASSERT(anon_pages(amp->ahp, aindx, 3915 maxpages) == maxpages); 3916 anon_array_exit(&an_cookie); 3917 ANON_LOCK_EXIT(&->a_rwlock); 3918 err = segvn_fault_anonpages(hat, seg, 3919 a, a + maxpgsz, type, rw, 3920 MAX(a, addr), 3921 MIN(a + maxpgsz, eaddr), brkcow); 3922 if (err != 0) { 3923 SEGVN_VMSTAT_FLTVNPAGES(6); 3924 goto out; 3925 } 3926 if (szc < seg->s_szc) { 3927 szc = seg->s_szc; 3928 pgsz = maxpgsz; 3929 pages = maxpages; 3930 lpgeaddr = maxlpgeaddr; 3931 } 3932 goto next; 3933 } else { 3934 ASSERT(anon_pages(amp->ahp, aindx, 3935 maxpages) == 0); 3936 SEGVN_VMSTAT_FLTVNPAGES(7); 3937 anon_array_exit(&an_cookie); 3938 ANON_LOCK_EXIT(&->a_rwlock); 3939 } 3940 } 3941 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3942 ASSERT(!tron || IS_P2ALIGNED(a, maxpgsz)); 3943 3944 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3945 ASSERT(vpage != NULL); 3946 prot = VPP_PROT(vpage); 3947 ASSERT(sameprot(seg, a, maxpgsz)); 3948 if ((prot & protchk) == 0) { 3949 SEGVN_VMSTAT_FLTVNPAGES(8); 3950 err = FC_PROT; 3951 goto out; 3952 } 3953 } 3954 if (type == F_SOFTLOCK) { 3955 atomic_add_long((ulong_t *)&svd->softlockcnt, 3956 pages); 3957 } 3958 3959 pplist = NULL; 3960 physcontig = 0; 3961 ppa[0] = NULL; 3962 if (!brkcow && !tron && szc && 3963 !page_exists_physcontig(vp, off, szc, 3964 segtype == MAP_PRIVATE ? ppa : NULL)) { 3965 SEGVN_VMSTAT_FLTVNPAGES(9); 3966 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 3967 szc, 0, 0) && type != F_SOFTLOCK) { 3968 SEGVN_VMSTAT_FLTVNPAGES(10); 3969 pszc = 0; 3970 ierr = -1; 3971 alloc_failed |= (1 << szc); 3972 break; 3973 } 3974 if (pplist != NULL && 3975 vp->v_mpssdata == SEGVN_PAGEIO) { 3976 int downsize; 3977 SEGVN_VMSTAT_FLTVNPAGES(11); 3978 physcontig = segvn_fill_vp_pages(svd, 3979 vp, off, szc, ppa, &pplist, 3980 &pszc, &downsize); 3981 ASSERT(!physcontig || pplist == NULL); 3982 if (!physcontig && downsize && 3983 type != F_SOFTLOCK) { 3984 ASSERT(pplist == NULL); 3985 SEGVN_VMSTAT_FLTVNPAGES(12); 3986 ierr = -1; 3987 break; 3988 } 3989 ASSERT(!physcontig || 3990 segtype == MAP_PRIVATE || 3991 ppa[0] == NULL); 3992 if (physcontig && ppa[0] == NULL) { 3993 physcontig = 0; 3994 } 3995 } 3996 } else if (!brkcow && !tron && szc && ppa[0] != NULL) { 3997 SEGVN_VMSTAT_FLTVNPAGES(13); 3998 ASSERT(segtype == MAP_PRIVATE); 3999 physcontig = 1; 4000 } 4001 4002 if (!physcontig) { 4003 SEGVN_VMSTAT_FLTVNPAGES(14); 4004 ppa[0] = NULL; 4005 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 4006 &vpprot, ppa, pgsz, seg, a, arw, 4007 svd->cred, NULL); 4008 #ifdef DEBUG 4009 if (ierr == 0) { 4010 for (i = 0; i < pages; i++) { 4011 ASSERT(PAGE_LOCKED(ppa[i])); 4012 ASSERT(!PP_ISFREE(ppa[i])); 4013 ASSERT(ppa[i]->p_vnode == vp); 4014 ASSERT(ppa[i]->p_offset == 4015 off + (i << PAGESHIFT)); 4016 } 4017 } 4018 #endif /* DEBUG */ 4019 if (segtype == MAP_PRIVATE) { 4020 SEGVN_VMSTAT_FLTVNPAGES(15); 4021 vpprot &= ~PROT_WRITE; 4022 } 4023 } else { 4024 ASSERT(segtype == MAP_PRIVATE); 4025 SEGVN_VMSTAT_FLTVNPAGES(16); 4026 vpprot = PROT_ALL & ~PROT_WRITE; 4027 ierr = 0; 4028 } 4029 4030 if (ierr != 0) { 4031 SEGVN_VMSTAT_FLTVNPAGES(17); 4032 if (pplist != NULL) { 4033 SEGVN_VMSTAT_FLTVNPAGES(18); 4034 page_free_replacement_page(pplist); 4035 page_create_putback(pages); 4036 } 4037 SEGVN_RESTORE_SOFTLOCK_VP(type, pages); 4038 if (a + pgsz <= eaddr) { 4039 SEGVN_VMSTAT_FLTVNPAGES(19); 4040 err = FC_MAKE_ERR(ierr); 4041 goto out; 4042 } 4043 va.va_mask = AT_SIZE; 4044 if (VOP_GETATTR(vp, &va, 0, svd->cred, NULL)) { 4045 SEGVN_VMSTAT_FLTVNPAGES(20); 4046 err = FC_MAKE_ERR(EIO); 4047 goto out; 4048 } 4049 if (btopr(va.va_size) >= btopr(off + pgsz)) { 4050 SEGVN_VMSTAT_FLTVNPAGES(21); 4051 err = FC_MAKE_ERR(ierr); 4052 goto out; 4053 } 4054 if (btopr(va.va_size) < 4055 btopr(off + (eaddr - a))) { 4056 SEGVN_VMSTAT_FLTVNPAGES(22); 4057 err = FC_MAKE_ERR(ierr); 4058 goto out; 4059 } 4060 if (brkcow || tron || type == F_SOFTLOCK) { 4061 /* can't reduce map area */ 4062 SEGVN_VMSTAT_FLTVNPAGES(23); 4063 vop_size_err = 1; 4064 goto out; 4065 } 4066 SEGVN_VMSTAT_FLTVNPAGES(24); 4067 ASSERT(szc != 0); 4068 pszc = 0; 4069 ierr = -1; 4070 break; 4071 } 4072 4073 if (amp != NULL) { 4074 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4075 anon_array_enter(amp, aindx, &an_cookie); 4076 } 4077 if (amp != NULL && 4078 anon_get_ptr(amp->ahp, aindx) != NULL) { 4079 ulong_t taindx = P2ALIGN(aindx, maxpages); 4080 4081 SEGVN_VMSTAT_FLTVNPAGES(25); 4082 ASSERT(anon_pages(amp->ahp, taindx, 4083 maxpages) == maxpages); 4084 for (i = 0; i < pages; i++) { 4085 page_unlock(ppa[i]); 4086 } 4087 anon_array_exit(&an_cookie); 4088 ANON_LOCK_EXIT(&->a_rwlock); 4089 if (pplist != NULL) { 4090 page_free_replacement_page(pplist); 4091 page_create_putback(pages); 4092 } 4093 SEGVN_RESTORE_SOFTLOCK_VP(type, pages); 4094 if (szc < seg->s_szc) { 4095 SEGVN_VMSTAT_FLTVNPAGES(26); 4096 /* 4097 * For private segments SOFTLOCK 4098 * either always breaks cow (any rw 4099 * type except S_READ_NOCOW) or 4100 * address space is locked as writer 4101 * (S_READ_NOCOW case) and anon slots 4102 * can't show up on second check. 4103 * Therefore if we are here for 4104 * SOFTLOCK case it must be a cow 4105 * break but cow break never reduces 4106 * szc. text replication (tron) in 4107 * this case works as cow break. 4108 * Thus the assert below. 4109 */ 4110 ASSERT(!brkcow && !tron && 4111 type != F_SOFTLOCK); 4112 pszc = seg->s_szc; 4113 ierr = -2; 4114 break; 4115 } 4116 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4117 goto again; 4118 } 4119 #ifdef DEBUG 4120 if (amp != NULL) { 4121 ulong_t taindx = P2ALIGN(aindx, maxpages); 4122 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 4123 } 4124 #endif /* DEBUG */ 4125 4126 if (brkcow || tron) { 4127 ASSERT(amp != NULL); 4128 ASSERT(pplist == NULL); 4129 ASSERT(szc == seg->s_szc); 4130 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4131 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 4132 SEGVN_VMSTAT_FLTVNPAGES(27); 4133 ierr = anon_map_privatepages(amp, aindx, szc, 4134 seg, a, prot, ppa, vpage, segvn_anypgsz, 4135 tron ? PG_LOCAL : 0, svd->cred); 4136 if (ierr != 0) { 4137 SEGVN_VMSTAT_FLTVNPAGES(28); 4138 anon_array_exit(&an_cookie); 4139 ANON_LOCK_EXIT(&->a_rwlock); 4140 SEGVN_RESTORE_SOFTLOCK_VP(type, pages); 4141 err = FC_MAKE_ERR(ierr); 4142 goto out; 4143 } 4144 4145 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4146 /* 4147 * p_szc can't be changed for locked 4148 * swapfs pages. 4149 */ 4150 ASSERT(svd->rcookie == 4151 HAT_INVALID_REGION_COOKIE); 4152 hat_memload_array(hat, a, pgsz, ppa, prot, 4153 hat_flag); 4154 4155 if (!(hat_flag & HAT_LOAD_LOCK)) { 4156 SEGVN_VMSTAT_FLTVNPAGES(29); 4157 for (i = 0; i < pages; i++) { 4158 page_unlock(ppa[i]); 4159 } 4160 } 4161 anon_array_exit(&an_cookie); 4162 ANON_LOCK_EXIT(&->a_rwlock); 4163 goto next; 4164 } 4165 4166 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE || 4167 (!svd->pageprot && svd->prot == (prot & vpprot))); 4168 4169 pfn = page_pptonum(ppa[0]); 4170 /* 4171 * hat_page_demote() needs an SE_EXCL lock on one of 4172 * constituent page_t's and it decreases root's p_szc 4173 * last. This means if root's p_szc is equal szc and 4174 * all its constituent pages are locked 4175 * hat_page_demote() that could have changed p_szc to 4176 * szc is already done and no new have page_demote() 4177 * can start for this large page. 4178 */ 4179 4180 /* 4181 * we need to make sure same mapping size is used for 4182 * the same address range if there's a possibility the 4183 * adddress is already mapped because hat layer panics 4184 * when translation is loaded for the range already 4185 * mapped with a different page size. We achieve it 4186 * by always using largest page size possible subject 4187 * to the constraints of page size, segment page size 4188 * and page alignment. Since mappings are invalidated 4189 * when those constraints change and make it 4190 * impossible to use previously used mapping size no 4191 * mapping size conflicts should happen. 4192 */ 4193 4194 chkszc: 4195 if ((pszc = ppa[0]->p_szc) == szc && 4196 IS_P2ALIGNED(pfn, pages)) { 4197 4198 SEGVN_VMSTAT_FLTVNPAGES(30); 4199 #ifdef DEBUG 4200 for (i = 0; i < pages; i++) { 4201 ASSERT(PAGE_LOCKED(ppa[i])); 4202 ASSERT(!PP_ISFREE(ppa[i])); 4203 ASSERT(page_pptonum(ppa[i]) == 4204 pfn + i); 4205 ASSERT(ppa[i]->p_szc == szc); 4206 ASSERT(ppa[i]->p_vnode == vp); 4207 ASSERT(ppa[i]->p_offset == 4208 off + (i << PAGESHIFT)); 4209 } 4210 #endif /* DEBUG */ 4211 /* 4212 * All pages are of szc we need and they are 4213 * all locked so they can't change szc. load 4214 * translations. 4215 * 4216 * if page got promoted since last check 4217 * we don't need pplist. 4218 */ 4219 if (pplist != NULL) { 4220 page_free_replacement_page(pplist); 4221 page_create_putback(pages); 4222 } 4223 if (PP_ISMIGRATE(ppa[0])) { 4224 page_migrate(seg, a, ppa, pages); 4225 } 4226 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4227 prot, vpprot); 4228 if (!xhat) { 4229 hat_memload_array_region(hat, a, pgsz, 4230 ppa, prot & vpprot, hat_flag, 4231 svd->rcookie); 4232 } else { 4233 /* 4234 * avoid large xhat mappings to FS 4235 * pages so that hat_page_demote() 4236 * doesn't need to check for xhat 4237 * large mappings. 4238 * Don't use regions with xhats. 4239 */ 4240 for (i = 0; i < pages; i++) { 4241 hat_memload(hat, 4242 a + (i << PAGESHIFT), 4243 ppa[i], prot & vpprot, 4244 hat_flag); 4245 } 4246 } 4247 4248 if (!(hat_flag & HAT_LOAD_LOCK)) { 4249 for (i = 0; i < pages; i++) { 4250 page_unlock(ppa[i]); 4251 } 4252 } 4253 if (amp != NULL) { 4254 anon_array_exit(&an_cookie); 4255 ANON_LOCK_EXIT(&->a_rwlock); 4256 } 4257 goto next; 4258 } 4259 4260 /* 4261 * See if upsize is possible. 4262 */ 4263 if (pszc > szc && szc < seg->s_szc && 4264 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 4265 pgcnt_t aphase; 4266 uint_t pszc1 = MIN(pszc, seg->s_szc); 4267 ppgsz = page_get_pagesize(pszc1); 4268 ppages = btop(ppgsz); 4269 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 4270 4271 ASSERT(type != F_SOFTLOCK); 4272 4273 SEGVN_VMSTAT_FLTVNPAGES(31); 4274 if (aphase != P2PHASE(pfn, ppages)) { 4275 segvn_faultvnmpss_align_err4++; 4276 } else { 4277 SEGVN_VMSTAT_FLTVNPAGES(32); 4278 if (pplist != NULL) { 4279 page_t *pl = pplist; 4280 page_free_replacement_page(pl); 4281 page_create_putback(pages); 4282 } 4283 for (i = 0; i < pages; i++) { 4284 page_unlock(ppa[i]); 4285 } 4286 if (amp != NULL) { 4287 anon_array_exit(&an_cookie); 4288 ANON_LOCK_EXIT(&->a_rwlock); 4289 } 4290 pszc = pszc1; 4291 ierr = -2; 4292 break; 4293 } 4294 } 4295 4296 /* 4297 * check if we should use smallest mapping size. 4298 */ 4299 upgrdfail = 0; 4300 if (szc == 0 || xhat || 4301 (pszc >= szc && 4302 !IS_P2ALIGNED(pfn, pages)) || 4303 (pszc < szc && 4304 !segvn_full_szcpages(ppa, szc, &upgrdfail, 4305 &pszc))) { 4306 4307 if (upgrdfail && type != F_SOFTLOCK) { 4308 /* 4309 * segvn_full_szcpages failed to lock 4310 * all pages EXCL. Size down. 4311 */ 4312 ASSERT(pszc < szc); 4313 4314 SEGVN_VMSTAT_FLTVNPAGES(33); 4315 4316 if (pplist != NULL) { 4317 page_t *pl = pplist; 4318 page_free_replacement_page(pl); 4319 page_create_putback(pages); 4320 } 4321 4322 for (i = 0; i < pages; i++) { 4323 page_unlock(ppa[i]); 4324 } 4325 if (amp != NULL) { 4326 anon_array_exit(&an_cookie); 4327 ANON_LOCK_EXIT(&->a_rwlock); 4328 } 4329 ierr = -1; 4330 break; 4331 } 4332 if (szc != 0 && !xhat && !upgrdfail) { 4333 segvn_faultvnmpss_align_err5++; 4334 } 4335 SEGVN_VMSTAT_FLTVNPAGES(34); 4336 if (pplist != NULL) { 4337 page_free_replacement_page(pplist); 4338 page_create_putback(pages); 4339 } 4340 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4341 prot, vpprot); 4342 if (upgrdfail && segvn_anypgsz_vnode) { 4343 /* SOFTLOCK case */ 4344 hat_memload_array_region(hat, a, pgsz, 4345 ppa, prot & vpprot, hat_flag, 4346 svd->rcookie); 4347 } else { 4348 for (i = 0; i < pages; i++) { 4349 hat_memload_region(hat, 4350 a + (i << PAGESHIFT), 4351 ppa[i], prot & vpprot, 4352 hat_flag, svd->rcookie); 4353 } 4354 } 4355 if (!(hat_flag & HAT_LOAD_LOCK)) { 4356 for (i = 0; i < pages; i++) { 4357 page_unlock(ppa[i]); 4358 } 4359 } 4360 if (amp != NULL) { 4361 anon_array_exit(&an_cookie); 4362 ANON_LOCK_EXIT(&->a_rwlock); 4363 } 4364 goto next; 4365 } 4366 4367 if (pszc == szc) { 4368 /* 4369 * segvn_full_szcpages() upgraded pages szc. 4370 */ 4371 ASSERT(pszc == ppa[0]->p_szc); 4372 ASSERT(IS_P2ALIGNED(pfn, pages)); 4373 goto chkszc; 4374 } 4375 4376 if (pszc > szc) { 4377 kmutex_t *szcmtx; 4378 SEGVN_VMSTAT_FLTVNPAGES(35); 4379 /* 4380 * p_szc of ppa[0] can change since we haven't 4381 * locked all constituent pages. Call 4382 * page_lock_szc() to prevent szc changes. 4383 * This should be a rare case that happens when 4384 * multiple segments use a different page size 4385 * to map the same file offsets. 4386 */ 4387 szcmtx = page_szc_lock(ppa[0]); 4388 pszc = ppa[0]->p_szc; 4389 ASSERT(szcmtx != NULL || pszc == 0); 4390 ASSERT(ppa[0]->p_szc <= pszc); 4391 if (pszc <= szc) { 4392 SEGVN_VMSTAT_FLTVNPAGES(36); 4393 if (szcmtx != NULL) { 4394 mutex_exit(szcmtx); 4395 } 4396 goto chkszc; 4397 } 4398 if (pplist != NULL) { 4399 /* 4400 * page got promoted since last check. 4401 * we don't need preaalocated large 4402 * page. 4403 */ 4404 SEGVN_VMSTAT_FLTVNPAGES(37); 4405 page_free_replacement_page(pplist); 4406 page_create_putback(pages); 4407 } 4408 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4409 prot, vpprot); 4410 hat_memload_array_region(hat, a, pgsz, ppa, 4411 prot & vpprot, hat_flag, svd->rcookie); 4412 mutex_exit(szcmtx); 4413 if (!(hat_flag & HAT_LOAD_LOCK)) { 4414 for (i = 0; i < pages; i++) { 4415 page_unlock(ppa[i]); 4416 } 4417 } 4418 if (amp != NULL) { 4419 anon_array_exit(&an_cookie); 4420 ANON_LOCK_EXIT(&->a_rwlock); 4421 } 4422 goto next; 4423 } 4424 4425 /* 4426 * if page got demoted since last check 4427 * we could have not allocated larger page. 4428 * allocate now. 4429 */ 4430 if (pplist == NULL && 4431 page_alloc_pages(vp, seg, a, &pplist, NULL, 4432 szc, 0, 0) && type != F_SOFTLOCK) { 4433 SEGVN_VMSTAT_FLTVNPAGES(38); 4434 for (i = 0; i < pages; i++) { 4435 page_unlock(ppa[i]); 4436 } 4437 if (amp != NULL) { 4438 anon_array_exit(&an_cookie); 4439 ANON_LOCK_EXIT(&->a_rwlock); 4440 } 4441 ierr = -1; 4442 alloc_failed |= (1 << szc); 4443 break; 4444 } 4445 4446 SEGVN_VMSTAT_FLTVNPAGES(39); 4447 4448 if (pplist != NULL) { 4449 segvn_relocate_pages(ppa, pplist); 4450 #ifdef DEBUG 4451 } else { 4452 ASSERT(type == F_SOFTLOCK); 4453 SEGVN_VMSTAT_FLTVNPAGES(40); 4454 #endif /* DEBUG */ 4455 } 4456 4457 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 4458 4459 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 4460 ASSERT(type == F_SOFTLOCK); 4461 for (i = 0; i < pages; i++) { 4462 ASSERT(ppa[i]->p_szc < szc); 4463 hat_memload_region(hat, 4464 a + (i << PAGESHIFT), 4465 ppa[i], prot & vpprot, hat_flag, 4466 svd->rcookie); 4467 } 4468 } else { 4469 ASSERT(pplist != NULL || type == F_SOFTLOCK); 4470 hat_memload_array_region(hat, a, pgsz, ppa, 4471 prot & vpprot, hat_flag, svd->rcookie); 4472 } 4473 if (!(hat_flag & HAT_LOAD_LOCK)) { 4474 for (i = 0; i < pages; i++) { 4475 ASSERT(PAGE_SHARED(ppa[i])); 4476 page_unlock(ppa[i]); 4477 } 4478 } 4479 if (amp != NULL) { 4480 anon_array_exit(&an_cookie); 4481 ANON_LOCK_EXIT(&->a_rwlock); 4482 } 4483 4484 next: 4485 if (vpage != NULL) { 4486 vpage += pages; 4487 } 4488 adjszc_chk = 1; 4489 } 4490 if (a == lpgeaddr) 4491 break; 4492 ASSERT(a < lpgeaddr); 4493 4494 ASSERT(!brkcow && !tron && type != F_SOFTLOCK); 4495 4496 /* 4497 * ierr == -1 means we failed to map with a large page. 4498 * (either due to allocation/relocation failures or 4499 * misalignment with other mappings to this file. 4500 * 4501 * ierr == -2 means some other thread allocated a large page 4502 * after we gave up tp map with a large page. retry with 4503 * larger mapping. 4504 */ 4505 ASSERT(ierr == -1 || ierr == -2); 4506 ASSERT(ierr == -2 || szc != 0); 4507 ASSERT(ierr == -1 || szc < seg->s_szc); 4508 if (ierr == -2) { 4509 SEGVN_VMSTAT_FLTVNPAGES(41); 4510 ASSERT(pszc > szc && pszc <= seg->s_szc); 4511 szc = pszc; 4512 } else if (segvn_anypgsz_vnode) { 4513 SEGVN_VMSTAT_FLTVNPAGES(42); 4514 szc--; 4515 } else { 4516 SEGVN_VMSTAT_FLTVNPAGES(43); 4517 ASSERT(pszc < szc); 4518 /* 4519 * other process created pszc large page. 4520 * but we still have to drop to 0 szc. 4521 */ 4522 szc = 0; 4523 } 4524 4525 pgsz = page_get_pagesize(szc); 4526 pages = btop(pgsz); 4527 if (ierr == -2) { 4528 /* 4529 * Size up case. Note lpgaddr may only be needed for 4530 * softlock case so we don't adjust it here. 4531 */ 4532 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4533 ASSERT(a >= lpgaddr); 4534 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4535 off = svd->offset + (uintptr_t)(a - seg->s_base); 4536 aindx = svd->anon_index + seg_page(seg, a); 4537 vpage = (svd->vpage != NULL) ? 4538 &svd->vpage[seg_page(seg, a)] : NULL; 4539 } else { 4540 /* 4541 * Size down case. Note lpgaddr may only be needed for 4542 * softlock case so we don't adjust it here. 4543 */ 4544 ASSERT(IS_P2ALIGNED(a, pgsz)); 4545 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4546 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4547 ASSERT(a < lpgeaddr); 4548 if (a < addr) { 4549 SEGVN_VMSTAT_FLTVNPAGES(44); 4550 /* 4551 * The beginning of the large page region can 4552 * be pulled to the right to make a smaller 4553 * region. We haven't yet faulted a single 4554 * page. 4555 */ 4556 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4557 ASSERT(a >= lpgaddr); 4558 off = svd->offset + 4559 (uintptr_t)(a - seg->s_base); 4560 aindx = svd->anon_index + seg_page(seg, a); 4561 vpage = (svd->vpage != NULL) ? 4562 &svd->vpage[seg_page(seg, a)] : NULL; 4563 } 4564 } 4565 } 4566 out: 4567 kmem_free(ppa, ppasize); 4568 if (!err && !vop_size_err) { 4569 SEGVN_VMSTAT_FLTVNPAGES(45); 4570 return (0); 4571 } 4572 if (type == F_SOFTLOCK && a > lpgaddr) { 4573 SEGVN_VMSTAT_FLTVNPAGES(46); 4574 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4575 } 4576 if (!vop_size_err) { 4577 SEGVN_VMSTAT_FLTVNPAGES(47); 4578 return (err); 4579 } 4580 ASSERT(brkcow || tron || type == F_SOFTLOCK); 4581 /* 4582 * Large page end is mapped beyond the end of file and it's a cow 4583 * fault (can be a text replication induced cow) or softlock so we can't 4584 * reduce the map area. For now just demote the segment. This should 4585 * really only happen if the end of the file changed after the mapping 4586 * was established since when large page segments are created we make 4587 * sure they don't extend beyond the end of the file. 4588 */ 4589 SEGVN_VMSTAT_FLTVNPAGES(48); 4590 4591 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4592 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4593 err = 0; 4594 if (seg->s_szc != 0) { 4595 segvn_fltvnpages_clrszc_cnt++; 4596 ASSERT(svd->softlockcnt == 0); 4597 err = segvn_clrszc(seg); 4598 if (err != 0) { 4599 segvn_fltvnpages_clrszc_err++; 4600 } 4601 } 4602 ASSERT(err || seg->s_szc == 0); 4603 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 4604 /* segvn_fault will do its job as if szc had been zero to begin with */ 4605 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 4606 } 4607 4608 /* 4609 * This routine will attempt to fault in one large page. 4610 * it will use smaller pages if that fails. 4611 * It should only be called for pure anonymous segments. 4612 */ 4613 static faultcode_t 4614 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 4615 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 4616 caddr_t eaddr, int brkcow) 4617 { 4618 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4619 struct anon_map *amp = svd->amp; 4620 uchar_t segtype = svd->type; 4621 uint_t szc = seg->s_szc; 4622 size_t pgsz = page_get_pagesize(szc); 4623 size_t maxpgsz = pgsz; 4624 pgcnt_t pages = btop(pgsz); 4625 uint_t ppaszc = szc; 4626 caddr_t a = lpgaddr; 4627 ulong_t aindx = svd->anon_index + seg_page(seg, a); 4628 struct vpage *vpage = (svd->vpage != NULL) ? 4629 &svd->vpage[seg_page(seg, a)] : NULL; 4630 page_t **ppa; 4631 uint_t ppa_szc; 4632 faultcode_t err; 4633 int ierr; 4634 uint_t protchk, prot, vpprot; 4635 ulong_t i; 4636 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 4637 anon_sync_obj_t cookie; 4638 int adjszc_chk; 4639 int pgflags = (svd->tr_state == SEGVN_TR_ON) ? PG_LOCAL : 0; 4640 4641 ASSERT(szc != 0); 4642 ASSERT(amp != NULL); 4643 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 4644 ASSERT(!(svd->flags & MAP_NORESERVE)); 4645 ASSERT(type != F_SOFTUNLOCK); 4646 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4647 ASSERT(!brkcow || svd->tr_state == SEGVN_TR_OFF); 4648 ASSERT(svd->tr_state != SEGVN_TR_INIT); 4649 4650 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4651 4652 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 4653 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 4654 4655 if (svd->flags & MAP_TEXT) { 4656 hat_flag |= HAT_LOAD_TEXT; 4657 } 4658 4659 if (svd->pageprot) { 4660 switch (rw) { 4661 case S_READ: 4662 protchk = PROT_READ; 4663 break; 4664 case S_WRITE: 4665 protchk = PROT_WRITE; 4666 break; 4667 case S_EXEC: 4668 protchk = PROT_EXEC; 4669 break; 4670 case S_OTHER: 4671 default: 4672 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4673 break; 4674 } 4675 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 4676 } else { 4677 prot = svd->prot; 4678 /* caller has already done segment level protection check. */ 4679 } 4680 4681 ppa = kmem_cache_alloc(segvn_szc_cache[ppaszc], KM_SLEEP); 4682 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4683 for (;;) { 4684 adjszc_chk = 0; 4685 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 4686 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 4687 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 4688 ASSERT(vpage != NULL); 4689 prot = VPP_PROT(vpage); 4690 ASSERT(sameprot(seg, a, maxpgsz)); 4691 if ((prot & protchk) == 0) { 4692 err = FC_PROT; 4693 goto error; 4694 } 4695 } 4696 if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) && 4697 pgsz < maxpgsz) { 4698 ASSERT(a > lpgaddr); 4699 szc = seg->s_szc; 4700 pgsz = maxpgsz; 4701 pages = btop(pgsz); 4702 ASSERT(IS_P2ALIGNED(aindx, pages)); 4703 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, 4704 pgsz); 4705 } 4706 if (type == F_SOFTLOCK) { 4707 atomic_add_long((ulong_t *)&svd->softlockcnt, 4708 pages); 4709 } 4710 anon_array_enter(amp, aindx, &cookie); 4711 ppa_szc = (uint_t)-1; 4712 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4713 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4714 segvn_anypgsz, pgflags, svd->cred); 4715 if (ierr != 0) { 4716 anon_array_exit(&cookie); 4717 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4718 if (type == F_SOFTLOCK) { 4719 atomic_add_long( 4720 (ulong_t *)&svd->softlockcnt, 4721 -pages); 4722 } 4723 if (ierr > 0) { 4724 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4725 err = FC_MAKE_ERR(ierr); 4726 goto error; 4727 } 4728 break; 4729 } 4730 4731 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4732 4733 ASSERT(segtype == MAP_SHARED || 4734 ppa[0]->p_szc <= szc); 4735 ASSERT(segtype == MAP_PRIVATE || 4736 ppa[0]->p_szc >= szc); 4737 4738 /* 4739 * Handle pages that have been marked for migration 4740 */ 4741 if (lgrp_optimizations()) 4742 page_migrate(seg, a, ppa, pages); 4743 4744 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 4745 4746 if (segtype == MAP_SHARED) { 4747 vpprot |= PROT_WRITE; 4748 } 4749 4750 hat_memload_array(hat, a, pgsz, ppa, 4751 prot & vpprot, hat_flag); 4752 4753 if (hat_flag & HAT_LOAD_LOCK) { 4754 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4755 } else { 4756 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4757 for (i = 0; i < pages; i++) 4758 page_unlock(ppa[i]); 4759 } 4760 if (vpage != NULL) 4761 vpage += pages; 4762 4763 anon_array_exit(&cookie); 4764 adjszc_chk = 1; 4765 } 4766 if (a == lpgeaddr) 4767 break; 4768 ASSERT(a < lpgeaddr); 4769 /* 4770 * ierr == -1 means we failed to allocate a large page. 4771 * so do a size down operation. 4772 * 4773 * ierr == -2 means some other process that privately shares 4774 * pages with this process has allocated a larger page and we 4775 * need to retry with larger pages. So do a size up 4776 * operation. This relies on the fact that large pages are 4777 * never partially shared i.e. if we share any constituent 4778 * page of a large page with another process we must share the 4779 * entire large page. Note this cannot happen for SOFTLOCK 4780 * case, unless current address (a) is at the beginning of the 4781 * next page size boundary because the other process couldn't 4782 * have relocated locked pages. 4783 */ 4784 ASSERT(ierr == -1 || ierr == -2); 4785 4786 if (segvn_anypgsz) { 4787 ASSERT(ierr == -2 || szc != 0); 4788 ASSERT(ierr == -1 || szc < seg->s_szc); 4789 szc = (ierr == -1) ? szc - 1 : szc + 1; 4790 } else { 4791 /* 4792 * For non COW faults and segvn_anypgsz == 0 4793 * we need to be careful not to loop forever 4794 * if existing page is found with szc other 4795 * than 0 or seg->s_szc. This could be due 4796 * to page relocations on behalf of DR or 4797 * more likely large page creation. For this 4798 * case simply re-size to existing page's szc 4799 * if returned by anon_map_getpages(). 4800 */ 4801 if (ppa_szc == (uint_t)-1) { 4802 szc = (ierr == -1) ? 0 : seg->s_szc; 4803 } else { 4804 ASSERT(ppa_szc <= seg->s_szc); 4805 ASSERT(ierr == -2 || ppa_szc < szc); 4806 ASSERT(ierr == -1 || ppa_szc > szc); 4807 szc = ppa_szc; 4808 } 4809 } 4810 4811 pgsz = page_get_pagesize(szc); 4812 pages = btop(pgsz); 4813 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4814 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4815 if (type == F_SOFTLOCK) { 4816 /* 4817 * For softlocks we cannot reduce the fault area 4818 * (calculated based on the largest page size for this 4819 * segment) for size down and a is already next 4820 * page size aligned as assertted above for size 4821 * ups. Therefore just continue in case of softlock. 4822 */ 4823 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4824 continue; /* keep lint happy */ 4825 } else if (ierr == -2) { 4826 4827 /* 4828 * Size up case. Note lpgaddr may only be needed for 4829 * softlock case so we don't adjust it here. 4830 */ 4831 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4832 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4833 ASSERT(a >= lpgaddr); 4834 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4835 aindx = svd->anon_index + seg_page(seg, a); 4836 vpage = (svd->vpage != NULL) ? 4837 &svd->vpage[seg_page(seg, a)] : NULL; 4838 } else { 4839 /* 4840 * Size down case. Note lpgaddr may only be needed for 4841 * softlock case so we don't adjust it here. 4842 */ 4843 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4844 ASSERT(IS_P2ALIGNED(a, pgsz)); 4845 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4846 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4847 ASSERT(a < lpgeaddr); 4848 if (a < addr) { 4849 /* 4850 * The beginning of the large page region can 4851 * be pulled to the right to make a smaller 4852 * region. We haven't yet faulted a single 4853 * page. 4854 */ 4855 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4856 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4857 ASSERT(a >= lpgaddr); 4858 aindx = svd->anon_index + seg_page(seg, a); 4859 vpage = (svd->vpage != NULL) ? 4860 &svd->vpage[seg_page(seg, a)] : NULL; 4861 } 4862 } 4863 } 4864 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4865 ANON_LOCK_EXIT(&->a_rwlock); 4866 kmem_cache_free(segvn_szc_cache[ppaszc], ppa); 4867 return (0); 4868 error: 4869 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4870 ANON_LOCK_EXIT(&->a_rwlock); 4871 kmem_cache_free(segvn_szc_cache[ppaszc], ppa); 4872 if (type == F_SOFTLOCK && a > lpgaddr) { 4873 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4874 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4875 } 4876 return (err); 4877 } 4878 4879 int fltadvice = 1; /* set to free behind pages for sequential access */ 4880 4881 /* 4882 * This routine is called via a machine specific fault handling routine. 4883 * It is also called by software routines wishing to lock or unlock 4884 * a range of addresses. 4885 * 4886 * Here is the basic algorithm: 4887 * If unlocking 4888 * Call segvn_softunlock 4889 * Return 4890 * endif 4891 * Checking and set up work 4892 * If we will need some non-anonymous pages 4893 * Call VOP_GETPAGE over the range of non-anonymous pages 4894 * endif 4895 * Loop over all addresses requested 4896 * Call segvn_faultpage passing in page list 4897 * to load up translations and handle anonymous pages 4898 * endloop 4899 * Load up translation to any additional pages in page list not 4900 * already handled that fit into this segment 4901 */ 4902 static faultcode_t 4903 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4904 enum fault_type type, enum seg_rw rw) 4905 { 4906 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4907 page_t **plp, **ppp, *pp; 4908 u_offset_t off; 4909 caddr_t a; 4910 struct vpage *vpage; 4911 uint_t vpprot, prot; 4912 int err; 4913 page_t *pl[PVN_GETPAGE_NUM + 1]; 4914 size_t plsz, pl_alloc_sz; 4915 size_t page; 4916 ulong_t anon_index; 4917 struct anon_map *amp; 4918 int dogetpage = 0; 4919 caddr_t lpgaddr, lpgeaddr; 4920 size_t pgsz; 4921 anon_sync_obj_t cookie; 4922 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4923 4924 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4925 ASSERT(svd->amp == NULL || svd->rcookie == HAT_INVALID_REGION_COOKIE); 4926 4927 /* 4928 * First handle the easy stuff 4929 */ 4930 if (type == F_SOFTUNLOCK) { 4931 if (rw == S_READ_NOCOW) { 4932 rw = S_READ; 4933 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4934 } 4935 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4936 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4937 page_get_pagesize(seg->s_szc); 4938 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4939 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4940 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4941 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4942 return (0); 4943 } 4944 4945 ASSERT(svd->tr_state == SEGVN_TR_OFF || 4946 !HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 4947 if (brkcow == 0) { 4948 if (svd->tr_state == SEGVN_TR_INIT) { 4949 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4950 if (svd->tr_state == SEGVN_TR_INIT) { 4951 ASSERT(svd->vp != NULL && svd->amp == NULL); 4952 ASSERT(svd->flags & MAP_TEXT); 4953 ASSERT(svd->type == MAP_PRIVATE); 4954 segvn_textrepl(seg); 4955 ASSERT(svd->tr_state != SEGVN_TR_INIT); 4956 ASSERT(svd->tr_state != SEGVN_TR_ON || 4957 svd->amp != NULL); 4958 } 4959 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4960 } 4961 } else if (svd->tr_state != SEGVN_TR_OFF) { 4962 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4963 4964 if (rw == S_WRITE && svd->tr_state != SEGVN_TR_OFF) { 4965 ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE)); 4966 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4967 return (FC_PROT); 4968 } 4969 4970 if (svd->tr_state == SEGVN_TR_ON) { 4971 ASSERT(svd->vp != NULL && svd->amp != NULL); 4972 segvn_textunrepl(seg, 0); 4973 ASSERT(svd->amp == NULL && 4974 svd->tr_state == SEGVN_TR_OFF); 4975 } else if (svd->tr_state != SEGVN_TR_OFF) { 4976 svd->tr_state = SEGVN_TR_OFF; 4977 } 4978 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 4979 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4980 } 4981 4982 top: 4983 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4984 4985 /* 4986 * If we have the same protections for the entire segment, 4987 * insure that the access being attempted is legitimate. 4988 */ 4989 4990 if (svd->pageprot == 0) { 4991 uint_t protchk; 4992 4993 switch (rw) { 4994 case S_READ: 4995 case S_READ_NOCOW: 4996 protchk = PROT_READ; 4997 break; 4998 case S_WRITE: 4999 protchk = PROT_WRITE; 5000 break; 5001 case S_EXEC: 5002 protchk = PROT_EXEC; 5003 break; 5004 case S_OTHER: 5005 default: 5006 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 5007 break; 5008 } 5009 5010 if ((svd->prot & protchk) == 0) { 5011 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5012 return (FC_PROT); /* illegal access type */ 5013 } 5014 } 5015 5016 if (brkcow && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5017 /* this must be SOFTLOCK S_READ fault */ 5018 ASSERT(svd->amp == NULL); 5019 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5020 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5021 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5022 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5023 /* 5024 * this must be the first ever non S_READ_NOCOW 5025 * softlock for this segment. 5026 */ 5027 ASSERT(svd->softlockcnt == 0); 5028 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 5029 HAT_REGION_TEXT); 5030 svd->rcookie = HAT_INVALID_REGION_COOKIE; 5031 } 5032 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5033 goto top; 5034 } 5035 5036 /* 5037 * We can't allow the long term use of softlocks for vmpss segments, 5038 * because in some file truncation cases we should be able to demote 5039 * the segment, which requires that there are no softlocks. The 5040 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 5041 * segment is S_READ_NOCOW, where the caller holds the address space 5042 * locked as writer and calls softunlock before dropping the as lock. 5043 * S_READ_NOCOW is used by /proc to read memory from another user. 5044 * 5045 * Another deadlock between SOFTLOCK and file truncation can happen 5046 * because segvn_fault_vnodepages() calls the FS one pagesize at 5047 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 5048 * can cause a deadlock because the first set of page_t's remain 5049 * locked SE_SHARED. To avoid this, we demote segments on a first 5050 * SOFTLOCK if they have a length greater than the segment's 5051 * page size. 5052 * 5053 * So for now, we only avoid demoting a segment on a SOFTLOCK when 5054 * the access type is S_READ_NOCOW and the fault length is less than 5055 * or equal to the segment's page size. While this is quite restrictive, 5056 * it should be the most common case of SOFTLOCK against a vmpss 5057 * segment. 5058 * 5059 * For S_READ_NOCOW, it's safe not to do a copy on write because the 5060 * caller makes sure no COW will be caused by another thread for a 5061 * softlocked page. 5062 */ 5063 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 5064 int demote = 0; 5065 5066 if (rw != S_READ_NOCOW) { 5067 demote = 1; 5068 } 5069 if (!demote && len > PAGESIZE) { 5070 pgsz = page_get_pagesize(seg->s_szc); 5071 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 5072 lpgeaddr); 5073 if (lpgeaddr - lpgaddr > pgsz) { 5074 demote = 1; 5075 } 5076 } 5077 5078 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5079 5080 if (demote) { 5081 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5082 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5083 if (seg->s_szc != 0) { 5084 segvn_vmpss_clrszc_cnt++; 5085 ASSERT(svd->softlockcnt == 0); 5086 err = segvn_clrszc(seg); 5087 if (err) { 5088 segvn_vmpss_clrszc_err++; 5089 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5090 return (FC_MAKE_ERR(err)); 5091 } 5092 } 5093 ASSERT(seg->s_szc == 0); 5094 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5095 goto top; 5096 } 5097 } 5098 5099 /* 5100 * Check to see if we need to allocate an anon_map structure. 5101 */ 5102 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 5103 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 5104 /* 5105 * Drop the "read" lock on the segment and acquire 5106 * the "write" version since we have to allocate the 5107 * anon_map. 5108 */ 5109 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5110 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5111 5112 if (svd->amp == NULL) { 5113 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 5114 svd->amp->a_szc = seg->s_szc; 5115 } 5116 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5117 5118 /* 5119 * Start all over again since segment protections 5120 * may have changed after we dropped the "read" lock. 5121 */ 5122 goto top; 5123 } 5124 5125 /* 5126 * S_READ_NOCOW vs S_READ distinction was 5127 * only needed for the code above. After 5128 * that we treat it as S_READ. 5129 */ 5130 if (rw == S_READ_NOCOW) { 5131 ASSERT(type == F_SOFTLOCK); 5132 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5133 rw = S_READ; 5134 } 5135 5136 amp = svd->amp; 5137 5138 /* 5139 * MADV_SEQUENTIAL work is ignored for large page segments. 5140 */ 5141 if (seg->s_szc != 0) { 5142 pgsz = page_get_pagesize(seg->s_szc); 5143 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 5144 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 5145 if (svd->vp == NULL) { 5146 err = segvn_fault_anonpages(hat, seg, lpgaddr, 5147 lpgeaddr, type, rw, addr, addr + len, brkcow); 5148 } else { 5149 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 5150 lpgeaddr, type, rw, addr, addr + len, brkcow); 5151 if (err == IE_RETRY) { 5152 ASSERT(seg->s_szc == 0); 5153 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 5154 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5155 goto top; 5156 } 5157 } 5158 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5159 return (err); 5160 } 5161 5162 page = seg_page(seg, addr); 5163 if (amp != NULL) { 5164 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 5165 anon_index = svd->anon_index + page; 5166 5167 if (type == F_PROT && rw == S_READ && 5168 svd->tr_state == SEGVN_TR_OFF && 5169 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 5170 size_t index = anon_index; 5171 struct anon *ap; 5172 5173 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5174 /* 5175 * The fast path could apply to S_WRITE also, except 5176 * that the protection fault could be caused by lazy 5177 * tlb flush when ro->rw. In this case, the pte is 5178 * RW already. But RO in the other cpu's tlb causes 5179 * the fault. Since hat_chgprot won't do anything if 5180 * pte doesn't change, we may end up faulting 5181 * indefinitely until the RO tlb entry gets replaced. 5182 */ 5183 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 5184 anon_array_enter(amp, index, &cookie); 5185 ap = anon_get_ptr(amp->ahp, index); 5186 anon_array_exit(&cookie); 5187 if ((ap == NULL) || (ap->an_refcnt != 1)) { 5188 ANON_LOCK_EXIT(&->a_rwlock); 5189 goto slow; 5190 } 5191 } 5192 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 5193 ANON_LOCK_EXIT(&->a_rwlock); 5194 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5195 return (0); 5196 } 5197 } 5198 slow: 5199 5200 if (svd->vpage == NULL) 5201 vpage = NULL; 5202 else 5203 vpage = &svd->vpage[page]; 5204 5205 off = svd->offset + (uintptr_t)(addr - seg->s_base); 5206 5207 /* 5208 * If MADV_SEQUENTIAL has been set for the particular page we 5209 * are faulting on, free behind all pages in the segment and put 5210 * them on the free list. 5211 */ 5212 5213 if ((page != 0) && fltadvice && svd->tr_state != SEGVN_TR_ON) { 5214 struct vpage *vpp; 5215 ulong_t fanon_index; 5216 size_t fpage; 5217 u_offset_t pgoff, fpgoff; 5218 struct vnode *fvp; 5219 struct anon *fap = NULL; 5220 5221 if (svd->advice == MADV_SEQUENTIAL || 5222 (svd->pageadvice && 5223 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 5224 pgoff = off - PAGESIZE; 5225 fpage = page - 1; 5226 if (vpage != NULL) 5227 vpp = &svd->vpage[fpage]; 5228 if (amp != NULL) 5229 fanon_index = svd->anon_index + fpage; 5230 5231 while (pgoff > svd->offset) { 5232 if (svd->advice != MADV_SEQUENTIAL && 5233 (!svd->pageadvice || (vpage && 5234 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 5235 break; 5236 5237 /* 5238 * If this is an anon page, we must find the 5239 * correct <vp, offset> for it 5240 */ 5241 fap = NULL; 5242 if (amp != NULL) { 5243 ANON_LOCK_ENTER(&->a_rwlock, 5244 RW_READER); 5245 anon_array_enter(amp, fanon_index, 5246 &cookie); 5247 fap = anon_get_ptr(amp->ahp, 5248 fanon_index); 5249 if (fap != NULL) { 5250 swap_xlate(fap, &fvp, &fpgoff); 5251 } else { 5252 fpgoff = pgoff; 5253 fvp = svd->vp; 5254 } 5255 anon_array_exit(&cookie); 5256 ANON_LOCK_EXIT(&->a_rwlock); 5257 } else { 5258 fpgoff = pgoff; 5259 fvp = svd->vp; 5260 } 5261 if (fvp == NULL) 5262 break; /* XXX */ 5263 /* 5264 * Skip pages that are free or have an 5265 * "exclusive" lock. 5266 */ 5267 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 5268 if (pp == NULL) 5269 break; 5270 /* 5271 * We don't need the page_struct_lock to test 5272 * as this is only advisory; even if we 5273 * acquire it someone might race in and lock 5274 * the page after we unlock and before the 5275 * PUTPAGE, then VOP_PUTPAGE will do nothing. 5276 */ 5277 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 5278 /* 5279 * Hold the vnode before releasing 5280 * the page lock to prevent it from 5281 * being freed and re-used by some 5282 * other thread. 5283 */ 5284 VN_HOLD(fvp); 5285 page_unlock(pp); 5286 /* 5287 * We should build a page list 5288 * to kluster putpages XXX 5289 */ 5290 (void) VOP_PUTPAGE(fvp, 5291 (offset_t)fpgoff, PAGESIZE, 5292 (B_DONTNEED|B_FREE|B_ASYNC), 5293 svd->cred, NULL); 5294 VN_RELE(fvp); 5295 } else { 5296 /* 5297 * XXX - Should the loop terminate if 5298 * the page is `locked'? 5299 */ 5300 page_unlock(pp); 5301 } 5302 --vpp; 5303 --fanon_index; 5304 pgoff -= PAGESIZE; 5305 } 5306 } 5307 } 5308 5309 plp = pl; 5310 *plp = NULL; 5311 pl_alloc_sz = 0; 5312 5313 /* 5314 * See if we need to call VOP_GETPAGE for 5315 * *any* of the range being faulted on. 5316 * We can skip all of this work if there 5317 * was no original vnode. 5318 */ 5319 if (svd->vp != NULL) { 5320 u_offset_t vp_off; 5321 size_t vp_len; 5322 struct anon *ap; 5323 vnode_t *vp; 5324 5325 vp_off = off; 5326 vp_len = len; 5327 5328 if (amp == NULL) 5329 dogetpage = 1; 5330 else { 5331 /* 5332 * Only acquire reader lock to prevent amp->ahp 5333 * from being changed. It's ok to miss pages, 5334 * hence we don't do anon_array_enter 5335 */ 5336 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5337 ap = anon_get_ptr(amp->ahp, anon_index); 5338 5339 if (len <= PAGESIZE) 5340 /* inline non_anon() */ 5341 dogetpage = (ap == NULL); 5342 else 5343 dogetpage = non_anon(amp->ahp, anon_index, 5344 &vp_off, &vp_len); 5345 ANON_LOCK_EXIT(&->a_rwlock); 5346 } 5347 5348 if (dogetpage) { 5349 enum seg_rw arw; 5350 struct as *as = seg->s_as; 5351 5352 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 5353 /* 5354 * Page list won't fit in local array, 5355 * allocate one of the needed size. 5356 */ 5357 pl_alloc_sz = 5358 (btop(len) + 1) * sizeof (page_t *); 5359 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 5360 plp[0] = NULL; 5361 plsz = len; 5362 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 5363 svd->tr_state == SEGVN_TR_ON || rw == S_OTHER || 5364 (((size_t)(addr + PAGESIZE) < 5365 (size_t)(seg->s_base + seg->s_size)) && 5366 hat_probe(as->a_hat, addr + PAGESIZE))) { 5367 /* 5368 * Ask VOP_GETPAGE to return the exact number 5369 * of pages if 5370 * (a) this is a COW fault, or 5371 * (b) this is a software fault, or 5372 * (c) next page is already mapped. 5373 */ 5374 plsz = len; 5375 } else { 5376 /* 5377 * Ask VOP_GETPAGE to return adjacent pages 5378 * within the segment. 5379 */ 5380 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 5381 ((seg->s_base + seg->s_size) - addr)); 5382 ASSERT((addr + plsz) <= 5383 (seg->s_base + seg->s_size)); 5384 } 5385 5386 /* 5387 * Need to get some non-anonymous pages. 5388 * We need to make only one call to GETPAGE to do 5389 * this to prevent certain deadlocking conditions 5390 * when we are doing locking. In this case 5391 * non_anon() should have picked up the smallest 5392 * range which includes all the non-anonymous 5393 * pages in the requested range. We have to 5394 * be careful regarding which rw flag to pass in 5395 * because on a private mapping, the underlying 5396 * object is never allowed to be written. 5397 */ 5398 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 5399 arw = S_READ; 5400 } else { 5401 arw = rw; 5402 } 5403 vp = svd->vp; 5404 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5405 "segvn_getpage:seg %p addr %p vp %p", 5406 seg, addr, vp); 5407 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 5408 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 5409 svd->cred, NULL); 5410 if (err) { 5411 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5412 segvn_pagelist_rele(plp); 5413 if (pl_alloc_sz) 5414 kmem_free(plp, pl_alloc_sz); 5415 return (FC_MAKE_ERR(err)); 5416 } 5417 if (svd->type == MAP_PRIVATE) 5418 vpprot &= ~PROT_WRITE; 5419 } 5420 } 5421 5422 /* 5423 * N.B. at this time the plp array has all the needed non-anon 5424 * pages in addition to (possibly) having some adjacent pages. 5425 */ 5426 5427 /* 5428 * Always acquire the anon_array_lock to prevent 5429 * 2 threads from allocating separate anon slots for 5430 * the same "addr". 5431 * 5432 * If this is a copy-on-write fault and we don't already 5433 * have the anon_array_lock, acquire it to prevent the 5434 * fault routine from handling multiple copy-on-write faults 5435 * on the same "addr" in the same address space. 5436 * 5437 * Only one thread should deal with the fault since after 5438 * it is handled, the other threads can acquire a translation 5439 * to the newly created private page. This prevents two or 5440 * more threads from creating different private pages for the 5441 * same fault. 5442 * 5443 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 5444 * to prevent deadlock between this thread and another thread 5445 * which has soft-locked this page and wants to acquire serial_lock. 5446 * ( bug 4026339 ) 5447 * 5448 * The fix for bug 4026339 becomes unnecessary when using the 5449 * locking scheme with per amp rwlock and a global set of hash 5450 * lock, anon_array_lock. If we steal a vnode page when low 5451 * on memory and upgrad the page lock through page_rename, 5452 * then the page is PAGE_HANDLED, nothing needs to be done 5453 * for this page after returning from segvn_faultpage. 5454 * 5455 * But really, the page lock should be downgraded after 5456 * the stolen page is page_rename'd. 5457 */ 5458 5459 if (amp != NULL) 5460 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5461 5462 /* 5463 * Ok, now loop over the address range and handle faults 5464 */ 5465 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 5466 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 5467 type, rw, brkcow); 5468 if (err) { 5469 if (amp != NULL) 5470 ANON_LOCK_EXIT(&->a_rwlock); 5471 if (type == F_SOFTLOCK && a > addr) { 5472 segvn_softunlock(seg, addr, (a - addr), 5473 S_OTHER); 5474 } 5475 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5476 segvn_pagelist_rele(plp); 5477 if (pl_alloc_sz) 5478 kmem_free(plp, pl_alloc_sz); 5479 return (err); 5480 } 5481 if (vpage) { 5482 vpage++; 5483 } else if (svd->vpage) { 5484 page = seg_page(seg, addr); 5485 vpage = &svd->vpage[++page]; 5486 } 5487 } 5488 5489 /* Didn't get pages from the underlying fs so we're done */ 5490 if (!dogetpage) 5491 goto done; 5492 5493 /* 5494 * Now handle any other pages in the list returned. 5495 * If the page can be used, load up the translations now. 5496 * Note that the for loop will only be entered if "plp" 5497 * is pointing to a non-NULL page pointer which means that 5498 * VOP_GETPAGE() was called and vpprot has been initialized. 5499 */ 5500 if (svd->pageprot == 0) 5501 prot = svd->prot & vpprot; 5502 5503 5504 /* 5505 * Large Files: diff should be unsigned value because we started 5506 * supporting > 2GB segment sizes from 2.5.1 and when a 5507 * large file of size > 2GB gets mapped to address space 5508 * the diff value can be > 2GB. 5509 */ 5510 5511 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 5512 size_t diff; 5513 struct anon *ap; 5514 int anon_index; 5515 anon_sync_obj_t cookie; 5516 int hat_flag = HAT_LOAD_ADV; 5517 5518 if (svd->flags & MAP_TEXT) { 5519 hat_flag |= HAT_LOAD_TEXT; 5520 } 5521 5522 if (pp == PAGE_HANDLED) 5523 continue; 5524 5525 if (svd->tr_state != SEGVN_TR_ON && 5526 pp->p_offset >= svd->offset && 5527 pp->p_offset < svd->offset + seg->s_size) { 5528 5529 diff = pp->p_offset - svd->offset; 5530 5531 /* 5532 * Large Files: Following is the assertion 5533 * validating the above cast. 5534 */ 5535 ASSERT(svd->vp == pp->p_vnode); 5536 5537 page = btop(diff); 5538 if (svd->pageprot) 5539 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 5540 5541 /* 5542 * Prevent other threads in the address space from 5543 * creating private pages (i.e., allocating anon slots) 5544 * while we are in the process of loading translations 5545 * to additional pages returned by the underlying 5546 * object. 5547 */ 5548 if (amp != NULL) { 5549 anon_index = svd->anon_index + page; 5550 anon_array_enter(amp, anon_index, &cookie); 5551 ap = anon_get_ptr(amp->ahp, anon_index); 5552 } 5553 if ((amp == NULL) || (ap == NULL)) { 5554 if (IS_VMODSORT(pp->p_vnode) || 5555 enable_mbit_wa) { 5556 if (rw == S_WRITE) 5557 hat_setmod(pp); 5558 else if (rw != S_OTHER && 5559 !hat_ismod(pp)) 5560 prot &= ~PROT_WRITE; 5561 } 5562 /* 5563 * Skip mapping read ahead pages marked 5564 * for migration, so they will get migrated 5565 * properly on fault 5566 */ 5567 ASSERT(amp == NULL || 5568 svd->rcookie == HAT_INVALID_REGION_COOKIE); 5569 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 5570 hat_memload_region(hat, 5571 seg->s_base + diff, 5572 pp, prot, hat_flag, 5573 svd->rcookie); 5574 } 5575 } 5576 if (amp != NULL) 5577 anon_array_exit(&cookie); 5578 } 5579 page_unlock(pp); 5580 } 5581 done: 5582 if (amp != NULL) 5583 ANON_LOCK_EXIT(&->a_rwlock); 5584 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5585 if (pl_alloc_sz) 5586 kmem_free(plp, pl_alloc_sz); 5587 return (0); 5588 } 5589 5590 /* 5591 * This routine is used to start I/O on pages asynchronously. XXX it will 5592 * only create PAGESIZE pages. At fault time they will be relocated into 5593 * larger pages. 5594 */ 5595 static faultcode_t 5596 segvn_faulta(struct seg *seg, caddr_t addr) 5597 { 5598 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5599 int err; 5600 struct anon_map *amp; 5601 vnode_t *vp; 5602 5603 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5604 5605 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5606 if ((amp = svd->amp) != NULL) { 5607 struct anon *ap; 5608 5609 /* 5610 * Reader lock to prevent amp->ahp from being changed. 5611 * This is advisory, it's ok to miss a page, so 5612 * we don't do anon_array_enter lock. 5613 */ 5614 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5615 if ((ap = anon_get_ptr(amp->ahp, 5616 svd->anon_index + seg_page(seg, addr))) != NULL) { 5617 5618 err = anon_getpage(&ap, NULL, NULL, 5619 0, seg, addr, S_READ, svd->cred); 5620 5621 ANON_LOCK_EXIT(&->a_rwlock); 5622 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5623 if (err) 5624 return (FC_MAKE_ERR(err)); 5625 return (0); 5626 } 5627 ANON_LOCK_EXIT(&->a_rwlock); 5628 } 5629 5630 if (svd->vp == NULL) { 5631 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5632 return (0); /* zfod page - do nothing now */ 5633 } 5634 5635 vp = svd->vp; 5636 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5637 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 5638 err = VOP_GETPAGE(vp, 5639 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 5640 PAGESIZE, NULL, NULL, 0, seg, addr, 5641 S_OTHER, svd->cred, NULL); 5642 5643 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5644 if (err) 5645 return (FC_MAKE_ERR(err)); 5646 return (0); 5647 } 5648 5649 static int 5650 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5651 { 5652 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5653 struct vpage *cvp, *svp, *evp; 5654 struct vnode *vp; 5655 size_t pgsz; 5656 pgcnt_t pgcnt; 5657 anon_sync_obj_t cookie; 5658 int unload_done = 0; 5659 5660 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5661 5662 if ((svd->maxprot & prot) != prot) 5663 return (EACCES); /* violated maxprot */ 5664 5665 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5666 5667 /* return if prot is the same */ 5668 if (!svd->pageprot && svd->prot == prot) { 5669 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5670 return (0); 5671 } 5672 5673 /* 5674 * Since we change protections we first have to flush the cache. 5675 * This makes sure all the pagelock calls have to recheck 5676 * protections. 5677 */ 5678 if (svd->softlockcnt > 0) { 5679 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5680 5681 /* 5682 * If this is shared segment non 0 softlockcnt 5683 * means locked pages are still in use. 5684 */ 5685 if (svd->type == MAP_SHARED) { 5686 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5687 return (EAGAIN); 5688 } 5689 5690 /* 5691 * Since we do have the segvn writers lock nobody can fill 5692 * the cache with entries belonging to this seg during 5693 * the purge. The flush either succeeds or we still have 5694 * pending I/Os. 5695 */ 5696 segvn_purge(seg); 5697 if (svd->softlockcnt > 0) { 5698 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5699 return (EAGAIN); 5700 } 5701 } 5702 5703 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5704 ASSERT(svd->amp == NULL); 5705 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5706 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 5707 HAT_REGION_TEXT); 5708 svd->rcookie = HAT_INVALID_REGION_COOKIE; 5709 unload_done = 1; 5710 } else if (svd->tr_state == SEGVN_TR_INIT) { 5711 svd->tr_state = SEGVN_TR_OFF; 5712 } else if (svd->tr_state == SEGVN_TR_ON) { 5713 ASSERT(svd->amp != NULL); 5714 segvn_textunrepl(seg, 0); 5715 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 5716 unload_done = 1; 5717 } 5718 5719 if ((prot & PROT_WRITE) && svd->type == MAP_SHARED && 5720 svd->vp != NULL && (svd->vp->v_flag & VVMEXEC)) { 5721 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 5722 segvn_inval_trcache(svd->vp); 5723 } 5724 if (seg->s_szc != 0) { 5725 int err; 5726 pgsz = page_get_pagesize(seg->s_szc); 5727 pgcnt = pgsz >> PAGESHIFT; 5728 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5729 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 5730 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5731 ASSERT(seg->s_base != addr || seg->s_size != len); 5732 /* 5733 * If we are holding the as lock as a reader then 5734 * we need to return IE_RETRY and let the as 5735 * layer drop and re-acquire the lock as a writer. 5736 */ 5737 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 5738 return (IE_RETRY); 5739 VM_STAT_ADD(segvnvmstats.demoterange[1]); 5740 if (svd->type == MAP_PRIVATE || svd->vp != NULL) { 5741 err = segvn_demote_range(seg, addr, len, 5742 SDR_END, 0); 5743 } else { 5744 uint_t szcvec = map_pgszcvec(seg->s_base, 5745 pgsz, (uintptr_t)seg->s_base, 5746 (svd->flags & MAP_TEXT), MAPPGSZC_SHM, 0); 5747 err = segvn_demote_range(seg, addr, len, 5748 SDR_END, szcvec); 5749 } 5750 if (err == 0) 5751 return (IE_RETRY); 5752 if (err == ENOMEM) 5753 return (IE_NOMEM); 5754 return (err); 5755 } 5756 } 5757 5758 5759 /* 5760 * If it's a private mapping and we're making it writable then we 5761 * may have to reserve the additional swap space now. If we are 5762 * making writable only a part of the segment then we use its vpage 5763 * array to keep a record of the pages for which we have reserved 5764 * swap. In this case we set the pageswap field in the segment's 5765 * segvn structure to record this. 5766 * 5767 * If it's a private mapping to a file (i.e., vp != NULL) and we're 5768 * removing write permission on the entire segment and we haven't 5769 * modified any pages, we can release the swap space. 5770 */ 5771 if (svd->type == MAP_PRIVATE) { 5772 if (prot & PROT_WRITE) { 5773 if (!(svd->flags & MAP_NORESERVE) && 5774 !(svd->swresv && svd->pageswap == 0)) { 5775 size_t sz = 0; 5776 5777 /* 5778 * Start by determining how much swap 5779 * space is required. 5780 */ 5781 if (addr == seg->s_base && 5782 len == seg->s_size && 5783 svd->pageswap == 0) { 5784 /* The whole segment */ 5785 sz = seg->s_size; 5786 } else { 5787 /* 5788 * Make sure that the vpage array 5789 * exists, and make a note of the 5790 * range of elements corresponding 5791 * to len. 5792 */ 5793 segvn_vpage(seg); 5794 svp = &svd->vpage[seg_page(seg, addr)]; 5795 evp = &svd->vpage[seg_page(seg, 5796 addr + len)]; 5797 5798 if (svd->pageswap == 0) { 5799 /* 5800 * This is the first time we've 5801 * asked for a part of this 5802 * segment, so we need to 5803 * reserve everything we've 5804 * been asked for. 5805 */ 5806 sz = len; 5807 } else { 5808 /* 5809 * We have to count the number 5810 * of pages required. 5811 */ 5812 for (cvp = svp; cvp < evp; 5813 cvp++) { 5814 if (!VPP_ISSWAPRES(cvp)) 5815 sz++; 5816 } 5817 sz <<= PAGESHIFT; 5818 } 5819 } 5820 5821 /* Try to reserve the necessary swap. */ 5822 if (anon_resv_zone(sz, 5823 seg->s_as->a_proc->p_zone) == 0) { 5824 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5825 return (IE_NOMEM); 5826 } 5827 5828 /* 5829 * Make a note of how much swap space 5830 * we've reserved. 5831 */ 5832 if (svd->pageswap == 0 && sz == seg->s_size) { 5833 svd->swresv = sz; 5834 } else { 5835 ASSERT(svd->vpage != NULL); 5836 svd->swresv += sz; 5837 svd->pageswap = 1; 5838 for (cvp = svp; cvp < evp; cvp++) { 5839 if (!VPP_ISSWAPRES(cvp)) 5840 VPP_SETSWAPRES(cvp); 5841 } 5842 } 5843 } 5844 } else { 5845 /* 5846 * Swap space is released only if this segment 5847 * does not map anonymous memory, since read faults 5848 * on such segments still need an anon slot to read 5849 * in the data. 5850 */ 5851 if (svd->swresv != 0 && svd->vp != NULL && 5852 svd->amp == NULL && addr == seg->s_base && 5853 len == seg->s_size && svd->pageprot == 0) { 5854 ASSERT(svd->pageswap == 0); 5855 anon_unresv_zone(svd->swresv, 5856 seg->s_as->a_proc->p_zone); 5857 svd->swresv = 0; 5858 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5859 "anon proc:%p %lu %u", seg, 0, 0); 5860 } 5861 } 5862 } 5863 5864 if (addr == seg->s_base && len == seg->s_size && svd->vpage == NULL) { 5865 if (svd->prot == prot) { 5866 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5867 return (0); /* all done */ 5868 } 5869 svd->prot = (uchar_t)prot; 5870 } else if (svd->type == MAP_PRIVATE) { 5871 struct anon *ap = NULL; 5872 page_t *pp; 5873 u_offset_t offset, off; 5874 struct anon_map *amp; 5875 ulong_t anon_idx = 0; 5876 5877 /* 5878 * A vpage structure exists or else the change does not 5879 * involve the entire segment. Establish a vpage structure 5880 * if none is there. Then, for each page in the range, 5881 * adjust its individual permissions. Note that write- 5882 * enabling a MAP_PRIVATE page can affect the claims for 5883 * locked down memory. Overcommitting memory terminates 5884 * the operation. 5885 */ 5886 segvn_vpage(seg); 5887 svd->pageprot = 1; 5888 if ((amp = svd->amp) != NULL) { 5889 anon_idx = svd->anon_index + seg_page(seg, addr); 5890 ASSERT(seg->s_szc == 0 || 5891 IS_P2ALIGNED(anon_idx, pgcnt)); 5892 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5893 } 5894 5895 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5896 evp = &svd->vpage[seg_page(seg, addr + len)]; 5897 5898 /* 5899 * See Statement at the beginning of segvn_lockop regarding 5900 * the way cowcnts and lckcnts are handled. 5901 */ 5902 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5903 5904 if (seg->s_szc != 0) { 5905 if (amp != NULL) { 5906 anon_array_enter(amp, anon_idx, 5907 &cookie); 5908 } 5909 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5910 !segvn_claim_pages(seg, svp, offset, 5911 anon_idx, prot)) { 5912 if (amp != NULL) { 5913 anon_array_exit(&cookie); 5914 } 5915 break; 5916 } 5917 if (amp != NULL) { 5918 anon_array_exit(&cookie); 5919 } 5920 anon_idx++; 5921 } else { 5922 if (amp != NULL) { 5923 anon_array_enter(amp, anon_idx, 5924 &cookie); 5925 ap = anon_get_ptr(amp->ahp, anon_idx++); 5926 } 5927 5928 if (VPP_ISPPLOCK(svp) && 5929 VPP_PROT(svp) != prot) { 5930 5931 if (amp == NULL || ap == NULL) { 5932 vp = svd->vp; 5933 off = offset; 5934 } else 5935 swap_xlate(ap, &vp, &off); 5936 if (amp != NULL) 5937 anon_array_exit(&cookie); 5938 5939 if ((pp = page_lookup(vp, off, 5940 SE_SHARED)) == NULL) { 5941 panic("segvn_setprot: no page"); 5942 /*NOTREACHED*/ 5943 } 5944 ASSERT(seg->s_szc == 0); 5945 if ((VPP_PROT(svp) ^ prot) & 5946 PROT_WRITE) { 5947 if (prot & PROT_WRITE) { 5948 if (!page_addclaim( 5949 pp)) { 5950 page_unlock(pp); 5951 break; 5952 } 5953 } else { 5954 if (!page_subclaim( 5955 pp)) { 5956 page_unlock(pp); 5957 break; 5958 } 5959 } 5960 } 5961 page_unlock(pp); 5962 } else if (amp != NULL) 5963 anon_array_exit(&cookie); 5964 } 5965 VPP_SETPROT(svp, prot); 5966 offset += PAGESIZE; 5967 } 5968 if (amp != NULL) 5969 ANON_LOCK_EXIT(&->a_rwlock); 5970 5971 /* 5972 * Did we terminate prematurely? If so, simply unload 5973 * the translations to the things we've updated so far. 5974 */ 5975 if (svp != evp) { 5976 if (unload_done) { 5977 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5978 return (IE_NOMEM); 5979 } 5980 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5981 PAGESIZE; 5982 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5983 if (len != 0) 5984 hat_unload(seg->s_as->a_hat, addr, 5985 len, HAT_UNLOAD); 5986 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5987 return (IE_NOMEM); 5988 } 5989 } else { 5990 segvn_vpage(seg); 5991 svd->pageprot = 1; 5992 evp = &svd->vpage[seg_page(seg, addr + len)]; 5993 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5994 VPP_SETPROT(svp, prot); 5995 } 5996 } 5997 5998 if (unload_done) { 5999 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6000 return (0); 6001 } 6002 6003 if (((prot & PROT_WRITE) != 0 && 6004 (svd->vp != NULL || svd->type == MAP_PRIVATE)) || 6005 (prot & ~PROT_USER) == PROT_NONE) { 6006 /* 6007 * Either private or shared data with write access (in 6008 * which case we need to throw out all former translations 6009 * so that we get the right translations set up on fault 6010 * and we don't allow write access to any copy-on-write pages 6011 * that might be around or to prevent write access to pages 6012 * representing holes in a file), or we don't have permission 6013 * to access the memory at all (in which case we have to 6014 * unload any current translations that might exist). 6015 */ 6016 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 6017 } else { 6018 /* 6019 * A shared mapping or a private mapping in which write 6020 * protection is going to be denied - just change all the 6021 * protections over the range of addresses in question. 6022 * segvn does not support any other attributes other 6023 * than prot so we can use hat_chgattr. 6024 */ 6025 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 6026 } 6027 6028 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6029 6030 return (0); 6031 } 6032 6033 /* 6034 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 6035 * to determine if the seg is capable of mapping the requested szc. 6036 */ 6037 static int 6038 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 6039 { 6040 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6041 struct segvn_data *nsvd; 6042 struct anon_map *amp = svd->amp; 6043 struct seg *nseg; 6044 caddr_t eaddr = addr + len, a; 6045 size_t pgsz = page_get_pagesize(szc); 6046 pgcnt_t pgcnt = page_get_pagecnt(szc); 6047 int err; 6048 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 6049 6050 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6051 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6052 6053 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 6054 return (0); 6055 } 6056 6057 /* 6058 * addr should always be pgsz aligned but eaddr may be misaligned if 6059 * it's at the end of the segment. 6060 * 6061 * XXX we should assert this condition since as_setpagesize() logic 6062 * guarantees it. 6063 */ 6064 if (!IS_P2ALIGNED(addr, pgsz) || 6065 (!IS_P2ALIGNED(eaddr, pgsz) && 6066 eaddr != seg->s_base + seg->s_size)) { 6067 6068 segvn_setpgsz_align_err++; 6069 return (EINVAL); 6070 } 6071 6072 if (amp != NULL && svd->type == MAP_SHARED) { 6073 ulong_t an_idx = svd->anon_index + seg_page(seg, addr); 6074 if (!IS_P2ALIGNED(an_idx, pgcnt)) { 6075 6076 segvn_setpgsz_anon_align_err++; 6077 return (EINVAL); 6078 } 6079 } 6080 6081 if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 6082 szc > segvn_maxpgszc) { 6083 return (EINVAL); 6084 } 6085 6086 /* paranoid check */ 6087 if (svd->vp != NULL && 6088 (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) { 6089 return (EINVAL); 6090 } 6091 6092 if (seg->s_szc == 0 && svd->vp != NULL && 6093 map_addr_vacalign_check(addr, off)) { 6094 return (EINVAL); 6095 } 6096 6097 /* 6098 * Check that protections are the same within new page 6099 * size boundaries. 6100 */ 6101 if (svd->pageprot) { 6102 for (a = addr; a < eaddr; a += pgsz) { 6103 if ((a + pgsz) > eaddr) { 6104 if (!sameprot(seg, a, eaddr - a)) { 6105 return (EINVAL); 6106 } 6107 } else { 6108 if (!sameprot(seg, a, pgsz)) { 6109 return (EINVAL); 6110 } 6111 } 6112 } 6113 } 6114 6115 /* 6116 * Since we are changing page size we first have to flush 6117 * the cache. This makes sure all the pagelock calls have 6118 * to recheck protections. 6119 */ 6120 if (svd->softlockcnt > 0) { 6121 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6122 6123 /* 6124 * If this is shared segment non 0 softlockcnt 6125 * means locked pages are still in use. 6126 */ 6127 if (svd->type == MAP_SHARED) { 6128 return (EAGAIN); 6129 } 6130 6131 /* 6132 * Since we do have the segvn writers lock nobody can fill 6133 * the cache with entries belonging to this seg during 6134 * the purge. The flush either succeeds or we still have 6135 * pending I/Os. 6136 */ 6137 segvn_purge(seg); 6138 if (svd->softlockcnt > 0) { 6139 return (EAGAIN); 6140 } 6141 } 6142 6143 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 6144 ASSERT(svd->amp == NULL); 6145 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6146 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 6147 HAT_REGION_TEXT); 6148 svd->rcookie = HAT_INVALID_REGION_COOKIE; 6149 } else if (svd->tr_state == SEGVN_TR_INIT) { 6150 svd->tr_state = SEGVN_TR_OFF; 6151 } else if (svd->tr_state == SEGVN_TR_ON) { 6152 ASSERT(svd->amp != NULL); 6153 segvn_textunrepl(seg, 1); 6154 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 6155 amp = NULL; 6156 } 6157 6158 /* 6159 * Operation for sub range of existing segment. 6160 */ 6161 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 6162 if (szc < seg->s_szc) { 6163 VM_STAT_ADD(segvnvmstats.demoterange[2]); 6164 err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0); 6165 if (err == 0) { 6166 return (IE_RETRY); 6167 } 6168 if (err == ENOMEM) { 6169 return (IE_NOMEM); 6170 } 6171 return (err); 6172 } 6173 if (addr != seg->s_base) { 6174 nseg = segvn_split_seg(seg, addr); 6175 if (eaddr != (nseg->s_base + nseg->s_size)) { 6176 /* eaddr is szc aligned */ 6177 (void) segvn_split_seg(nseg, eaddr); 6178 } 6179 return (IE_RETRY); 6180 } 6181 if (eaddr != (seg->s_base + seg->s_size)) { 6182 /* eaddr is szc aligned */ 6183 (void) segvn_split_seg(seg, eaddr); 6184 } 6185 return (IE_RETRY); 6186 } 6187 6188 /* 6189 * Break any low level sharing and reset seg->s_szc to 0. 6190 */ 6191 if ((err = segvn_clrszc(seg)) != 0) { 6192 if (err == ENOMEM) { 6193 err = IE_NOMEM; 6194 } 6195 return (err); 6196 } 6197 ASSERT(seg->s_szc == 0); 6198 6199 /* 6200 * If the end of the current segment is not pgsz aligned 6201 * then attempt to concatenate with the next segment. 6202 */ 6203 if (!IS_P2ALIGNED(eaddr, pgsz)) { 6204 nseg = AS_SEGNEXT(seg->s_as, seg); 6205 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 6206 return (ENOMEM); 6207 } 6208 if (nseg->s_ops != &segvn_ops) { 6209 return (EINVAL); 6210 } 6211 nsvd = (struct segvn_data *)nseg->s_data; 6212 if (nsvd->softlockcnt > 0) { 6213 /* 6214 * If this is shared segment non 0 softlockcnt 6215 * means locked pages are still in use. 6216 */ 6217 if (nsvd->type == MAP_SHARED) { 6218 return (EAGAIN); 6219 } 6220 segvn_purge(nseg); 6221 if (nsvd->softlockcnt > 0) { 6222 return (EAGAIN); 6223 } 6224 } 6225 err = segvn_clrszc(nseg); 6226 if (err == ENOMEM) { 6227 err = IE_NOMEM; 6228 } 6229 if (err != 0) { 6230 return (err); 6231 } 6232 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 6233 err = segvn_concat(seg, nseg, 1); 6234 if (err == -1) { 6235 return (EINVAL); 6236 } 6237 if (err == -2) { 6238 return (IE_NOMEM); 6239 } 6240 return (IE_RETRY); 6241 } 6242 6243 /* 6244 * May need to re-align anon array to 6245 * new szc. 6246 */ 6247 if (amp != NULL) { 6248 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 6249 struct anon_hdr *nahp; 6250 6251 ASSERT(svd->type == MAP_PRIVATE); 6252 6253 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6254 ASSERT(amp->refcnt == 1); 6255 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 6256 if (nahp == NULL) { 6257 ANON_LOCK_EXIT(&->a_rwlock); 6258 return (IE_NOMEM); 6259 } 6260 if (anon_copy_ptr(amp->ahp, svd->anon_index, 6261 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 6262 anon_release(nahp, btop(amp->size)); 6263 ANON_LOCK_EXIT(&->a_rwlock); 6264 return (IE_NOMEM); 6265 } 6266 anon_release(amp->ahp, btop(amp->size)); 6267 amp->ahp = nahp; 6268 svd->anon_index = 0; 6269 ANON_LOCK_EXIT(&->a_rwlock); 6270 } 6271 } 6272 if (svd->vp != NULL && szc != 0) { 6273 struct vattr va; 6274 u_offset_t eoffpage = svd->offset; 6275 va.va_mask = AT_SIZE; 6276 eoffpage += seg->s_size; 6277 eoffpage = btopr(eoffpage); 6278 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred, NULL) != 0) { 6279 segvn_setpgsz_getattr_err++; 6280 return (EINVAL); 6281 } 6282 if (btopr(va.va_size) < eoffpage) { 6283 segvn_setpgsz_eof_err++; 6284 return (EINVAL); 6285 } 6286 if (amp != NULL) { 6287 /* 6288 * anon_fill_cow_holes() may call VOP_GETPAGE(). 6289 * don't take anon map lock here to avoid holding it 6290 * across VOP_GETPAGE() calls that may call back into 6291 * segvn for klsutering checks. We don't really need 6292 * anon map lock here since it's a private segment and 6293 * we hold as level lock as writers. 6294 */ 6295 if ((err = anon_fill_cow_holes(seg, seg->s_base, 6296 amp->ahp, svd->anon_index, svd->vp, svd->offset, 6297 seg->s_size, szc, svd->prot, svd->vpage, 6298 svd->cred)) != 0) { 6299 return (EINVAL); 6300 } 6301 } 6302 segvn_setvnode_mpss(svd->vp); 6303 } 6304 6305 if (amp != NULL) { 6306 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6307 if (svd->type == MAP_PRIVATE) { 6308 amp->a_szc = szc; 6309 } else if (szc > amp->a_szc) { 6310 amp->a_szc = szc; 6311 } 6312 ANON_LOCK_EXIT(&->a_rwlock); 6313 } 6314 6315 seg->s_szc = szc; 6316 6317 return (0); 6318 } 6319 6320 static int 6321 segvn_clrszc(struct seg *seg) 6322 { 6323 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6324 struct anon_map *amp = svd->amp; 6325 size_t pgsz; 6326 pgcnt_t pages; 6327 int err = 0; 6328 caddr_t a = seg->s_base; 6329 caddr_t ea = a + seg->s_size; 6330 ulong_t an_idx = svd->anon_index; 6331 vnode_t *vp = svd->vp; 6332 struct vpage *vpage = svd->vpage; 6333 page_t *anon_pl[1 + 1], *pp; 6334 struct anon *ap, *oldap; 6335 uint_t prot = svd->prot, vpprot; 6336 int pageflag = 0; 6337 6338 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6339 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 6340 ASSERT(svd->softlockcnt == 0); 6341 6342 if (vp == NULL && amp == NULL) { 6343 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6344 seg->s_szc = 0; 6345 return (0); 6346 } 6347 6348 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 6349 ASSERT(svd->amp == NULL); 6350 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6351 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 6352 HAT_REGION_TEXT); 6353 svd->rcookie = HAT_INVALID_REGION_COOKIE; 6354 } else if (svd->tr_state == SEGVN_TR_ON) { 6355 ASSERT(svd->amp != NULL); 6356 segvn_textunrepl(seg, 1); 6357 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 6358 amp = NULL; 6359 } else { 6360 if (svd->tr_state != SEGVN_TR_OFF) { 6361 ASSERT(svd->tr_state == SEGVN_TR_INIT); 6362 svd->tr_state = SEGVN_TR_OFF; 6363 } 6364 6365 /* 6366 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 6367 * unload argument is 0 when we are freeing the segment 6368 * and unload was already done. 6369 */ 6370 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 6371 HAT_UNLOAD_UNMAP); 6372 } 6373 6374 if (amp == NULL || svd->type == MAP_SHARED) { 6375 seg->s_szc = 0; 6376 return (0); 6377 } 6378 6379 pgsz = page_get_pagesize(seg->s_szc); 6380 pages = btop(pgsz); 6381 6382 /* 6383 * XXX anon rwlock is not really needed because this is a 6384 * private segment and we are writers. 6385 */ 6386 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6387 6388 for (; a < ea; a += pgsz, an_idx += pages) { 6389 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 6390 ASSERT(vpage != NULL || svd->pageprot == 0); 6391 if (vpage != NULL) { 6392 ASSERT(sameprot(seg, a, pgsz)); 6393 prot = VPP_PROT(vpage); 6394 pageflag = VPP_ISPPLOCK(vpage) ? LOCK_PAGE : 0; 6395 } 6396 if (seg->s_szc != 0) { 6397 ASSERT(vp == NULL || anon_pages(amp->ahp, 6398 an_idx, pages) == pages); 6399 if ((err = anon_map_demotepages(amp, an_idx, 6400 seg, a, prot, vpage, svd->cred)) != 0) { 6401 goto out; 6402 } 6403 } else { 6404 if (oldap->an_refcnt == 1) { 6405 continue; 6406 } 6407 if ((err = anon_getpage(&oldap, &vpprot, 6408 anon_pl, PAGESIZE, seg, a, S_READ, 6409 svd->cred))) { 6410 goto out; 6411 } 6412 if ((pp = anon_private(&ap, seg, a, prot, 6413 anon_pl[0], pageflag, svd->cred)) == NULL) { 6414 err = ENOMEM; 6415 goto out; 6416 } 6417 anon_decref(oldap); 6418 (void) anon_set_ptr(amp->ahp, an_idx, ap, 6419 ANON_SLEEP); 6420 page_unlock(pp); 6421 } 6422 } 6423 vpage = (vpage == NULL) ? NULL : vpage + pages; 6424 } 6425 6426 amp->a_szc = 0; 6427 seg->s_szc = 0; 6428 out: 6429 ANON_LOCK_EXIT(&->a_rwlock); 6430 return (err); 6431 } 6432 6433 static int 6434 segvn_claim_pages( 6435 struct seg *seg, 6436 struct vpage *svp, 6437 u_offset_t off, 6438 ulong_t anon_idx, 6439 uint_t prot) 6440 { 6441 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6442 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 6443 page_t **ppa; 6444 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6445 struct anon_map *amp = svd->amp; 6446 struct vpage *evp = svp + pgcnt; 6447 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 6448 + seg->s_base; 6449 struct anon *ap; 6450 struct vnode *vp = svd->vp; 6451 page_t *pp; 6452 pgcnt_t pg_idx, i; 6453 int err = 0; 6454 anoff_t aoff; 6455 int anon = (amp != NULL) ? 1 : 0; 6456 6457 ASSERT(svd->type == MAP_PRIVATE); 6458 ASSERT(svd->vpage != NULL); 6459 ASSERT(seg->s_szc != 0); 6460 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 6461 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 6462 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 6463 6464 if (VPP_PROT(svp) == prot) 6465 return (1); 6466 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 6467 return (1); 6468 6469 ppa = kmem_alloc(ppasize, KM_SLEEP); 6470 if (anon && vp != NULL) { 6471 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 6472 anon = 0; 6473 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 6474 } 6475 ASSERT(!anon || 6476 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 6477 } 6478 6479 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 6480 if (!VPP_ISPPLOCK(svp)) 6481 continue; 6482 if (anon) { 6483 ap = anon_get_ptr(amp->ahp, anon_idx); 6484 if (ap == NULL) { 6485 panic("segvn_claim_pages: no anon slot"); 6486 } 6487 swap_xlate(ap, &vp, &aoff); 6488 off = (u_offset_t)aoff; 6489 } 6490 ASSERT(vp != NULL); 6491 if ((pp = page_lookup(vp, 6492 (u_offset_t)off, SE_SHARED)) == NULL) { 6493 panic("segvn_claim_pages: no page"); 6494 } 6495 ppa[pg_idx++] = pp; 6496 off += PAGESIZE; 6497 } 6498 6499 if (ppa[0] == NULL) { 6500 kmem_free(ppa, ppasize); 6501 return (1); 6502 } 6503 6504 ASSERT(pg_idx <= pgcnt); 6505 ppa[pg_idx] = NULL; 6506 6507 6508 /* Find each large page within ppa, and adjust its claim */ 6509 6510 /* Does ppa cover a single large page? */ 6511 if (ppa[0]->p_szc == seg->s_szc) { 6512 if (prot & PROT_WRITE) 6513 err = page_addclaim_pages(ppa); 6514 else 6515 err = page_subclaim_pages(ppa); 6516 } else { 6517 for (i = 0; ppa[i]; i += pgcnt) { 6518 ASSERT(IS_P2ALIGNED(page_pptonum(ppa[i]), pgcnt)); 6519 if (prot & PROT_WRITE) 6520 err = page_addclaim_pages(&ppa[i]); 6521 else 6522 err = page_subclaim_pages(&ppa[i]); 6523 if (err == 0) 6524 break; 6525 } 6526 } 6527 6528 for (i = 0; i < pg_idx; i++) { 6529 ASSERT(ppa[i] != NULL); 6530 page_unlock(ppa[i]); 6531 } 6532 6533 kmem_free(ppa, ppasize); 6534 return (err); 6535 } 6536 6537 /* 6538 * Returns right (upper address) segment if split occurred. 6539 * If the address is equal to the beginning or end of its segment it returns 6540 * the current segment. 6541 */ 6542 static struct seg * 6543 segvn_split_seg(struct seg *seg, caddr_t addr) 6544 { 6545 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6546 struct seg *nseg; 6547 size_t nsize; 6548 struct segvn_data *nsvd; 6549 6550 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6551 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6552 6553 ASSERT(addr >= seg->s_base); 6554 ASSERT(addr <= seg->s_base + seg->s_size); 6555 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6556 6557 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 6558 return (seg); 6559 6560 nsize = seg->s_base + seg->s_size - addr; 6561 seg->s_size = addr - seg->s_base; 6562 nseg = seg_alloc(seg->s_as, addr, nsize); 6563 ASSERT(nseg != NULL); 6564 nseg->s_ops = seg->s_ops; 6565 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 6566 nseg->s_data = (void *)nsvd; 6567 nseg->s_szc = seg->s_szc; 6568 *nsvd = *svd; 6569 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 6570 nsvd->seg = nseg; 6571 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 6572 6573 if (nsvd->vp != NULL) { 6574 VN_HOLD(nsvd->vp); 6575 nsvd->offset = svd->offset + 6576 (uintptr_t)(nseg->s_base - seg->s_base); 6577 if (nsvd->type == MAP_SHARED) 6578 lgrp_shm_policy_init(NULL, nsvd->vp); 6579 } else { 6580 /* 6581 * The offset for an anonymous segment has no signifigance in 6582 * terms of an offset into a file. If we were to use the above 6583 * calculation instead, the structures read out of 6584 * /proc/<pid>/xmap would be more difficult to decipher since 6585 * it would be unclear whether two seemingly contiguous 6586 * prxmap_t structures represented different segments or a 6587 * single segment that had been split up into multiple prxmap_t 6588 * structures (e.g. if some part of the segment had not yet 6589 * been faulted in). 6590 */ 6591 nsvd->offset = 0; 6592 } 6593 6594 ASSERT(svd->softlockcnt == 0); 6595 ASSERT(svd->softlockcnt_sbase == 0); 6596 ASSERT(svd->softlockcnt_send == 0); 6597 crhold(svd->cred); 6598 6599 if (svd->vpage != NULL) { 6600 size_t bytes = vpgtob(seg_pages(seg)); 6601 size_t nbytes = vpgtob(seg_pages(nseg)); 6602 struct vpage *ovpage = svd->vpage; 6603 6604 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 6605 bcopy(ovpage, svd->vpage, bytes); 6606 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 6607 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 6608 kmem_free(ovpage, bytes + nbytes); 6609 } 6610 if (svd->amp != NULL && svd->type == MAP_PRIVATE) { 6611 struct anon_map *oamp = svd->amp, *namp; 6612 struct anon_hdr *nahp; 6613 6614 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 6615 ASSERT(oamp->refcnt == 1); 6616 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 6617 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 6618 nahp, 0, btop(seg->s_size), ANON_SLEEP); 6619 6620 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP); 6621 namp->a_szc = nseg->s_szc; 6622 (void) anon_copy_ptr(oamp->ahp, 6623 svd->anon_index + btop(seg->s_size), 6624 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 6625 anon_release(oamp->ahp, btop(oamp->size)); 6626 oamp->ahp = nahp; 6627 oamp->size = seg->s_size; 6628 svd->anon_index = 0; 6629 nsvd->amp = namp; 6630 nsvd->anon_index = 0; 6631 ANON_LOCK_EXIT(&oamp->a_rwlock); 6632 } else if (svd->amp != NULL) { 6633 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6634 ASSERT(svd->amp == nsvd->amp); 6635 ASSERT(seg->s_szc <= svd->amp->a_szc); 6636 nsvd->anon_index = svd->anon_index + seg_pages(seg); 6637 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt)); 6638 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER); 6639 svd->amp->refcnt++; 6640 ANON_LOCK_EXIT(&svd->amp->a_rwlock); 6641 } 6642 6643 /* 6644 * Split the amount of swap reserved. 6645 */ 6646 if (svd->swresv) { 6647 /* 6648 * For MAP_NORESERVE, only allocate swap reserve for pages 6649 * being used. Other segments get enough to cover whole 6650 * segment. 6651 */ 6652 if (svd->flags & MAP_NORESERVE) { 6653 size_t oswresv; 6654 6655 ASSERT(svd->amp); 6656 oswresv = svd->swresv; 6657 svd->swresv = ptob(anon_pages(svd->amp->ahp, 6658 svd->anon_index, btop(seg->s_size))); 6659 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 6660 nsvd->anon_index, btop(nseg->s_size))); 6661 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 6662 } else { 6663 if (svd->pageswap) { 6664 svd->swresv = segvn_count_swap_by_vpages(seg); 6665 ASSERT(nsvd->swresv >= svd->swresv); 6666 nsvd->swresv -= svd->swresv; 6667 } else { 6668 ASSERT(svd->swresv == seg->s_size + 6669 nseg->s_size); 6670 svd->swresv = seg->s_size; 6671 nsvd->swresv = nseg->s_size; 6672 } 6673 } 6674 } 6675 6676 return (nseg); 6677 } 6678 6679 /* 6680 * called on memory operations (unmap, setprot, setpagesize) for a subset 6681 * of a large page segment to either demote the memory range (SDR_RANGE) 6682 * or the ends (SDR_END) by addr/len. 6683 * 6684 * returns 0 on success. returns errno, including ENOMEM, on failure. 6685 */ 6686 static int 6687 segvn_demote_range( 6688 struct seg *seg, 6689 caddr_t addr, 6690 size_t len, 6691 int flag, 6692 uint_t szcvec) 6693 { 6694 caddr_t eaddr = addr + len; 6695 caddr_t lpgaddr, lpgeaddr; 6696 struct seg *nseg; 6697 struct seg *badseg1 = NULL; 6698 struct seg *badseg2 = NULL; 6699 size_t pgsz; 6700 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6701 int err; 6702 uint_t szc = seg->s_szc; 6703 uint_t tszcvec; 6704 6705 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6706 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6707 ASSERT(szc != 0); 6708 pgsz = page_get_pagesize(szc); 6709 ASSERT(seg->s_base != addr || seg->s_size != len); 6710 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6711 ASSERT(svd->softlockcnt == 0); 6712 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6713 ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED)); 6714 6715 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 6716 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 6717 if (flag == SDR_RANGE) { 6718 /* demote entire range */ 6719 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6720 (void) segvn_split_seg(nseg, lpgeaddr); 6721 ASSERT(badseg1->s_base == lpgaddr); 6722 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 6723 } else if (addr != lpgaddr) { 6724 ASSERT(flag == SDR_END); 6725 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6726 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 6727 eaddr < lpgaddr + 2 * pgsz) { 6728 (void) segvn_split_seg(nseg, lpgeaddr); 6729 ASSERT(badseg1->s_base == lpgaddr); 6730 ASSERT(badseg1->s_size == 2 * pgsz); 6731 } else { 6732 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 6733 ASSERT(badseg1->s_base == lpgaddr); 6734 ASSERT(badseg1->s_size == pgsz); 6735 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 6736 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 6737 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 6738 badseg2 = nseg; 6739 (void) segvn_split_seg(nseg, lpgeaddr); 6740 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 6741 ASSERT(badseg2->s_size == pgsz); 6742 } 6743 } 6744 } else { 6745 ASSERT(flag == SDR_END); 6746 ASSERT(eaddr < lpgeaddr); 6747 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 6748 (void) segvn_split_seg(nseg, lpgeaddr); 6749 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 6750 ASSERT(badseg1->s_size == pgsz); 6751 } 6752 6753 ASSERT(badseg1 != NULL); 6754 ASSERT(badseg1->s_szc == szc); 6755 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 6756 badseg1->s_size == 2 * pgsz); 6757 ASSERT(sameprot(badseg1, badseg1->s_base, pgsz)); 6758 ASSERT(badseg1->s_size == pgsz || 6759 sameprot(badseg1, badseg1->s_base + pgsz, pgsz)); 6760 if (err = segvn_clrszc(badseg1)) { 6761 return (err); 6762 } 6763 ASSERT(badseg1->s_szc == 0); 6764 6765 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6766 uint_t tszc = highbit(tszcvec) - 1; 6767 caddr_t ta = MAX(addr, badseg1->s_base); 6768 caddr_t te; 6769 size_t tpgsz = page_get_pagesize(tszc); 6770 6771 ASSERT(svd->type == MAP_SHARED); 6772 ASSERT(flag == SDR_END); 6773 ASSERT(tszc < szc && tszc > 0); 6774 6775 if (eaddr > badseg1->s_base + badseg1->s_size) { 6776 te = badseg1->s_base + badseg1->s_size; 6777 } else { 6778 te = eaddr; 6779 } 6780 6781 ASSERT(ta <= te); 6782 badseg1->s_szc = tszc; 6783 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) { 6784 if (badseg2 != NULL) { 6785 err = segvn_demote_range(badseg1, ta, te - ta, 6786 SDR_END, tszcvec); 6787 if (err != 0) { 6788 return (err); 6789 } 6790 } else { 6791 return (segvn_demote_range(badseg1, ta, 6792 te - ta, SDR_END, tszcvec)); 6793 } 6794 } 6795 } 6796 6797 if (badseg2 == NULL) 6798 return (0); 6799 ASSERT(badseg2->s_szc == szc); 6800 ASSERT(badseg2->s_size == pgsz); 6801 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 6802 if (err = segvn_clrszc(badseg2)) { 6803 return (err); 6804 } 6805 ASSERT(badseg2->s_szc == 0); 6806 6807 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6808 uint_t tszc = highbit(tszcvec) - 1; 6809 size_t tpgsz = page_get_pagesize(tszc); 6810 6811 ASSERT(svd->type == MAP_SHARED); 6812 ASSERT(flag == SDR_END); 6813 ASSERT(tszc < szc && tszc > 0); 6814 ASSERT(badseg2->s_base > addr); 6815 ASSERT(eaddr > badseg2->s_base); 6816 ASSERT(eaddr < badseg2->s_base + badseg2->s_size); 6817 6818 badseg2->s_szc = tszc; 6819 if (!IS_P2ALIGNED(eaddr, tpgsz)) { 6820 return (segvn_demote_range(badseg2, badseg2->s_base, 6821 eaddr - badseg2->s_base, SDR_END, tszcvec)); 6822 } 6823 } 6824 6825 return (0); 6826 } 6827 6828 static int 6829 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 6830 { 6831 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6832 struct vpage *vp, *evp; 6833 6834 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6835 6836 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6837 /* 6838 * If segment protection can be used, simply check against them. 6839 */ 6840 if (svd->pageprot == 0) { 6841 int err; 6842 6843 err = ((svd->prot & prot) != prot) ? EACCES : 0; 6844 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6845 return (err); 6846 } 6847 6848 /* 6849 * Have to check down to the vpage level. 6850 */ 6851 evp = &svd->vpage[seg_page(seg, addr + len)]; 6852 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 6853 if ((VPP_PROT(vp) & prot) != prot) { 6854 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6855 return (EACCES); 6856 } 6857 } 6858 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6859 return (0); 6860 } 6861 6862 static int 6863 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 6864 { 6865 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6866 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 6867 6868 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6869 6870 if (pgno != 0) { 6871 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6872 if (svd->pageprot == 0) { 6873 do { 6874 protv[--pgno] = svd->prot; 6875 } while (pgno != 0); 6876 } else { 6877 size_t pgoff = seg_page(seg, addr); 6878 6879 do { 6880 pgno--; 6881 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 6882 } while (pgno != 0); 6883 } 6884 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6885 } 6886 return (0); 6887 } 6888 6889 static u_offset_t 6890 segvn_getoffset(struct seg *seg, caddr_t addr) 6891 { 6892 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6893 6894 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6895 6896 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 6897 } 6898 6899 /*ARGSUSED*/ 6900 static int 6901 segvn_gettype(struct seg *seg, caddr_t addr) 6902 { 6903 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6904 6905 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6906 6907 return (svd->type | (svd->flags & (MAP_NORESERVE | MAP_TEXT | 6908 MAP_INITDATA))); 6909 } 6910 6911 /*ARGSUSED*/ 6912 static int 6913 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 6914 { 6915 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6916 6917 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6918 6919 *vpp = svd->vp; 6920 return (0); 6921 } 6922 6923 /* 6924 * Check to see if it makes sense to do kluster/read ahead to 6925 * addr + delta relative to the mapping at addr. We assume here 6926 * that delta is a signed PAGESIZE'd multiple (which can be negative). 6927 * 6928 * For segvn, we currently "approve" of the action if we are 6929 * still in the segment and it maps from the same vp/off, 6930 * or if the advice stored in segvn_data or vpages allows it. 6931 * Currently, klustering is not allowed only if MADV_RANDOM is set. 6932 */ 6933 static int 6934 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 6935 { 6936 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6937 struct anon *oap, *ap; 6938 ssize_t pd; 6939 size_t page; 6940 struct vnode *vp1, *vp2; 6941 u_offset_t off1, off2; 6942 struct anon_map *amp; 6943 6944 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6945 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6946 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 6947 6948 if (addr + delta < seg->s_base || 6949 addr + delta >= (seg->s_base + seg->s_size)) 6950 return (-1); /* exceeded segment bounds */ 6951 6952 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 6953 page = seg_page(seg, addr); 6954 6955 /* 6956 * Check to see if either of the pages addr or addr + delta 6957 * have advice set that prevents klustering (if MADV_RANDOM advice 6958 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 6959 * is negative). 6960 */ 6961 if (svd->advice == MADV_RANDOM || 6962 svd->advice == MADV_SEQUENTIAL && delta < 0) 6963 return (-1); 6964 else if (svd->pageadvice && svd->vpage) { 6965 struct vpage *bvpp, *evpp; 6966 6967 bvpp = &svd->vpage[page]; 6968 evpp = &svd->vpage[page + pd]; 6969 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 6970 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 6971 return (-1); 6972 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 6973 VPP_ADVICE(evpp) == MADV_RANDOM) 6974 return (-1); 6975 } 6976 6977 if (svd->type == MAP_SHARED) 6978 return (0); /* shared mapping - all ok */ 6979 6980 if ((amp = svd->amp) == NULL) 6981 return (0); /* off original vnode */ 6982 6983 page += svd->anon_index; 6984 6985 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6986 6987 oap = anon_get_ptr(amp->ahp, page); 6988 ap = anon_get_ptr(amp->ahp, page + pd); 6989 6990 ANON_LOCK_EXIT(&->a_rwlock); 6991 6992 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 6993 return (-1); /* one with and one without an anon */ 6994 } 6995 6996 if (oap == NULL) { /* implies that ap == NULL */ 6997 return (0); /* off original vnode */ 6998 } 6999 7000 /* 7001 * Now we know we have two anon pointers - check to 7002 * see if they happen to be properly allocated. 7003 */ 7004 7005 /* 7006 * XXX We cheat here and don't lock the anon slots. We can't because 7007 * we may have been called from the anon layer which might already 7008 * have locked them. We are holding a refcnt on the slots so they 7009 * can't disappear. The worst that will happen is we'll get the wrong 7010 * names (vp, off) for the slots and make a poor klustering decision. 7011 */ 7012 swap_xlate(ap, &vp1, &off1); 7013 swap_xlate(oap, &vp2, &off2); 7014 7015 7016 if (!VOP_CMP(vp1, vp2, NULL) || off1 - off2 != delta) 7017 return (-1); 7018 return (0); 7019 } 7020 7021 /* 7022 * Swap the pages of seg out to secondary storage, returning the 7023 * number of bytes of storage freed. 7024 * 7025 * The basic idea is first to unload all translations and then to call 7026 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 7027 * swap device. Pages to which other segments have mappings will remain 7028 * mapped and won't be swapped. Our caller (as_swapout) has already 7029 * performed the unloading step. 7030 * 7031 * The value returned is intended to correlate well with the process's 7032 * memory requirements. However, there are some caveats: 7033 * 1) When given a shared segment as argument, this routine will 7034 * only succeed in swapping out pages for the last sharer of the 7035 * segment. (Previous callers will only have decremented mapping 7036 * reference counts.) 7037 * 2) We assume that the hat layer maintains a large enough translation 7038 * cache to capture process reference patterns. 7039 */ 7040 static size_t 7041 segvn_swapout(struct seg *seg) 7042 { 7043 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7044 struct anon_map *amp; 7045 pgcnt_t pgcnt = 0; 7046 pgcnt_t npages; 7047 pgcnt_t page; 7048 ulong_t anon_index; 7049 7050 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7051 7052 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7053 /* 7054 * Find pages unmapped by our caller and force them 7055 * out to the virtual swap device. 7056 */ 7057 if ((amp = svd->amp) != NULL) 7058 anon_index = svd->anon_index; 7059 npages = seg->s_size >> PAGESHIFT; 7060 for (page = 0; page < npages; page++) { 7061 page_t *pp; 7062 struct anon *ap; 7063 struct vnode *vp; 7064 u_offset_t off; 7065 anon_sync_obj_t cookie; 7066 7067 /* 7068 * Obtain <vp, off> pair for the page, then look it up. 7069 * 7070 * Note that this code is willing to consider regular 7071 * pages as well as anon pages. Is this appropriate here? 7072 */ 7073 ap = NULL; 7074 if (amp != NULL) { 7075 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7076 if (anon_array_try_enter(amp, anon_index + page, 7077 &cookie)) { 7078 ANON_LOCK_EXIT(&->a_rwlock); 7079 continue; 7080 } 7081 ap = anon_get_ptr(amp->ahp, anon_index + page); 7082 if (ap != NULL) { 7083 swap_xlate(ap, &vp, &off); 7084 } else { 7085 vp = svd->vp; 7086 off = svd->offset + ptob(page); 7087 } 7088 anon_array_exit(&cookie); 7089 ANON_LOCK_EXIT(&->a_rwlock); 7090 } else { 7091 vp = svd->vp; 7092 off = svd->offset + ptob(page); 7093 } 7094 if (vp == NULL) { /* untouched zfod page */ 7095 ASSERT(ap == NULL); 7096 continue; 7097 } 7098 7099 pp = page_lookup_nowait(vp, off, SE_SHARED); 7100 if (pp == NULL) 7101 continue; 7102 7103 7104 /* 7105 * Examine the page to see whether it can be tossed out, 7106 * keeping track of how many we've found. 7107 */ 7108 if (!page_tryupgrade(pp)) { 7109 /* 7110 * If the page has an i/o lock and no mappings, 7111 * it's very likely that the page is being 7112 * written out as a result of klustering. 7113 * Assume this is so and take credit for it here. 7114 */ 7115 if (!page_io_trylock(pp)) { 7116 if (!hat_page_is_mapped(pp)) 7117 pgcnt++; 7118 } else { 7119 page_io_unlock(pp); 7120 } 7121 page_unlock(pp); 7122 continue; 7123 } 7124 ASSERT(!page_iolock_assert(pp)); 7125 7126 7127 /* 7128 * Skip if page is locked or has mappings. 7129 * We don't need the page_struct_lock to look at lckcnt 7130 * and cowcnt because the page is exclusive locked. 7131 */ 7132 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 7133 hat_page_is_mapped(pp)) { 7134 page_unlock(pp); 7135 continue; 7136 } 7137 7138 /* 7139 * dispose skips large pages so try to demote first. 7140 */ 7141 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 7142 page_unlock(pp); 7143 /* 7144 * XXX should skip the remaining page_t's of this 7145 * large page. 7146 */ 7147 continue; 7148 } 7149 7150 ASSERT(pp->p_szc == 0); 7151 7152 /* 7153 * No longer mapped -- we can toss it out. How 7154 * we do so depends on whether or not it's dirty. 7155 */ 7156 if (hat_ismod(pp) && pp->p_vnode) { 7157 /* 7158 * We must clean the page before it can be 7159 * freed. Setting B_FREE will cause pvn_done 7160 * to free the page when the i/o completes. 7161 * XXX: This also causes it to be accounted 7162 * as a pageout instead of a swap: need 7163 * B_SWAPOUT bit to use instead of B_FREE. 7164 * 7165 * Hold the vnode before releasing the page lock 7166 * to prevent it from being freed and re-used by 7167 * some other thread. 7168 */ 7169 VN_HOLD(vp); 7170 page_unlock(pp); 7171 7172 /* 7173 * Queue all i/o requests for the pageout thread 7174 * to avoid saturating the pageout devices. 7175 */ 7176 if (!queue_io_request(vp, off)) 7177 VN_RELE(vp); 7178 } else { 7179 /* 7180 * The page was clean, free it. 7181 * 7182 * XXX: Can we ever encounter modified pages 7183 * with no associated vnode here? 7184 */ 7185 ASSERT(pp->p_vnode != NULL); 7186 /*LINTED: constant in conditional context*/ 7187 VN_DISPOSE(pp, B_FREE, 0, kcred); 7188 } 7189 7190 /* 7191 * Credit now even if i/o is in progress. 7192 */ 7193 pgcnt++; 7194 } 7195 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7196 7197 /* 7198 * Wakeup pageout to initiate i/o on all queued requests. 7199 */ 7200 cv_signal_pageout(); 7201 return (ptob(pgcnt)); 7202 } 7203 7204 /* 7205 * Synchronize primary storage cache with real object in virtual memory. 7206 * 7207 * XXX - Anonymous pages should not be sync'ed out at all. 7208 */ 7209 static int 7210 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 7211 { 7212 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7213 struct vpage *vpp; 7214 page_t *pp; 7215 u_offset_t offset; 7216 struct vnode *vp; 7217 u_offset_t off; 7218 caddr_t eaddr; 7219 int bflags; 7220 int err = 0; 7221 int segtype; 7222 int pageprot; 7223 int prot; 7224 ulong_t anon_index; 7225 struct anon_map *amp; 7226 struct anon *ap; 7227 anon_sync_obj_t cookie; 7228 7229 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7230 7231 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7232 7233 if (svd->softlockcnt > 0) { 7234 /* 7235 * If this is shared segment non 0 softlockcnt 7236 * means locked pages are still in use. 7237 */ 7238 if (svd->type == MAP_SHARED) { 7239 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7240 return (EAGAIN); 7241 } 7242 7243 /* 7244 * flush all pages from seg cache 7245 * otherwise we may deadlock in swap_putpage 7246 * for B_INVAL page (4175402). 7247 * 7248 * Even if we grab segvn WRITER's lock 7249 * here, there might be another thread which could've 7250 * successfully performed lookup/insert just before 7251 * we acquired the lock here. So, grabbing either 7252 * lock here is of not much use. Until we devise 7253 * a strategy at upper layers to solve the 7254 * synchronization issues completely, we expect 7255 * applications to handle this appropriately. 7256 */ 7257 segvn_purge(seg); 7258 if (svd->softlockcnt > 0) { 7259 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7260 return (EAGAIN); 7261 } 7262 } else if (svd->type == MAP_SHARED && svd->amp != NULL && 7263 svd->amp->a_softlockcnt > 0) { 7264 /* 7265 * Try to purge this amp's entries from pcache. It will 7266 * succeed only if other segments that share the amp have no 7267 * outstanding softlock's. 7268 */ 7269 segvn_purge(seg); 7270 if (svd->amp->a_softlockcnt > 0 || svd->softlockcnt > 0) { 7271 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7272 return (EAGAIN); 7273 } 7274 } 7275 7276 vpp = svd->vpage; 7277 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7278 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 7279 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 7280 7281 if (attr) { 7282 pageprot = attr & ~(SHARED|PRIVATE); 7283 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 7284 7285 /* 7286 * We are done if the segment types don't match 7287 * or if we have segment level protections and 7288 * they don't match. 7289 */ 7290 if (svd->type != segtype) { 7291 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7292 return (0); 7293 } 7294 if (vpp == NULL) { 7295 if (svd->prot != pageprot) { 7296 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7297 return (0); 7298 } 7299 prot = svd->prot; 7300 } else 7301 vpp = &svd->vpage[seg_page(seg, addr)]; 7302 7303 } else if (svd->vp && svd->amp == NULL && 7304 (flags & MS_INVALIDATE) == 0) { 7305 7306 /* 7307 * No attributes, no anonymous pages and MS_INVALIDATE flag 7308 * is not on, just use one big request. 7309 */ 7310 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 7311 bflags, svd->cred, NULL); 7312 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7313 return (err); 7314 } 7315 7316 if ((amp = svd->amp) != NULL) 7317 anon_index = svd->anon_index + seg_page(seg, addr); 7318 7319 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 7320 ap = NULL; 7321 if (amp != NULL) { 7322 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7323 anon_array_enter(amp, anon_index, &cookie); 7324 ap = anon_get_ptr(amp->ahp, anon_index++); 7325 if (ap != NULL) { 7326 swap_xlate(ap, &vp, &off); 7327 } else { 7328 vp = svd->vp; 7329 off = offset; 7330 } 7331 anon_array_exit(&cookie); 7332 ANON_LOCK_EXIT(&->a_rwlock); 7333 } else { 7334 vp = svd->vp; 7335 off = offset; 7336 } 7337 offset += PAGESIZE; 7338 7339 if (vp == NULL) /* untouched zfod page */ 7340 continue; 7341 7342 if (attr) { 7343 if (vpp) { 7344 prot = VPP_PROT(vpp); 7345 vpp++; 7346 } 7347 if (prot != pageprot) { 7348 continue; 7349 } 7350 } 7351 7352 /* 7353 * See if any of these pages are locked -- if so, then we 7354 * will have to truncate an invalidate request at the first 7355 * locked one. We don't need the page_struct_lock to test 7356 * as this is only advisory; even if we acquire it someone 7357 * might race in and lock the page after we unlock and before 7358 * we do the PUTPAGE, then PUTPAGE simply does nothing. 7359 */ 7360 if (flags & MS_INVALIDATE) { 7361 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 7362 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 7363 page_unlock(pp); 7364 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7365 return (EBUSY); 7366 } 7367 if (ap != NULL && pp->p_szc != 0 && 7368 page_tryupgrade(pp)) { 7369 if (pp->p_lckcnt == 0 && 7370 pp->p_cowcnt == 0) { 7371 /* 7372 * swapfs VN_DISPOSE() won't 7373 * invalidate large pages. 7374 * Attempt to demote. 7375 * XXX can't help it if it 7376 * fails. But for swapfs 7377 * pages it is no big deal. 7378 */ 7379 (void) page_try_demote_pages( 7380 pp); 7381 } 7382 } 7383 page_unlock(pp); 7384 } 7385 } else if (svd->type == MAP_SHARED && amp != NULL) { 7386 /* 7387 * Avoid writing out to disk ISM's large pages 7388 * because segspt_free_pages() relies on NULL an_pvp 7389 * of anon slots of such pages. 7390 */ 7391 7392 ASSERT(svd->vp == NULL); 7393 /* 7394 * swapfs uses page_lookup_nowait if not freeing or 7395 * invalidating and skips a page if 7396 * page_lookup_nowait returns NULL. 7397 */ 7398 pp = page_lookup_nowait(vp, off, SE_SHARED); 7399 if (pp == NULL) { 7400 continue; 7401 } 7402 if (pp->p_szc != 0) { 7403 page_unlock(pp); 7404 continue; 7405 } 7406 7407 /* 7408 * Note ISM pages are created large so (vp, off)'s 7409 * page cannot suddenly become large after we unlock 7410 * pp. 7411 */ 7412 page_unlock(pp); 7413 } 7414 /* 7415 * XXX - Should ultimately try to kluster 7416 * calls to VOP_PUTPAGE() for performance. 7417 */ 7418 VN_HOLD(vp); 7419 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 7420 (bflags | (IS_SWAPFSVP(vp) ? B_PAGE_NOWAIT : 0)), 7421 svd->cred, NULL); 7422 7423 VN_RELE(vp); 7424 if (err) 7425 break; 7426 } 7427 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7428 return (err); 7429 } 7430 7431 /* 7432 * Determine if we have data corresponding to pages in the 7433 * primary storage virtual memory cache (i.e., "in core"). 7434 */ 7435 static size_t 7436 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 7437 { 7438 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7439 struct vnode *vp, *avp; 7440 u_offset_t offset, aoffset; 7441 size_t p, ep; 7442 int ret; 7443 struct vpage *vpp; 7444 page_t *pp; 7445 uint_t start; 7446 struct anon_map *amp; /* XXX - for locknest */ 7447 struct anon *ap; 7448 uint_t attr; 7449 anon_sync_obj_t cookie; 7450 7451 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7452 7453 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7454 if (svd->amp == NULL && svd->vp == NULL) { 7455 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7456 bzero(vec, btopr(len)); 7457 return (len); /* no anonymous pages created yet */ 7458 } 7459 7460 p = seg_page(seg, addr); 7461 ep = seg_page(seg, addr + len); 7462 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 7463 7464 amp = svd->amp; 7465 for (; p < ep; p++, addr += PAGESIZE) { 7466 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 7467 ret = start; 7468 ap = NULL; 7469 avp = NULL; 7470 /* Grab the vnode/offset for the anon slot */ 7471 if (amp != NULL) { 7472 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7473 anon_array_enter(amp, svd->anon_index + p, &cookie); 7474 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 7475 if (ap != NULL) { 7476 swap_xlate(ap, &avp, &aoffset); 7477 } 7478 anon_array_exit(&cookie); 7479 ANON_LOCK_EXIT(&->a_rwlock); 7480 } 7481 if ((avp != NULL) && page_exists(avp, aoffset)) { 7482 /* A page exists for the anon slot */ 7483 ret |= SEG_PAGE_INCORE; 7484 7485 /* 7486 * If page is mapped and writable 7487 */ 7488 attr = (uint_t)0; 7489 if ((hat_getattr(seg->s_as->a_hat, addr, 7490 &attr) != -1) && (attr & PROT_WRITE)) { 7491 ret |= SEG_PAGE_ANON; 7492 } 7493 /* 7494 * Don't get page_struct lock for lckcnt and cowcnt, 7495 * since this is purely advisory. 7496 */ 7497 if ((pp = page_lookup_nowait(avp, aoffset, 7498 SE_SHARED)) != NULL) { 7499 if (pp->p_lckcnt) 7500 ret |= SEG_PAGE_SOFTLOCK; 7501 if (pp->p_cowcnt) 7502 ret |= SEG_PAGE_HASCOW; 7503 page_unlock(pp); 7504 } 7505 } 7506 7507 /* Gather vnode statistics */ 7508 vp = svd->vp; 7509 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7510 7511 if (vp != NULL) { 7512 /* 7513 * Try to obtain a "shared" lock on the page 7514 * without blocking. If this fails, determine 7515 * if the page is in memory. 7516 */ 7517 pp = page_lookup_nowait(vp, offset, SE_SHARED); 7518 if ((pp == NULL) && (page_exists(vp, offset))) { 7519 /* Page is incore, and is named */ 7520 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 7521 } 7522 /* 7523 * Don't get page_struct lock for lckcnt and cowcnt, 7524 * since this is purely advisory. 7525 */ 7526 if (pp != NULL) { 7527 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 7528 if (pp->p_lckcnt) 7529 ret |= SEG_PAGE_SOFTLOCK; 7530 if (pp->p_cowcnt) 7531 ret |= SEG_PAGE_HASCOW; 7532 page_unlock(pp); 7533 } 7534 } 7535 7536 /* Gather virtual page information */ 7537 if (vpp) { 7538 if (VPP_ISPPLOCK(vpp)) 7539 ret |= SEG_PAGE_LOCKED; 7540 vpp++; 7541 } 7542 7543 *vec++ = (char)ret; 7544 } 7545 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7546 return (len); 7547 } 7548 7549 /* 7550 * Statement for p_cowcnts/p_lckcnts. 7551 * 7552 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 7553 * irrespective of the following factors or anything else: 7554 * 7555 * (1) anon slots are populated or not 7556 * (2) cow is broken or not 7557 * (3) refcnt on ap is 1 or greater than 1 7558 * 7559 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 7560 * and munlock. 7561 * 7562 * 7563 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 7564 * 7565 * if vpage has PROT_WRITE 7566 * transfer cowcnt on the oldpage -> cowcnt on the newpage 7567 * else 7568 * transfer lckcnt on the oldpage -> lckcnt on the newpage 7569 * 7570 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 7571 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 7572 * 7573 * We may also break COW if softlocking on read access in the physio case. 7574 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 7575 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 7576 * vpage doesn't have PROT_WRITE. 7577 * 7578 * 7579 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 7580 * 7581 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 7582 * increment p_lckcnt by calling page_subclaim() which takes care of 7583 * availrmem accounting and p_lckcnt overflow. 7584 * 7585 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 7586 * increment p_cowcnt by calling page_addclaim() which takes care of 7587 * availrmem availability and p_cowcnt overflow. 7588 */ 7589 7590 /* 7591 * Lock down (or unlock) pages mapped by this segment. 7592 * 7593 * XXX only creates PAGESIZE pages if anon slots are not initialized. 7594 * At fault time they will be relocated into larger pages. 7595 */ 7596 static int 7597 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 7598 int attr, int op, ulong_t *lockmap, size_t pos) 7599 { 7600 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7601 struct vpage *vpp; 7602 struct vpage *evp; 7603 page_t *pp; 7604 u_offset_t offset; 7605 u_offset_t off; 7606 int segtype; 7607 int pageprot; 7608 int claim; 7609 struct vnode *vp; 7610 ulong_t anon_index; 7611 struct anon_map *amp; 7612 struct anon *ap; 7613 struct vattr va; 7614 anon_sync_obj_t cookie; 7615 struct kshmid *sp = NULL; 7616 struct proc *p = curproc; 7617 kproject_t *proj = NULL; 7618 int chargeproc = 1; 7619 size_t locked_bytes = 0; 7620 size_t unlocked_bytes = 0; 7621 int err = 0; 7622 7623 /* 7624 * Hold write lock on address space because may split or concatenate 7625 * segments 7626 */ 7627 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7628 7629 /* 7630 * If this is a shm, use shm's project and zone, else use 7631 * project and zone of calling process 7632 */ 7633 7634 /* Determine if this segment backs a sysV shm */ 7635 if (svd->amp != NULL && svd->amp->a_sp != NULL) { 7636 ASSERT(svd->type == MAP_SHARED); 7637 ASSERT(svd->tr_state == SEGVN_TR_OFF); 7638 sp = svd->amp->a_sp; 7639 proj = sp->shm_perm.ipc_proj; 7640 chargeproc = 0; 7641 } 7642 7643 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7644 if (attr) { 7645 pageprot = attr & ~(SHARED|PRIVATE); 7646 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 7647 7648 /* 7649 * We are done if the segment types don't match 7650 * or if we have segment level protections and 7651 * they don't match. 7652 */ 7653 if (svd->type != segtype) { 7654 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7655 return (0); 7656 } 7657 if (svd->pageprot == 0 && svd->prot != pageprot) { 7658 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7659 return (0); 7660 } 7661 } 7662 7663 if (op == MC_LOCK) { 7664 if (svd->tr_state == SEGVN_TR_INIT) { 7665 svd->tr_state = SEGVN_TR_OFF; 7666 } else if (svd->tr_state == SEGVN_TR_ON) { 7667 ASSERT(svd->amp != NULL); 7668 segvn_textunrepl(seg, 0); 7669 ASSERT(svd->amp == NULL && 7670 svd->tr_state == SEGVN_TR_OFF); 7671 } 7672 } 7673 7674 /* 7675 * If we're locking, then we must create a vpage structure if 7676 * none exists. If we're unlocking, then check to see if there 7677 * is a vpage -- if not, then we could not have locked anything. 7678 */ 7679 7680 if ((vpp = svd->vpage) == NULL) { 7681 if (op == MC_LOCK) 7682 segvn_vpage(seg); 7683 else { 7684 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7685 return (0); 7686 } 7687 } 7688 7689 /* 7690 * The anonymous data vector (i.e., previously 7691 * unreferenced mapping to swap space) can be allocated 7692 * by lazily testing for its existence. 7693 */ 7694 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 7695 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 7696 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 7697 svd->amp->a_szc = seg->s_szc; 7698 } 7699 7700 if ((amp = svd->amp) != NULL) { 7701 anon_index = svd->anon_index + seg_page(seg, addr); 7702 } 7703 7704 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7705 evp = &svd->vpage[seg_page(seg, addr + len)]; 7706 7707 if (sp != NULL) 7708 mutex_enter(&sp->shm_mlock); 7709 7710 /* determine number of unlocked bytes in range for lock operation */ 7711 if (op == MC_LOCK) { 7712 7713 if (sp == NULL) { 7714 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7715 vpp++) { 7716 if (!VPP_ISPPLOCK(vpp)) 7717 unlocked_bytes += PAGESIZE; 7718 } 7719 } else { 7720 ulong_t i_idx, i_edx; 7721 anon_sync_obj_t i_cookie; 7722 struct anon *i_ap; 7723 struct vnode *i_vp; 7724 u_offset_t i_off; 7725 7726 /* Only count sysV pages once for locked memory */ 7727 i_edx = svd->anon_index + seg_page(seg, addr + len); 7728 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7729 for (i_idx = anon_index; i_idx < i_edx; i_idx++) { 7730 anon_array_enter(amp, i_idx, &i_cookie); 7731 i_ap = anon_get_ptr(amp->ahp, i_idx); 7732 if (i_ap == NULL) { 7733 unlocked_bytes += PAGESIZE; 7734 anon_array_exit(&i_cookie); 7735 continue; 7736 } 7737 swap_xlate(i_ap, &i_vp, &i_off); 7738 anon_array_exit(&i_cookie); 7739 pp = page_lookup(i_vp, i_off, SE_SHARED); 7740 if (pp == NULL) { 7741 unlocked_bytes += PAGESIZE; 7742 continue; 7743 } else if (pp->p_lckcnt == 0) 7744 unlocked_bytes += PAGESIZE; 7745 page_unlock(pp); 7746 } 7747 ANON_LOCK_EXIT(&->a_rwlock); 7748 } 7749 7750 mutex_enter(&p->p_lock); 7751 err = rctl_incr_locked_mem(p, proj, unlocked_bytes, 7752 chargeproc); 7753 mutex_exit(&p->p_lock); 7754 7755 if (err) { 7756 if (sp != NULL) 7757 mutex_exit(&sp->shm_mlock); 7758 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7759 return (err); 7760 } 7761 } 7762 /* 7763 * Loop over all pages in the range. Process if we're locking and 7764 * page has not already been locked in this mapping; or if we're 7765 * unlocking and the page has been locked. 7766 */ 7767 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7768 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 7769 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7770 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 7771 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 7772 7773 if (amp != NULL) 7774 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7775 /* 7776 * If this isn't a MAP_NORESERVE segment and 7777 * we're locking, allocate anon slots if they 7778 * don't exist. The page is brought in later on. 7779 */ 7780 if (op == MC_LOCK && svd->vp == NULL && 7781 ((svd->flags & MAP_NORESERVE) == 0) && 7782 amp != NULL && 7783 ((ap = anon_get_ptr(amp->ahp, anon_index)) 7784 == NULL)) { 7785 anon_array_enter(amp, anon_index, &cookie); 7786 7787 if ((ap = anon_get_ptr(amp->ahp, 7788 anon_index)) == NULL) { 7789 pp = anon_zero(seg, addr, &ap, 7790 svd->cred); 7791 if (pp == NULL) { 7792 anon_array_exit(&cookie); 7793 ANON_LOCK_EXIT(&->a_rwlock); 7794 err = ENOMEM; 7795 goto out; 7796 } 7797 ASSERT(anon_get_ptr(amp->ahp, 7798 anon_index) == NULL); 7799 (void) anon_set_ptr(amp->ahp, 7800 anon_index, ap, ANON_SLEEP); 7801 page_unlock(pp); 7802 } 7803 anon_array_exit(&cookie); 7804 } 7805 7806 /* 7807 * Get name for page, accounting for 7808 * existence of private copy. 7809 */ 7810 ap = NULL; 7811 if (amp != NULL) { 7812 anon_array_enter(amp, anon_index, &cookie); 7813 ap = anon_get_ptr(amp->ahp, anon_index); 7814 if (ap != NULL) { 7815 swap_xlate(ap, &vp, &off); 7816 } else { 7817 if (svd->vp == NULL && 7818 (svd->flags & MAP_NORESERVE)) { 7819 anon_array_exit(&cookie); 7820 ANON_LOCK_EXIT(&->a_rwlock); 7821 continue; 7822 } 7823 vp = svd->vp; 7824 off = offset; 7825 } 7826 if (op != MC_LOCK || ap == NULL) { 7827 anon_array_exit(&cookie); 7828 ANON_LOCK_EXIT(&->a_rwlock); 7829 } 7830 } else { 7831 vp = svd->vp; 7832 off = offset; 7833 } 7834 7835 /* 7836 * Get page frame. It's ok if the page is 7837 * not available when we're unlocking, as this 7838 * may simply mean that a page we locked got 7839 * truncated out of existence after we locked it. 7840 * 7841 * Invoke VOP_GETPAGE() to obtain the page struct 7842 * since we may need to read it from disk if its 7843 * been paged out. 7844 */ 7845 if (op != MC_LOCK) 7846 pp = page_lookup(vp, off, SE_SHARED); 7847 else { 7848 page_t *pl[1 + 1]; 7849 int error; 7850 7851 ASSERT(vp != NULL); 7852 7853 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 7854 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 7855 S_OTHER, svd->cred, NULL); 7856 7857 if (error && ap != NULL) { 7858 anon_array_exit(&cookie); 7859 ANON_LOCK_EXIT(&->a_rwlock); 7860 } 7861 7862 /* 7863 * If the error is EDEADLK then we must bounce 7864 * up and drop all vm subsystem locks and then 7865 * retry the operation later 7866 * This behavior is a temporary measure because 7867 * ufs/sds logging is badly designed and will 7868 * deadlock if we don't allow this bounce to 7869 * happen. The real solution is to re-design 7870 * the logging code to work properly. See bug 7871 * 4125102 for details of the problem. 7872 */ 7873 if (error == EDEADLK) { 7874 err = error; 7875 goto out; 7876 } 7877 /* 7878 * Quit if we fail to fault in the page. Treat 7879 * the failure as an error, unless the addr 7880 * is mapped beyond the end of a file. 7881 */ 7882 if (error && svd->vp) { 7883 va.va_mask = AT_SIZE; 7884 if (VOP_GETATTR(svd->vp, &va, 0, 7885 svd->cred, NULL) != 0) { 7886 err = EIO; 7887 goto out; 7888 } 7889 if (btopr(va.va_size) >= 7890 btopr(off + 1)) { 7891 err = EIO; 7892 goto out; 7893 } 7894 goto out; 7895 7896 } else if (error) { 7897 err = EIO; 7898 goto out; 7899 } 7900 pp = pl[0]; 7901 ASSERT(pp != NULL); 7902 } 7903 7904 /* 7905 * See Statement at the beginning of this routine. 7906 * 7907 * claim is always set if MAP_PRIVATE and PROT_WRITE 7908 * irrespective of following factors: 7909 * 7910 * (1) anon slots are populated or not 7911 * (2) cow is broken or not 7912 * (3) refcnt on ap is 1 or greater than 1 7913 * 7914 * See 4140683 for details 7915 */ 7916 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 7917 (svd->type == MAP_PRIVATE)); 7918 7919 /* 7920 * Perform page-level operation appropriate to 7921 * operation. If locking, undo the SOFTLOCK 7922 * performed to bring the page into memory 7923 * after setting the lock. If unlocking, 7924 * and no page was found, account for the claim 7925 * separately. 7926 */ 7927 if (op == MC_LOCK) { 7928 int ret = 1; /* Assume success */ 7929 7930 ASSERT(!VPP_ISPPLOCK(vpp)); 7931 7932 ret = page_pp_lock(pp, claim, 0); 7933 if (ap != NULL) { 7934 if (ap->an_pvp != NULL) { 7935 anon_swap_free(ap, pp); 7936 } 7937 anon_array_exit(&cookie); 7938 ANON_LOCK_EXIT(&->a_rwlock); 7939 } 7940 if (ret == 0) { 7941 /* locking page failed */ 7942 page_unlock(pp); 7943 err = EAGAIN; 7944 goto out; 7945 } 7946 VPP_SETPPLOCK(vpp); 7947 if (sp != NULL) { 7948 if (pp->p_lckcnt == 1) 7949 locked_bytes += PAGESIZE; 7950 } else 7951 locked_bytes += PAGESIZE; 7952 7953 if (lockmap != (ulong_t *)NULL) 7954 BT_SET(lockmap, pos); 7955 7956 page_unlock(pp); 7957 } else { 7958 ASSERT(VPP_ISPPLOCK(vpp)); 7959 if (pp != NULL) { 7960 /* sysV pages should be locked */ 7961 ASSERT(sp == NULL || pp->p_lckcnt > 0); 7962 page_pp_unlock(pp, claim, 0); 7963 if (sp != NULL) { 7964 if (pp->p_lckcnt == 0) 7965 unlocked_bytes 7966 += PAGESIZE; 7967 } else 7968 unlocked_bytes += PAGESIZE; 7969 page_unlock(pp); 7970 } else { 7971 ASSERT(sp == NULL); 7972 unlocked_bytes += PAGESIZE; 7973 } 7974 VPP_CLRPPLOCK(vpp); 7975 } 7976 } 7977 } 7978 out: 7979 if (op == MC_LOCK) { 7980 /* Credit back bytes that did not get locked */ 7981 if ((unlocked_bytes - locked_bytes) > 0) { 7982 if (proj == NULL) 7983 mutex_enter(&p->p_lock); 7984 rctl_decr_locked_mem(p, proj, 7985 (unlocked_bytes - locked_bytes), chargeproc); 7986 if (proj == NULL) 7987 mutex_exit(&p->p_lock); 7988 } 7989 7990 } else { 7991 /* Account bytes that were unlocked */ 7992 if (unlocked_bytes > 0) { 7993 if (proj == NULL) 7994 mutex_enter(&p->p_lock); 7995 rctl_decr_locked_mem(p, proj, unlocked_bytes, 7996 chargeproc); 7997 if (proj == NULL) 7998 mutex_exit(&p->p_lock); 7999 } 8000 } 8001 if (sp != NULL) 8002 mutex_exit(&sp->shm_mlock); 8003 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8004 8005 return (err); 8006 } 8007 8008 /* 8009 * Set advice from user for specified pages 8010 * There are 5 types of advice: 8011 * MADV_NORMAL - Normal (default) behavior (whatever that is) 8012 * MADV_RANDOM - Random page references 8013 * do not allow readahead or 'klustering' 8014 * MADV_SEQUENTIAL - Sequential page references 8015 * Pages previous to the one currently being 8016 * accessed (determined by fault) are 'not needed' 8017 * and are freed immediately 8018 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 8019 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 8020 * MADV_FREE - Contents can be discarded 8021 * MADV_ACCESS_DEFAULT- Default access 8022 * MADV_ACCESS_LWP - Next LWP will access heavily 8023 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 8024 */ 8025 static int 8026 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 8027 { 8028 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8029 size_t page; 8030 int err = 0; 8031 int already_set; 8032 struct anon_map *amp; 8033 ulong_t anon_index; 8034 struct seg *next; 8035 lgrp_mem_policy_t policy; 8036 struct seg *prev; 8037 struct vnode *vp; 8038 8039 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 8040 8041 /* 8042 * In case of MADV_FREE, we won't be modifying any segment private 8043 * data structures; so, we only need to grab READER's lock 8044 */ 8045 if (behav != MADV_FREE) { 8046 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 8047 if (svd->tr_state != SEGVN_TR_OFF) { 8048 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8049 return (0); 8050 } 8051 } else { 8052 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8053 } 8054 8055 /* 8056 * Large pages are assumed to be only turned on when accesses to the 8057 * segment's address range have spatial and temporal locality. That 8058 * justifies ignoring MADV_SEQUENTIAL for large page segments. 8059 * Also, ignore advice affecting lgroup memory allocation 8060 * if don't need to do lgroup optimizations on this system 8061 */ 8062 8063 if ((behav == MADV_SEQUENTIAL && 8064 (seg->s_szc != 0 || HAT_IS_REGION_COOKIE_VALID(svd->rcookie))) || 8065 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 8066 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 8067 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8068 return (0); 8069 } 8070 8071 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 8072 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 8073 /* 8074 * Since we are going to unload hat mappings 8075 * we first have to flush the cache. Otherwise 8076 * this might lead to system panic if another 8077 * thread is doing physio on the range whose 8078 * mappings are unloaded by madvise(3C). 8079 */ 8080 if (svd->softlockcnt > 0) { 8081 /* 8082 * If this is shared segment non 0 softlockcnt 8083 * means locked pages are still in use. 8084 */ 8085 if (svd->type == MAP_SHARED) { 8086 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8087 return (EAGAIN); 8088 } 8089 /* 8090 * Since we do have the segvn writers lock 8091 * nobody can fill the cache with entries 8092 * belonging to this seg during the purge. 8093 * The flush either succeeds or we still 8094 * have pending I/Os. In the later case, 8095 * madvise(3C) fails. 8096 */ 8097 segvn_purge(seg); 8098 if (svd->softlockcnt > 0) { 8099 /* 8100 * Since madvise(3C) is advisory and 8101 * it's not part of UNIX98, madvise(3C) 8102 * failure here doesn't cause any hardship. 8103 * Note that we don't block in "as" layer. 8104 */ 8105 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8106 return (EAGAIN); 8107 } 8108 } else if (svd->type == MAP_SHARED && svd->amp != NULL && 8109 svd->amp->a_softlockcnt > 0) { 8110 /* 8111 * Try to purge this amp's entries from pcache. It 8112 * will succeed only if other segments that share the 8113 * amp have no outstanding softlock's. 8114 */ 8115 segvn_purge(seg); 8116 } 8117 } 8118 8119 amp = svd->amp; 8120 vp = svd->vp; 8121 if (behav == MADV_FREE) { 8122 /* 8123 * MADV_FREE is not supported for segments with 8124 * underlying object; if anonmap is NULL, anon slots 8125 * are not yet populated and there is nothing for 8126 * us to do. As MADV_FREE is advisory, we don't 8127 * return error in either case. 8128 */ 8129 if (vp != NULL || amp == NULL) { 8130 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8131 return (0); 8132 } 8133 8134 segvn_purge(seg); 8135 8136 page = seg_page(seg, addr); 8137 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8138 anon_disclaim(amp, svd->anon_index + page, len); 8139 ANON_LOCK_EXIT(&->a_rwlock); 8140 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8141 return (0); 8142 } 8143 8144 /* 8145 * If advice is to be applied to entire segment, 8146 * use advice field in seg_data structure 8147 * otherwise use appropriate vpage entry. 8148 */ 8149 if ((addr == seg->s_base) && (len == seg->s_size)) { 8150 switch (behav) { 8151 case MADV_ACCESS_LWP: 8152 case MADV_ACCESS_MANY: 8153 case MADV_ACCESS_DEFAULT: 8154 /* 8155 * Set memory allocation policy for this segment 8156 */ 8157 policy = lgrp_madv_to_policy(behav, len, svd->type); 8158 if (svd->type == MAP_SHARED) 8159 already_set = lgrp_shm_policy_set(policy, amp, 8160 svd->anon_index, vp, svd->offset, len); 8161 else { 8162 /* 8163 * For private memory, need writers lock on 8164 * address space because the segment may be 8165 * split or concatenated when changing policy 8166 */ 8167 if (AS_READ_HELD(seg->s_as, 8168 &seg->s_as->a_lock)) { 8169 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8170 return (IE_RETRY); 8171 } 8172 8173 already_set = lgrp_privm_policy_set(policy, 8174 &svd->policy_info, len); 8175 } 8176 8177 /* 8178 * If policy set already and it shouldn't be reapplied, 8179 * don't do anything. 8180 */ 8181 if (already_set && 8182 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 8183 break; 8184 8185 /* 8186 * Mark any existing pages in given range for 8187 * migration 8188 */ 8189 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 8190 vp, svd->offset, 1); 8191 8192 /* 8193 * If same policy set already or this is a shared 8194 * memory segment, don't need to try to concatenate 8195 * segment with adjacent ones. 8196 */ 8197 if (already_set || svd->type == MAP_SHARED) 8198 break; 8199 8200 /* 8201 * Try to concatenate this segment with previous 8202 * one and next one, since we changed policy for 8203 * this one and it may be compatible with adjacent 8204 * ones now. 8205 */ 8206 prev = AS_SEGPREV(seg->s_as, seg); 8207 next = AS_SEGNEXT(seg->s_as, seg); 8208 8209 if (next && next->s_ops == &segvn_ops && 8210 addr + len == next->s_base) 8211 (void) segvn_concat(seg, next, 1); 8212 8213 if (prev && prev->s_ops == &segvn_ops && 8214 addr == prev->s_base + prev->s_size) { 8215 /* 8216 * Drop lock for private data of current 8217 * segment before concatenating (deleting) it 8218 * and return IE_REATTACH to tell as_ctl() that 8219 * current segment has changed 8220 */ 8221 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8222 if (!segvn_concat(prev, seg, 1)) 8223 err = IE_REATTACH; 8224 8225 return (err); 8226 } 8227 break; 8228 8229 case MADV_SEQUENTIAL: 8230 /* 8231 * unloading mapping guarantees 8232 * detection in segvn_fault 8233 */ 8234 ASSERT(seg->s_szc == 0); 8235 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 8236 hat_unload(seg->s_as->a_hat, addr, len, 8237 HAT_UNLOAD); 8238 /* FALLTHROUGH */ 8239 case MADV_NORMAL: 8240 case MADV_RANDOM: 8241 svd->advice = (uchar_t)behav; 8242 svd->pageadvice = 0; 8243 break; 8244 case MADV_WILLNEED: /* handled in memcntl */ 8245 case MADV_DONTNEED: /* handled in memcntl */ 8246 case MADV_FREE: /* handled above */ 8247 break; 8248 default: 8249 err = EINVAL; 8250 } 8251 } else { 8252 caddr_t eaddr; 8253 struct seg *new_seg; 8254 struct segvn_data *new_svd; 8255 u_offset_t off; 8256 caddr_t oldeaddr; 8257 8258 page = seg_page(seg, addr); 8259 8260 segvn_vpage(seg); 8261 8262 switch (behav) { 8263 struct vpage *bvpp, *evpp; 8264 8265 case MADV_ACCESS_LWP: 8266 case MADV_ACCESS_MANY: 8267 case MADV_ACCESS_DEFAULT: 8268 /* 8269 * Set memory allocation policy for portion of this 8270 * segment 8271 */ 8272 8273 /* 8274 * Align address and length of advice to page 8275 * boundaries for large pages 8276 */ 8277 if (seg->s_szc != 0) { 8278 size_t pgsz; 8279 8280 pgsz = page_get_pagesize(seg->s_szc); 8281 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 8282 len = P2ROUNDUP(len, pgsz); 8283 } 8284 8285 /* 8286 * Check to see whether policy is set already 8287 */ 8288 policy = lgrp_madv_to_policy(behav, len, svd->type); 8289 8290 anon_index = svd->anon_index + page; 8291 off = svd->offset + (uintptr_t)(addr - seg->s_base); 8292 8293 if (svd->type == MAP_SHARED) 8294 already_set = lgrp_shm_policy_set(policy, amp, 8295 anon_index, vp, off, len); 8296 else 8297 already_set = 8298 (policy == svd->policy_info.mem_policy); 8299 8300 /* 8301 * If policy set already and it shouldn't be reapplied, 8302 * don't do anything. 8303 */ 8304 if (already_set && 8305 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 8306 break; 8307 8308 /* 8309 * For private memory, need writers lock on 8310 * address space because the segment may be 8311 * split or concatenated when changing policy 8312 */ 8313 if (svd->type == MAP_PRIVATE && 8314 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 8315 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8316 return (IE_RETRY); 8317 } 8318 8319 /* 8320 * Mark any existing pages in given range for 8321 * migration 8322 */ 8323 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 8324 vp, svd->offset, 1); 8325 8326 /* 8327 * Don't need to try to split or concatenate 8328 * segments, since policy is same or this is a shared 8329 * memory segment 8330 */ 8331 if (already_set || svd->type == MAP_SHARED) 8332 break; 8333 8334 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 8335 ASSERT(svd->amp == NULL); 8336 ASSERT(svd->tr_state == SEGVN_TR_OFF); 8337 ASSERT(svd->softlockcnt == 0); 8338 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 8339 HAT_REGION_TEXT); 8340 svd->rcookie = HAT_INVALID_REGION_COOKIE; 8341 } 8342 8343 /* 8344 * Split off new segment if advice only applies to a 8345 * portion of existing segment starting in middle 8346 */ 8347 new_seg = NULL; 8348 eaddr = addr + len; 8349 oldeaddr = seg->s_base + seg->s_size; 8350 if (addr > seg->s_base) { 8351 /* 8352 * Must flush I/O page cache 8353 * before splitting segment 8354 */ 8355 if (svd->softlockcnt > 0) 8356 segvn_purge(seg); 8357 8358 /* 8359 * Split segment and return IE_REATTACH to tell 8360 * as_ctl() that current segment changed 8361 */ 8362 new_seg = segvn_split_seg(seg, addr); 8363 new_svd = (struct segvn_data *)new_seg->s_data; 8364 err = IE_REATTACH; 8365 8366 /* 8367 * If new segment ends where old one 8368 * did, try to concatenate the new 8369 * segment with next one. 8370 */ 8371 if (eaddr == oldeaddr) { 8372 /* 8373 * Set policy for new segment 8374 */ 8375 (void) lgrp_privm_policy_set(policy, 8376 &new_svd->policy_info, 8377 new_seg->s_size); 8378 8379 next = AS_SEGNEXT(new_seg->s_as, 8380 new_seg); 8381 8382 if (next && 8383 next->s_ops == &segvn_ops && 8384 eaddr == next->s_base) 8385 (void) segvn_concat(new_seg, 8386 next, 1); 8387 } 8388 } 8389 8390 /* 8391 * Split off end of existing segment if advice only 8392 * applies to a portion of segment ending before 8393 * end of the existing segment 8394 */ 8395 if (eaddr < oldeaddr) { 8396 /* 8397 * Must flush I/O page cache 8398 * before splitting segment 8399 */ 8400 if (svd->softlockcnt > 0) 8401 segvn_purge(seg); 8402 8403 /* 8404 * If beginning of old segment was already 8405 * split off, use new segment to split end off 8406 * from. 8407 */ 8408 if (new_seg != NULL && new_seg != seg) { 8409 /* 8410 * Split segment 8411 */ 8412 (void) segvn_split_seg(new_seg, eaddr); 8413 8414 /* 8415 * Set policy for new segment 8416 */ 8417 (void) lgrp_privm_policy_set(policy, 8418 &new_svd->policy_info, 8419 new_seg->s_size); 8420 } else { 8421 /* 8422 * Split segment and return IE_REATTACH 8423 * to tell as_ctl() that current 8424 * segment changed 8425 */ 8426 (void) segvn_split_seg(seg, eaddr); 8427 err = IE_REATTACH; 8428 8429 (void) lgrp_privm_policy_set(policy, 8430 &svd->policy_info, seg->s_size); 8431 8432 /* 8433 * If new segment starts where old one 8434 * did, try to concatenate it with 8435 * previous segment. 8436 */ 8437 if (addr == seg->s_base) { 8438 prev = AS_SEGPREV(seg->s_as, 8439 seg); 8440 8441 /* 8442 * Drop lock for private data 8443 * of current segment before 8444 * concatenating (deleting) it 8445 */ 8446 if (prev && 8447 prev->s_ops == 8448 &segvn_ops && 8449 addr == prev->s_base + 8450 prev->s_size) { 8451 SEGVN_LOCK_EXIT( 8452 seg->s_as, 8453 &svd->lock); 8454 (void) segvn_concat( 8455 prev, seg, 1); 8456 return (err); 8457 } 8458 } 8459 } 8460 } 8461 break; 8462 case MADV_SEQUENTIAL: 8463 ASSERT(seg->s_szc == 0); 8464 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 8465 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 8466 /* FALLTHROUGH */ 8467 case MADV_NORMAL: 8468 case MADV_RANDOM: 8469 bvpp = &svd->vpage[page]; 8470 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 8471 for (; bvpp < evpp; bvpp++) 8472 VPP_SETADVICE(bvpp, behav); 8473 svd->advice = MADV_NORMAL; 8474 break; 8475 case MADV_WILLNEED: /* handled in memcntl */ 8476 case MADV_DONTNEED: /* handled in memcntl */ 8477 case MADV_FREE: /* handled above */ 8478 break; 8479 default: 8480 err = EINVAL; 8481 } 8482 } 8483 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8484 return (err); 8485 } 8486 8487 /* 8488 * Create a vpage structure for this seg. 8489 */ 8490 static void 8491 segvn_vpage(struct seg *seg) 8492 { 8493 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8494 struct vpage *vp, *evp; 8495 8496 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 8497 8498 /* 8499 * If no vpage structure exists, allocate one. Copy the protections 8500 * and the advice from the segment itself to the individual pages. 8501 */ 8502 if (svd->vpage == NULL) { 8503 svd->pageadvice = 1; 8504 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 8505 KM_SLEEP); 8506 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 8507 for (vp = svd->vpage; vp < evp; vp++) { 8508 VPP_SETPROT(vp, svd->prot); 8509 VPP_SETADVICE(vp, svd->advice); 8510 } 8511 } 8512 } 8513 8514 /* 8515 * Dump the pages belonging to this segvn segment. 8516 */ 8517 static void 8518 segvn_dump(struct seg *seg) 8519 { 8520 struct segvn_data *svd; 8521 page_t *pp; 8522 struct anon_map *amp; 8523 ulong_t anon_index; 8524 struct vnode *vp; 8525 u_offset_t off, offset; 8526 pfn_t pfn; 8527 pgcnt_t page, npages; 8528 caddr_t addr; 8529 8530 npages = seg_pages(seg); 8531 svd = (struct segvn_data *)seg->s_data; 8532 vp = svd->vp; 8533 off = offset = svd->offset; 8534 addr = seg->s_base; 8535 8536 if ((amp = svd->amp) != NULL) { 8537 anon_index = svd->anon_index; 8538 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8539 } 8540 8541 for (page = 0; page < npages; page++, offset += PAGESIZE) { 8542 struct anon *ap; 8543 int we_own_it = 0; 8544 8545 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 8546 swap_xlate_nopanic(ap, &vp, &off); 8547 } else { 8548 vp = svd->vp; 8549 off = offset; 8550 } 8551 8552 /* 8553 * If pp == NULL, the page either does not exist 8554 * or is exclusively locked. So determine if it 8555 * exists before searching for it. 8556 */ 8557 8558 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 8559 we_own_it = 1; 8560 else 8561 pp = page_exists(vp, off); 8562 8563 if (pp) { 8564 pfn = page_pptonum(pp); 8565 dump_addpage(seg->s_as, addr, pfn); 8566 if (we_own_it) 8567 page_unlock(pp); 8568 } 8569 addr += PAGESIZE; 8570 dump_timeleft = dump_timeout; 8571 } 8572 8573 if (amp != NULL) 8574 ANON_LOCK_EXIT(&->a_rwlock); 8575 } 8576 8577 #ifdef DEBUG 8578 static uint32_t segvn_pglock_mtbf = 0; 8579 #endif 8580 8581 #define PCACHE_SHWLIST ((page_t *)-2) 8582 #define NOPCACHE_SHWLIST ((page_t *)-1) 8583 8584 /* 8585 * Lock/Unlock anon pages over a given range. Return shadow list. This routine 8586 * uses global segment pcache to cache shadow lists (i.e. pp arrays) of pages 8587 * to avoid the overhead of per page locking, unlocking for subsequent IOs to 8588 * the same parts of the segment. Currently shadow list creation is only 8589 * supported for pure anon segments. MAP_PRIVATE segment pcache entries are 8590 * tagged with segment pointer, starting virtual address and length. This 8591 * approach for MAP_SHARED segments may add many pcache entries for the same 8592 * set of pages and lead to long hash chains that decrease pcache lookup 8593 * performance. To avoid this issue for shared segments shared anon map and 8594 * starting anon index are used for pcache entry tagging. This allows all 8595 * segments to share pcache entries for the same anon range and reduces pcache 8596 * chain's length as well as memory overhead from duplicate shadow lists and 8597 * pcache entries. 8598 * 8599 * softlockcnt field in segvn_data structure counts the number of F_SOFTLOCK'd 8600 * pages via segvn_fault() and pagelock'd pages via this routine. But pagelock 8601 * part of softlockcnt accounting is done differently for private and shared 8602 * segments. In private segment case softlock is only incremented when a new 8603 * shadow list is created but not when an existing one is found via 8604 * seg_plookup(). pcache entries have reference count incremented/decremented 8605 * by each seg_plookup()/seg_pinactive() operation. Only entries that have 0 8606 * reference count can be purged (and purging is needed before segment can be 8607 * freed). When a private segment pcache entry is purged segvn_reclaim() will 8608 * decrement softlockcnt. Since in private segment case each of its pcache 8609 * entries only belongs to this segment we can expect that when 8610 * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this 8611 * segment purge will succeed and softlockcnt will drop to 0. In shared 8612 * segment case reference count in pcache entry counts active locks from many 8613 * different segments so we can't expect segment purging to succeed even when 8614 * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this 8615 * segment. To be able to determine when there're no pending pagelocks in 8616 * shared segment case we don't rely on purging to make softlockcnt drop to 0 8617 * but instead softlockcnt is incremented and decremented for every 8618 * segvn_pagelock(L_PAGELOCK/L_PAGEUNLOCK) call regardless if a new shadow 8619 * list was created or an existing one was found. When softlockcnt drops to 0 8620 * this segment no longer has any claims for pcached shadow lists and the 8621 * segment can be freed even if there're still active pcache entries 8622 * shared by this segment anon map. Shared segment pcache entries belong to 8623 * anon map and are typically removed when anon map is freed after all 8624 * processes destroy the segments that use this anon map. 8625 */ 8626 static int 8627 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 8628 enum lock_type type, enum seg_rw rw) 8629 { 8630 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8631 size_t np; 8632 pgcnt_t adjustpages; 8633 pgcnt_t npages; 8634 ulong_t anon_index; 8635 uint_t protchk = (rw == S_READ) ? PROT_READ : PROT_WRITE; 8636 uint_t error; 8637 struct anon_map *amp; 8638 pgcnt_t anpgcnt; 8639 struct page **pplist, **pl, *pp; 8640 caddr_t a; 8641 size_t page; 8642 caddr_t lpgaddr, lpgeaddr; 8643 anon_sync_obj_t cookie; 8644 int anlock; 8645 struct anon_map *pamp; 8646 caddr_t paddr; 8647 seg_preclaim_cbfunc_t preclaim_callback; 8648 size_t pgsz; 8649 int use_pcache; 8650 size_t wlen; 8651 uint_t pflags = 0; 8652 int sftlck_sbase = 0; 8653 int sftlck_send = 0; 8654 8655 #ifdef DEBUG 8656 if (type == L_PAGELOCK && segvn_pglock_mtbf) { 8657 hrtime_t ts = gethrtime(); 8658 if ((ts % segvn_pglock_mtbf) == 0) { 8659 return (ENOTSUP); 8660 } 8661 if ((ts % segvn_pglock_mtbf) == 1) { 8662 return (EFAULT); 8663 } 8664 } 8665 #endif 8666 8667 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 8668 "segvn_pagelock: start seg %p addr %p", seg, addr); 8669 8670 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 8671 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); 8672 8673 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8674 8675 /* 8676 * for now we only support pagelock to anon memory. We would have to 8677 * check protections for vnode objects and call into the vnode driver. 8678 * That's too much for a fast path. Let the fault entry point handle 8679 * it. 8680 */ 8681 if (svd->vp != NULL) { 8682 if (type == L_PAGELOCK) { 8683 error = ENOTSUP; 8684 goto out; 8685 } 8686 panic("segvn_pagelock(L_PAGEUNLOCK): vp != NULL"); 8687 } 8688 if ((amp = svd->amp) == NULL) { 8689 if (type == L_PAGELOCK) { 8690 error = EFAULT; 8691 goto out; 8692 } 8693 panic("segvn_pagelock(L_PAGEUNLOCK): amp == NULL"); 8694 } 8695 if (rw != S_READ && rw != S_WRITE) { 8696 if (type == L_PAGELOCK) { 8697 error = ENOTSUP; 8698 goto out; 8699 } 8700 panic("segvn_pagelock(L_PAGEUNLOCK): bad rw"); 8701 } 8702 8703 if (seg->s_szc != 0) { 8704 /* 8705 * We are adjusting the pagelock region to the large page size 8706 * boundary because the unlocked part of a large page cannot 8707 * be freed anyway unless all constituent pages of a large 8708 * page are locked. Bigger regions reduce pcache chain length 8709 * and improve lookup performance. The tradeoff is that the 8710 * very first segvn_pagelock() call for a given page is more 8711 * expensive if only 1 page_t is needed for IO. This is only 8712 * an issue if pcache entry doesn't get reused by several 8713 * subsequent calls. We optimize here for the case when pcache 8714 * is heavily used by repeated IOs to the same address range. 8715 * 8716 * Note segment's page size cannot change while we are holding 8717 * as lock. And then it cannot change while softlockcnt is 8718 * not 0. This will allow us to correctly recalculate large 8719 * page size region for the matching pageunlock/reclaim call 8720 * since as_pageunlock() caller must always match 8721 * as_pagelock() call's addr and len. 8722 * 8723 * For pageunlock *ppp points to the pointer of page_t that 8724 * corresponds to the real unadjusted start address. Similar 8725 * for pagelock *ppp must point to the pointer of page_t that 8726 * corresponds to the real unadjusted start address. 8727 */ 8728 pgsz = page_get_pagesize(seg->s_szc); 8729 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 8730 adjustpages = btop((uintptr_t)(addr - lpgaddr)); 8731 } else if (len < segvn_pglock_comb_thrshld) { 8732 lpgaddr = addr; 8733 lpgeaddr = addr + len; 8734 adjustpages = 0; 8735 pgsz = PAGESIZE; 8736 } else { 8737 /* 8738 * Align the address range of large enough requests to allow 8739 * combining of different shadow lists into 1 to reduce memory 8740 * overhead from potentially overlapping large shadow lists 8741 * (worst case is we have a 1MB IO into buffers with start 8742 * addresses separated by 4K). Alignment is only possible if 8743 * padded chunks have sufficient access permissions. Note 8744 * permissions won't change between L_PAGELOCK and 8745 * L_PAGEUNLOCK calls since non 0 softlockcnt will force 8746 * segvn_setprot() to wait until softlockcnt drops to 0. This 8747 * allows us to determine in L_PAGEUNLOCK the same range we 8748 * computed in L_PAGELOCK. 8749 * 8750 * If alignment is limited by segment ends set 8751 * sftlck_sbase/sftlck_send flags. In L_PAGELOCK case when 8752 * these flags are set bump softlockcnt_sbase/softlockcnt_send 8753 * per segment counters. In L_PAGEUNLOCK case decrease 8754 * softlockcnt_sbase/softlockcnt_send counters if 8755 * sftlck_sbase/sftlck_send flags are set. When 8756 * softlockcnt_sbase/softlockcnt_send are non 0 8757 * segvn_concat()/segvn_extend_prev()/segvn_extend_next() 8758 * won't merge the segments. This restriction combined with 8759 * restriction on segment unmapping and splitting for segments 8760 * that have non 0 softlockcnt allows L_PAGEUNLOCK to 8761 * correctly determine the same range that was previously 8762 * locked by matching L_PAGELOCK. 8763 */ 8764 pflags = SEGP_PSHIFT | (segvn_pglock_comb_bshift << 16); 8765 pgsz = PAGESIZE; 8766 if (svd->type == MAP_PRIVATE) { 8767 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)addr, 8768 segvn_pglock_comb_balign); 8769 if (lpgaddr < seg->s_base) { 8770 lpgaddr = seg->s_base; 8771 sftlck_sbase = 1; 8772 } 8773 } else { 8774 ulong_t aix = svd->anon_index + seg_page(seg, addr); 8775 ulong_t aaix = P2ALIGN(aix, segvn_pglock_comb_palign); 8776 if (aaix < svd->anon_index) { 8777 lpgaddr = seg->s_base; 8778 sftlck_sbase = 1; 8779 } else { 8780 lpgaddr = addr - ptob(aix - aaix); 8781 ASSERT(lpgaddr >= seg->s_base); 8782 } 8783 } 8784 if (svd->pageprot && lpgaddr != addr) { 8785 struct vpage *vp = &svd->vpage[seg_page(seg, lpgaddr)]; 8786 struct vpage *evp = &svd->vpage[seg_page(seg, addr)]; 8787 while (vp < evp) { 8788 if ((VPP_PROT(vp) & protchk) == 0) { 8789 break; 8790 } 8791 vp++; 8792 } 8793 if (vp < evp) { 8794 lpgaddr = addr; 8795 pflags = 0; 8796 } 8797 } 8798 lpgeaddr = addr + len; 8799 if (pflags) { 8800 if (svd->type == MAP_PRIVATE) { 8801 lpgeaddr = (caddr_t)P2ROUNDUP( 8802 (uintptr_t)lpgeaddr, 8803 segvn_pglock_comb_balign); 8804 } else { 8805 ulong_t aix = svd->anon_index + 8806 seg_page(seg, lpgeaddr); 8807 ulong_t aaix = P2ROUNDUP(aix, 8808 segvn_pglock_comb_palign); 8809 if (aaix < aix) { 8810 lpgeaddr = 0; 8811 } else { 8812 lpgeaddr += ptob(aaix - aix); 8813 } 8814 } 8815 if (lpgeaddr == 0 || 8816 lpgeaddr > seg->s_base + seg->s_size) { 8817 lpgeaddr = seg->s_base + seg->s_size; 8818 sftlck_send = 1; 8819 } 8820 } 8821 if (svd->pageprot && lpgeaddr != addr + len) { 8822 struct vpage *vp; 8823 struct vpage *evp; 8824 8825 vp = &svd->vpage[seg_page(seg, addr + len)]; 8826 evp = &svd->vpage[seg_page(seg, lpgeaddr)]; 8827 8828 while (vp < evp) { 8829 if ((VPP_PROT(vp) & protchk) == 0) { 8830 break; 8831 } 8832 vp++; 8833 } 8834 if (vp < evp) { 8835 lpgeaddr = addr + len; 8836 } 8837 } 8838 adjustpages = btop((uintptr_t)(addr - lpgaddr)); 8839 } 8840 8841 /* 8842 * For MAP_SHARED segments we create pcache entries tagged by amp and 8843 * anon index so that we can share pcache entries with other segments 8844 * that map this amp. For private segments pcache entries are tagged 8845 * with segment and virtual address. 8846 */ 8847 if (svd->type == MAP_SHARED) { 8848 pamp = amp; 8849 paddr = (caddr_t)((lpgaddr - seg->s_base) + 8850 ptob(svd->anon_index)); 8851 preclaim_callback = shamp_reclaim; 8852 } else { 8853 pamp = NULL; 8854 paddr = lpgaddr; 8855 preclaim_callback = segvn_reclaim; 8856 } 8857 8858 if (type == L_PAGEUNLOCK) { 8859 VM_STAT_ADD(segvnvmstats.pagelock[0]); 8860 8861 /* 8862 * update hat ref bits for /proc. We need to make sure 8863 * that threads tracing the ref and mod bits of the 8864 * address space get the right data. 8865 * Note: page ref and mod bits are updated at reclaim time 8866 */ 8867 if (seg->s_as->a_vbits) { 8868 for (a = addr; a < addr + len; a += PAGESIZE) { 8869 if (rw == S_WRITE) { 8870 hat_setstat(seg->s_as, a, 8871 PAGESIZE, P_REF | P_MOD); 8872 } else { 8873 hat_setstat(seg->s_as, a, 8874 PAGESIZE, P_REF); 8875 } 8876 } 8877 } 8878 8879 /* 8880 * Check the shadow list entry after the last page used in 8881 * this IO request. If it's NOPCACHE_SHWLIST the shadow list 8882 * was not inserted into pcache and is not large page 8883 * adjusted. In this case call reclaim callback directly and 8884 * don't adjust the shadow list start and size for large 8885 * pages. 8886 */ 8887 npages = btop(len); 8888 if ((*ppp)[npages] == NOPCACHE_SHWLIST) { 8889 void *ptag; 8890 if (pamp != NULL) { 8891 ASSERT(svd->type == MAP_SHARED); 8892 ptag = (void *)pamp; 8893 paddr = (caddr_t)((addr - seg->s_base) + 8894 ptob(svd->anon_index)); 8895 } else { 8896 ptag = (void *)seg; 8897 paddr = addr; 8898 } 8899 (*preclaim_callback)(ptag, paddr, len, *ppp, rw, 0); 8900 } else { 8901 ASSERT((*ppp)[npages] == PCACHE_SHWLIST || 8902 IS_SWAPFSVP((*ppp)[npages]->p_vnode)); 8903 len = lpgeaddr - lpgaddr; 8904 npages = btop(len); 8905 seg_pinactive(seg, pamp, paddr, len, 8906 *ppp - adjustpages, rw, pflags, preclaim_callback); 8907 } 8908 8909 if (pamp != NULL) { 8910 ASSERT(svd->type == MAP_SHARED); 8911 ASSERT(svd->softlockcnt >= npages); 8912 atomic_add_long((ulong_t *)&svd->softlockcnt, -npages); 8913 } 8914 8915 if (sftlck_sbase) { 8916 ASSERT(svd->softlockcnt_sbase > 0); 8917 atomic_dec_ulong((ulong_t *)&svd->softlockcnt_sbase); 8918 } 8919 if (sftlck_send) { 8920 ASSERT(svd->softlockcnt_send > 0); 8921 atomic_dec_ulong((ulong_t *)&svd->softlockcnt_send); 8922 } 8923 8924 /* 8925 * If someone is blocked while unmapping, we purge 8926 * segment page cache and thus reclaim pplist synchronously 8927 * without waiting for seg_pasync_thread. This speeds up 8928 * unmapping in cases where munmap(2) is called, while 8929 * raw async i/o is still in progress or where a thread 8930 * exits on data fault in a multithreaded application. 8931 */ 8932 if (AS_ISUNMAPWAIT(seg->s_as)) { 8933 if (svd->softlockcnt == 0) { 8934 mutex_enter(&seg->s_as->a_contents); 8935 if (AS_ISUNMAPWAIT(seg->s_as)) { 8936 AS_CLRUNMAPWAIT(seg->s_as); 8937 cv_broadcast(&seg->s_as->a_cv); 8938 } 8939 mutex_exit(&seg->s_as->a_contents); 8940 } else if (pamp == NULL) { 8941 /* 8942 * softlockcnt is not 0 and this is a 8943 * MAP_PRIVATE segment. Try to purge its 8944 * pcache entries to reduce softlockcnt. 8945 * If it drops to 0 segvn_reclaim() 8946 * will wake up a thread waiting on 8947 * unmapwait flag. 8948 * 8949 * We don't purge MAP_SHARED segments with non 8950 * 0 softlockcnt since IO is still in progress 8951 * for such segments. 8952 */ 8953 ASSERT(svd->type == MAP_PRIVATE); 8954 segvn_purge(seg); 8955 } 8956 } 8957 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8958 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 8959 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 8960 return (0); 8961 } 8962 8963 /* The L_PAGELOCK case ... */ 8964 8965 VM_STAT_ADD(segvnvmstats.pagelock[1]); 8966 8967 /* 8968 * For MAP_SHARED segments we have to check protections before 8969 * seg_plookup() since pcache entries may be shared by many segments 8970 * with potentially different page protections. 8971 */ 8972 if (pamp != NULL) { 8973 ASSERT(svd->type == MAP_SHARED); 8974 if (svd->pageprot == 0) { 8975 if ((svd->prot & protchk) == 0) { 8976 error = EACCES; 8977 goto out; 8978 } 8979 } else { 8980 /* 8981 * check page protections 8982 */ 8983 caddr_t ea; 8984 8985 if (seg->s_szc) { 8986 a = lpgaddr; 8987 ea = lpgeaddr; 8988 } else { 8989 a = addr; 8990 ea = addr + len; 8991 } 8992 for (; a < ea; a += pgsz) { 8993 struct vpage *vp; 8994 8995 ASSERT(seg->s_szc == 0 || 8996 sameprot(seg, a, pgsz)); 8997 vp = &svd->vpage[seg_page(seg, a)]; 8998 if ((VPP_PROT(vp) & protchk) == 0) { 8999 error = EACCES; 9000 goto out; 9001 } 9002 } 9003 } 9004 } 9005 9006 /* 9007 * try to find pages in segment page cache 9008 */ 9009 pplist = seg_plookup(seg, pamp, paddr, lpgeaddr - lpgaddr, rw, pflags); 9010 if (pplist != NULL) { 9011 if (pamp != NULL) { 9012 npages = btop((uintptr_t)(lpgeaddr - lpgaddr)); 9013 ASSERT(svd->type == MAP_SHARED); 9014 atomic_add_long((ulong_t *)&svd->softlockcnt, 9015 npages); 9016 } 9017 if (sftlck_sbase) { 9018 atomic_inc_ulong((ulong_t *)&svd->softlockcnt_sbase); 9019 } 9020 if (sftlck_send) { 9021 atomic_inc_ulong((ulong_t *)&svd->softlockcnt_send); 9022 } 9023 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9024 *ppp = pplist + adjustpages; 9025 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 9026 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 9027 return (0); 9028 } 9029 9030 /* 9031 * For MAP_SHARED segments we already verified above that segment 9032 * protections allow this pagelock operation. 9033 */ 9034 if (pamp == NULL) { 9035 ASSERT(svd->type == MAP_PRIVATE); 9036 if (svd->pageprot == 0) { 9037 if ((svd->prot & protchk) == 0) { 9038 error = EACCES; 9039 goto out; 9040 } 9041 if (svd->prot & PROT_WRITE) { 9042 wlen = lpgeaddr - lpgaddr; 9043 } else { 9044 wlen = 0; 9045 ASSERT(rw == S_READ); 9046 } 9047 } else { 9048 int wcont = 1; 9049 /* 9050 * check page protections 9051 */ 9052 for (a = lpgaddr, wlen = 0; a < lpgeaddr; a += pgsz) { 9053 struct vpage *vp; 9054 9055 ASSERT(seg->s_szc == 0 || 9056 sameprot(seg, a, pgsz)); 9057 vp = &svd->vpage[seg_page(seg, a)]; 9058 if ((VPP_PROT(vp) & protchk) == 0) { 9059 error = EACCES; 9060 goto out; 9061 } 9062 if (wcont && (VPP_PROT(vp) & PROT_WRITE)) { 9063 wlen += pgsz; 9064 } else { 9065 wcont = 0; 9066 ASSERT(rw == S_READ); 9067 } 9068 } 9069 } 9070 ASSERT(rw == S_READ || wlen == lpgeaddr - lpgaddr); 9071 ASSERT(rw == S_WRITE || wlen <= lpgeaddr - lpgaddr); 9072 } 9073 9074 /* 9075 * Only build large page adjusted shadow list if we expect to insert 9076 * it into pcache. For large enough pages it's a big overhead to 9077 * create a shadow list of the entire large page. But this overhead 9078 * should be amortized over repeated pcache hits on subsequent reuse 9079 * of this shadow list (IO into any range within this shadow list will 9080 * find it in pcache since we large page align the request for pcache 9081 * lookups). pcache performance is improved with bigger shadow lists 9082 * as it reduces the time to pcache the entire big segment and reduces 9083 * pcache chain length. 9084 */ 9085 if (seg_pinsert_check(seg, pamp, paddr, 9086 lpgeaddr - lpgaddr, pflags) == SEGP_SUCCESS) { 9087 addr = lpgaddr; 9088 len = lpgeaddr - lpgaddr; 9089 use_pcache = 1; 9090 } else { 9091 use_pcache = 0; 9092 /* 9093 * Since this entry will not be inserted into the pcache, we 9094 * will not do any adjustments to the starting address or 9095 * size of the memory to be locked. 9096 */ 9097 adjustpages = 0; 9098 } 9099 npages = btop(len); 9100 9101 pplist = kmem_alloc(sizeof (page_t *) * (npages + 1), KM_SLEEP); 9102 pl = pplist; 9103 *ppp = pplist + adjustpages; 9104 /* 9105 * If use_pcache is 0 this shadow list is not large page adjusted. 9106 * Record this info in the last entry of shadow array so that 9107 * L_PAGEUNLOCK can determine if it should large page adjust the 9108 * address range to find the real range that was locked. 9109 */ 9110 pl[npages] = use_pcache ? PCACHE_SHWLIST : NOPCACHE_SHWLIST; 9111 9112 page = seg_page(seg, addr); 9113 anon_index = svd->anon_index + page; 9114 9115 anlock = 0; 9116 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 9117 ASSERT(amp->a_szc >= seg->s_szc); 9118 anpgcnt = page_get_pagecnt(amp->a_szc); 9119 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 9120 struct anon *ap; 9121 struct vnode *vp; 9122 u_offset_t off; 9123 9124 /* 9125 * Lock and unlock anon array only once per large page. 9126 * anon_array_enter() locks the root anon slot according to 9127 * a_szc which can't change while anon map is locked. We lock 9128 * anon the first time through this loop and each time we 9129 * reach anon index that corresponds to a root of a large 9130 * page. 9131 */ 9132 if (a == addr || P2PHASE(anon_index, anpgcnt) == 0) { 9133 ASSERT(anlock == 0); 9134 anon_array_enter(amp, anon_index, &cookie); 9135 anlock = 1; 9136 } 9137 ap = anon_get_ptr(amp->ahp, anon_index); 9138 9139 /* 9140 * We must never use seg_pcache for COW pages 9141 * because we might end up with original page still 9142 * lying in seg_pcache even after private page is 9143 * created. This leads to data corruption as 9144 * aio_write refers to the page still in cache 9145 * while all other accesses refer to the private 9146 * page. 9147 */ 9148 if (ap == NULL || ap->an_refcnt != 1) { 9149 struct vpage *vpage; 9150 9151 if (seg->s_szc) { 9152 error = EFAULT; 9153 break; 9154 } 9155 if (svd->vpage != NULL) { 9156 vpage = &svd->vpage[seg_page(seg, a)]; 9157 } else { 9158 vpage = NULL; 9159 } 9160 ASSERT(anlock); 9161 anon_array_exit(&cookie); 9162 anlock = 0; 9163 pp = NULL; 9164 error = segvn_faultpage(seg->s_as->a_hat, seg, a, 0, 9165 vpage, &pp, 0, F_INVAL, rw, 1); 9166 if (error) { 9167 error = fc_decode(error); 9168 break; 9169 } 9170 anon_array_enter(amp, anon_index, &cookie); 9171 anlock = 1; 9172 ap = anon_get_ptr(amp->ahp, anon_index); 9173 if (ap == NULL || ap->an_refcnt != 1) { 9174 error = EFAULT; 9175 break; 9176 } 9177 } 9178 swap_xlate(ap, &vp, &off); 9179 pp = page_lookup_nowait(vp, off, SE_SHARED); 9180 if (pp == NULL) { 9181 error = EFAULT; 9182 break; 9183 } 9184 if (ap->an_pvp != NULL) { 9185 anon_swap_free(ap, pp); 9186 } 9187 /* 9188 * Unlock anon if this is the last slot in a large page. 9189 */ 9190 if (P2PHASE(anon_index, anpgcnt) == anpgcnt - 1) { 9191 ASSERT(anlock); 9192 anon_array_exit(&cookie); 9193 anlock = 0; 9194 } 9195 *pplist++ = pp; 9196 } 9197 if (anlock) { /* Ensure the lock is dropped */ 9198 anon_array_exit(&cookie); 9199 } 9200 ANON_LOCK_EXIT(&->a_rwlock); 9201 9202 if (a >= addr + len) { 9203 atomic_add_long((ulong_t *)&svd->softlockcnt, npages); 9204 if (pamp != NULL) { 9205 ASSERT(svd->type == MAP_SHARED); 9206 atomic_add_long((ulong_t *)&pamp->a_softlockcnt, 9207 npages); 9208 wlen = len; 9209 } 9210 if (sftlck_sbase) { 9211 atomic_inc_ulong((ulong_t *)&svd->softlockcnt_sbase); 9212 } 9213 if (sftlck_send) { 9214 atomic_inc_ulong((ulong_t *)&svd->softlockcnt_send); 9215 } 9216 if (use_pcache) { 9217 (void) seg_pinsert(seg, pamp, paddr, len, wlen, pl, 9218 rw, pflags, preclaim_callback); 9219 } 9220 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9221 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 9222 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 9223 return (0); 9224 } 9225 9226 pplist = pl; 9227 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 9228 while (np > (uint_t)0) { 9229 ASSERT(PAGE_LOCKED(*pplist)); 9230 page_unlock(*pplist); 9231 np--; 9232 pplist++; 9233 } 9234 kmem_free(pl, sizeof (page_t *) * (npages + 1)); 9235 out: 9236 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9237 *ppp = NULL; 9238 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 9239 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 9240 return (error); 9241 } 9242 9243 /* 9244 * purge any cached pages in the I/O page cache 9245 */ 9246 static void 9247 segvn_purge(struct seg *seg) 9248 { 9249 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9250 9251 /* 9252 * pcache is only used by pure anon segments. 9253 */ 9254 if (svd->amp == NULL || svd->vp != NULL) { 9255 return; 9256 } 9257 9258 /* 9259 * For MAP_SHARED segments non 0 segment's softlockcnt means 9260 * active IO is still in progress via this segment. So we only 9261 * purge MAP_SHARED segments when their softlockcnt is 0. 9262 */ 9263 if (svd->type == MAP_PRIVATE) { 9264 if (svd->softlockcnt) { 9265 seg_ppurge(seg, NULL, 0); 9266 } 9267 } else if (svd->softlockcnt == 0 && svd->amp->a_softlockcnt != 0) { 9268 seg_ppurge(seg, svd->amp, 0); 9269 } 9270 } 9271 9272 /* 9273 * If async argument is not 0 we are called from pcache async thread and don't 9274 * hold AS lock. 9275 */ 9276 9277 /*ARGSUSED*/ 9278 static int 9279 segvn_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, 9280 enum seg_rw rw, int async) 9281 { 9282 struct seg *seg = (struct seg *)ptag; 9283 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9284 pgcnt_t np, npages; 9285 struct page **pl; 9286 9287 npages = np = btop(len); 9288 ASSERT(npages); 9289 9290 ASSERT(svd->vp == NULL && svd->amp != NULL); 9291 ASSERT(svd->softlockcnt >= npages); 9292 ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 9293 9294 pl = pplist; 9295 9296 ASSERT(pl[np] == NOPCACHE_SHWLIST || pl[np] == PCACHE_SHWLIST); 9297 ASSERT(!async || pl[np] == PCACHE_SHWLIST); 9298 9299 while (np > (uint_t)0) { 9300 if (rw == S_WRITE) { 9301 hat_setrefmod(*pplist); 9302 } else { 9303 hat_setref(*pplist); 9304 } 9305 page_unlock(*pplist); 9306 np--; 9307 pplist++; 9308 } 9309 9310 kmem_free(pl, sizeof (page_t *) * (npages + 1)); 9311 9312 /* 9313 * If we are pcache async thread we don't hold AS lock. This means if 9314 * softlockcnt drops to 0 after the decrement below address space may 9315 * get freed. We can't allow it since after softlock derement to 0 we 9316 * still need to access as structure for possible wakeup of unmap 9317 * waiters. To prevent the disappearance of as we take this segment 9318 * segfree_syncmtx. segvn_free() also takes this mutex as a barrier to 9319 * make sure this routine completes before segment is freed. 9320 * 9321 * The second complication we have to deal with in async case is a 9322 * possibility of missed wake up of unmap wait thread. When we don't 9323 * hold as lock here we may take a_contents lock before unmap wait 9324 * thread that was first to see softlockcnt was still not 0. As a 9325 * result we'll fail to wake up an unmap wait thread. To avoid this 9326 * race we set nounmapwait flag in as structure if we drop softlockcnt 9327 * to 0 when we were called by pcache async thread. unmapwait thread 9328 * will not block if this flag is set. 9329 */ 9330 if (async) { 9331 mutex_enter(&svd->segfree_syncmtx); 9332 } 9333 9334 if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -npages)) { 9335 if (async || AS_ISUNMAPWAIT(seg->s_as)) { 9336 mutex_enter(&seg->s_as->a_contents); 9337 if (async) { 9338 AS_SETNOUNMAPWAIT(seg->s_as); 9339 } 9340 if (AS_ISUNMAPWAIT(seg->s_as)) { 9341 AS_CLRUNMAPWAIT(seg->s_as); 9342 cv_broadcast(&seg->s_as->a_cv); 9343 } 9344 mutex_exit(&seg->s_as->a_contents); 9345 } 9346 } 9347 9348 if (async) { 9349 mutex_exit(&svd->segfree_syncmtx); 9350 } 9351 return (0); 9352 } 9353 9354 /*ARGSUSED*/ 9355 static int 9356 shamp_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, 9357 enum seg_rw rw, int async) 9358 { 9359 amp_t *amp = (amp_t *)ptag; 9360 pgcnt_t np, npages; 9361 struct page **pl; 9362 9363 npages = np = btop(len); 9364 ASSERT(npages); 9365 ASSERT(amp->a_softlockcnt >= npages); 9366 9367 pl = pplist; 9368 9369 ASSERT(pl[np] == NOPCACHE_SHWLIST || pl[np] == PCACHE_SHWLIST); 9370 ASSERT(!async || pl[np] == PCACHE_SHWLIST); 9371 9372 while (np > (uint_t)0) { 9373 if (rw == S_WRITE) { 9374 hat_setrefmod(*pplist); 9375 } else { 9376 hat_setref(*pplist); 9377 } 9378 page_unlock(*pplist); 9379 np--; 9380 pplist++; 9381 } 9382 9383 kmem_free(pl, sizeof (page_t *) * (npages + 1)); 9384 9385 /* 9386 * If somebody sleeps in anonmap_purge() wake them up if a_softlockcnt 9387 * drops to 0. anon map can't be freed until a_softlockcnt drops to 0 9388 * and anonmap_purge() acquires a_purgemtx. 9389 */ 9390 mutex_enter(&->a_purgemtx); 9391 if (!atomic_add_long_nv((ulong_t *)&->a_softlockcnt, -npages) && 9392 amp->a_purgewait) { 9393 amp->a_purgewait = 0; 9394 cv_broadcast(&->a_purgecv); 9395 } 9396 mutex_exit(&->a_purgemtx); 9397 return (0); 9398 } 9399 9400 /* 9401 * get a memory ID for an addr in a given segment 9402 * 9403 * XXX only creates PAGESIZE pages if anon slots are not initialized. 9404 * At fault time they will be relocated into larger pages. 9405 */ 9406 static int 9407 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 9408 { 9409 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9410 struct anon *ap = NULL; 9411 ulong_t anon_index; 9412 struct anon_map *amp; 9413 anon_sync_obj_t cookie; 9414 9415 if (svd->type == MAP_PRIVATE) { 9416 memidp->val[0] = (uintptr_t)seg->s_as; 9417 memidp->val[1] = (uintptr_t)addr; 9418 return (0); 9419 } 9420 9421 if (svd->type == MAP_SHARED) { 9422 if (svd->vp) { 9423 memidp->val[0] = (uintptr_t)svd->vp; 9424 memidp->val[1] = (u_longlong_t)svd->offset + 9425 (uintptr_t)(addr - seg->s_base); 9426 return (0); 9427 } else { 9428 9429 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 9430 if ((amp = svd->amp) != NULL) { 9431 anon_index = svd->anon_index + 9432 seg_page(seg, addr); 9433 } 9434 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9435 9436 ASSERT(amp != NULL); 9437 9438 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 9439 anon_array_enter(amp, anon_index, &cookie); 9440 ap = anon_get_ptr(amp->ahp, anon_index); 9441 if (ap == NULL) { 9442 page_t *pp; 9443 9444 pp = anon_zero(seg, addr, &ap, svd->cred); 9445 if (pp == NULL) { 9446 anon_array_exit(&cookie); 9447 ANON_LOCK_EXIT(&->a_rwlock); 9448 return (ENOMEM); 9449 } 9450 ASSERT(anon_get_ptr(amp->ahp, anon_index) 9451 == NULL); 9452 (void) anon_set_ptr(amp->ahp, anon_index, 9453 ap, ANON_SLEEP); 9454 page_unlock(pp); 9455 } 9456 9457 anon_array_exit(&cookie); 9458 ANON_LOCK_EXIT(&->a_rwlock); 9459 9460 memidp->val[0] = (uintptr_t)ap; 9461 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 9462 return (0); 9463 } 9464 } 9465 return (EINVAL); 9466 } 9467 9468 static int 9469 sameprot(struct seg *seg, caddr_t a, size_t len) 9470 { 9471 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9472 struct vpage *vpage; 9473 spgcnt_t pages = btop(len); 9474 uint_t prot; 9475 9476 if (svd->pageprot == 0) 9477 return (1); 9478 9479 ASSERT(svd->vpage != NULL); 9480 9481 vpage = &svd->vpage[seg_page(seg, a)]; 9482 prot = VPP_PROT(vpage); 9483 vpage++; 9484 pages--; 9485 while (pages-- > 0) { 9486 if (prot != VPP_PROT(vpage)) 9487 return (0); 9488 vpage++; 9489 } 9490 return (1); 9491 } 9492 9493 /* 9494 * Get memory allocation policy info for specified address in given segment 9495 */ 9496 static lgrp_mem_policy_info_t * 9497 segvn_getpolicy(struct seg *seg, caddr_t addr) 9498 { 9499 struct anon_map *amp; 9500 ulong_t anon_index; 9501 lgrp_mem_policy_info_t *policy_info; 9502 struct segvn_data *svn_data; 9503 u_offset_t vn_off; 9504 vnode_t *vp; 9505 9506 ASSERT(seg != NULL); 9507 9508 svn_data = (struct segvn_data *)seg->s_data; 9509 if (svn_data == NULL) 9510 return (NULL); 9511 9512 /* 9513 * Get policy info for private or shared memory 9514 */ 9515 if (svn_data->type != MAP_SHARED) { 9516 if (svn_data->tr_state != SEGVN_TR_ON) { 9517 policy_info = &svn_data->policy_info; 9518 } else { 9519 policy_info = &svn_data->tr_policy_info; 9520 ASSERT(policy_info->mem_policy == 9521 LGRP_MEM_POLICY_NEXT_SEG); 9522 } 9523 } else { 9524 amp = svn_data->amp; 9525 anon_index = svn_data->anon_index + seg_page(seg, addr); 9526 vp = svn_data->vp; 9527 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 9528 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 9529 } 9530 9531 return (policy_info); 9532 } 9533 9534 /*ARGSUSED*/ 9535 static int 9536 segvn_capable(struct seg *seg, segcapability_t capability) 9537 { 9538 return (0); 9539 } 9540 9541 /* 9542 * Bind text vnode segment to an amp. If we bind successfully mappings will be 9543 * established to per vnode mapping per lgroup amp pages instead of to vnode 9544 * pages. There's one amp per vnode text mapping per lgroup. Many processes 9545 * may share the same text replication amp. If a suitable amp doesn't already 9546 * exist in svntr hash table create a new one. We may fail to bind to amp if 9547 * segment is not eligible for text replication. Code below first checks for 9548 * these conditions. If binding is successful segment tr_state is set to on 9549 * and svd->amp points to the amp to use. Otherwise tr_state is set to off and 9550 * svd->amp remains as NULL. 9551 */ 9552 static void 9553 segvn_textrepl(struct seg *seg) 9554 { 9555 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9556 vnode_t *vp = svd->vp; 9557 u_offset_t off = svd->offset; 9558 size_t size = seg->s_size; 9559 u_offset_t eoff = off + size; 9560 uint_t szc = seg->s_szc; 9561 ulong_t hash = SVNTR_HASH_FUNC(vp); 9562 svntr_t *svntrp; 9563 struct vattr va; 9564 proc_t *p = seg->s_as->a_proc; 9565 lgrp_id_t lgrp_id; 9566 lgrp_id_t olid; 9567 int first; 9568 struct anon_map *amp; 9569 9570 ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 9571 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 9572 ASSERT(p != NULL); 9573 ASSERT(svd->tr_state == SEGVN_TR_INIT); 9574 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 9575 ASSERT(svd->flags & MAP_TEXT); 9576 ASSERT(svd->type == MAP_PRIVATE); 9577 ASSERT(vp != NULL && svd->amp == NULL); 9578 ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE)); 9579 ASSERT(!(svd->flags & MAP_NORESERVE) && svd->swresv == 0); 9580 ASSERT(seg->s_as != &kas); 9581 ASSERT(off < eoff); 9582 ASSERT(svntr_hashtab != NULL); 9583 9584 /* 9585 * If numa optimizations are no longer desired bail out. 9586 */ 9587 if (!lgrp_optimizations()) { 9588 svd->tr_state = SEGVN_TR_OFF; 9589 return; 9590 } 9591 9592 /* 9593 * Avoid creating anon maps with size bigger than the file size. 9594 * If VOP_GETATTR() call fails bail out. 9595 */ 9596 va.va_mask = AT_SIZE | AT_MTIME | AT_CTIME; 9597 if (VOP_GETATTR(vp, &va, 0, svd->cred, NULL) != 0) { 9598 svd->tr_state = SEGVN_TR_OFF; 9599 SEGVN_TR_ADDSTAT(gaerr); 9600 return; 9601 } 9602 if (btopr(va.va_size) < btopr(eoff)) { 9603 svd->tr_state = SEGVN_TR_OFF; 9604 SEGVN_TR_ADDSTAT(overmap); 9605 return; 9606 } 9607 9608 /* 9609 * VVMEXEC may not be set yet if exec() prefaults text segment. Set 9610 * this flag now before vn_is_mapped(V_WRITE) so that MAP_SHARED 9611 * mapping that checks if trcache for this vnode needs to be 9612 * invalidated can't miss us. 9613 */ 9614 if (!(vp->v_flag & VVMEXEC)) { 9615 mutex_enter(&vp->v_lock); 9616 vp->v_flag |= VVMEXEC; 9617 mutex_exit(&vp->v_lock); 9618 } 9619 mutex_enter(&svntr_hashtab[hash].tr_lock); 9620 /* 9621 * Bail out if potentially MAP_SHARED writable mappings exist to this 9622 * vnode. We don't want to use old file contents from existing 9623 * replicas if this mapping was established after the original file 9624 * was changed. 9625 */ 9626 if (vn_is_mapped(vp, V_WRITE)) { 9627 mutex_exit(&svntr_hashtab[hash].tr_lock); 9628 svd->tr_state = SEGVN_TR_OFF; 9629 SEGVN_TR_ADDSTAT(wrcnt); 9630 return; 9631 } 9632 svntrp = svntr_hashtab[hash].tr_head; 9633 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 9634 ASSERT(svntrp->tr_refcnt != 0); 9635 if (svntrp->tr_vp != vp) { 9636 continue; 9637 } 9638 9639 /* 9640 * Bail out if the file or its attributes were changed after 9641 * this replication entry was created since we need to use the 9642 * latest file contents. Note that mtime test alone is not 9643 * sufficient because a user can explicitly change mtime via 9644 * utimes(2) interfaces back to the old value after modifiying 9645 * the file contents. To detect this case we also have to test 9646 * ctime which among other things records the time of the last 9647 * mtime change by utimes(2). ctime is not changed when the file 9648 * is only read or executed so we expect that typically existing 9649 * replication amp's can be used most of the time. 9650 */ 9651 if (!svntrp->tr_valid || 9652 svntrp->tr_mtime.tv_sec != va.va_mtime.tv_sec || 9653 svntrp->tr_mtime.tv_nsec != va.va_mtime.tv_nsec || 9654 svntrp->tr_ctime.tv_sec != va.va_ctime.tv_sec || 9655 svntrp->tr_ctime.tv_nsec != va.va_ctime.tv_nsec) { 9656 mutex_exit(&svntr_hashtab[hash].tr_lock); 9657 svd->tr_state = SEGVN_TR_OFF; 9658 SEGVN_TR_ADDSTAT(stale); 9659 return; 9660 } 9661 /* 9662 * if off, eoff and szc match current segment we found the 9663 * existing entry we can use. 9664 */ 9665 if (svntrp->tr_off == off && svntrp->tr_eoff == eoff && 9666 svntrp->tr_szc == szc) { 9667 break; 9668 } 9669 /* 9670 * Don't create different but overlapping in file offsets 9671 * entries to avoid replication of the same file pages more 9672 * than once per lgroup. 9673 */ 9674 if ((off >= svntrp->tr_off && off < svntrp->tr_eoff) || 9675 (eoff > svntrp->tr_off && eoff <= svntrp->tr_eoff)) { 9676 mutex_exit(&svntr_hashtab[hash].tr_lock); 9677 svd->tr_state = SEGVN_TR_OFF; 9678 SEGVN_TR_ADDSTAT(overlap); 9679 return; 9680 } 9681 } 9682 /* 9683 * If we didn't find existing entry create a new one. 9684 */ 9685 if (svntrp == NULL) { 9686 svntrp = kmem_cache_alloc(svntr_cache, KM_NOSLEEP); 9687 if (svntrp == NULL) { 9688 mutex_exit(&svntr_hashtab[hash].tr_lock); 9689 svd->tr_state = SEGVN_TR_OFF; 9690 SEGVN_TR_ADDSTAT(nokmem); 9691 return; 9692 } 9693 #ifdef DEBUG 9694 { 9695 lgrp_id_t i; 9696 for (i = 0; i < NLGRPS_MAX; i++) { 9697 ASSERT(svntrp->tr_amp[i] == NULL); 9698 } 9699 } 9700 #endif /* DEBUG */ 9701 svntrp->tr_vp = vp; 9702 svntrp->tr_off = off; 9703 svntrp->tr_eoff = eoff; 9704 svntrp->tr_szc = szc; 9705 svntrp->tr_valid = 1; 9706 svntrp->tr_mtime = va.va_mtime; 9707 svntrp->tr_ctime = va.va_ctime; 9708 svntrp->tr_refcnt = 0; 9709 svntrp->tr_next = svntr_hashtab[hash].tr_head; 9710 svntr_hashtab[hash].tr_head = svntrp; 9711 } 9712 first = 1; 9713 again: 9714 /* 9715 * We want to pick a replica with pages on main thread's (t_tid = 1, 9716 * aka T1) lgrp. Currently text replication is only optimized for 9717 * workloads that either have all threads of a process on the same 9718 * lgrp or execute their large text primarily on main thread. 9719 */ 9720 lgrp_id = p->p_t1_lgrpid; 9721 if (lgrp_id == LGRP_NONE) { 9722 /* 9723 * In case exec() prefaults text on non main thread use 9724 * current thread lgrpid. It will become main thread anyway 9725 * soon. 9726 */ 9727 lgrp_id = lgrp_home_id(curthread); 9728 } 9729 /* 9730 * Set p_tr_lgrpid to lgrpid if it hasn't been set yet. Otherwise 9731 * just set it to NLGRPS_MAX if it's different from current process T1 9732 * home lgrp. p_tr_lgrpid is used to detect if process uses text 9733 * replication and T1 new home is different from lgrp used for text 9734 * replication. When this happens asyncronous segvn thread rechecks if 9735 * segments should change lgrps used for text replication. If we fail 9736 * to set p_tr_lgrpid with atomic_cas_32 then set it to NLGRPS_MAX 9737 * without cas if it's not already NLGRPS_MAX and not equal lgrp_id 9738 * we want to use. We don't need to use cas in this case because 9739 * another thread that races in between our non atomic check and set 9740 * may only change p_tr_lgrpid to NLGRPS_MAX at this point. 9741 */ 9742 ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX); 9743 olid = p->p_tr_lgrpid; 9744 if (lgrp_id != olid && olid != NLGRPS_MAX) { 9745 lgrp_id_t nlid = (olid == LGRP_NONE) ? lgrp_id : NLGRPS_MAX; 9746 if (atomic_cas_32((uint32_t *)&p->p_tr_lgrpid, olid, nlid) != 9747 olid) { 9748 olid = p->p_tr_lgrpid; 9749 ASSERT(olid != LGRP_NONE); 9750 if (olid != lgrp_id && olid != NLGRPS_MAX) { 9751 p->p_tr_lgrpid = NLGRPS_MAX; 9752 } 9753 } 9754 ASSERT(p->p_tr_lgrpid != LGRP_NONE); 9755 membar_producer(); 9756 /* 9757 * lgrp_move_thread() won't schedule async recheck after 9758 * p->p_t1_lgrpid update unless p->p_tr_lgrpid is not 9759 * LGRP_NONE. Recheck p_t1_lgrpid once now that p->p_tr_lgrpid 9760 * is not LGRP_NONE. 9761 */ 9762 if (first && p->p_t1_lgrpid != LGRP_NONE && 9763 p->p_t1_lgrpid != lgrp_id) { 9764 first = 0; 9765 goto again; 9766 } 9767 } 9768 /* 9769 * If no amp was created yet for lgrp_id create a new one as long as 9770 * we have enough memory to afford it. 9771 */ 9772 if ((amp = svntrp->tr_amp[lgrp_id]) == NULL) { 9773 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size); 9774 if (trmem > segvn_textrepl_max_bytes) { 9775 SEGVN_TR_ADDSTAT(normem); 9776 goto fail; 9777 } 9778 if (anon_try_resv_zone(size, NULL) == 0) { 9779 SEGVN_TR_ADDSTAT(noanon); 9780 goto fail; 9781 } 9782 amp = anonmap_alloc(size, size, ANON_NOSLEEP); 9783 if (amp == NULL) { 9784 anon_unresv_zone(size, NULL); 9785 SEGVN_TR_ADDSTAT(nokmem); 9786 goto fail; 9787 } 9788 ASSERT(amp->refcnt == 1); 9789 amp->a_szc = szc; 9790 svntrp->tr_amp[lgrp_id] = amp; 9791 SEGVN_TR_ADDSTAT(newamp); 9792 } 9793 svntrp->tr_refcnt++; 9794 ASSERT(svd->svn_trnext == NULL); 9795 ASSERT(svd->svn_trprev == NULL); 9796 svd->svn_trnext = svntrp->tr_svnhead; 9797 svd->svn_trprev = NULL; 9798 if (svntrp->tr_svnhead != NULL) { 9799 svntrp->tr_svnhead->svn_trprev = svd; 9800 } 9801 svntrp->tr_svnhead = svd; 9802 ASSERT(amp->a_szc == szc && amp->size == size && amp->swresv == size); 9803 ASSERT(amp->refcnt >= 1); 9804 svd->amp = amp; 9805 svd->anon_index = 0; 9806 svd->tr_policy_info.mem_policy = LGRP_MEM_POLICY_NEXT_SEG; 9807 svd->tr_policy_info.mem_lgrpid = lgrp_id; 9808 svd->tr_state = SEGVN_TR_ON; 9809 mutex_exit(&svntr_hashtab[hash].tr_lock); 9810 SEGVN_TR_ADDSTAT(repl); 9811 return; 9812 fail: 9813 ASSERT(segvn_textrepl_bytes >= size); 9814 atomic_add_long(&segvn_textrepl_bytes, -size); 9815 ASSERT(svntrp != NULL); 9816 ASSERT(svntrp->tr_amp[lgrp_id] == NULL); 9817 if (svntrp->tr_refcnt == 0) { 9818 ASSERT(svntrp == svntr_hashtab[hash].tr_head); 9819 svntr_hashtab[hash].tr_head = svntrp->tr_next; 9820 mutex_exit(&svntr_hashtab[hash].tr_lock); 9821 kmem_cache_free(svntr_cache, svntrp); 9822 } else { 9823 mutex_exit(&svntr_hashtab[hash].tr_lock); 9824 } 9825 svd->tr_state = SEGVN_TR_OFF; 9826 } 9827 9828 /* 9829 * Convert seg back to regular vnode mapping seg by unbinding it from its text 9830 * replication amp. This routine is most typically called when segment is 9831 * unmapped but can also be called when segment no longer qualifies for text 9832 * replication (e.g. due to protection changes). If unload_unmap is set use 9833 * HAT_UNLOAD_UNMAP flag in hat_unload_callback(). If we are the last user of 9834 * svntr free all its anon maps and remove it from the hash table. 9835 */ 9836 static void 9837 segvn_textunrepl(struct seg *seg, int unload_unmap) 9838 { 9839 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9840 vnode_t *vp = svd->vp; 9841 u_offset_t off = svd->offset; 9842 size_t size = seg->s_size; 9843 u_offset_t eoff = off + size; 9844 uint_t szc = seg->s_szc; 9845 ulong_t hash = SVNTR_HASH_FUNC(vp); 9846 svntr_t *svntrp; 9847 svntr_t **prv_svntrp; 9848 lgrp_id_t lgrp_id = svd->tr_policy_info.mem_lgrpid; 9849 lgrp_id_t i; 9850 9851 ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 9852 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 9853 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 9854 ASSERT(svd->tr_state == SEGVN_TR_ON); 9855 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 9856 ASSERT(svd->amp != NULL); 9857 ASSERT(svd->amp->refcnt >= 1); 9858 ASSERT(svd->anon_index == 0); 9859 ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX); 9860 ASSERT(svntr_hashtab != NULL); 9861 9862 mutex_enter(&svntr_hashtab[hash].tr_lock); 9863 prv_svntrp = &svntr_hashtab[hash].tr_head; 9864 for (; (svntrp = *prv_svntrp) != NULL; prv_svntrp = &svntrp->tr_next) { 9865 ASSERT(svntrp->tr_refcnt != 0); 9866 if (svntrp->tr_vp == vp && svntrp->tr_off == off && 9867 svntrp->tr_eoff == eoff && svntrp->tr_szc == szc) { 9868 break; 9869 } 9870 } 9871 if (svntrp == NULL) { 9872 panic("segvn_textunrepl: svntr record not found"); 9873 } 9874 if (svntrp->tr_amp[lgrp_id] != svd->amp) { 9875 panic("segvn_textunrepl: amp mismatch"); 9876 } 9877 svd->tr_state = SEGVN_TR_OFF; 9878 svd->amp = NULL; 9879 if (svd->svn_trprev == NULL) { 9880 ASSERT(svntrp->tr_svnhead == svd); 9881 svntrp->tr_svnhead = svd->svn_trnext; 9882 if (svntrp->tr_svnhead != NULL) { 9883 svntrp->tr_svnhead->svn_trprev = NULL; 9884 } 9885 svd->svn_trnext = NULL; 9886 } else { 9887 svd->svn_trprev->svn_trnext = svd->svn_trnext; 9888 if (svd->svn_trnext != NULL) { 9889 svd->svn_trnext->svn_trprev = svd->svn_trprev; 9890 svd->svn_trnext = NULL; 9891 } 9892 svd->svn_trprev = NULL; 9893 } 9894 if (--svntrp->tr_refcnt) { 9895 mutex_exit(&svntr_hashtab[hash].tr_lock); 9896 goto done; 9897 } 9898 *prv_svntrp = svntrp->tr_next; 9899 mutex_exit(&svntr_hashtab[hash].tr_lock); 9900 for (i = 0; i < NLGRPS_MAX; i++) { 9901 struct anon_map *amp = svntrp->tr_amp[i]; 9902 if (amp == NULL) { 9903 continue; 9904 } 9905 ASSERT(amp->refcnt == 1); 9906 ASSERT(amp->swresv == size); 9907 ASSERT(amp->size == size); 9908 ASSERT(amp->a_szc == szc); 9909 if (amp->a_szc != 0) { 9910 anon_free_pages(amp->ahp, 0, size, szc); 9911 } else { 9912 anon_free(amp->ahp, 0, size); 9913 } 9914 svntrp->tr_amp[i] = NULL; 9915 ASSERT(segvn_textrepl_bytes >= size); 9916 atomic_add_long(&segvn_textrepl_bytes, -size); 9917 anon_unresv_zone(amp->swresv, NULL); 9918 amp->refcnt = 0; 9919 anonmap_free(amp); 9920 } 9921 kmem_cache_free(svntr_cache, svntrp); 9922 done: 9923 hat_unload_callback(seg->s_as->a_hat, seg->s_base, size, 9924 unload_unmap ? HAT_UNLOAD_UNMAP : 0, NULL); 9925 } 9926 9927 /* 9928 * This is called when a MAP_SHARED writable mapping is created to a vnode 9929 * that is currently used for execution (VVMEXEC flag is set). In this case we 9930 * need to prevent further use of existing replicas. 9931 */ 9932 static void 9933 segvn_inval_trcache(vnode_t *vp) 9934 { 9935 ulong_t hash = SVNTR_HASH_FUNC(vp); 9936 svntr_t *svntrp; 9937 9938 ASSERT(vp->v_flag & VVMEXEC); 9939 9940 if (svntr_hashtab == NULL) { 9941 return; 9942 } 9943 9944 mutex_enter(&svntr_hashtab[hash].tr_lock); 9945 svntrp = svntr_hashtab[hash].tr_head; 9946 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 9947 ASSERT(svntrp->tr_refcnt != 0); 9948 if (svntrp->tr_vp == vp && svntrp->tr_valid) { 9949 svntrp->tr_valid = 0; 9950 } 9951 } 9952 mutex_exit(&svntr_hashtab[hash].tr_lock); 9953 } 9954 9955 static void 9956 segvn_trasync_thread(void) 9957 { 9958 callb_cpr_t cpr_info; 9959 kmutex_t cpr_lock; /* just for CPR stuff */ 9960 9961 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 9962 9963 CALLB_CPR_INIT(&cpr_info, &cpr_lock, 9964 callb_generic_cpr, "segvn_async"); 9965 9966 if (segvn_update_textrepl_interval == 0) { 9967 segvn_update_textrepl_interval = segvn_update_tr_time * hz; 9968 } else { 9969 segvn_update_textrepl_interval *= hz; 9970 } 9971 (void) timeout(segvn_trupdate_wakeup, NULL, 9972 segvn_update_textrepl_interval); 9973 9974 for (;;) { 9975 mutex_enter(&cpr_lock); 9976 CALLB_CPR_SAFE_BEGIN(&cpr_info); 9977 mutex_exit(&cpr_lock); 9978 sema_p(&segvn_trasync_sem); 9979 mutex_enter(&cpr_lock); 9980 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 9981 mutex_exit(&cpr_lock); 9982 segvn_trupdate(); 9983 } 9984 } 9985 9986 static uint64_t segvn_lgrp_trthr_migrs_snpsht = 0; 9987 9988 static void 9989 segvn_trupdate_wakeup(void *dummy) 9990 { 9991 uint64_t cur_lgrp_trthr_migrs = lgrp_get_trthr_migrations(); 9992 9993 if (cur_lgrp_trthr_migrs != segvn_lgrp_trthr_migrs_snpsht) { 9994 segvn_lgrp_trthr_migrs_snpsht = cur_lgrp_trthr_migrs; 9995 sema_v(&segvn_trasync_sem); 9996 } 9997 9998 if (!segvn_disable_textrepl_update && 9999 segvn_update_textrepl_interval != 0) { 10000 (void) timeout(segvn_trupdate_wakeup, dummy, 10001 segvn_update_textrepl_interval); 10002 } 10003 } 10004 10005 static void 10006 segvn_trupdate(void) 10007 { 10008 ulong_t hash; 10009 svntr_t *svntrp; 10010 segvn_data_t *svd; 10011 10012 ASSERT(svntr_hashtab != NULL); 10013 10014 for (hash = 0; hash < svntr_hashtab_sz; hash++) { 10015 mutex_enter(&svntr_hashtab[hash].tr_lock); 10016 svntrp = svntr_hashtab[hash].tr_head; 10017 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 10018 ASSERT(svntrp->tr_refcnt != 0); 10019 svd = svntrp->tr_svnhead; 10020 for (; svd != NULL; svd = svd->svn_trnext) { 10021 segvn_trupdate_seg(svd->seg, svd, svntrp, 10022 hash); 10023 } 10024 } 10025 mutex_exit(&svntr_hashtab[hash].tr_lock); 10026 } 10027 } 10028 10029 static void 10030 segvn_trupdate_seg(struct seg *seg, 10031 segvn_data_t *svd, 10032 svntr_t *svntrp, 10033 ulong_t hash) 10034 { 10035 proc_t *p; 10036 lgrp_id_t lgrp_id; 10037 struct as *as; 10038 size_t size; 10039 struct anon_map *amp; 10040 10041 ASSERT(svd->vp != NULL); 10042 ASSERT(svd->vp == svntrp->tr_vp); 10043 ASSERT(svd->offset == svntrp->tr_off); 10044 ASSERT(svd->offset + seg->s_size == svntrp->tr_eoff); 10045 ASSERT(seg != NULL); 10046 ASSERT(svd->seg == seg); 10047 ASSERT(seg->s_data == (void *)svd); 10048 ASSERT(seg->s_szc == svntrp->tr_szc); 10049 ASSERT(svd->tr_state == SEGVN_TR_ON); 10050 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 10051 ASSERT(svd->amp != NULL); 10052 ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG); 10053 ASSERT(svd->tr_policy_info.mem_lgrpid != LGRP_NONE); 10054 ASSERT(svd->tr_policy_info.mem_lgrpid < NLGRPS_MAX); 10055 ASSERT(svntrp->tr_amp[svd->tr_policy_info.mem_lgrpid] == svd->amp); 10056 ASSERT(svntrp->tr_refcnt != 0); 10057 ASSERT(mutex_owned(&svntr_hashtab[hash].tr_lock)); 10058 10059 as = seg->s_as; 10060 ASSERT(as != NULL && as != &kas); 10061 p = as->a_proc; 10062 ASSERT(p != NULL); 10063 ASSERT(p->p_tr_lgrpid != LGRP_NONE); 10064 lgrp_id = p->p_t1_lgrpid; 10065 if (lgrp_id == LGRP_NONE) { 10066 return; 10067 } 10068 ASSERT(lgrp_id < NLGRPS_MAX); 10069 if (svd->tr_policy_info.mem_lgrpid == lgrp_id) { 10070 return; 10071 } 10072 10073 /* 10074 * Use tryenter locking since we are locking as/seg and svntr hash 10075 * lock in reverse from syncrounous thread order. 10076 */ 10077 if (!AS_LOCK_TRYENTER(as, &as->a_lock, RW_READER)) { 10078 SEGVN_TR_ADDSTAT(nolock); 10079 if (segvn_lgrp_trthr_migrs_snpsht) { 10080 segvn_lgrp_trthr_migrs_snpsht = 0; 10081 } 10082 return; 10083 } 10084 if (!SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock, RW_WRITER)) { 10085 AS_LOCK_EXIT(as, &as->a_lock); 10086 SEGVN_TR_ADDSTAT(nolock); 10087 if (segvn_lgrp_trthr_migrs_snpsht) { 10088 segvn_lgrp_trthr_migrs_snpsht = 0; 10089 } 10090 return; 10091 } 10092 size = seg->s_size; 10093 if (svntrp->tr_amp[lgrp_id] == NULL) { 10094 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size); 10095 if (trmem > segvn_textrepl_max_bytes) { 10096 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 10097 AS_LOCK_EXIT(as, &as->a_lock); 10098 atomic_add_long(&segvn_textrepl_bytes, -size); 10099 SEGVN_TR_ADDSTAT(normem); 10100 return; 10101 } 10102 if (anon_try_resv_zone(size, NULL) == 0) { 10103 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 10104 AS_LOCK_EXIT(as, &as->a_lock); 10105 atomic_add_long(&segvn_textrepl_bytes, -size); 10106 SEGVN_TR_ADDSTAT(noanon); 10107 return; 10108 } 10109 amp = anonmap_alloc(size, size, KM_NOSLEEP); 10110 if (amp == NULL) { 10111 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 10112 AS_LOCK_EXIT(as, &as->a_lock); 10113 atomic_add_long(&segvn_textrepl_bytes, -size); 10114 anon_unresv_zone(size, NULL); 10115 SEGVN_TR_ADDSTAT(nokmem); 10116 return; 10117 } 10118 ASSERT(amp->refcnt == 1); 10119 amp->a_szc = seg->s_szc; 10120 svntrp->tr_amp[lgrp_id] = amp; 10121 } 10122 /* 10123 * We don't need to drop the bucket lock but here we give other 10124 * threads a chance. svntr and svd can't be unlinked as long as 10125 * segment lock is held as a writer and AS held as well. After we 10126 * retake bucket lock we'll continue from where we left. We'll be able 10127 * to reach the end of either list since new entries are always added 10128 * to the beginning of the lists. 10129 */ 10130 mutex_exit(&svntr_hashtab[hash].tr_lock); 10131 hat_unload_callback(as->a_hat, seg->s_base, size, 0, NULL); 10132 mutex_enter(&svntr_hashtab[hash].tr_lock); 10133 10134 ASSERT(svd->tr_state == SEGVN_TR_ON); 10135 ASSERT(svd->amp != NULL); 10136 ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG); 10137 ASSERT(svd->tr_policy_info.mem_lgrpid != lgrp_id); 10138 ASSERT(svd->amp != svntrp->tr_amp[lgrp_id]); 10139 10140 svd->tr_policy_info.mem_lgrpid = lgrp_id; 10141 svd->amp = svntrp->tr_amp[lgrp_id]; 10142 p->p_tr_lgrpid = NLGRPS_MAX; 10143 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 10144 AS_LOCK_EXIT(as, &as->a_lock); 10145 10146 ASSERT(svntrp->tr_refcnt != 0); 10147 ASSERT(svd->vp == svntrp->tr_vp); 10148 ASSERT(svd->tr_policy_info.mem_lgrpid == lgrp_id); 10149 ASSERT(svd->amp != NULL && svd->amp == svntrp->tr_amp[lgrp_id]); 10150 ASSERT(svd->seg == seg); 10151 ASSERT(svd->tr_state == SEGVN_TR_ON); 10152 10153 SEGVN_TR_ADDSTAT(asyncrepl); 10154 }