42 */
43
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/param.h>
47 #include <sys/errno.h>
48 #include <sys/systm.h>
49 #include <sys/mman.h>
50 #include <sys/sysmacros.h>
51 #include <sys/cpuvar.h>
52 #include <sys/sysinfo.h>
53 #include <sys/kmem.h>
54 #include <sys/vnode.h>
55 #include <sys/vmsystm.h>
56 #include <sys/cmn_err.h>
57 #include <sys/debug.h>
58 #include <sys/tnf_probe.h>
59 #include <sys/vtrace.h>
60
61 #include <vm/hat.h>
62 #include <vm/xhat.h>
63 #include <vm/as.h>
64 #include <vm/seg.h>
65 #include <vm/seg_vn.h>
66 #include <vm/seg_dev.h>
67 #include <vm/seg_kmem.h>
68 #include <vm/seg_map.h>
69 #include <vm/seg_spt.h>
70 #include <vm/page.h>
71
72 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
73
74 static struct kmem_cache *as_cache;
75
76 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
77 static void as_clearwatchprot(struct as *, caddr_t, size_t);
78 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
79
80
81 /*
82 * Verifying the segment lists is very time-consuming; it may not be
457 seg = avl_find(&as->a_segtree, &addr, &where);
458
459 if (seg == NULL)
460 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
461
462 if (seg == NULL)
463 seg = avl_last(&as->a_segtree);
464
465 if (seg != NULL) {
466 caddr_t base = seg->s_base;
467
468 /*
469 * If top of seg is below the requested address, then
470 * the insertion point is at the end of the linked list,
471 * and seg points to the tail of the list. Otherwise,
472 * the insertion point is immediately before seg.
473 */
474 if (base + seg->s_size > addr) {
475 if (addr >= base || eaddr > base) {
476 #ifdef __sparc
477 extern struct seg_ops segnf_ops;
478
479 /*
480 * no-fault segs must disappear if overlaid.
481 * XXX need new segment type so
482 * we don't have to check s_ops
483 */
484 if (seg->s_ops == &segnf_ops) {
485 seg_unmap(seg);
486 goto again;
487 }
488 #endif
489 return (-1); /* overlapping segment */
490 }
491 }
492 }
493 as->a_seglast = newseg;
494 avl_insert(&as->a_segtree, newseg, where);
495
496 #ifdef VERIFY_SEGLIST
497 as_verify(as);
654
655 as->a_flags = 0;
656 as->a_vbits = 0;
657 as->a_hrm = NULL;
658 as->a_seglast = NULL;
659 as->a_size = 0;
660 as->a_resvsize = 0;
661 as->a_updatedir = 0;
662 gethrestime(&as->a_updatetime);
663 as->a_objectdir = NULL;
664 as->a_sizedir = 0;
665 as->a_userlimit = (caddr_t)USERLIMIT;
666 as->a_lastgap = NULL;
667 as->a_lastgaphl = NULL;
668 as->a_callbacks = NULL;
669
670 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
671 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
672 AS_LOCK_EXIT(as, &as->a_lock);
673
674 as->a_xhat = NULL;
675
676 return (as);
677 }
678
679 /*
680 * Free an address space data structure.
681 * Need to free the hat first and then
682 * all the segments on this as and finally
683 * the space for the as struct itself.
684 */
685 void
686 as_free(struct as *as)
687 {
688 struct hat *hat = as->a_hat;
689 struct seg *seg, *next;
690 int called = 0;
691
692 top:
693 /*
694 * Invoke ALL callbacks. as_do_callbacks will do one callback
695 * per call, and not return (-1) until the callback has completed.
696 * When as_do_callbacks returns zero, all callbacks have completed.
697 */
698 mutex_enter(&as->a_contents);
699 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
700 ;
701
702 /* This will prevent new XHATs from attaching to as */
703 if (!called)
704 AS_SETBUSY(as);
705 mutex_exit(&as->a_contents);
706 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
707
708 if (!called) {
709 called = 1;
710 hat_free_start(hat);
711 if (as->a_xhat != NULL)
712 xhat_free_start_all(as);
713 }
714 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
715 int err;
716
717 next = AS_SEGNEXT(as, seg);
718 retry:
719 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
720 if (err == EAGAIN) {
721 mutex_enter(&as->a_contents);
722 if (as->a_callbacks) {
723 AS_LOCK_EXIT(as, &as->a_lock);
724 } else if (!AS_ISNOUNMAPWAIT(as)) {
725 /*
726 * Memory is currently locked. Wait for a
727 * cv_signal that it has been unlocked, then
728 * try the operation again.
729 */
730 if (AS_ISUNMAPWAIT(as) == 0)
731 cv_broadcast(&as->a_cv);
732 AS_SETUNMAPWAIT(as);
733 AS_LOCK_EXIT(as, &as->a_lock);
734 while (AS_ISUNMAPWAIT(as))
735 cv_wait(&as->a_cv, &as->a_contents);
736 } else {
737 /*
738 * We may have raced with
739 * segvn_reclaim()/segspt_reclaim(). In this
742 * 0. We don't drop as writer lock so our
743 * number of retries without sleeping should
744 * be very small. See segvn_reclaim() for
745 * more comments.
746 */
747 AS_CLRNOUNMAPWAIT(as);
748 mutex_exit(&as->a_contents);
749 goto retry;
750 }
751 mutex_exit(&as->a_contents);
752 goto top;
753 } else {
754 /*
755 * We do not expect any other error return at this
756 * time. This is similar to an ASSERT in seg_unmap()
757 */
758 ASSERT(err == 0);
759 }
760 }
761 hat_free_end(hat);
762 if (as->a_xhat != NULL)
763 xhat_free_end_all(as);
764 AS_LOCK_EXIT(as, &as->a_lock);
765
766 /* /proc stuff */
767 ASSERT(avl_numnodes(&as->a_wpage) == 0);
768 if (as->a_objectdir) {
769 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
770 as->a_objectdir = NULL;
771 as->a_sizedir = 0;
772 }
773
774 /*
775 * Free the struct as back to kmem. Assert it has no segments.
776 */
777 ASSERT(avl_numnodes(&as->a_segtree) == 0);
778 kmem_cache_free(as_cache, as);
779 }
780
781 int
782 as_dup(struct as *as, struct proc *forkedproc)
783 {
784 struct as *newas;
785 struct seg *seg, *newseg;
786 size_t purgesize = 0;
787 int error;
788
789 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
790 as_clearwatch(as);
791 newas = as_alloc();
792 newas->a_userlimit = as->a_userlimit;
793 newas->a_proc = forkedproc;
794
795 AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
796
797 /* This will prevent new XHATs from attaching */
798 mutex_enter(&as->a_contents);
799 AS_SETBUSY(as);
800 mutex_exit(&as->a_contents);
801 mutex_enter(&newas->a_contents);
802 AS_SETBUSY(newas);
803 mutex_exit(&newas->a_contents);
804
805 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
806
807 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
808
809 if (seg->s_flags & S_PURGE) {
810 purgesize += seg->s_size;
811 continue;
812 }
813
814 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
815 if (newseg == NULL) {
816 AS_LOCK_EXIT(newas, &newas->a_lock);
817 as_setwatch(as);
818 mutex_enter(&as->a_contents);
819 AS_CLRBUSY(as);
820 mutex_exit(&as->a_contents);
821 AS_LOCK_EXIT(as, &as->a_lock);
822 as_free(newas);
823 return (-1);
824 }
825 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
826 /*
827 * We call seg_free() on the new seg
828 * because the segment is not set up
829 * completely; i.e. it has no ops.
830 */
831 as_setwatch(as);
832 mutex_enter(&as->a_contents);
833 AS_CLRBUSY(as);
834 mutex_exit(&as->a_contents);
835 AS_LOCK_EXIT(as, &as->a_lock);
836 seg_free(newseg);
837 AS_LOCK_EXIT(newas, &newas->a_lock);
838 as_free(newas);
839 return (error);
840 }
841 newas->a_size += seg->s_size;
842 }
843 newas->a_resvsize = as->a_resvsize - purgesize;
844
845 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
846 if (as->a_xhat != NULL)
847 error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL);
848
849 mutex_enter(&newas->a_contents);
850 AS_CLRBUSY(newas);
851 mutex_exit(&newas->a_contents);
852 AS_LOCK_EXIT(newas, &newas->a_lock);
853
854 as_setwatch(as);
855 mutex_enter(&as->a_contents);
856 AS_CLRBUSY(as);
857 mutex_exit(&as->a_contents);
858 AS_LOCK_EXIT(as, &as->a_lock);
859 if (error != 0) {
860 as_free(newas);
861 return (error);
862 }
863 forkedproc->p_as = newas;
864 return (0);
865 }
866
867 /*
868 * Handle a ``fault'' at addr for size bytes.
869 */
870 faultcode_t
871 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
872 enum fault_type type, enum seg_rw rw)
873 {
874 struct seg *seg;
875 caddr_t raddr; /* rounded down addr */
876 size_t rsize; /* rounded up size */
877 size_t ssize;
878 faultcode_t res = 0;
879 caddr_t addrsav;
880 struct seg *segsav;
881 int as_lock_held;
882 klwp_t *lwp = ttolwp(curthread);
883 int is_xhat = 0;
884 int holding_wpage = 0;
885 extern struct seg_ops segdev_ops;
886
887
888
889 if (as->a_hat != hat) {
890 /* This must be an XHAT then */
891 is_xhat = 1;
892
893 if ((type != F_INVAL) || (as == &kas))
894 return (FC_NOSUPPORT);
895 }
896
897 retry:
898 if (!is_xhat) {
899 /*
900 * Indicate that the lwp is not to be stopped while waiting
901 * for a pagefault. This is to avoid deadlock while debugging
902 * a process via /proc over NFS (in particular).
903 */
904 if (lwp != NULL)
905 lwp->lwp_nostop++;
906
907 /*
908 * same length must be used when we softlock and softunlock.
909 * We don't support softunlocking lengths less than
910 * the original length when there is largepage support.
911 * See seg_dev.c for more comments.
912 */
913 switch (type) {
914
915 case F_SOFTLOCK:
916 CPU_STATS_ADD_K(vm, softlock, 1);
917 break;
918
919 case F_SOFTUNLOCK:
920 break;
921
922 case F_PROT:
923 CPU_STATS_ADD_K(vm, prot_fault, 1);
924 break;
925
926 case F_INVAL:
927 CPU_STATS_ENTER_K();
928 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
929 if (as == &kas)
930 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
931 CPU_STATS_EXIT_K();
932 break;
933 }
934 }
935
936 /* Kernel probe */
937 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
938 tnf_opaque, address, addr,
939 tnf_fault_type, fault_type, type,
940 tnf_seg_access, access, rw);
941
942 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
943 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
944 (size_t)raddr;
945
946 /*
947 * XXX -- Don't grab the as lock for segkmap. We should grab it for
948 * correctness, but then we could be stuck holding this lock for
949 * a LONG time if the fault needs to be resolved on a slow
950 * filesystem, and then no-one will be able to exec new commands,
951 * as exec'ing requires the write lock on the as.
952 */
953 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
954 raddr + size < segkmap->s_base + segkmap->s_size) {
955 /*
956 * if (as==&kas), this can't be XHAT: we've already returned
957 * FC_NOSUPPORT.
958 */
959 seg = segkmap;
960 as_lock_held = 0;
961 } else {
962 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
963 if (is_xhat && avl_numnodes(&as->a_wpage) != 0) {
964 /*
965 * Grab and hold the writers' lock on the as
966 * if the fault is to a watched page.
967 * This will keep CPUs from "peeking" at the
968 * address range while we're temporarily boosting
969 * the permissions for the XHAT device to
970 * resolve the fault in the segment layer.
971 *
972 * We could check whether faulted address
973 * is within a watched page and only then grab
974 * the writer lock, but this is simpler.
975 */
976 AS_LOCK_EXIT(as, &as->a_lock);
977 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
978 }
979
980 seg = as_segat(as, raddr);
981 if (seg == NULL) {
982 AS_LOCK_EXIT(as, &as->a_lock);
983 if ((lwp != NULL) && (!is_xhat))
984 lwp->lwp_nostop--;
985 return (FC_NOMAP);
986 }
987
988 as_lock_held = 1;
989 }
990
991 addrsav = raddr;
992 segsav = seg;
993
994 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
995 if (raddr >= seg->s_base + seg->s_size) {
996 seg = AS_SEGNEXT(as, seg);
997 if (seg == NULL || raddr != seg->s_base) {
998 res = FC_NOMAP;
999 break;
1000 }
1001 }
1002 if (raddr + rsize > seg->s_base + seg->s_size)
1003 ssize = seg->s_base + seg->s_size - raddr;
1004 else
1005 ssize = rsize;
1006
1007 if (!is_xhat || (seg->s_ops != &segdev_ops)) {
1008
1009 if (is_xhat && avl_numnodes(&as->a_wpage) != 0 &&
1010 pr_is_watchpage_as(raddr, rw, as)) {
1011 /*
1012 * Handle watch pages. If we're faulting on a
1013 * watched page from an X-hat, we have to
1014 * restore the original permissions while we
1015 * handle the fault.
1016 */
1017 as_clearwatch(as);
1018 holding_wpage = 1;
1019 }
1020
1021 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
1022
1023 /* Restore watchpoints */
1024 if (holding_wpage) {
1025 as_setwatch(as);
1026 holding_wpage = 0;
1027 }
1028
1029 if (res != 0)
1030 break;
1031 } else {
1032 /* XHAT does not support seg_dev */
1033 res = FC_NOSUPPORT;
1034 break;
1035 }
1036 }
1037
1038 /*
1039 * If we were SOFTLOCKing and encountered a failure,
1040 * we must SOFTUNLOCK the range we already did. (Maybe we
1041 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
1042 * right here...)
1043 */
1044 if (res != 0 && type == F_SOFTLOCK) {
1045 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
1046 if (addrsav >= seg->s_base + seg->s_size)
1047 seg = AS_SEGNEXT(as, seg);
1048 ASSERT(seg != NULL);
1049 /*
1050 * Now call the fault routine again to perform the
1051 * unlock using S_OTHER instead of the rw variable
1052 * since we never got a chance to touch the pages.
1053 */
1054 if (raddr > seg->s_base + seg->s_size)
1055 ssize = seg->s_base + seg->s_size - addrsav;
1056 else
1057 ssize = raddr - addrsav;
1058 (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
1059 F_SOFTUNLOCK, S_OTHER);
1060 }
1061 }
1062 if (as_lock_held)
1063 AS_LOCK_EXIT(as, &as->a_lock);
1064 if ((lwp != NULL) && (!is_xhat))
1065 lwp->lwp_nostop--;
1066
1067 /*
1068 * If the lower levels returned EDEADLK for a fault,
1069 * It means that we should retry the fault. Let's wait
1070 * a bit also to let the deadlock causing condition clear.
1071 * This is part of a gross hack to work around a design flaw
1072 * in the ufs/sds logging code and should go away when the
1073 * logging code is re-designed to fix the problem. See bug
1074 * 4125102 for details of the problem.
1075 */
1076 if (FC_ERRNO(res) == EDEADLK) {
1077 delay(deadlk_wait);
1078 res = 0;
1079 goto retry;
1080 }
1081 return (res);
1082 }
1083
1084
1108 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1109 (size_t)raddr;
1110
1111 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1112 seg = as_segat(as, raddr);
1113 if (seg == NULL) {
1114 AS_LOCK_EXIT(as, &as->a_lock);
1115 if (lwp != NULL)
1116 lwp->lwp_nostop--;
1117 return (FC_NOMAP);
1118 }
1119
1120 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1121 if (raddr >= seg->s_base + seg->s_size) {
1122 seg = AS_SEGNEXT(as, seg);
1123 if (seg == NULL || raddr != seg->s_base) {
1124 res = FC_NOMAP;
1125 break;
1126 }
1127 }
1128 res = SEGOP_FAULTA(seg, raddr);
1129 if (res != 0)
1130 break;
1131 }
1132 AS_LOCK_EXIT(as, &as->a_lock);
1133 if (lwp != NULL)
1134 lwp->lwp_nostop--;
1135 /*
1136 * If the lower levels returned EDEADLK for a fault,
1137 * It means that we should retry the fault. Let's wait
1138 * a bit also to let the deadlock causing condition clear.
1139 * This is part of a gross hack to work around a design flaw
1140 * in the ufs/sds logging code and should go away when the
1141 * logging code is re-designed to fix the problem. See bug
1142 * 4125102 for details of the problem.
1143 */
1144 if (FC_ERRNO(res) == EDEADLK) {
1145 delay(deadlk_wait);
1146 res = 0;
1147 goto retry;
1148 }
1198 seg = as_segat(as, raddr);
1199 if (seg == NULL) {
1200 as_setwatch(as);
1201 AS_LOCK_EXIT(as, &as->a_lock);
1202 return (ENOMEM);
1203 }
1204
1205 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1206 if (raddr >= seg->s_base + seg->s_size) {
1207 seg = AS_SEGNEXT(as, seg);
1208 if (seg == NULL || raddr != seg->s_base) {
1209 error = ENOMEM;
1210 break;
1211 }
1212 }
1213 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1214 ssize = seg->s_base + seg->s_size - raddr;
1215 else
1216 ssize = rsize;
1217 retry:
1218 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1219
1220 if (error == IE_NOMEM) {
1221 error = EAGAIN;
1222 break;
1223 }
1224
1225 if (error == IE_RETRY) {
1226 AS_LOCK_EXIT(as, &as->a_lock);
1227 writer = 1;
1228 goto setprot_top;
1229 }
1230
1231 if (error == EAGAIN) {
1232 /*
1233 * Make sure we have a_lock as writer.
1234 */
1235 if (writer == 0) {
1236 AS_LOCK_EXIT(as, &as->a_lock);
1237 writer = 1;
1238 goto setprot_top;
1349 seg = as_segat(as, raddr);
1350 if (seg == NULL) {
1351 as_setwatch(as);
1352 AS_LOCK_EXIT(as, &as->a_lock);
1353 return (ENOMEM);
1354 }
1355
1356 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1357 if (raddr >= seg->s_base + seg->s_size) {
1358 seg = AS_SEGNEXT(as, seg);
1359 if (seg == NULL || raddr != seg->s_base) {
1360 error = ENOMEM;
1361 break;
1362 }
1363 }
1364 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1365 ssize = seg->s_base + seg->s_size - raddr;
1366 else
1367 ssize = rsize;
1368
1369 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1370 if (error != 0)
1371 break;
1372 }
1373 as_setwatch(as);
1374 AS_LOCK_EXIT(as, &as->a_lock);
1375 return (error);
1376 }
1377
1378 int
1379 as_unmap(struct as *as, caddr_t addr, size_t size)
1380 {
1381 struct seg *seg, *seg_next;
1382 struct as_callback *cb;
1383 caddr_t raddr, eaddr;
1384 size_t ssize, rsize = 0;
1385 int err;
1386
1387 top:
1388 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1389 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1415 else
1416 ssize = eaddr - raddr;
1417
1418 /*
1419 * Save next segment pointer since seg can be
1420 * destroyed during the segment unmap operation.
1421 */
1422 seg_next = AS_SEGNEXT(as, seg);
1423
1424 /*
1425 * We didn't count /dev/null mappings, so ignore them here.
1426 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1427 * we have to do this check here while we have seg.)
1428 */
1429 rsize = 0;
1430 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1431 !SEG_IS_PARTIAL_RESV(seg))
1432 rsize = ssize;
1433
1434 retry:
1435 err = SEGOP_UNMAP(seg, raddr, ssize);
1436 if (err == EAGAIN) {
1437 /*
1438 * Memory is currently locked. It must be unlocked
1439 * before this operation can succeed through a retry.
1440 * The possible reasons for locked memory and
1441 * corresponding strategies for unlocking are:
1442 * (1) Normal I/O
1443 * wait for a signal that the I/O operation
1444 * has completed and the memory is unlocked.
1445 * (2) Asynchronous I/O
1446 * The aio subsystem does not unlock pages when
1447 * the I/O is completed. Those pages are unlocked
1448 * when the application calls aiowait/aioerror.
1449 * So, to prevent blocking forever, cv_broadcast()
1450 * is done to wake up aio_cleanup_thread.
1451 * Subsequently, segvn_reclaim will be called, and
1452 * that will do AS_CLRUNMAPWAIT() and wake us up.
1453 * (3) Long term page locking:
1454 * Drivers intending to have pages locked for a
1455 * period considerably longer than for normal I/O
1853 */
1854 void
1855 as_purge(struct as *as)
1856 {
1857 struct seg *seg;
1858 struct seg *next_seg;
1859
1860 /*
1861 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1862 * no need to grab a_contents mutex for this check
1863 */
1864 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1865 return;
1866
1867 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1868 next_seg = NULL;
1869 seg = AS_SEGFIRST(as);
1870 while (seg != NULL) {
1871 next_seg = AS_SEGNEXT(as, seg);
1872 if (seg->s_flags & S_PURGE)
1873 SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1874 seg = next_seg;
1875 }
1876 AS_LOCK_EXIT(as, &as->a_lock);
1877
1878 mutex_enter(&as->a_contents);
1879 as->a_flags &= ~AS_NEEDSPURGE;
1880 mutex_exit(&as->a_contents);
1881 }
1882
1883 /*
1884 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1885 * range of addresses at least "minlen" long, where the base of the range is
1886 * at "off" phase from an "align" boundary and there is space for a
1887 * "redzone"-sized redzone on eithe rside of the range. Thus,
1888 * if align was 4M and off was 16k, the user wants a hole which will start
1889 * 16k into a 4M page.
1890 *
1891 * If flags specifies AH_HI, the hole will have the highest possible address
1892 * in the range. We use the as->a_lastgap field to figure out where to
1893 * start looking for a gap.
2071 * -1 is returned.
2072 *
2073 * NOTE: This routine is not correct when base+len overflows caddr_t.
2074 */
2075 int
2076 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2077 caddr_t addr)
2078 {
2079
2080 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2081 }
2082
2083 /*
2084 * Return the next range within [base, base + len) that is backed
2085 * with "real memory". Skip holes and non-seg_vn segments.
2086 * We're lazy and only return one segment at a time.
2087 */
2088 int
2089 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2090 {
2091 extern struct seg_ops segspt_shmops; /* needs a header file */
2092 struct seg *seg;
2093 caddr_t addr, eaddr;
2094 caddr_t segend;
2095
2096 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2097
2098 addr = *basep;
2099 eaddr = addr + *lenp;
2100
2101 seg = as_findseg(as, addr, 0);
2102 if (seg != NULL)
2103 addr = MAX(seg->s_base, addr);
2104
2105 for (;;) {
2106 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2107 AS_LOCK_EXIT(as, &as->a_lock);
2108 return (EINVAL);
2109 }
2110
2111 if (seg->s_ops == &segvn_ops) {
2124 }
2125
2126 seg = AS_SEGNEXT(as, seg);
2127
2128 if (seg != NULL)
2129 addr = seg->s_base;
2130 }
2131
2132 *basep = addr;
2133
2134 if (segend > eaddr)
2135 *lenp = eaddr - addr;
2136 else
2137 *lenp = segend - addr;
2138
2139 AS_LOCK_EXIT(as, &as->a_lock);
2140 return (0);
2141 }
2142
2143 /*
2144 * Swap the pages associated with the address space as out to
2145 * secondary storage, returning the number of bytes actually
2146 * swapped.
2147 *
2148 * The value returned is intended to correlate well with the process's
2149 * memory requirements. Its usefulness for this purpose depends on
2150 * how well the segment-level routines do at returning accurate
2151 * information.
2152 */
2153 size_t
2154 as_swapout(struct as *as)
2155 {
2156 struct seg *seg;
2157 size_t swpcnt = 0;
2158
2159 /*
2160 * Kernel-only processes have given up their address
2161 * spaces. Of course, we shouldn't be attempting to
2162 * swap out such processes in the first place...
2163 */
2164 if (as == NULL)
2165 return (0);
2166
2167 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2168
2169 /* Prevent XHATs from attaching */
2170 mutex_enter(&as->a_contents);
2171 AS_SETBUSY(as);
2172 mutex_exit(&as->a_contents);
2173
2174
2175 /*
2176 * Free all mapping resources associated with the address
2177 * space. The segment-level swapout routines capitalize
2178 * on this unmapping by scavanging pages that have become
2179 * unmapped here.
2180 */
2181 hat_swapout(as->a_hat);
2182 if (as->a_xhat != NULL)
2183 xhat_swapout_all(as);
2184
2185 mutex_enter(&as->a_contents);
2186 AS_CLRBUSY(as);
2187 mutex_exit(&as->a_contents);
2188
2189 /*
2190 * Call the swapout routines of all segments in the address
2191 * space to do the actual work, accumulating the amount of
2192 * space reclaimed.
2193 */
2194 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2195 struct seg_ops *ov = seg->s_ops;
2196
2197 /*
2198 * We have to check to see if the seg has
2199 * an ops vector because the seg may have
2200 * been in the middle of being set up when
2201 * the process was picked for swapout.
2202 */
2203 if ((ov != NULL) && (ov->swapout != NULL))
2204 swpcnt += SEGOP_SWAPOUT(seg);
2205 }
2206 AS_LOCK_EXIT(as, &as->a_lock);
2207 return (swpcnt);
2208 }
2209
2210 /*
2211 * Determine whether data from the mappings in interval [addr, addr + size)
2212 * are in the primary memory (core) cache.
2213 */
2214 int
2215 as_incore(struct as *as, caddr_t addr,
2216 size_t size, char *vec, size_t *sizep)
2217 {
2218 struct seg *seg;
2219 size_t ssize;
2220 caddr_t raddr; /* rounded down addr */
2221 size_t rsize; /* rounded up size */
2222 size_t isize; /* iteration size */
2223 int error = 0; /* result, assume success */
2224
2225 *sizep = 0;
2226 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2227 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2228 (size_t)raddr;
2229
2230 if (raddr + rsize < raddr) /* check for wraparound */
2232
2233 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2234 seg = as_segat(as, raddr);
2235 if (seg == NULL) {
2236 AS_LOCK_EXIT(as, &as->a_lock);
2237 return (-1);
2238 }
2239
2240 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2241 if (raddr >= seg->s_base + seg->s_size) {
2242 seg = AS_SEGNEXT(as, seg);
2243 if (seg == NULL || raddr != seg->s_base) {
2244 error = -1;
2245 break;
2246 }
2247 }
2248 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2249 ssize = seg->s_base + seg->s_size - raddr;
2250 else
2251 ssize = rsize;
2252 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2253 if (isize != ssize) {
2254 error = -1;
2255 break;
2256 }
2257 vec += btopr(ssize);
2258 }
2259 AS_LOCK_EXIT(as, &as->a_lock);
2260 return (error);
2261 }
2262
2263 static void
2264 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2265 ulong_t *bitmap, size_t position, size_t npages)
2266 {
2267 caddr_t range_start;
2268 size_t pos1 = position;
2269 size_t pos2;
2270 size_t size;
2271 size_t end_pos = npages + position;
2272
2273 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2274 size = ptob((pos2 - pos1));
2275 range_start = (caddr_t)((uintptr_t)addr +
2276 ptob(pos1 - position));
2277
2278 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2279 (ulong_t *)NULL, (size_t)NULL);
2280 pos1 = pos2;
2281 }
2282 }
2283
2284 static void
2285 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2286 caddr_t raddr, size_t rsize)
2287 {
2288 struct seg *seg = as_segat(as, raddr);
2289 size_t ssize;
2290
2291 while (rsize != 0) {
2292 if (raddr >= seg->s_base + seg->s_size)
2293 seg = AS_SEGNEXT(as, seg);
2294
2295 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2296 ssize = seg->s_base + seg->s_size - raddr;
2297 else
2298 ssize = rsize;
2354 if (seg == NULL) {
2355 AS_LOCK_EXIT(as, &as->a_lock);
2356 return (0);
2357 }
2358
2359 do {
2360 raddr = (caddr_t)((uintptr_t)seg->s_base &
2361 (uintptr_t)PAGEMASK);
2362 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2363 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2364 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2365
2366 mlock_size = BT_BITOUL(btopr(rlen));
2367 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2368 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2369 AS_LOCK_EXIT(as, &as->a_lock);
2370 return (EAGAIN);
2371 }
2372
2373 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2374 error = SEGOP_LOCKOP(seg, seg->s_base,
2375 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2376 if (error != 0)
2377 break;
2378 pos += seg_pages(seg);
2379 }
2380
2381 if (error) {
2382 for (seg = AS_SEGFIRST(as); seg != NULL;
2383 seg = AS_SEGNEXT(as, seg)) {
2384
2385 raddr = (caddr_t)((uintptr_t)seg->s_base &
2386 (uintptr_t)PAGEMASK);
2387 npages = seg_pages(seg);
2388 as_segunlock(seg, raddr, attr, mlock_map,
2389 idx, npages);
2390 idx += npages;
2391 }
2392 }
2393
2394 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2395 AS_LOCK_EXIT(as, &as->a_lock);
2396 goto lockerr;
2397 } else if (func == MC_UNLOCKAS) {
2398 mutex_enter(&as->a_contents);
2399 AS_CLRPGLCK(as);
2400 mutex_exit(&as->a_contents);
2401
2402 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2403 error = SEGOP_LOCKOP(seg, seg->s_base,
2404 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2405 if (error != 0)
2406 break;
2407 }
2408
2409 AS_LOCK_EXIT(as, &as->a_lock);
2410 goto lockerr;
2411 }
2412
2413 /*
2414 * Normalize addresses and sizes.
2415 */
2416 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2417 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2418 (size_t)raddr;
2419
2420 if (raddr + rsize < raddr) { /* check for wraparound */
2421 AS_LOCK_EXIT(as, &as->a_lock);
2422 return (ENOMEM);
2423 }
2461 }
2462 AS_LOCK_EXIT(as, &as->a_lock);
2463 return (ENOMEM);
2464 }
2465 }
2466 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2467 ssize = seg->s_base + seg->s_size - raddr;
2468 else
2469 ssize = rsize;
2470
2471 /*
2472 * Dispatch on specific function.
2473 */
2474 switch (func) {
2475
2476 /*
2477 * Synchronize cached data from mappings with backing
2478 * objects.
2479 */
2480 case MC_SYNC:
2481 if (error = SEGOP_SYNC(seg, raddr, ssize,
2482 attr, (uint_t)arg)) {
2483 AS_LOCK_EXIT(as, &as->a_lock);
2484 return (error);
2485 }
2486 break;
2487
2488 /*
2489 * Lock pages in memory.
2490 */
2491 case MC_LOCK:
2492 if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2493 attr, func, mlock_map, pos)) {
2494 as_unlockerr(as, attr, mlock_map, initraddr,
2495 initrsize - rsize + ssize);
2496 kmem_free(mlock_map, mlock_size *
2497 sizeof (ulong_t));
2498 AS_LOCK_EXIT(as, &as->a_lock);
2499 goto lockerr;
2500 }
2501 break;
2502
2503 /*
2504 * Unlock mapped pages.
2505 */
2506 case MC_UNLOCK:
2507 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2508 (ulong_t *)NULL, (size_t)NULL);
2509 break;
2510
2511 /*
2512 * Store VM advise for mapped pages in segment layer.
2513 */
2514 case MC_ADVISE:
2515 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2516
2517 /*
2518 * Check for regular errors and special retry error
2519 */
2520 if (error) {
2521 if (error == IE_RETRY) {
2522 /*
2523 * Need to acquire writers lock, so
2524 * have to drop readers lock and start
2525 * all over again
2526 */
2527 AS_LOCK_EXIT(as, &as->a_lock);
2528 goto retry;
2529 } else if (error == IE_REATTACH) {
2530 /*
2531 * Find segment for current address
2532 * because current segment just got
2533 * split or concatenated
2534 */
2535 seg = as_segat(as, raddr);
2536 if (seg == NULL) {
2537 AS_LOCK_EXIT(as, &as->a_lock);
2538 return (ENOMEM);
2539 }
2540 } else {
2541 /*
2542 * Regular error
2543 */
2544 AS_LOCK_EXIT(as, &as->a_lock);
2545 return (error);
2546 }
2547 }
2548 break;
2549
2550 case MC_INHERIT_ZERO:
2551 if (seg->s_ops->inherit == NULL) {
2552 error = ENOTSUP;
2553 } else {
2554 error = SEGOP_INHERIT(seg, raddr, ssize,
2555 SEGP_INH_ZERO);
2556 }
2557 if (error != 0) {
2558 AS_LOCK_EXIT(as, &as->a_lock);
2559 return (error);
2560 }
2561 break;
2562
2563 /*
2564 * Can't happen.
2565 */
2566 default:
2567 panic("as_ctl: bad operation %d", func);
2568 /*NOTREACHED*/
2569 }
2570
2571 rsize -= ssize;
2572 raddr += ssize;
2573 }
2574
2575 if (func == MC_LOCK)
2576 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2620 * as expected by the caller. Save pointers to per segment shadow lists at
2621 * the tail of plist so that they can be used during as_pageunlock().
2622 */
2623 static int
2624 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2625 caddr_t addr, size_t size, enum seg_rw rw)
2626 {
2627 caddr_t sv_addr = addr;
2628 size_t sv_size = size;
2629 struct seg *sv_seg = seg;
2630 ulong_t segcnt = 1;
2631 ulong_t cnt;
2632 size_t ssize;
2633 pgcnt_t npages = btop(size);
2634 page_t **plist;
2635 page_t **pl;
2636 int error;
2637 caddr_t eaddr;
2638 faultcode_t fault_err = 0;
2639 pgcnt_t pl_off;
2640 extern struct seg_ops segspt_shmops;
2641
2642 ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2643 ASSERT(seg != NULL);
2644 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2645 ASSERT(addr + size > seg->s_base + seg->s_size);
2646 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2647 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2648
2649 /*
2650 * Count the number of segments covered by the range we are about to
2651 * lock. The segment count is used to size the shadow list we return
2652 * back to the caller.
2653 */
2654 for (; size != 0; size -= ssize, addr += ssize) {
2655 if (addr >= seg->s_base + seg->s_size) {
2656
2657 seg = AS_SEGNEXT(as, seg);
2658 if (seg == NULL || addr != seg->s_base) {
2659 AS_LOCK_EXIT(as, &as->a_lock);
2660 return (EFAULT);
2661 }
2662 /*
2663 * Do a quick check if subsequent segments
2664 * will most likely support pagelock.
2665 */
2666 if (seg->s_ops == &segvn_ops) {
2667 vnode_t *vp;
2668
2669 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2670 vp != NULL) {
2671 AS_LOCK_EXIT(as, &as->a_lock);
2672 goto slow;
2673 }
2674 } else if (seg->s_ops != &segspt_shmops) {
2675 AS_LOCK_EXIT(as, &as->a_lock);
2676 goto slow;
2677 }
2678 segcnt++;
2679 }
2680 if (addr + size > seg->s_base + seg->s_size) {
2681 ssize = seg->s_base + seg->s_size - addr;
2682 } else {
2683 ssize = size;
2684 }
2685 }
2686 ASSERT(segcnt > 1);
2687
2688 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2689
2690 addr = sv_addr;
2691 size = sv_size;
2692 seg = sv_seg;
2693
2694 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2695 if (addr >= seg->s_base + seg->s_size) {
2696 seg = AS_SEGNEXT(as, seg);
2697 ASSERT(seg != NULL && addr == seg->s_base);
2698 cnt++;
2699 ASSERT(cnt < segcnt);
2700 }
2701 if (addr + size > seg->s_base + seg->s_size) {
2702 ssize = seg->s_base + seg->s_size - addr;
2703 } else {
2704 ssize = size;
2705 }
2706 pl = &plist[npages + cnt];
2707 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2708 L_PAGELOCK, rw);
2709 if (error) {
2710 break;
2711 }
2712 ASSERT(plist[npages + cnt] != NULL);
2713 ASSERT(pl_off + btop(ssize) <= npages);
2714 bcopy(plist[npages + cnt], &plist[pl_off],
2715 btop(ssize) * sizeof (page_t *));
2716 pl_off += btop(ssize);
2717 }
2718
2719 if (size == 0) {
2720 AS_LOCK_EXIT(as, &as->a_lock);
2721 ASSERT(cnt == segcnt - 1);
2722 *ppp = plist;
2723 return (0);
2724 }
2725
2726 /*
2727 * one of pagelock calls failed. The error type is in error variable.
2730 * back to the caller.
2731 */
2732
2733 eaddr = addr;
2734 seg = sv_seg;
2735
2736 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2737 if (addr >= seg->s_base + seg->s_size) {
2738 seg = AS_SEGNEXT(as, seg);
2739 ASSERT(seg != NULL && addr == seg->s_base);
2740 cnt++;
2741 ASSERT(cnt < segcnt);
2742 }
2743 if (eaddr > seg->s_base + seg->s_size) {
2744 ssize = seg->s_base + seg->s_size - addr;
2745 } else {
2746 ssize = eaddr - addr;
2747 }
2748 pl = &plist[npages + cnt];
2749 ASSERT(*pl != NULL);
2750 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2751 L_PAGEUNLOCK, rw);
2752 }
2753
2754 AS_LOCK_EXIT(as, &as->a_lock);
2755
2756 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2757
2758 if (error != ENOTSUP && error != EFAULT) {
2759 return (error);
2760 }
2761
2762 slow:
2763 /*
2764 * If we are here because pagelock failed due to the need to cow fault
2765 * in the pages we want to lock F_SOFTLOCK will do this job and in
2766 * next as_pagelock() call for this address range pagelock will
2767 * hopefully succeed.
2768 */
2769 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2770 if (fault_err != 0) {
2805 seg = as_segat(as, raddr);
2806 if (seg == NULL) {
2807 AS_LOCK_EXIT(as, &as->a_lock);
2808 return (EFAULT);
2809 }
2810 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2811 if (raddr + rsize > seg->s_base + seg->s_size) {
2812 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2813 }
2814 if (raddr + rsize <= raddr) {
2815 AS_LOCK_EXIT(as, &as->a_lock);
2816 return (EFAULT);
2817 }
2818
2819 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2820 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2821
2822 /*
2823 * try to lock pages and pass back shadow list
2824 */
2825 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2826
2827 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2828
2829 AS_LOCK_EXIT(as, &as->a_lock);
2830
2831 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2832 return (err);
2833 }
2834
2835 /*
2836 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2837 * to no pagelock support for this segment or pages need to be cow
2838 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2839 * this as_pagelock() call and in the next as_pagelock() call for the
2840 * same address range pagelock call will hopefull succeed.
2841 */
2842 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2843 if (fault_err != 0) {
2844 return (fc_decode(fault_err));
2845 }
2868 ASSERT(seg != NULL);
2869 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2870 ASSERT(addr + size > seg->s_base + seg->s_size);
2871 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2872 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2873 ASSERT(plist != NULL);
2874
2875 for (cnt = 0; addr < eaddr; addr += ssize) {
2876 if (addr >= seg->s_base + seg->s_size) {
2877 seg = AS_SEGNEXT(as, seg);
2878 ASSERT(seg != NULL && addr == seg->s_base);
2879 cnt++;
2880 }
2881 if (eaddr > seg->s_base + seg->s_size) {
2882 ssize = seg->s_base + seg->s_size - addr;
2883 } else {
2884 ssize = eaddr - addr;
2885 }
2886 pl = &plist[npages + cnt];
2887 ASSERT(*pl != NULL);
2888 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2889 L_PAGEUNLOCK, rw);
2890 }
2891 ASSERT(cnt > 0);
2892 AS_LOCK_EXIT(as, &as->a_lock);
2893
2894 cnt++;
2895 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2896 }
2897
2898 /*
2899 * unlock pages in a given address range
2900 */
2901 void
2902 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2903 enum seg_rw rw)
2904 {
2905 struct seg *seg;
2906 size_t rsize;
2907 caddr_t raddr;
2908
2914 * falling back to as_fault
2915 */
2916 if (pp == NULL) {
2917 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2918 return;
2919 }
2920
2921 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2922 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2923 (size_t)raddr;
2924
2925 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2926 seg = as_segat(as, raddr);
2927 ASSERT(seg != NULL);
2928
2929 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2930 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2931
2932 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2933 if (raddr + rsize <= seg->s_base + seg->s_size) {
2934 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2935 } else {
2936 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2937 return;
2938 }
2939 AS_LOCK_EXIT(as, &as->a_lock);
2940 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2941 }
2942
2943 int
2944 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2945 boolean_t wait)
2946 {
2947 struct seg *seg;
2948 size_t ssize;
2949 caddr_t raddr; /* rounded down addr */
2950 size_t rsize; /* rounded up size */
2951 int error = 0;
2952 size_t pgsz = page_get_pagesize(szc);
2953
2954 setpgsz_top:
2969 as_setwatch(as);
2970 AS_LOCK_EXIT(as, &as->a_lock);
2971 return (ENOMEM);
2972 }
2973
2974 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2975 if (raddr >= seg->s_base + seg->s_size) {
2976 seg = AS_SEGNEXT(as, seg);
2977 if (seg == NULL || raddr != seg->s_base) {
2978 error = ENOMEM;
2979 break;
2980 }
2981 }
2982 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2983 ssize = seg->s_base + seg->s_size - raddr;
2984 } else {
2985 ssize = rsize;
2986 }
2987
2988 retry:
2989 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2990
2991 if (error == IE_NOMEM) {
2992 error = EAGAIN;
2993 break;
2994 }
2995
2996 if (error == IE_RETRY) {
2997 AS_LOCK_EXIT(as, &as->a_lock);
2998 goto setpgsz_top;
2999 }
3000
3001 if (error == ENOTSUP) {
3002 error = EINVAL;
3003 break;
3004 }
3005
3006 if (wait && (error == EAGAIN)) {
3007 /*
3008 * Memory is currently locked. It must be unlocked
3009 * before this operation can succeed through a retry.
3048 * number of retries without sleeping should
3049 * be very small. See segvn_reclaim() for
3050 * more comments.
3051 */
3052 AS_CLRNOUNMAPWAIT(as);
3053 mutex_exit(&as->a_contents);
3054 goto retry;
3055 }
3056 mutex_exit(&as->a_contents);
3057 goto setpgsz_top;
3058 } else if (error != 0) {
3059 break;
3060 }
3061 }
3062 as_setwatch(as);
3063 AS_LOCK_EXIT(as, &as->a_lock);
3064 return (error);
3065 }
3066
3067 /*
3068 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
3069 * in its chunk where s_szc is less than the szc we want to set.
3070 */
3071 static int
3072 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3073 int *retry)
3074 {
3075 struct seg *seg;
3076 size_t ssize;
3077 int error;
3078
3079 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3080
3081 seg = as_segat(as, raddr);
3082 if (seg == NULL) {
3083 panic("as_iset3_default_lpsize: no seg");
3084 }
3085
3086 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3087 if (raddr >= seg->s_base + seg->s_size) {
3088 seg = AS_SEGNEXT(as, seg);
3089 if (seg == NULL || raddr != seg->s_base) {
3090 panic("as_iset3_default_lpsize: as changed");
3091 }
3092 }
3093 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3094 ssize = seg->s_base + seg->s_size - raddr;
3095 } else {
3096 ssize = rsize;
3097 }
3098
3099 if (szc > seg->s_szc) {
3100 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3101 /* Only retry on EINVAL segments that have no vnode. */
3102 if (error == EINVAL) {
3103 vnode_t *vp = NULL;
3104 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3105 (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3106 vp == NULL)) {
3107 *retry = 1;
3108 } else {
3109 *retry = 0;
3110 }
3111 }
3112 if (error) {
3113 return (error);
3114 }
3115 }
3116 }
3117 return (0);
3118 }
3119
3120 /*
3121 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3122 * pagesize on each segment in its range, but if any fails with EINVAL,
3123 * then it reduces the pagesizes to the next size in the bitmap and
3124 * retries as_iset3_default_lpsize(). The reason why the code retries
3125 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3328 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3329 again:
3330 error = 0;
3331
3332 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3333 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3334 (size_t)raddr;
3335
3336 if (raddr + rsize < raddr) { /* check for wraparound */
3337 AS_LOCK_EXIT(as, &as->a_lock);
3338 return (ENOMEM);
3339 }
3340 as_clearwatchprot(as, raddr, rsize);
3341 seg = as_segat(as, raddr);
3342 if (seg == NULL) {
3343 as_setwatch(as);
3344 AS_LOCK_EXIT(as, &as->a_lock);
3345 return (ENOMEM);
3346 }
3347 if (seg->s_ops == &segvn_ops) {
3348 rtype = SEGOP_GETTYPE(seg, addr);
3349 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3350 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3351 segvn = 1;
3352 } else {
3353 segvn = 0;
3354 }
3355 setaddr = raddr;
3356 setsize = 0;
3357
3358 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3359 if (raddr >= (seg->s_base + seg->s_size)) {
3360 seg = AS_SEGNEXT(as, seg);
3361 if (seg == NULL || raddr != seg->s_base) {
3362 error = ENOMEM;
3363 break;
3364 }
3365 if (seg->s_ops == &segvn_ops) {
3366 stype = SEGOP_GETTYPE(seg, raddr);
3367 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3368 stype &= (MAP_SHARED | MAP_PRIVATE);
3369 if (segvn && (rflags != sflags ||
3370 rtype != stype)) {
3371 /*
3372 * The next segment is also segvn but
3373 * has different flags and/or type.
3374 */
3375 ASSERT(setsize != 0);
3376 error = as_iset_default_lpsize(as,
3377 setaddr, setsize, rflags, rtype);
3378 if (error) {
3379 break;
3380 }
3381 rflags = sflags;
3382 rtype = stype;
3383 setaddr = raddr;
3384 setsize = 0;
3385 } else if (!segvn) {
3386 rflags = sflags;
3460 as_setwatch(struct as *as)
3461 {
3462 struct watched_page *pwp;
3463 struct seg *seg;
3464 caddr_t vaddr;
3465 uint_t prot;
3466 int err, retrycnt;
3467
3468 if (avl_numnodes(&as->a_wpage) == 0)
3469 return;
3470
3471 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3472
3473 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3474 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3475 retrycnt = 0;
3476 retry:
3477 vaddr = pwp->wp_vaddr;
3478 if (pwp->wp_oprot != 0 || /* already set up */
3479 (seg = as_segat(as, vaddr)) == NULL ||
3480 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3481 continue;
3482
3483 pwp->wp_oprot = prot;
3484 if (pwp->wp_read)
3485 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3486 if (pwp->wp_write)
3487 prot &= ~PROT_WRITE;
3488 if (pwp->wp_exec)
3489 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3490 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3491 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3492 if (err == IE_RETRY) {
3493 pwp->wp_oprot = 0;
3494 ASSERT(retrycnt == 0);
3495 retrycnt++;
3496 goto retry;
3497 }
3498 }
3499 pwp->wp_prot = prot;
3500 }
3501 }
3502
3503 /*
3504 * Clear all of the watched pages in the address space.
3505 */
3506 void
3507 as_clearwatch(struct as *as)
3508 {
3509 struct watched_page *pwp;
3510 struct seg *seg;
3511 caddr_t vaddr;
3512 uint_t prot;
3513 int err, retrycnt;
3514
3515 if (avl_numnodes(&as->a_wpage) == 0)
3516 return;
3517
3518 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3519
3520 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3521 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3522 retrycnt = 0;
3523 retry:
3524 vaddr = pwp->wp_vaddr;
3525 if (pwp->wp_oprot == 0 || /* not set up */
3526 (seg = as_segat(as, vaddr)) == NULL)
3527 continue;
3528
3529 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3530 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3531 if (err == IE_RETRY) {
3532 ASSERT(retrycnt == 0);
3533 retrycnt++;
3534 goto retry;
3535 }
3536 }
3537 pwp->wp_oprot = 0;
3538 pwp->wp_prot = 0;
3539 }
3540 }
3541
3542 /*
3543 * Force a new setup for all the watched pages in the range.
3544 */
3545 static void
3546 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3547 {
3548 struct watched_page *pwp;
3549 struct watched_page tpw;
3550 caddr_t eaddr = addr + size;
3564 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3565
3566 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3567 retrycnt = 0;
3568 vaddr = pwp->wp_vaddr;
3569
3570 wprot = prot;
3571 if (pwp->wp_read)
3572 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3573 if (pwp->wp_write)
3574 wprot &= ~PROT_WRITE;
3575 if (pwp->wp_exec)
3576 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3577 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3578 retry:
3579 seg = as_segat(as, vaddr);
3580 if (seg == NULL) {
3581 panic("as_setwatchprot: no seg");
3582 /*NOTREACHED*/
3583 }
3584 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3585 if (err == IE_RETRY) {
3586 ASSERT(retrycnt == 0);
3587 retrycnt++;
3588 goto retry;
3589 }
3590 }
3591 pwp->wp_oprot = prot;
3592 pwp->wp_prot = wprot;
3593
3594 pwp = AVL_NEXT(&as->a_wpage, pwp);
3595 }
3596 }
3597
3598 /*
3599 * Clear all of the watched pages in the range.
3600 */
3601 static void
3602 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3603 {
3604 caddr_t eaddr = addr + size;
3611
3612 if (avl_numnodes(&as->a_wpage) == 0)
3613 return;
3614
3615 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3616 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3617 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3618
3619 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3620
3621 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3622
3623 if ((prot = pwp->wp_oprot) != 0) {
3624 retrycnt = 0;
3625
3626 if (prot != pwp->wp_prot) {
3627 retry:
3628 seg = as_segat(as, pwp->wp_vaddr);
3629 if (seg == NULL)
3630 continue;
3631 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3632 PAGESIZE, prot);
3633 if (err == IE_RETRY) {
3634 ASSERT(retrycnt == 0);
3635 retrycnt++;
3636 goto retry;
3637
3638 }
3639 }
3640 pwp->wp_oprot = 0;
3641 pwp->wp_prot = 0;
3642 }
3643
3644 pwp = AVL_NEXT(&as->a_wpage, pwp);
3645 }
3646 }
3647
3648 void
3649 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3650 {
3651 struct proc *p;
3660 }
3661 }
3662 mutex_exit(&pidlock);
3663 }
3664
3665 /*
3666 * return memory object ID
3667 */
3668 int
3669 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3670 {
3671 struct seg *seg;
3672 int sts;
3673
3674 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
3675 seg = as_segat(as, addr);
3676 if (seg == NULL) {
3677 AS_LOCK_EXIT(as, &as->a_lock);
3678 return (EFAULT);
3679 }
3680 /*
3681 * catch old drivers which may not support getmemid
3682 */
3683 if (seg->s_ops->getmemid == NULL) {
3684 AS_LOCK_EXIT(as, &as->a_lock);
3685 return (ENODEV);
3686 }
3687
3688 sts = SEGOP_GETMEMID(seg, addr, memidp);
3689
3690 AS_LOCK_EXIT(as, &as->a_lock);
3691 return (sts);
3692 }
|
42 */
43
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/param.h>
47 #include <sys/errno.h>
48 #include <sys/systm.h>
49 #include <sys/mman.h>
50 #include <sys/sysmacros.h>
51 #include <sys/cpuvar.h>
52 #include <sys/sysinfo.h>
53 #include <sys/kmem.h>
54 #include <sys/vnode.h>
55 #include <sys/vmsystm.h>
56 #include <sys/cmn_err.h>
57 #include <sys/debug.h>
58 #include <sys/tnf_probe.h>
59 #include <sys/vtrace.h>
60
61 #include <vm/hat.h>
62 #include <vm/as.h>
63 #include <vm/seg.h>
64 #include <vm/seg_vn.h>
65 #include <vm/seg_dev.h>
66 #include <vm/seg_kmem.h>
67 #include <vm/seg_map.h>
68 #include <vm/seg_spt.h>
69 #include <vm/page.h>
70
71 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
72
73 static struct kmem_cache *as_cache;
74
75 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
76 static void as_clearwatchprot(struct as *, caddr_t, size_t);
77 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
78
79
80 /*
81 * Verifying the segment lists is very time-consuming; it may not be
456 seg = avl_find(&as->a_segtree, &addr, &where);
457
458 if (seg == NULL)
459 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
460
461 if (seg == NULL)
462 seg = avl_last(&as->a_segtree);
463
464 if (seg != NULL) {
465 caddr_t base = seg->s_base;
466
467 /*
468 * If top of seg is below the requested address, then
469 * the insertion point is at the end of the linked list,
470 * and seg points to the tail of the list. Otherwise,
471 * the insertion point is immediately before seg.
472 */
473 if (base + seg->s_size > addr) {
474 if (addr >= base || eaddr > base) {
475 #ifdef __sparc
476 extern const struct seg_ops segnf_ops;
477
478 /*
479 * no-fault segs must disappear if overlaid.
480 * XXX need new segment type so
481 * we don't have to check s_ops
482 */
483 if (seg->s_ops == &segnf_ops) {
484 seg_unmap(seg);
485 goto again;
486 }
487 #endif
488 return (-1); /* overlapping segment */
489 }
490 }
491 }
492 as->a_seglast = newseg;
493 avl_insert(&as->a_segtree, newseg, where);
494
495 #ifdef VERIFY_SEGLIST
496 as_verify(as);
653
654 as->a_flags = 0;
655 as->a_vbits = 0;
656 as->a_hrm = NULL;
657 as->a_seglast = NULL;
658 as->a_size = 0;
659 as->a_resvsize = 0;
660 as->a_updatedir = 0;
661 gethrestime(&as->a_updatetime);
662 as->a_objectdir = NULL;
663 as->a_sizedir = 0;
664 as->a_userlimit = (caddr_t)USERLIMIT;
665 as->a_lastgap = NULL;
666 as->a_lastgaphl = NULL;
667 as->a_callbacks = NULL;
668
669 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
670 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
671 AS_LOCK_EXIT(as, &as->a_lock);
672
673 return (as);
674 }
675
676 /*
677 * Free an address space data structure.
678 * Need to free the hat first and then
679 * all the segments on this as and finally
680 * the space for the as struct itself.
681 */
682 void
683 as_free(struct as *as)
684 {
685 struct hat *hat = as->a_hat;
686 struct seg *seg, *next;
687 boolean_t free_started = B_FALSE;
688
689 top:
690 /*
691 * Invoke ALL callbacks. as_do_callbacks will do one callback
692 * per call, and not return (-1) until the callback has completed.
693 * When as_do_callbacks returns zero, all callbacks have completed.
694 */
695 mutex_enter(&as->a_contents);
696 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
697 ;
698
699 mutex_exit(&as->a_contents);
700 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
701
702 if (!free_started) {
703 free_started = B_TRUE;
704 hat_free_start(hat);
705 }
706 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
707 int err;
708
709 next = AS_SEGNEXT(as, seg);
710 retry:
711 err = segop_unmap(seg, seg->s_base, seg->s_size);
712 if (err == EAGAIN) {
713 mutex_enter(&as->a_contents);
714 if (as->a_callbacks) {
715 AS_LOCK_EXIT(as, &as->a_lock);
716 } else if (!AS_ISNOUNMAPWAIT(as)) {
717 /*
718 * Memory is currently locked. Wait for a
719 * cv_signal that it has been unlocked, then
720 * try the operation again.
721 */
722 if (AS_ISUNMAPWAIT(as) == 0)
723 cv_broadcast(&as->a_cv);
724 AS_SETUNMAPWAIT(as);
725 AS_LOCK_EXIT(as, &as->a_lock);
726 while (AS_ISUNMAPWAIT(as))
727 cv_wait(&as->a_cv, &as->a_contents);
728 } else {
729 /*
730 * We may have raced with
731 * segvn_reclaim()/segspt_reclaim(). In this
734 * 0. We don't drop as writer lock so our
735 * number of retries without sleeping should
736 * be very small. See segvn_reclaim() for
737 * more comments.
738 */
739 AS_CLRNOUNMAPWAIT(as);
740 mutex_exit(&as->a_contents);
741 goto retry;
742 }
743 mutex_exit(&as->a_contents);
744 goto top;
745 } else {
746 /*
747 * We do not expect any other error return at this
748 * time. This is similar to an ASSERT in seg_unmap()
749 */
750 ASSERT(err == 0);
751 }
752 }
753 hat_free_end(hat);
754 AS_LOCK_EXIT(as, &as->a_lock);
755
756 /* /proc stuff */
757 ASSERT(avl_numnodes(&as->a_wpage) == 0);
758 if (as->a_objectdir) {
759 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
760 as->a_objectdir = NULL;
761 as->a_sizedir = 0;
762 }
763
764 /*
765 * Free the struct as back to kmem. Assert it has no segments.
766 */
767 ASSERT(avl_numnodes(&as->a_segtree) == 0);
768 kmem_cache_free(as_cache, as);
769 }
770
771 int
772 as_dup(struct as *as, struct proc *forkedproc)
773 {
774 struct as *newas;
775 struct seg *seg, *newseg;
776 size_t purgesize = 0;
777 int error;
778
779 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
780 as_clearwatch(as);
781 newas = as_alloc();
782 newas->a_userlimit = as->a_userlimit;
783 newas->a_proc = forkedproc;
784
785 AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
786
787 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
788
789 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
790
791 if (seg->s_flags & S_PURGE) {
792 purgesize += seg->s_size;
793 continue;
794 }
795
796 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
797 if (newseg == NULL) {
798 AS_LOCK_EXIT(newas, &newas->a_lock);
799 as_setwatch(as);
800 AS_LOCK_EXIT(as, &as->a_lock);
801 as_free(newas);
802 return (-1);
803 }
804 if ((error = segop_dup(seg, newseg)) != 0) {
805 /*
806 * We call seg_free() on the new seg
807 * because the segment is not set up
808 * completely; i.e. it has no ops.
809 */
810 as_setwatch(as);
811 AS_LOCK_EXIT(as, &as->a_lock);
812 seg_free(newseg);
813 AS_LOCK_EXIT(newas, &newas->a_lock);
814 as_free(newas);
815 return (error);
816 }
817 newas->a_size += seg->s_size;
818 }
819 newas->a_resvsize = as->a_resvsize - purgesize;
820
821 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
822
823 AS_LOCK_EXIT(newas, &newas->a_lock);
824
825 as_setwatch(as);
826 AS_LOCK_EXIT(as, &as->a_lock);
827 if (error != 0) {
828 as_free(newas);
829 return (error);
830 }
831 forkedproc->p_as = newas;
832 return (0);
833 }
834
835 /*
836 * Handle a ``fault'' at addr for size bytes.
837 */
838 faultcode_t
839 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
840 enum fault_type type, enum seg_rw rw)
841 {
842 struct seg *seg;
843 caddr_t raddr; /* rounded down addr */
844 size_t rsize; /* rounded up size */
845 size_t ssize;
846 faultcode_t res = 0;
847 caddr_t addrsav;
848 struct seg *segsav;
849 int as_lock_held;
850 klwp_t *lwp = ttolwp(curthread);
851 int holding_wpage = 0;
852
853
854
855 retry:
856 /*
857 * Indicate that the lwp is not to be stopped while waiting for a
858 * pagefault. This is to avoid deadlock while debugging a process
859 * via /proc over NFS (in particular).
860 */
861 if (lwp != NULL)
862 lwp->lwp_nostop++;
863
864 /*
865 * same length must be used when we softlock and softunlock. We
866 * don't support softunlocking lengths less than the original length
867 * when there is largepage support. See seg_dev.c for more
868 * comments.
869 */
870 switch (type) {
871
872 case F_SOFTLOCK:
873 CPU_STATS_ADD_K(vm, softlock, 1);
874 break;
875
876 case F_SOFTUNLOCK:
877 break;
878
879 case F_PROT:
880 CPU_STATS_ADD_K(vm, prot_fault, 1);
881 break;
882
883 case F_INVAL:
884 CPU_STATS_ENTER_K();
885 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
886 if (as == &kas)
887 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
888 CPU_STATS_EXIT_K();
889 break;
890 }
891
892 /* Kernel probe */
893 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
894 tnf_opaque, address, addr,
895 tnf_fault_type, fault_type, type,
896 tnf_seg_access, access, rw);
897
898 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
899 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
900 (size_t)raddr;
901
902 /*
903 * XXX -- Don't grab the as lock for segkmap. We should grab it for
904 * correctness, but then we could be stuck holding this lock for
905 * a LONG time if the fault needs to be resolved on a slow
906 * filesystem, and then no-one will be able to exec new commands,
907 * as exec'ing requires the write lock on the as.
908 */
909 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
910 raddr + size < segkmap->s_base + segkmap->s_size) {
911 seg = segkmap;
912 as_lock_held = 0;
913 } else {
914 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
915
916 seg = as_segat(as, raddr);
917 if (seg == NULL) {
918 AS_LOCK_EXIT(as, &as->a_lock);
919 if (lwp != NULL)
920 lwp->lwp_nostop--;
921 return (FC_NOMAP);
922 }
923
924 as_lock_held = 1;
925 }
926
927 addrsav = raddr;
928 segsav = seg;
929
930 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
931 if (raddr >= seg->s_base + seg->s_size) {
932 seg = AS_SEGNEXT(as, seg);
933 if (seg == NULL || raddr != seg->s_base) {
934 res = FC_NOMAP;
935 break;
936 }
937 }
938 if (raddr + rsize > seg->s_base + seg->s_size)
939 ssize = seg->s_base + seg->s_size - raddr;
940 else
941 ssize = rsize;
942
943 res = segop_fault(hat, seg, raddr, ssize, type, rw);
944
945 /* Restore watchpoints */
946 if (holding_wpage) {
947 as_setwatch(as);
948 holding_wpage = 0;
949 }
950
951 if (res != 0)
952 break;
953 }
954
955 /*
956 * If we were SOFTLOCKing and encountered a failure,
957 * we must SOFTUNLOCK the range we already did. (Maybe we
958 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
959 * right here...)
960 */
961 if (res != 0 && type == F_SOFTLOCK) {
962 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
963 if (addrsav >= seg->s_base + seg->s_size)
964 seg = AS_SEGNEXT(as, seg);
965 ASSERT(seg != NULL);
966 /*
967 * Now call the fault routine again to perform the
968 * unlock using S_OTHER instead of the rw variable
969 * since we never got a chance to touch the pages.
970 */
971 if (raddr > seg->s_base + seg->s_size)
972 ssize = seg->s_base + seg->s_size - addrsav;
973 else
974 ssize = raddr - addrsav;
975 (void) segop_fault(hat, seg, addrsav, ssize,
976 F_SOFTUNLOCK, S_OTHER);
977 }
978 }
979 if (as_lock_held)
980 AS_LOCK_EXIT(as, &as->a_lock);
981 if (lwp != NULL)
982 lwp->lwp_nostop--;
983
984 /*
985 * If the lower levels returned EDEADLK for a fault,
986 * It means that we should retry the fault. Let's wait
987 * a bit also to let the deadlock causing condition clear.
988 * This is part of a gross hack to work around a design flaw
989 * in the ufs/sds logging code and should go away when the
990 * logging code is re-designed to fix the problem. See bug
991 * 4125102 for details of the problem.
992 */
993 if (FC_ERRNO(res) == EDEADLK) {
994 delay(deadlk_wait);
995 res = 0;
996 goto retry;
997 }
998 return (res);
999 }
1000
1001
1025 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1026 (size_t)raddr;
1027
1028 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1029 seg = as_segat(as, raddr);
1030 if (seg == NULL) {
1031 AS_LOCK_EXIT(as, &as->a_lock);
1032 if (lwp != NULL)
1033 lwp->lwp_nostop--;
1034 return (FC_NOMAP);
1035 }
1036
1037 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1038 if (raddr >= seg->s_base + seg->s_size) {
1039 seg = AS_SEGNEXT(as, seg);
1040 if (seg == NULL || raddr != seg->s_base) {
1041 res = FC_NOMAP;
1042 break;
1043 }
1044 }
1045 res = segop_faulta(seg, raddr);
1046 if (res != 0)
1047 break;
1048 }
1049 AS_LOCK_EXIT(as, &as->a_lock);
1050 if (lwp != NULL)
1051 lwp->lwp_nostop--;
1052 /*
1053 * If the lower levels returned EDEADLK for a fault,
1054 * It means that we should retry the fault. Let's wait
1055 * a bit also to let the deadlock causing condition clear.
1056 * This is part of a gross hack to work around a design flaw
1057 * in the ufs/sds logging code and should go away when the
1058 * logging code is re-designed to fix the problem. See bug
1059 * 4125102 for details of the problem.
1060 */
1061 if (FC_ERRNO(res) == EDEADLK) {
1062 delay(deadlk_wait);
1063 res = 0;
1064 goto retry;
1065 }
1115 seg = as_segat(as, raddr);
1116 if (seg == NULL) {
1117 as_setwatch(as);
1118 AS_LOCK_EXIT(as, &as->a_lock);
1119 return (ENOMEM);
1120 }
1121
1122 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1123 if (raddr >= seg->s_base + seg->s_size) {
1124 seg = AS_SEGNEXT(as, seg);
1125 if (seg == NULL || raddr != seg->s_base) {
1126 error = ENOMEM;
1127 break;
1128 }
1129 }
1130 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1131 ssize = seg->s_base + seg->s_size - raddr;
1132 else
1133 ssize = rsize;
1134 retry:
1135 error = segop_setprot(seg, raddr, ssize, prot);
1136
1137 if (error == IE_NOMEM) {
1138 error = EAGAIN;
1139 break;
1140 }
1141
1142 if (error == IE_RETRY) {
1143 AS_LOCK_EXIT(as, &as->a_lock);
1144 writer = 1;
1145 goto setprot_top;
1146 }
1147
1148 if (error == EAGAIN) {
1149 /*
1150 * Make sure we have a_lock as writer.
1151 */
1152 if (writer == 0) {
1153 AS_LOCK_EXIT(as, &as->a_lock);
1154 writer = 1;
1155 goto setprot_top;
1266 seg = as_segat(as, raddr);
1267 if (seg == NULL) {
1268 as_setwatch(as);
1269 AS_LOCK_EXIT(as, &as->a_lock);
1270 return (ENOMEM);
1271 }
1272
1273 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1274 if (raddr >= seg->s_base + seg->s_size) {
1275 seg = AS_SEGNEXT(as, seg);
1276 if (seg == NULL || raddr != seg->s_base) {
1277 error = ENOMEM;
1278 break;
1279 }
1280 }
1281 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1282 ssize = seg->s_base + seg->s_size - raddr;
1283 else
1284 ssize = rsize;
1285
1286 error = segop_checkprot(seg, raddr, ssize, prot);
1287 if (error != 0)
1288 break;
1289 }
1290 as_setwatch(as);
1291 AS_LOCK_EXIT(as, &as->a_lock);
1292 return (error);
1293 }
1294
1295 int
1296 as_unmap(struct as *as, caddr_t addr, size_t size)
1297 {
1298 struct seg *seg, *seg_next;
1299 struct as_callback *cb;
1300 caddr_t raddr, eaddr;
1301 size_t ssize, rsize = 0;
1302 int err;
1303
1304 top:
1305 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1306 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1332 else
1333 ssize = eaddr - raddr;
1334
1335 /*
1336 * Save next segment pointer since seg can be
1337 * destroyed during the segment unmap operation.
1338 */
1339 seg_next = AS_SEGNEXT(as, seg);
1340
1341 /*
1342 * We didn't count /dev/null mappings, so ignore them here.
1343 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1344 * we have to do this check here while we have seg.)
1345 */
1346 rsize = 0;
1347 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1348 !SEG_IS_PARTIAL_RESV(seg))
1349 rsize = ssize;
1350
1351 retry:
1352 err = segop_unmap(seg, raddr, ssize);
1353 if (err == EAGAIN) {
1354 /*
1355 * Memory is currently locked. It must be unlocked
1356 * before this operation can succeed through a retry.
1357 * The possible reasons for locked memory and
1358 * corresponding strategies for unlocking are:
1359 * (1) Normal I/O
1360 * wait for a signal that the I/O operation
1361 * has completed and the memory is unlocked.
1362 * (2) Asynchronous I/O
1363 * The aio subsystem does not unlock pages when
1364 * the I/O is completed. Those pages are unlocked
1365 * when the application calls aiowait/aioerror.
1366 * So, to prevent blocking forever, cv_broadcast()
1367 * is done to wake up aio_cleanup_thread.
1368 * Subsequently, segvn_reclaim will be called, and
1369 * that will do AS_CLRUNMAPWAIT() and wake us up.
1370 * (3) Long term page locking:
1371 * Drivers intending to have pages locked for a
1372 * period considerably longer than for normal I/O
1770 */
1771 void
1772 as_purge(struct as *as)
1773 {
1774 struct seg *seg;
1775 struct seg *next_seg;
1776
1777 /*
1778 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1779 * no need to grab a_contents mutex for this check
1780 */
1781 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1782 return;
1783
1784 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1785 next_seg = NULL;
1786 seg = AS_SEGFIRST(as);
1787 while (seg != NULL) {
1788 next_seg = AS_SEGNEXT(as, seg);
1789 if (seg->s_flags & S_PURGE)
1790 segop_unmap(seg, seg->s_base, seg->s_size);
1791 seg = next_seg;
1792 }
1793 AS_LOCK_EXIT(as, &as->a_lock);
1794
1795 mutex_enter(&as->a_contents);
1796 as->a_flags &= ~AS_NEEDSPURGE;
1797 mutex_exit(&as->a_contents);
1798 }
1799
1800 /*
1801 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1802 * range of addresses at least "minlen" long, where the base of the range is
1803 * at "off" phase from an "align" boundary and there is space for a
1804 * "redzone"-sized redzone on eithe rside of the range. Thus,
1805 * if align was 4M and off was 16k, the user wants a hole which will start
1806 * 16k into a 4M page.
1807 *
1808 * If flags specifies AH_HI, the hole will have the highest possible address
1809 * in the range. We use the as->a_lastgap field to figure out where to
1810 * start looking for a gap.
1988 * -1 is returned.
1989 *
1990 * NOTE: This routine is not correct when base+len overflows caddr_t.
1991 */
1992 int
1993 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1994 caddr_t addr)
1995 {
1996
1997 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1998 }
1999
2000 /*
2001 * Return the next range within [base, base + len) that is backed
2002 * with "real memory". Skip holes and non-seg_vn segments.
2003 * We're lazy and only return one segment at a time.
2004 */
2005 int
2006 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2007 {
2008 extern const struct seg_ops segspt_shmops; /* needs a header file */
2009 struct seg *seg;
2010 caddr_t addr, eaddr;
2011 caddr_t segend;
2012
2013 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2014
2015 addr = *basep;
2016 eaddr = addr + *lenp;
2017
2018 seg = as_findseg(as, addr, 0);
2019 if (seg != NULL)
2020 addr = MAX(seg->s_base, addr);
2021
2022 for (;;) {
2023 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2024 AS_LOCK_EXIT(as, &as->a_lock);
2025 return (EINVAL);
2026 }
2027
2028 if (seg->s_ops == &segvn_ops) {
2041 }
2042
2043 seg = AS_SEGNEXT(as, seg);
2044
2045 if (seg != NULL)
2046 addr = seg->s_base;
2047 }
2048
2049 *basep = addr;
2050
2051 if (segend > eaddr)
2052 *lenp = eaddr - addr;
2053 else
2054 *lenp = segend - addr;
2055
2056 AS_LOCK_EXIT(as, &as->a_lock);
2057 return (0);
2058 }
2059
2060 /*
2061 * Determine whether data from the mappings in interval [addr, addr + size)
2062 * are in the primary memory (core) cache.
2063 */
2064 int
2065 as_incore(struct as *as, caddr_t addr,
2066 size_t size, char *vec, size_t *sizep)
2067 {
2068 struct seg *seg;
2069 size_t ssize;
2070 caddr_t raddr; /* rounded down addr */
2071 size_t rsize; /* rounded up size */
2072 size_t isize; /* iteration size */
2073 int error = 0; /* result, assume success */
2074
2075 *sizep = 0;
2076 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2077 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2078 (size_t)raddr;
2079
2080 if (raddr + rsize < raddr) /* check for wraparound */
2082
2083 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2084 seg = as_segat(as, raddr);
2085 if (seg == NULL) {
2086 AS_LOCK_EXIT(as, &as->a_lock);
2087 return (-1);
2088 }
2089
2090 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2091 if (raddr >= seg->s_base + seg->s_size) {
2092 seg = AS_SEGNEXT(as, seg);
2093 if (seg == NULL || raddr != seg->s_base) {
2094 error = -1;
2095 break;
2096 }
2097 }
2098 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2099 ssize = seg->s_base + seg->s_size - raddr;
2100 else
2101 ssize = rsize;
2102 *sizep += isize = segop_incore(seg, raddr, ssize, vec);
2103 if (isize != ssize) {
2104 error = -1;
2105 break;
2106 }
2107 vec += btopr(ssize);
2108 }
2109 AS_LOCK_EXIT(as, &as->a_lock);
2110 return (error);
2111 }
2112
2113 static void
2114 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2115 ulong_t *bitmap, size_t position, size_t npages)
2116 {
2117 caddr_t range_start;
2118 size_t pos1 = position;
2119 size_t pos2;
2120 size_t size;
2121 size_t end_pos = npages + position;
2122
2123 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2124 size = ptob((pos2 - pos1));
2125 range_start = (caddr_t)((uintptr_t)addr +
2126 ptob(pos1 - position));
2127
2128 (void) segop_lockop(seg, range_start, size, attr, MC_UNLOCK,
2129 (ulong_t *)NULL, (size_t)NULL);
2130 pos1 = pos2;
2131 }
2132 }
2133
2134 static void
2135 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2136 caddr_t raddr, size_t rsize)
2137 {
2138 struct seg *seg = as_segat(as, raddr);
2139 size_t ssize;
2140
2141 while (rsize != 0) {
2142 if (raddr >= seg->s_base + seg->s_size)
2143 seg = AS_SEGNEXT(as, seg);
2144
2145 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2146 ssize = seg->s_base + seg->s_size - raddr;
2147 else
2148 ssize = rsize;
2204 if (seg == NULL) {
2205 AS_LOCK_EXIT(as, &as->a_lock);
2206 return (0);
2207 }
2208
2209 do {
2210 raddr = (caddr_t)((uintptr_t)seg->s_base &
2211 (uintptr_t)PAGEMASK);
2212 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2213 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2214 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2215
2216 mlock_size = BT_BITOUL(btopr(rlen));
2217 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2218 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2219 AS_LOCK_EXIT(as, &as->a_lock);
2220 return (EAGAIN);
2221 }
2222
2223 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2224 error = segop_lockop(seg, seg->s_base,
2225 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2226 if (error != 0)
2227 break;
2228 pos += seg_pages(seg);
2229 }
2230
2231 if (error) {
2232 for (seg = AS_SEGFIRST(as); seg != NULL;
2233 seg = AS_SEGNEXT(as, seg)) {
2234
2235 raddr = (caddr_t)((uintptr_t)seg->s_base &
2236 (uintptr_t)PAGEMASK);
2237 npages = seg_pages(seg);
2238 as_segunlock(seg, raddr, attr, mlock_map,
2239 idx, npages);
2240 idx += npages;
2241 }
2242 }
2243
2244 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2245 AS_LOCK_EXIT(as, &as->a_lock);
2246 goto lockerr;
2247 } else if (func == MC_UNLOCKAS) {
2248 mutex_enter(&as->a_contents);
2249 AS_CLRPGLCK(as);
2250 mutex_exit(&as->a_contents);
2251
2252 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2253 error = segop_lockop(seg, seg->s_base,
2254 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2255 if (error != 0)
2256 break;
2257 }
2258
2259 AS_LOCK_EXIT(as, &as->a_lock);
2260 goto lockerr;
2261 }
2262
2263 /*
2264 * Normalize addresses and sizes.
2265 */
2266 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2267 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2268 (size_t)raddr;
2269
2270 if (raddr + rsize < raddr) { /* check for wraparound */
2271 AS_LOCK_EXIT(as, &as->a_lock);
2272 return (ENOMEM);
2273 }
2311 }
2312 AS_LOCK_EXIT(as, &as->a_lock);
2313 return (ENOMEM);
2314 }
2315 }
2316 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2317 ssize = seg->s_base + seg->s_size - raddr;
2318 else
2319 ssize = rsize;
2320
2321 /*
2322 * Dispatch on specific function.
2323 */
2324 switch (func) {
2325
2326 /*
2327 * Synchronize cached data from mappings with backing
2328 * objects.
2329 */
2330 case MC_SYNC:
2331 if (error = segop_sync(seg, raddr, ssize,
2332 attr, (uint_t)arg)) {
2333 AS_LOCK_EXIT(as, &as->a_lock);
2334 return (error);
2335 }
2336 break;
2337
2338 /*
2339 * Lock pages in memory.
2340 */
2341 case MC_LOCK:
2342 if (error = segop_lockop(seg, raddr, ssize,
2343 attr, func, mlock_map, pos)) {
2344 as_unlockerr(as, attr, mlock_map, initraddr,
2345 initrsize - rsize + ssize);
2346 kmem_free(mlock_map, mlock_size *
2347 sizeof (ulong_t));
2348 AS_LOCK_EXIT(as, &as->a_lock);
2349 goto lockerr;
2350 }
2351 break;
2352
2353 /*
2354 * Unlock mapped pages.
2355 */
2356 case MC_UNLOCK:
2357 (void) segop_lockop(seg, raddr, ssize, attr, func,
2358 (ulong_t *)NULL, (size_t)NULL);
2359 break;
2360
2361 /*
2362 * Store VM advise for mapped pages in segment layer.
2363 */
2364 case MC_ADVISE:
2365 error = segop_advise(seg, raddr, ssize, (uint_t)arg);
2366
2367 /*
2368 * Check for regular errors and special retry error
2369 */
2370 if (error) {
2371 if (error == IE_RETRY) {
2372 /*
2373 * Need to acquire writers lock, so
2374 * have to drop readers lock and start
2375 * all over again
2376 */
2377 AS_LOCK_EXIT(as, &as->a_lock);
2378 goto retry;
2379 } else if (error == IE_REATTACH) {
2380 /*
2381 * Find segment for current address
2382 * because current segment just got
2383 * split or concatenated
2384 */
2385 seg = as_segat(as, raddr);
2386 if (seg == NULL) {
2387 AS_LOCK_EXIT(as, &as->a_lock);
2388 return (ENOMEM);
2389 }
2390 } else {
2391 /*
2392 * Regular error
2393 */
2394 AS_LOCK_EXIT(as, &as->a_lock);
2395 return (error);
2396 }
2397 }
2398 break;
2399
2400 case MC_INHERIT_ZERO:
2401 error = segop_inherit(seg, raddr, ssize, SEGP_INH_ZERO);
2402 if (error != 0) {
2403 AS_LOCK_EXIT(as, &as->a_lock);
2404 return (error);
2405 }
2406 break;
2407
2408 /*
2409 * Can't happen.
2410 */
2411 default:
2412 panic("as_ctl: bad operation %d", func);
2413 /*NOTREACHED*/
2414 }
2415
2416 rsize -= ssize;
2417 raddr += ssize;
2418 }
2419
2420 if (func == MC_LOCK)
2421 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2465 * as expected by the caller. Save pointers to per segment shadow lists at
2466 * the tail of plist so that they can be used during as_pageunlock().
2467 */
2468 static int
2469 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2470 caddr_t addr, size_t size, enum seg_rw rw)
2471 {
2472 caddr_t sv_addr = addr;
2473 size_t sv_size = size;
2474 struct seg *sv_seg = seg;
2475 ulong_t segcnt = 1;
2476 ulong_t cnt;
2477 size_t ssize;
2478 pgcnt_t npages = btop(size);
2479 page_t **plist;
2480 page_t **pl;
2481 int error;
2482 caddr_t eaddr;
2483 faultcode_t fault_err = 0;
2484 pgcnt_t pl_off;
2485 extern const struct seg_ops segspt_shmops;
2486
2487 ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2488 ASSERT(seg != NULL);
2489 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2490 ASSERT(addr + size > seg->s_base + seg->s_size);
2491 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2492 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2493
2494 /*
2495 * Count the number of segments covered by the range we are about to
2496 * lock. The segment count is used to size the shadow list we return
2497 * back to the caller.
2498 */
2499 for (; size != 0; size -= ssize, addr += ssize) {
2500 if (addr >= seg->s_base + seg->s_size) {
2501
2502 seg = AS_SEGNEXT(as, seg);
2503 if (seg == NULL || addr != seg->s_base) {
2504 AS_LOCK_EXIT(as, &as->a_lock);
2505 return (EFAULT);
2506 }
2507 /*
2508 * Do a quick check if subsequent segments
2509 * will most likely support pagelock.
2510 */
2511 if (seg->s_ops == &segvn_ops) {
2512 vnode_t *vp;
2513
2514 if (segop_getvp(seg, addr, &vp) != 0 ||
2515 vp != NULL) {
2516 AS_LOCK_EXIT(as, &as->a_lock);
2517 goto slow;
2518 }
2519 } else if (seg->s_ops != &segspt_shmops) {
2520 AS_LOCK_EXIT(as, &as->a_lock);
2521 goto slow;
2522 }
2523 segcnt++;
2524 }
2525 if (addr + size > seg->s_base + seg->s_size) {
2526 ssize = seg->s_base + seg->s_size - addr;
2527 } else {
2528 ssize = size;
2529 }
2530 }
2531 ASSERT(segcnt > 1);
2532
2533 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2534
2535 addr = sv_addr;
2536 size = sv_size;
2537 seg = sv_seg;
2538
2539 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2540 if (addr >= seg->s_base + seg->s_size) {
2541 seg = AS_SEGNEXT(as, seg);
2542 ASSERT(seg != NULL && addr == seg->s_base);
2543 cnt++;
2544 ASSERT(cnt < segcnt);
2545 }
2546 if (addr + size > seg->s_base + seg->s_size) {
2547 ssize = seg->s_base + seg->s_size - addr;
2548 } else {
2549 ssize = size;
2550 }
2551 pl = &plist[npages + cnt];
2552 error = segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2553 L_PAGELOCK, rw);
2554 if (error) {
2555 break;
2556 }
2557 ASSERT(plist[npages + cnt] != NULL);
2558 ASSERT(pl_off + btop(ssize) <= npages);
2559 bcopy(plist[npages + cnt], &plist[pl_off],
2560 btop(ssize) * sizeof (page_t *));
2561 pl_off += btop(ssize);
2562 }
2563
2564 if (size == 0) {
2565 AS_LOCK_EXIT(as, &as->a_lock);
2566 ASSERT(cnt == segcnt - 1);
2567 *ppp = plist;
2568 return (0);
2569 }
2570
2571 /*
2572 * one of pagelock calls failed. The error type is in error variable.
2575 * back to the caller.
2576 */
2577
2578 eaddr = addr;
2579 seg = sv_seg;
2580
2581 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2582 if (addr >= seg->s_base + seg->s_size) {
2583 seg = AS_SEGNEXT(as, seg);
2584 ASSERT(seg != NULL && addr == seg->s_base);
2585 cnt++;
2586 ASSERT(cnt < segcnt);
2587 }
2588 if (eaddr > seg->s_base + seg->s_size) {
2589 ssize = seg->s_base + seg->s_size - addr;
2590 } else {
2591 ssize = eaddr - addr;
2592 }
2593 pl = &plist[npages + cnt];
2594 ASSERT(*pl != NULL);
2595 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2596 L_PAGEUNLOCK, rw);
2597 }
2598
2599 AS_LOCK_EXIT(as, &as->a_lock);
2600
2601 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2602
2603 if (error != ENOTSUP && error != EFAULT) {
2604 return (error);
2605 }
2606
2607 slow:
2608 /*
2609 * If we are here because pagelock failed due to the need to cow fault
2610 * in the pages we want to lock F_SOFTLOCK will do this job and in
2611 * next as_pagelock() call for this address range pagelock will
2612 * hopefully succeed.
2613 */
2614 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2615 if (fault_err != 0) {
2650 seg = as_segat(as, raddr);
2651 if (seg == NULL) {
2652 AS_LOCK_EXIT(as, &as->a_lock);
2653 return (EFAULT);
2654 }
2655 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2656 if (raddr + rsize > seg->s_base + seg->s_size) {
2657 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2658 }
2659 if (raddr + rsize <= raddr) {
2660 AS_LOCK_EXIT(as, &as->a_lock);
2661 return (EFAULT);
2662 }
2663
2664 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2665 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2666
2667 /*
2668 * try to lock pages and pass back shadow list
2669 */
2670 err = segop_pagelock(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2671
2672 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2673
2674 AS_LOCK_EXIT(as, &as->a_lock);
2675
2676 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2677 return (err);
2678 }
2679
2680 /*
2681 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2682 * to no pagelock support for this segment or pages need to be cow
2683 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2684 * this as_pagelock() call and in the next as_pagelock() call for the
2685 * same address range pagelock call will hopefull succeed.
2686 */
2687 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2688 if (fault_err != 0) {
2689 return (fc_decode(fault_err));
2690 }
2713 ASSERT(seg != NULL);
2714 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2715 ASSERT(addr + size > seg->s_base + seg->s_size);
2716 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2717 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2718 ASSERT(plist != NULL);
2719
2720 for (cnt = 0; addr < eaddr; addr += ssize) {
2721 if (addr >= seg->s_base + seg->s_size) {
2722 seg = AS_SEGNEXT(as, seg);
2723 ASSERT(seg != NULL && addr == seg->s_base);
2724 cnt++;
2725 }
2726 if (eaddr > seg->s_base + seg->s_size) {
2727 ssize = seg->s_base + seg->s_size - addr;
2728 } else {
2729 ssize = eaddr - addr;
2730 }
2731 pl = &plist[npages + cnt];
2732 ASSERT(*pl != NULL);
2733 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2734 L_PAGEUNLOCK, rw);
2735 }
2736 ASSERT(cnt > 0);
2737 AS_LOCK_EXIT(as, &as->a_lock);
2738
2739 cnt++;
2740 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2741 }
2742
2743 /*
2744 * unlock pages in a given address range
2745 */
2746 void
2747 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2748 enum seg_rw rw)
2749 {
2750 struct seg *seg;
2751 size_t rsize;
2752 caddr_t raddr;
2753
2759 * falling back to as_fault
2760 */
2761 if (pp == NULL) {
2762 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2763 return;
2764 }
2765
2766 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2767 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2768 (size_t)raddr;
2769
2770 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2771 seg = as_segat(as, raddr);
2772 ASSERT(seg != NULL);
2773
2774 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2775 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2776
2777 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2778 if (raddr + rsize <= seg->s_base + seg->s_size) {
2779 segop_pagelock(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2780 } else {
2781 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2782 return;
2783 }
2784 AS_LOCK_EXIT(as, &as->a_lock);
2785 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2786 }
2787
2788 int
2789 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2790 boolean_t wait)
2791 {
2792 struct seg *seg;
2793 size_t ssize;
2794 caddr_t raddr; /* rounded down addr */
2795 size_t rsize; /* rounded up size */
2796 int error = 0;
2797 size_t pgsz = page_get_pagesize(szc);
2798
2799 setpgsz_top:
2814 as_setwatch(as);
2815 AS_LOCK_EXIT(as, &as->a_lock);
2816 return (ENOMEM);
2817 }
2818
2819 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2820 if (raddr >= seg->s_base + seg->s_size) {
2821 seg = AS_SEGNEXT(as, seg);
2822 if (seg == NULL || raddr != seg->s_base) {
2823 error = ENOMEM;
2824 break;
2825 }
2826 }
2827 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2828 ssize = seg->s_base + seg->s_size - raddr;
2829 } else {
2830 ssize = rsize;
2831 }
2832
2833 retry:
2834 error = segop_setpagesize(seg, raddr, ssize, szc);
2835
2836 if (error == IE_NOMEM) {
2837 error = EAGAIN;
2838 break;
2839 }
2840
2841 if (error == IE_RETRY) {
2842 AS_LOCK_EXIT(as, &as->a_lock);
2843 goto setpgsz_top;
2844 }
2845
2846 if (error == ENOTSUP) {
2847 error = EINVAL;
2848 break;
2849 }
2850
2851 if (wait && (error == EAGAIN)) {
2852 /*
2853 * Memory is currently locked. It must be unlocked
2854 * before this operation can succeed through a retry.
2893 * number of retries without sleeping should
2894 * be very small. See segvn_reclaim() for
2895 * more comments.
2896 */
2897 AS_CLRNOUNMAPWAIT(as);
2898 mutex_exit(&as->a_contents);
2899 goto retry;
2900 }
2901 mutex_exit(&as->a_contents);
2902 goto setpgsz_top;
2903 } else if (error != 0) {
2904 break;
2905 }
2906 }
2907 as_setwatch(as);
2908 AS_LOCK_EXIT(as, &as->a_lock);
2909 return (error);
2910 }
2911
2912 /*
2913 * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments
2914 * in its chunk where s_szc is less than the szc we want to set.
2915 */
2916 static int
2917 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2918 int *retry)
2919 {
2920 struct seg *seg;
2921 size_t ssize;
2922 int error;
2923
2924 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
2925
2926 seg = as_segat(as, raddr);
2927 if (seg == NULL) {
2928 panic("as_iset3_default_lpsize: no seg");
2929 }
2930
2931 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2932 if (raddr >= seg->s_base + seg->s_size) {
2933 seg = AS_SEGNEXT(as, seg);
2934 if (seg == NULL || raddr != seg->s_base) {
2935 panic("as_iset3_default_lpsize: as changed");
2936 }
2937 }
2938 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2939 ssize = seg->s_base + seg->s_size - raddr;
2940 } else {
2941 ssize = rsize;
2942 }
2943
2944 if (szc > seg->s_szc) {
2945 error = segop_setpagesize(seg, raddr, ssize, szc);
2946 /* Only retry on EINVAL segments that have no vnode. */
2947 if (error == EINVAL) {
2948 vnode_t *vp = NULL;
2949 if ((segop_gettype(seg, raddr) & MAP_SHARED) &&
2950 (segop_getvp(seg, raddr, &vp) != 0 ||
2951 vp == NULL)) {
2952 *retry = 1;
2953 } else {
2954 *retry = 0;
2955 }
2956 }
2957 if (error) {
2958 return (error);
2959 }
2960 }
2961 }
2962 return (0);
2963 }
2964
2965 /*
2966 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
2967 * pagesize on each segment in its range, but if any fails with EINVAL,
2968 * then it reduces the pagesizes to the next size in the bitmap and
2969 * retries as_iset3_default_lpsize(). The reason why the code retries
2970 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3173 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3174 again:
3175 error = 0;
3176
3177 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3178 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3179 (size_t)raddr;
3180
3181 if (raddr + rsize < raddr) { /* check for wraparound */
3182 AS_LOCK_EXIT(as, &as->a_lock);
3183 return (ENOMEM);
3184 }
3185 as_clearwatchprot(as, raddr, rsize);
3186 seg = as_segat(as, raddr);
3187 if (seg == NULL) {
3188 as_setwatch(as);
3189 AS_LOCK_EXIT(as, &as->a_lock);
3190 return (ENOMEM);
3191 }
3192 if (seg->s_ops == &segvn_ops) {
3193 rtype = segop_gettype(seg, addr);
3194 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3195 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3196 segvn = 1;
3197 } else {
3198 segvn = 0;
3199 }
3200 setaddr = raddr;
3201 setsize = 0;
3202
3203 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3204 if (raddr >= (seg->s_base + seg->s_size)) {
3205 seg = AS_SEGNEXT(as, seg);
3206 if (seg == NULL || raddr != seg->s_base) {
3207 error = ENOMEM;
3208 break;
3209 }
3210 if (seg->s_ops == &segvn_ops) {
3211 stype = segop_gettype(seg, raddr);
3212 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3213 stype &= (MAP_SHARED | MAP_PRIVATE);
3214 if (segvn && (rflags != sflags ||
3215 rtype != stype)) {
3216 /*
3217 * The next segment is also segvn but
3218 * has different flags and/or type.
3219 */
3220 ASSERT(setsize != 0);
3221 error = as_iset_default_lpsize(as,
3222 setaddr, setsize, rflags, rtype);
3223 if (error) {
3224 break;
3225 }
3226 rflags = sflags;
3227 rtype = stype;
3228 setaddr = raddr;
3229 setsize = 0;
3230 } else if (!segvn) {
3231 rflags = sflags;
3305 as_setwatch(struct as *as)
3306 {
3307 struct watched_page *pwp;
3308 struct seg *seg;
3309 caddr_t vaddr;
3310 uint_t prot;
3311 int err, retrycnt;
3312
3313 if (avl_numnodes(&as->a_wpage) == 0)
3314 return;
3315
3316 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3317
3318 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3319 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3320 retrycnt = 0;
3321 retry:
3322 vaddr = pwp->wp_vaddr;
3323 if (pwp->wp_oprot != 0 || /* already set up */
3324 (seg = as_segat(as, vaddr)) == NULL ||
3325 segop_getprot(seg, vaddr, 0, &prot) != 0)
3326 continue;
3327
3328 pwp->wp_oprot = prot;
3329 if (pwp->wp_read)
3330 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3331 if (pwp->wp_write)
3332 prot &= ~PROT_WRITE;
3333 if (pwp->wp_exec)
3334 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3335 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3336 err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3337 if (err == IE_RETRY) {
3338 pwp->wp_oprot = 0;
3339 ASSERT(retrycnt == 0);
3340 retrycnt++;
3341 goto retry;
3342 }
3343 }
3344 pwp->wp_prot = prot;
3345 }
3346 }
3347
3348 /*
3349 * Clear all of the watched pages in the address space.
3350 */
3351 void
3352 as_clearwatch(struct as *as)
3353 {
3354 struct watched_page *pwp;
3355 struct seg *seg;
3356 caddr_t vaddr;
3357 uint_t prot;
3358 int err, retrycnt;
3359
3360 if (avl_numnodes(&as->a_wpage) == 0)
3361 return;
3362
3363 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3364
3365 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3366 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3367 retrycnt = 0;
3368 retry:
3369 vaddr = pwp->wp_vaddr;
3370 if (pwp->wp_oprot == 0 || /* not set up */
3371 (seg = as_segat(as, vaddr)) == NULL)
3372 continue;
3373
3374 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3375 err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3376 if (err == IE_RETRY) {
3377 ASSERT(retrycnt == 0);
3378 retrycnt++;
3379 goto retry;
3380 }
3381 }
3382 pwp->wp_oprot = 0;
3383 pwp->wp_prot = 0;
3384 }
3385 }
3386
3387 /*
3388 * Force a new setup for all the watched pages in the range.
3389 */
3390 static void
3391 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3392 {
3393 struct watched_page *pwp;
3394 struct watched_page tpw;
3395 caddr_t eaddr = addr + size;
3409 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3410
3411 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3412 retrycnt = 0;
3413 vaddr = pwp->wp_vaddr;
3414
3415 wprot = prot;
3416 if (pwp->wp_read)
3417 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3418 if (pwp->wp_write)
3419 wprot &= ~PROT_WRITE;
3420 if (pwp->wp_exec)
3421 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3422 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3423 retry:
3424 seg = as_segat(as, vaddr);
3425 if (seg == NULL) {
3426 panic("as_setwatchprot: no seg");
3427 /*NOTREACHED*/
3428 }
3429 err = segop_setprot(seg, vaddr, PAGESIZE, wprot);
3430 if (err == IE_RETRY) {
3431 ASSERT(retrycnt == 0);
3432 retrycnt++;
3433 goto retry;
3434 }
3435 }
3436 pwp->wp_oprot = prot;
3437 pwp->wp_prot = wprot;
3438
3439 pwp = AVL_NEXT(&as->a_wpage, pwp);
3440 }
3441 }
3442
3443 /*
3444 * Clear all of the watched pages in the range.
3445 */
3446 static void
3447 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3448 {
3449 caddr_t eaddr = addr + size;
3456
3457 if (avl_numnodes(&as->a_wpage) == 0)
3458 return;
3459
3460 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3461 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3462 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3463
3464 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3465
3466 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3467
3468 if ((prot = pwp->wp_oprot) != 0) {
3469 retrycnt = 0;
3470
3471 if (prot != pwp->wp_prot) {
3472 retry:
3473 seg = as_segat(as, pwp->wp_vaddr);
3474 if (seg == NULL)
3475 continue;
3476 err = segop_setprot(seg, pwp->wp_vaddr,
3477 PAGESIZE, prot);
3478 if (err == IE_RETRY) {
3479 ASSERT(retrycnt == 0);
3480 retrycnt++;
3481 goto retry;
3482
3483 }
3484 }
3485 pwp->wp_oprot = 0;
3486 pwp->wp_prot = 0;
3487 }
3488
3489 pwp = AVL_NEXT(&as->a_wpage, pwp);
3490 }
3491 }
3492
3493 void
3494 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3495 {
3496 struct proc *p;
3505 }
3506 }
3507 mutex_exit(&pidlock);
3508 }
3509
3510 /*
3511 * return memory object ID
3512 */
3513 int
3514 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3515 {
3516 struct seg *seg;
3517 int sts;
3518
3519 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
3520 seg = as_segat(as, addr);
3521 if (seg == NULL) {
3522 AS_LOCK_EXIT(as, &as->a_lock);
3523 return (EFAULT);
3524 }
3525
3526 sts = segop_getmemid(seg, addr, memidp);
3527
3528 AS_LOCK_EXIT(as, &as->a_lock);
3529 return (sts);
3530 }
|