1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * sun4u Memory Scrubbing
  28  *
  29  * On detection of a correctable memory ECC error, the sun4u kernel
  30  * returns the corrected data to the requester and re-writes it
  31  * to memory (DRAM).  So if the correctable error was transient,
  32  * the read has effectively been cleaned (scrubbed) from memory.
  33  *
  34  * Scrubbing thus reduces the likelyhood that multiple transient errors
  35  * will occur in the same memory word, making uncorrectable errors due
  36  * to transients less likely.
  37  *
  38  * Thus is born the desire that every memory location be periodically
  39  * accessed.
  40  *
  41  * This file implements a memory scrubbing thread.  This scrubber
  42  * guarantees that all of physical memory is accessed periodically
  43  * (memscrub_period_sec -- 12 hours).
  44  *
  45  * It attempts to do this as unobtrusively as possible.  The thread
  46  * schedules itself to wake up at an interval such that if it reads
  47  * memscrub_span_pages (32MB) on each wakeup, it will read all of physical
  48  * memory in in memscrub_period_sec (12 hours).
  49  *
  50  * The scrubber uses the block load and prefetch hardware to read memory
  51  * @ 1300MB/s, so it reads spans of 32MB in 0.025 seconds.  Unlike the
  52  * original sun4d scrubber the sun4u scrubber does not read ahead if the
  53  * system is idle because we can read memory very efficently.
  54  *
  55  * The scrubber maintains a private copy of the phys_install memory list
  56  * to keep track of what memory should be scrubbed.
  57  *
  58  * The global routines memscrub_add_span() and memscrub_delete_span() are
  59  * used to add and delete from this list.  If hotplug memory is later
  60  * supported these two routines can be used to notify the scrubber of
  61  * memory configuration changes.
  62  *
  63  * The following parameters can be set via /etc/system
  64  *
  65  * memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES (8MB)
  66  * memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC (12 hours)
  67  * memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI)
  68  * memscrub_delay_start_sec = (5 minutes)
  69  * memscrub_verbose = (0)
  70  * memscrub_override_ticks = (1 tick)
  71  * disable_memscrub = (0)
  72  * pause_memscrub = (0)
  73  * read_all_memscrub = (0)
  74  *
  75  * The scrubber will print NOTICE messages of what it is doing if
  76  * "memscrub_verbose" is set.
  77  *
  78  * If the scrubber's sleep time calculation drops to zero ticks,
  79  * memscrub_override_ticks will be used as the sleep time instead. The
  80  * sleep time should only drop to zero on a system with over 131.84
  81  * terabytes of memory, or where the default scrubber parameters have
  82  * been adjusted. For example, reducing memscrub_span_pages or
  83  * memscrub_period_sec causes the sleep time to drop to zero with less
  84  * memory. Note that since the sleep time is calculated in clock ticks,
  85  * using hires clock ticks allows for more memory before the sleep time
  86  * becomes zero.
  87  *
  88  * The scrubber will exit (or never be started) if it finds the variable
  89  * "disable_memscrub" set.
  90  *
  91  * The scrubber will pause (not read memory) when "pause_memscrub"
  92  * is set.  It will check the state of pause_memscrub at each wakeup
  93  * period.  The scrubber will not make up for lost time.  If you
  94  * pause the scrubber for a prolonged period of time you can use
  95  * the "read_all_memscrub" switch (see below) to catch up. In addition,
  96  * pause_memscrub is used internally by the post memory DR callbacks.
  97  * It is set for the small period of time during which the callbacks
  98  * are executing. This ensures "memscrub_lock" will be released,
  99  * allowing the callbacks to finish.
 100  *
 101  * The scrubber will read all memory if "read_all_memscrub" is set.
 102  * The normal span read will also occur during the wakeup.
 103  *
 104  * MEMSCRUB_MIN_PAGES (32MB) is the minimum amount of memory a system
 105  * must have before we'll start the scrubber.
 106  *
 107  * MEMSCRUB_DFL_SPAN_PAGES (32MB) is based on the guess that 0.025 sec
 108  * is a "good" amount of minimum time for the thread to run at a time.
 109  *
 110  * MEMSCRUB_DFL_PERIOD_SEC (12 hours) is nearly a total guess --
 111  * twice the frequency the hardware folk estimated would be necessary.
 112  *
 113  * MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI) is based on the assumption
 114  * that the scurbber should get its fair share of time (since it
 115  * is short).  At a priority of 0 the scrubber will be starved.
 116  */
 117 
 118 #include <sys/systm.h>            /* timeout, types, t_lock */
 119 #include <sys/cmn_err.h>
 120 #include <sys/sysmacros.h>        /* MIN */
 121 #include <sys/memlist.h>  /* memlist */
 122 #include <sys/mem_config.h>       /* memory add/delete */
 123 #include <sys/kmem.h>             /* KMEM_NOSLEEP */
 124 #include <sys/cpuvar.h>           /* ncpus_online */
 125 #include <sys/debug.h>            /* ASSERTs */
 126 #include <sys/machsystm.h>        /* lddphys */
 127 #include <sys/cpu_module.h>       /* vtag_flushpage */
 128 #include <sys/kstat.h>
 129 #include <sys/atomic.h>           /* atomic_add_32 */
 130 
 131 #include <vm/hat.h>
 132 #include <vm/seg_kmem.h>
 133 #include <vm/hat_sfmmu.h> /* XXX FIXME - delete */
 134 
 135 #include <sys/time.h>
 136 #include <sys/callb.h>            /* CPR callback */
 137 #include <sys/ontrap.h>
 138 
 139 /*
 140  * Should really have paddr_t defined, but it is broken.  Use
 141  * ms_paddr_t in the meantime to make the code cleaner
 142  */
 143 typedef uint64_t ms_paddr_t;
 144 
 145 /*
 146  * Global Routines:
 147  */
 148 int memscrub_add_span(pfn_t pfn, pgcnt_t pages);
 149 int memscrub_delete_span(pfn_t pfn, pgcnt_t pages);
 150 int memscrub_init(void);
 151 void memscrub_induced_error(void);
 152 
 153 /*
 154  * Global Data:
 155  */
 156 
 157 /*
 158  * scrub if we have at least this many pages
 159  */
 160 #define MEMSCRUB_MIN_PAGES (32 * 1024 * 1024 / PAGESIZE)
 161 
 162 /*
 163  * scan all of physical memory at least once every MEMSCRUB_PERIOD_SEC
 164  */
 165 #define MEMSCRUB_DFL_PERIOD_SEC (12 * 60 * 60)  /* 12 hours */
 166 
 167 /*
 168  * scan at least MEMSCRUB_DFL_SPAN_PAGES each iteration
 169  */
 170 #define MEMSCRUB_DFL_SPAN_PAGES ((32 * 1024 * 1024) / PAGESIZE)
 171 
 172 /*
 173  * almost anything is higher priority than scrubbing
 174  */
 175 #define MEMSCRUB_DFL_THREAD_PRI MINCLSYSPRI
 176 
 177 /*
 178  * size used when scanning memory
 179  */
 180 #define MEMSCRUB_BLOCK_SIZE             256
 181 #define MEMSCRUB_BLOCK_SIZE_SHIFT       8       /* log2(MEMSCRUB_BLOCK_SIZE) */
 182 #define MEMSCRUB_BLOCKS_PER_PAGE        (PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT)
 183 
 184 #define MEMSCRUB_BPP4M          MMU_PAGESIZE4M >> MEMSCRUB_BLOCK_SIZE_SHIFT
 185 #define MEMSCRUB_BPP512K        MMU_PAGESIZE512K >> MEMSCRUB_BLOCK_SIZE_SHIFT
 186 #define MEMSCRUB_BPP64K         MMU_PAGESIZE64K >> MEMSCRUB_BLOCK_SIZE_SHIFT
 187 #define MEMSCRUB_BPP            MMU_PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT
 188 
 189 /*
 190  * This message indicates that we have exceeded the limitations of
 191  * the memscrubber. See the comments above regarding what would
 192  * cause the sleep time to become zero. In DEBUG mode, this message
 193  * is logged on the console and in the messages file. In non-DEBUG
 194  * mode, it is only logged in the messages file.
 195  */
 196 #ifdef DEBUG
 197 #define MEMSCRUB_OVERRIDE_MSG   "Memory scrubber sleep time is zero " \
 198         "seconds, consuming entire CPU."
 199 #else
 200 #define MEMSCRUB_OVERRIDE_MSG   "!Memory scrubber sleep time is zero " \
 201         "seconds, consuming entire CPU."
 202 #endif /* DEBUG */
 203 
 204 /*
 205  * we can patch these defaults in /etc/system if necessary
 206  */
 207 uint_t disable_memscrub = 0;
 208 uint_t pause_memscrub = 0;
 209 uint_t read_all_memscrub = 0;
 210 uint_t memscrub_verbose = 0;
 211 uint_t memscrub_all_idle = 0;
 212 uint_t memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES;
 213 uint_t memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC;
 214 uint_t memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI;
 215 uint_t memscrub_delay_start_sec = 5 * 60;
 216 uint_t memscrub_override_ticks = 1;
 217 
 218 /*
 219  * Static Routines
 220  */
 221 static void memscrubber(void);
 222 static void memscrub_cleanup(void);
 223 static int memscrub_add_span_gen(pfn_t, pgcnt_t, struct memlist **, uint_t *);
 224 static int memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp);
 225 static void memscrub_scan(uint_t blks, ms_paddr_t src);
 226 
 227 /*
 228  * Static Data
 229  */
 230 
 231 static struct memlist *memscrub_memlist;
 232 static uint_t memscrub_phys_pages;
 233 
 234 static kcondvar_t memscrub_cv;
 235 static kmutex_t memscrub_lock;
 236 /*
 237  * memscrub_lock protects memscrub_memlist, interval_ticks, cprinfo, ...
 238  */
 239 static void memscrub_init_mem_config(void);
 240 static void memscrub_uninit_mem_config(void);
 241 
 242 /*
 243  * Linked list of memscrub aware spans having retired pages.
 244  * Currently enabled only on sun4u USIII-based platforms.
 245  */
 246 typedef struct memscrub_page_retire_span {
 247         ms_paddr_t                              address;
 248         struct memscrub_page_retire_span        *next;
 249 } memscrub_page_retire_span_t;
 250 
 251 static memscrub_page_retire_span_t *memscrub_page_retire_span_list = NULL;
 252 
 253 static void memscrub_page_retire_span_add(ms_paddr_t);
 254 static void memscrub_page_retire_span_delete(ms_paddr_t);
 255 static int memscrub_page_retire_span_search(ms_paddr_t);
 256 static void memscrub_page_retire_span_list_update(void);
 257 
 258 /*
 259  * add_to_page_retire_list: Set by cpu_async_log_err() routine
 260  * by calling memscrub_induced_error() when CE/UE occurs on a retired
 261  * page due to memscrub reading.  Cleared by memscrub after updating
 262  * global page retire span list.  Piggybacking on protection of
 263  * memscrub_lock, which is held during set and clear.
 264  * Note: When cpu_async_log_err() calls memscrub_induced_error(), it is running
 265  * on softint context, which gets fired on a cpu memscrub thread currently
 266  * running.  Memscrub thread has affinity set during memscrub_read(), hence
 267  * migration to new cpu not expected.
 268  */
 269 static int add_to_page_retire_list = 0;
 270 
 271 /*
 272  * Keep track of some interesting statistics
 273  */
 274 static struct memscrub_kstats {
 275         kstat_named_t   done_early;     /* ahead of schedule */
 276         kstat_named_t   early_sec;      /* by cumulative num secs */
 277         kstat_named_t   done_late;      /* behind schedule */
 278         kstat_named_t   late_sec;       /* by cumulative num secs */
 279         kstat_named_t   interval_ticks; /* num ticks between intervals */
 280         kstat_named_t   force_run;      /* forced to run, non-timeout */
 281         kstat_named_t   errors_found;   /* num errors found by memscrub */
 282 } memscrub_counts = {
 283         { "done_early",         KSTAT_DATA_UINT32 },
 284         { "early_sec",          KSTAT_DATA_UINT32 },
 285         { "done_late",          KSTAT_DATA_UINT32 },
 286         { "late_sec",           KSTAT_DATA_UINT32 },
 287         { "interval_ticks",     KSTAT_DATA_UINT32 },
 288         { "force_run",          KSTAT_DATA_UINT32 },
 289         { "errors_found",       KSTAT_DATA_UINT32 },
 290 };
 291 
 292 #define MEMSCRUB_STAT_INC(stat) memscrub_counts.stat.value.ui32++
 293 #define MEMSCRUB_STAT_SET(stat, val) memscrub_counts.stat.value.ui32 = (val)
 294 #define MEMSCRUB_STAT_NINC(stat, val) memscrub_counts.stat.value.ui32 += (val)
 295 
 296 static struct kstat *memscrub_ksp = (struct kstat *)NULL;
 297 
 298 static timeout_id_t memscrub_tid = 0;   /* keep track of timeout id */
 299 
 300 /*
 301  * create memscrub_memlist from phys_install list
 302  * initialize locks, set memscrub_phys_pages.
 303  */
 304 int
 305 memscrub_init(void)
 306 {
 307         struct memlist *src;
 308 
 309         /*
 310          * only startup the scrubber if we have a minimum
 311          * number of pages
 312          */
 313         if (physinstalled >= MEMSCRUB_MIN_PAGES) {
 314 
 315                 /*
 316                  * initialize locks
 317                  */
 318                 mutex_init(&memscrub_lock, NULL, MUTEX_DRIVER, NULL);
 319                 cv_init(&memscrub_cv, NULL, CV_DRIVER, NULL);
 320 
 321                 /*
 322                  * copy phys_install to memscrub_memlist
 323                  */
 324                 for (src = phys_install; src; src = src->ml_next) {
 325                         if (memscrub_add_span(
 326                             (pfn_t)(src->ml_address >> PAGESHIFT),
 327                             (pgcnt_t)(src->ml_size >> PAGESHIFT))) {
 328                                 memscrub_cleanup();
 329                                 return (-1);
 330                         }
 331                 }
 332 
 333                 /*
 334                  * initialize kstats
 335                  */
 336                 memscrub_ksp = kstat_create("unix", 0, "memscrub_kstat",
 337                     "misc", KSTAT_TYPE_NAMED,
 338                     sizeof (memscrub_counts) / sizeof (kstat_named_t),
 339                     KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
 340 
 341                 if (memscrub_ksp) {
 342                         memscrub_ksp->ks_data = (void *)&memscrub_counts;
 343                         kstat_install(memscrub_ksp);
 344                 } else {
 345                         cmn_err(CE_NOTE, "Memscrubber cannot create kstats\n");
 346                 }
 347 
 348                 /*
 349                  * create memscrubber thread
 350                  */
 351                 (void) thread_create(NULL, 0, (void (*)())memscrubber,
 352                     NULL, 0, &p0, TS_RUN, memscrub_thread_pri);
 353 
 354                 /*
 355                  * We don't want call backs changing the list
 356                  * if there is no thread running. We do not
 357                  * attempt to deal with stopping/starting scrubbing
 358                  * on memory size changes.
 359                  */
 360                 memscrub_init_mem_config();
 361         }
 362 
 363         return (0);
 364 }
 365 
 366 static void
 367 memscrub_cleanup(void)
 368 {
 369         memscrub_uninit_mem_config();
 370         while (memscrub_memlist) {
 371                 (void) memscrub_delete_span(
 372                     (pfn_t)(memscrub_memlist->ml_address >> PAGESHIFT),
 373                     (pgcnt_t)(memscrub_memlist->ml_size >> PAGESHIFT));
 374         }
 375         if (memscrub_ksp)
 376                 kstat_delete(memscrub_ksp);
 377         cv_destroy(&memscrub_cv);
 378         mutex_destroy(&memscrub_lock);
 379 }
 380 
 381 #ifdef MEMSCRUB_DEBUG
 382 static void
 383 memscrub_printmemlist(char *title, struct memlist *listp)
 384 {
 385         struct memlist *list;
 386 
 387         cmn_err(CE_CONT, "%s:\n", title);
 388 
 389         for (list = listp; list; list = list->ml_next) {
 390                 cmn_err(CE_CONT, "addr = 0x%llx, size = 0x%llx\n",
 391                     list->ml_address, list->ml_size);
 392         }
 393 }
 394 #endif /* MEMSCRUB_DEBUG */
 395 
 396 /* ARGSUSED */
 397 static void
 398 memscrub_wakeup(void *c)
 399 {
 400         /*
 401          * grab mutex to guarantee that our wakeup call
 402          * arrives after we go to sleep -- so we can't sleep forever.
 403          */
 404         mutex_enter(&memscrub_lock);
 405         cv_signal(&memscrub_cv);
 406         mutex_exit(&memscrub_lock);
 407 }
 408 
 409 /*
 410  * provide an interface external to the memscrubber
 411  * which will force the memscrub thread to run vs.
 412  * waiting for the timeout, if one is set
 413  */
 414 void
 415 memscrub_run(void)
 416 {
 417         MEMSCRUB_STAT_INC(force_run);
 418         if (memscrub_tid) {
 419                 (void) untimeout(memscrub_tid);
 420                 memscrub_wakeup((void *)NULL);
 421         }
 422 }
 423 
 424 /*
 425  * this calculation doesn't account for the time
 426  * that the actual scan consumes -- so we'd fall
 427  * slightly behind schedule with this interval.
 428  * It's very small.
 429  */
 430 
 431 static uint_t
 432 compute_interval_ticks(void)
 433 {
 434         /*
 435          * We use msp_safe mpp_safe below to insure somebody
 436          * doesn't set memscrub_span_pages or memscrub_phys_pages
 437          * to 0 on us.
 438          */
 439         static uint_t msp_safe, mpp_safe;
 440         static uint_t interval_ticks, period_ticks;
 441         msp_safe = memscrub_span_pages;
 442         mpp_safe = memscrub_phys_pages;
 443 
 444         period_ticks = memscrub_period_sec * hz;
 445         interval_ticks = period_ticks;
 446 
 447         ASSERT(mutex_owned(&memscrub_lock));
 448 
 449         if ((msp_safe != 0) && (mpp_safe != 0)) {
 450                 if (memscrub_phys_pages <= msp_safe) {
 451                         interval_ticks = period_ticks;
 452                 } else {
 453                         interval_ticks = (period_ticks /
 454                             (mpp_safe / msp_safe));
 455                 }
 456         }
 457         return (interval_ticks);
 458 }
 459 
 460 void
 461 memscrubber(void)
 462 {
 463         ms_paddr_t address, addr;
 464         time_t deadline;
 465         pgcnt_t pages;
 466         uint_t reached_end = 1;
 467         uint_t paused_message = 0;
 468         uint_t interval_ticks = 0;
 469         uint_t sleep_warn_printed = 0;
 470         callb_cpr_t cprinfo;
 471 
 472         /*
 473          * notify CPR of our existence
 474          */
 475         CALLB_CPR_INIT(&cprinfo, &memscrub_lock, callb_generic_cpr, "memscrub");
 476 
 477         mutex_enter(&memscrub_lock);
 478 
 479         if (memscrub_memlist == NULL) {
 480                 cmn_err(CE_WARN, "memscrub_memlist not initialized.");
 481                 goto memscrub_exit;
 482         }
 483 
 484         address = memscrub_memlist->ml_address;
 485 
 486         deadline = gethrestime_sec() + memscrub_delay_start_sec;
 487 
 488         for (;;) {
 489                 if (disable_memscrub)
 490                         break;
 491 
 492                 /*
 493                  * compute interval_ticks
 494                  */
 495                 interval_ticks = compute_interval_ticks();
 496 
 497                 /*
 498                  * If the calculated sleep time is zero, and pause_memscrub
 499                  * has been set, make sure we sleep so that another thread
 500                  * can acquire memscrub_lock.
 501                  */
 502                 if (interval_ticks == 0 && pause_memscrub) {
 503                         interval_ticks = hz;
 504                 }
 505 
 506                 /*
 507                  * And as a fail safe, under normal non-paused operation, do
 508                  * not allow the sleep time to be zero.
 509                  */
 510                 if (interval_ticks == 0) {
 511                         interval_ticks = memscrub_override_ticks;
 512                         if (!sleep_warn_printed) {
 513                                 cmn_err(CE_NOTE, MEMSCRUB_OVERRIDE_MSG);
 514                                 sleep_warn_printed = 1;
 515                         }
 516                 }
 517 
 518                 MEMSCRUB_STAT_SET(interval_ticks, interval_ticks);
 519 
 520                 /*
 521                  * Did we just reach the end of memory? If we are at the
 522                  * end of memory, delay end of memory processing until
 523                  * pause_memscrub is not set.
 524                  */
 525                 if (reached_end && !pause_memscrub) {
 526                         time_t now = gethrestime_sec();
 527 
 528                         if (now >= deadline) {
 529                                 MEMSCRUB_STAT_INC(done_late);
 530                                 MEMSCRUB_STAT_NINC(late_sec, now - deadline);
 531                                 /*
 532                                  * past deadline, start right away
 533                                  */
 534                                 interval_ticks = 0;
 535 
 536                                 deadline = now + memscrub_period_sec;
 537                         } else {
 538                                 /*
 539                                  * we finished ahead of schedule.
 540                                  * wait till previous deadline before re-start.
 541                                  */
 542                                 interval_ticks = (deadline - now) * hz;
 543                                 MEMSCRUB_STAT_INC(done_early);
 544                                 MEMSCRUB_STAT_NINC(early_sec, deadline - now);
 545                                 deadline += memscrub_period_sec;
 546                         }
 547                         reached_end = 0;
 548                         sleep_warn_printed = 0;
 549                 }
 550 
 551                 if (interval_ticks != 0) {
 552                         /*
 553                          * it is safe from our standpoint for CPR to
 554                          * suspend the system
 555                          */
 556                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 557 
 558                         /*
 559                          * hit the snooze bar
 560                          */
 561                         memscrub_tid = timeout(memscrub_wakeup, NULL,
 562                             interval_ticks);
 563 
 564                         /*
 565                          * go to sleep
 566                          */
 567                         cv_wait(&memscrub_cv, &memscrub_lock);
 568 
 569                         /*
 570                          * at this point, no timeout should be set
 571                          */
 572                         memscrub_tid = 0;
 573 
 574                         /*
 575                          * we need to goto work and will be modifying
 576                          * our internal state and mapping/unmapping
 577                          * TTEs
 578                          */
 579                         CALLB_CPR_SAFE_END(&cprinfo, &memscrub_lock);
 580                 }
 581 
 582 
 583                 if (memscrub_phys_pages == 0) {
 584                         cmn_err(CE_WARN, "Memory scrubber has 0 pages to read");
 585                         goto memscrub_exit;
 586                 }
 587 
 588                 if (!pause_memscrub) {
 589                         if (paused_message) {
 590                                 paused_message = 0;
 591                                 if (memscrub_verbose)
 592                                         cmn_err(CE_NOTE, "Memory scrubber "
 593                                             "resuming");
 594                         }
 595 
 596                         if (read_all_memscrub) {
 597                                 if (memscrub_verbose)
 598                                         cmn_err(CE_NOTE, "Memory scrubber "
 599                                             "reading all memory per request");
 600 
 601                                 addr = memscrub_memlist->ml_address;
 602                                 reached_end = 0;
 603                                 while (!reached_end) {
 604                                         if (disable_memscrub)
 605                                                 break;
 606                                         pages = memscrub_phys_pages;
 607                                         reached_end = memscrub_verify_span(
 608                                             &addr, &pages);
 609                                         memscrub_scan(pages *
 610                                             MEMSCRUB_BLOCKS_PER_PAGE, addr);
 611                                         addr += ((uint64_t)pages * PAGESIZE);
 612                                 }
 613                                 read_all_memscrub = 0;
 614                         }
 615 
 616                         /*
 617                          * read 1 span
 618                          */
 619                         pages = memscrub_span_pages;
 620 
 621                         if (disable_memscrub)
 622                                 break;
 623 
 624                         /*
 625                          * determine physical address range
 626                          */
 627                         reached_end = memscrub_verify_span(&address,
 628                             &pages);
 629 
 630                         memscrub_scan(pages * MEMSCRUB_BLOCKS_PER_PAGE,
 631                             address);
 632 
 633                         address += ((uint64_t)pages * PAGESIZE);
 634                 }
 635 
 636                 if (pause_memscrub && !paused_message) {
 637                         paused_message = 1;
 638                         if (memscrub_verbose)
 639                                 cmn_err(CE_NOTE, "Memory scrubber paused");
 640                 }
 641         }
 642 
 643 memscrub_exit:
 644         cmn_err(CE_NOTE, "Memory scrubber exiting");
 645         CALLB_CPR_EXIT(&cprinfo);
 646         memscrub_cleanup();
 647         thread_exit();
 648         /* NOTREACHED */
 649 }
 650 
 651 /*
 652  * condition address and size
 653  * such that they span legal physical addresses.
 654  *
 655  * when appropriate, address will be rounded up to start of next
 656  * struct memlist, and pages will be rounded down to the end of the
 657  * memlist size.
 658  *
 659  * returns 1 if reached end of list, else returns 0.
 660  */
 661 static int
 662 memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp)
 663 {
 664         struct memlist *mlp;
 665         ms_paddr_t address = *addrp;
 666         uint64_t bytes = (uint64_t)*pagesp * PAGESIZE;
 667         uint64_t bytes_remaining;
 668         int reached_end = 0;
 669 
 670         ASSERT(mutex_owned(&memscrub_lock));
 671 
 672         /*
 673          * find memlist struct that contains addrp
 674          * assumes memlist is sorted by ascending address.
 675          */
 676         for (mlp = memscrub_memlist; mlp != NULL; mlp = mlp->ml_next) {
 677                 /*
 678                  * if before this chunk, round up to beginning
 679                  */
 680                 if (address < mlp->ml_address) {
 681                         address = mlp->ml_address;
 682                         break;
 683                 }
 684                 /*
 685                  * if before end of chunk, then we found it
 686                  */
 687                 if (address < (mlp->ml_address + mlp->ml_size))
 688                         break;
 689 
 690                 /* else go to next struct memlist */
 691         }
 692         /*
 693          * if we hit end of list, start at beginning
 694          */
 695         if (mlp == NULL) {
 696                 mlp = memscrub_memlist;
 697                 address = mlp->ml_address;
 698         }
 699 
 700         /*
 701          * now we have legal address, and its mlp, condition bytes
 702          */
 703         bytes_remaining = (mlp->ml_address + mlp->ml_size) - address;
 704 
 705         if (bytes > bytes_remaining)
 706                 bytes = bytes_remaining;
 707 
 708         /*
 709          * will this span take us to end of list?
 710          */
 711         if ((mlp->ml_next == NULL) &&
 712             ((mlp->ml_address + mlp->ml_size) == (address + bytes)))
 713                 reached_end = 1;
 714 
 715         /* return values */
 716         *addrp = address;
 717         *pagesp = bytes / PAGESIZE;
 718 
 719         return (reached_end);
 720 }
 721 
 722 /*
 723  * add a span to the memscrub list
 724  * add to memscrub_phys_pages
 725  */
 726 int
 727 memscrub_add_span(pfn_t pfn, pgcnt_t pages)
 728 {
 729 #ifdef MEMSCRUB_DEBUG
 730         ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
 731         uint64_t bytes = (uint64_t)pages << PAGESHIFT;
 732 #endif /* MEMSCRUB_DEBUG */
 733 
 734         int retval;
 735 
 736         mutex_enter(&memscrub_lock);
 737 
 738 #ifdef MEMSCRUB_DEBUG
 739         memscrub_printmemlist("memscrub_memlist before", memscrub_memlist);
 740         cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
 741         cmn_err(CE_CONT, "memscrub_add_span: address: 0x%llx"
 742             " size: 0x%llx\n", address, bytes);
 743 #endif /* MEMSCRUB_DEBUG */
 744 
 745         retval = memscrub_add_span_gen(pfn, pages, &memscrub_memlist,
 746             &memscrub_phys_pages);
 747 
 748 #ifdef MEMSCRUB_DEBUG
 749         memscrub_printmemlist("memscrub_memlist after", memscrub_memlist);
 750         cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
 751 #endif /* MEMSCRUB_DEBUG */
 752 
 753         mutex_exit(&memscrub_lock);
 754 
 755         return (retval);
 756 }
 757 
 758 static int
 759 memscrub_add_span_gen(
 760         pfn_t pfn,
 761         pgcnt_t pages,
 762         struct memlist **list,
 763         uint_t *npgs)
 764 {
 765         ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
 766         uint64_t bytes = (uint64_t)pages << PAGESHIFT;
 767         struct memlist *dst;
 768         struct memlist *prev, *next;
 769         int retval = 0;
 770 
 771         /*
 772          * allocate a new struct memlist
 773          */
 774 
 775         dst = (struct memlist *)
 776             kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
 777 
 778         if (dst == NULL) {
 779                 retval = -1;
 780                 goto add_done;
 781         }
 782 
 783         dst->ml_address = address;
 784         dst->ml_size = bytes;
 785 
 786         /*
 787          * first insert
 788          */
 789         if (*list == NULL) {
 790                 dst->ml_prev = NULL;
 791                 dst->ml_next = NULL;
 792                 *list = dst;
 793 
 794                 goto add_done;
 795         }
 796 
 797         /*
 798          * insert into sorted list
 799          */
 800         for (prev = NULL, next = *list;
 801             next != NULL;
 802             prev = next, next = next->ml_next) {
 803                 if (address > (next->ml_address + next->ml_size))
 804                         continue;
 805 
 806                 /*
 807                  * else insert here
 808                  */
 809 
 810                 /*
 811                  * prepend to next
 812                  */
 813                 if ((address + bytes) == next->ml_address) {
 814                         kmem_free(dst, sizeof (struct memlist));
 815 
 816                         next->ml_address = address;
 817                         next->ml_size += bytes;
 818 
 819                         goto add_done;
 820                 }
 821 
 822                 /*
 823                  * append to next
 824                  */
 825                 if (address == (next->ml_address + next->ml_size)) {
 826                         kmem_free(dst, sizeof (struct memlist));
 827 
 828                         if (next->ml_next) {
 829                                 /*
 830                                  * don't overlap with next->ml_next
 831                                  */
 832                                 if ((address + bytes) >
 833                                     next->ml_next->ml_address) {
 834                                         retval = -1;
 835                                         goto add_done;
 836                                 }
 837                                 /*
 838                                  * concatenate next and next->ml_next
 839                                  */
 840                                 if ((address + bytes) ==
 841                                     next->ml_next->ml_address) {
 842                                         struct memlist *mlp = next->ml_next;
 843 
 844                                         if (next == *list)
 845                                                 *list = next->ml_next;
 846 
 847                                         mlp->ml_address = next->ml_address;
 848                                         mlp->ml_size += next->ml_size;
 849                                         mlp->ml_size += bytes;
 850 
 851                                         if (next->ml_prev)
 852                                                 next->ml_prev->ml_next = mlp;
 853                                         mlp->ml_prev = next->ml_prev;
 854 
 855                                         kmem_free(next,
 856                                             sizeof (struct memlist));
 857                                         goto add_done;
 858                                 }
 859                         }
 860 
 861                         next->ml_size += bytes;
 862 
 863                         goto add_done;
 864                 }
 865 
 866                 /* don't overlap with next */
 867                 if ((address + bytes) > next->ml_address) {
 868                         retval = -1;
 869                         kmem_free(dst, sizeof (struct memlist));
 870                         goto add_done;
 871                 }
 872 
 873                 /*
 874                  * insert before next
 875                  */
 876                 dst->ml_prev = prev;
 877                 dst->ml_next = next;
 878                 next->ml_prev = dst;
 879                 if (prev == NULL) {
 880                         *list = dst;
 881                 } else {
 882                         prev->ml_next = dst;
 883                 }
 884                 goto add_done;
 885         }       /* end for */
 886 
 887         /*
 888          * end of list, prev is valid and next is NULL
 889          */
 890         prev->ml_next = dst;
 891         dst->ml_prev = prev;
 892         dst->ml_next = NULL;
 893 
 894 add_done:
 895 
 896         if (retval != -1)
 897                 *npgs += pages;
 898 
 899         return (retval);
 900 }
 901 
 902 /*
 903  * delete a span from the memscrub list
 904  * subtract from memscrub_phys_pages
 905  */
 906 int
 907 memscrub_delete_span(pfn_t pfn, pgcnt_t pages)
 908 {
 909         ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
 910         uint64_t bytes = (uint64_t)pages << PAGESHIFT;
 911         struct memlist *dst, *next;
 912         int retval = 0;
 913 
 914         mutex_enter(&memscrub_lock);
 915 
 916 #ifdef MEMSCRUB_DEBUG
 917         memscrub_printmemlist("memscrub_memlist Before", memscrub_memlist);
 918         cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
 919         cmn_err(CE_CONT, "memscrub_delete_span: 0x%llx 0x%llx\n",
 920             address, bytes);
 921 #endif /* MEMSCRUB_DEBUG */
 922 
 923         /*
 924          * find struct memlist containing page
 925          */
 926         for (next = memscrub_memlist; next != NULL; next = next->ml_next) {
 927                 if ((address >= next->ml_address) &&
 928                     (address < next->ml_address + next->ml_size))
 929                         break;
 930         }
 931 
 932         /*
 933          * if start address not in list
 934          */
 935         if (next == NULL) {
 936                 retval = -1;
 937                 goto delete_done;
 938         }
 939 
 940         /*
 941          * error if size goes off end of this struct memlist
 942          */
 943         if (address + bytes > next->ml_address + next->ml_size) {
 944                 retval = -1;
 945                 goto delete_done;
 946         }
 947 
 948         /*
 949          * pages at beginning of struct memlist
 950          */
 951         if (address == next->ml_address) {
 952                 /*
 953                  * if start & size match, delete from list
 954                  */
 955                 if (bytes == next->ml_size) {
 956                         if (next == memscrub_memlist)
 957                                 memscrub_memlist = next->ml_next;
 958                         if (next->ml_prev != NULL)
 959                                 next->ml_prev->ml_next = next->ml_next;
 960                         if (next->ml_next != NULL)
 961                                 next->ml_next->ml_prev = next->ml_prev;
 962 
 963                         kmem_free(next, sizeof (struct memlist));
 964                 } else {
 965                 /*
 966                  * increment start address by bytes
 967                  */
 968                         next->ml_address += bytes;
 969                         next->ml_size -= bytes;
 970                 }
 971                 goto delete_done;
 972         }
 973 
 974         /*
 975          * pages at end of struct memlist
 976          */
 977         if (address + bytes == next->ml_address + next->ml_size) {
 978                 /*
 979                  * decrement size by bytes
 980                  */
 981                 next->ml_size -= bytes;
 982                 goto delete_done;
 983         }
 984 
 985         /*
 986          * delete a span in the middle of the struct memlist
 987          */
 988         {
 989                 /*
 990                  * create a new struct memlist
 991                  */
 992                 dst = (struct memlist *)
 993                     kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
 994 
 995                 if (dst == NULL) {
 996                         retval = -1;
 997                         goto delete_done;
 998                 }
 999 
1000                 /*
1001                  * existing struct memlist gets address
1002                  * and size up to pfn
1003                  */
1004                 dst->ml_address = address + bytes;
1005                 dst->ml_size =
1006                     (next->ml_address + next->ml_size) - dst->ml_address;
1007                 next->ml_size = address - next->ml_address;
1008 
1009                 /*
1010                  * new struct memlist gets address starting
1011                  * after pfn, until end
1012                  */
1013 
1014                 /*
1015                  * link in new memlist after old
1016                  */
1017                 dst->ml_next = next->ml_next;
1018                 dst->ml_prev = next;
1019 
1020                 if (next->ml_next != NULL)
1021                         next->ml_next->ml_prev = dst;
1022                 next->ml_next = dst;
1023         }
1024 
1025 delete_done:
1026         if (retval != -1) {
1027                 memscrub_phys_pages -= pages;
1028                 if (memscrub_phys_pages == 0)
1029                         disable_memscrub = 1;
1030         }
1031 
1032 #ifdef MEMSCRUB_DEBUG
1033         memscrub_printmemlist("memscrub_memlist After", memscrub_memlist);
1034         cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
1035 #endif /* MEMSCRUB_DEBUG */
1036 
1037         mutex_exit(&memscrub_lock);
1038         return (retval);
1039 }
1040 
1041 static void
1042 memscrub_scan(uint_t blks, ms_paddr_t src)
1043 {
1044         uint_t          psz, bpp, pgsread;
1045         pfn_t           pfn;
1046         ms_paddr_t      pa;
1047         caddr_t         va;
1048         on_trap_data_t  otd;
1049         int             scan_mmu_pagesize = 0;
1050         int             retired_pages = 0;
1051 
1052         extern void memscrub_read(caddr_t src, uint_t blks);
1053 
1054         ASSERT(mutex_owned(&memscrub_lock));
1055 
1056         pgsread = 0;
1057         pa = src;
1058 
1059         if (memscrub_page_retire_span_list != NULL) {
1060                 if (memscrub_page_retire_span_search(src)) {
1061                         /* retired pages in current span */
1062                         scan_mmu_pagesize = 1;
1063                 }
1064         }
1065 
1066 #ifdef MEMSCRUB_DEBUG
1067         cmn_err(CE_NOTE, "scan_mmu_pagesize = %d\n" scan_mmu_pagesize);
1068 #endif /* MEMSCRUB_DEBUG */
1069 
1070         while (blks != 0) {
1071                 /* Ensure the PA is properly aligned */
1072                 if (((pa & MMU_PAGEMASK4M) == pa) &&
1073                     (blks >= MEMSCRUB_BPP4M)) {
1074                         psz = MMU_PAGESIZE4M;
1075                         bpp = MEMSCRUB_BPP4M;
1076                 } else if (((pa & MMU_PAGEMASK512K) == pa) &&
1077                     (blks >= MEMSCRUB_BPP512K)) {
1078                         psz = MMU_PAGESIZE512K;
1079                         bpp = MEMSCRUB_BPP512K;
1080                 } else if (((pa & MMU_PAGEMASK64K) == pa) &&
1081                     (blks >= MEMSCRUB_BPP64K)) {
1082                         psz = MMU_PAGESIZE64K;
1083                         bpp = MEMSCRUB_BPP64K;
1084                 } else if ((pa & MMU_PAGEMASK) == pa) {
1085                         psz = MMU_PAGESIZE;
1086                         bpp = MEMSCRUB_BPP;
1087                 } else {
1088                         if (memscrub_verbose) {
1089                                 cmn_err(CE_NOTE, "Memory scrubber ignoring "
1090                                     "non-page aligned block starting at 0x%"
1091                                     PRIx64, src);
1092                         }
1093                         return;
1094                 }
1095                 if (blks < bpp) bpp = blks;
1096 
1097 #ifdef MEMSCRUB_DEBUG
1098                 cmn_err(CE_NOTE, "Going to run psz=%x, "
1099                     "bpp=%x pa=%llx\n", psz, bpp, pa);
1100 #endif /* MEMSCRUB_DEBUG */
1101 
1102                 /*
1103                  * MEMSCRUBBASE is a 4MB aligned page in the
1104                  * kernel so that we can quickly map the PA
1105                  * to a VA for the block loads performed in
1106                  * memscrub_read.
1107                  */
1108                 pfn = mmu_btop(pa);
1109                 va = (caddr_t)MEMSCRUBBASE;
1110                 hat_devload(kas.a_hat, va, psz, pfn, PROT_READ,
1111                     HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
1112 
1113                 /*
1114                  * Can't allow the memscrubber to migrate across CPUs as
1115                  * we need to know whether CEEN is enabled for the current
1116                  * CPU to enable us to scrub the memory. Don't use
1117                  * kpreempt_disable as the time we take to scan a span (even
1118                  * without cpu_check_ce having to manually cpu_check_block)
1119                  * is too long to hold a higher priority thread (eg, RT)
1120                  * off cpu.
1121                  */
1122                 thread_affinity_set(curthread, CPU_CURRENT);
1123 
1124                 /*
1125                  * Protect read scrub from async faults.  For now, we simply
1126                  * maintain a count of such faults caught.
1127                  */
1128 
1129                 if (!on_trap(&otd, OT_DATA_EC) && !scan_mmu_pagesize) {
1130                         memscrub_read(va, bpp);
1131                         /*
1132                          * Check if CEs require logging
1133                          */
1134                         cpu_check_ce(SCRUBBER_CEEN_CHECK,
1135                             (uint64_t)pa, va, psz);
1136                         no_trap();
1137                         thread_affinity_clear(curthread);
1138                 } else {
1139                         no_trap();
1140                         thread_affinity_clear(curthread);
1141 
1142                         /*
1143                          * Got an async error..
1144                          * Try rescanning it at MMU_PAGESIZE
1145                          * granularity if we were trying to
1146                          * read at a larger page size.
1147                          * This is to ensure we continue to
1148                          * scan the rest of the span.
1149                          * OR scanning MMU_PAGESIZE granularity to avoid
1150                          * reading retired pages memory when scan_mmu_pagesize
1151                          * is set.
1152                          */
1153                         if (psz > MMU_PAGESIZE || scan_mmu_pagesize) {
1154                                 caddr_t vaddr = va;
1155                                 ms_paddr_t paddr = pa;
1156                                 int tmp = 0;
1157                                 for (; tmp < bpp; tmp += MEMSCRUB_BPP) {
1158                                         /* Don't scrub retired pages */
1159                                         if (page_retire_check(paddr, NULL)
1160                                             == 0) {
1161                                                 vaddr += MMU_PAGESIZE;
1162                                                 paddr += MMU_PAGESIZE;
1163                                                 retired_pages++;
1164                                                 continue;
1165                                         }
1166                                         thread_affinity_set(curthread,
1167                                             CPU_CURRENT);
1168                                         if (!on_trap(&otd, OT_DATA_EC)) {
1169                                                 memscrub_read(vaddr,
1170                                                     MEMSCRUB_BPP);
1171                                                 cpu_check_ce(
1172                                                     SCRUBBER_CEEN_CHECK,
1173                                                     (uint64_t)paddr, vaddr,
1174                                                     MMU_PAGESIZE);
1175                                                 no_trap();
1176                                         } else {
1177                                                 no_trap();
1178                                                 MEMSCRUB_STAT_INC(errors_found);
1179                                         }
1180                                         thread_affinity_clear(curthread);
1181                                         vaddr += MMU_PAGESIZE;
1182                                         paddr += MMU_PAGESIZE;
1183                                 }
1184                         }
1185                 }
1186                 hat_unload(kas.a_hat, va, psz, HAT_UNLOAD_UNLOCK);
1187 
1188                 blks -= bpp;
1189                 pa += psz;
1190                 pgsread++;
1191         }
1192 
1193         /*
1194          * If just finished scrubbing MMU_PAGESIZE at a time, but no retired
1195          * pages found so delete span from global list.
1196          */
1197         if (scan_mmu_pagesize && retired_pages == 0)
1198                 memscrub_page_retire_span_delete(src);
1199 
1200         /*
1201          * Encountered CE/UE on a retired page during memscrub read of current
1202          * span.  Adding span to global list to enable avoid reading further.
1203          */
1204         if (add_to_page_retire_list) {
1205                 if (!memscrub_page_retire_span_search(src))
1206                         memscrub_page_retire_span_add(src);
1207                 add_to_page_retire_list = 0;
1208         }
1209 
1210         if (memscrub_verbose) {
1211                 cmn_err(CE_NOTE, "Memory scrubber read 0x%x pages starting "
1212                     "at 0x%" PRIx64, pgsread, src);
1213         }
1214 }
1215 
1216 /*
1217  * Called by cpu_async_log_err() when memscrub read causes
1218  * CE/UE on a retired page.
1219  */
1220 void
1221 memscrub_induced_error(void)
1222 {
1223         add_to_page_retire_list = 1;
1224 }
1225 
1226 /*
1227  * Called by page_retire() when toxic pages cannot be retired
1228  * immediately and are scheduled for retire.  Memscrubber stops
1229  * scrubbing them to avoid further CE/UEs.
1230  */
1231 void
1232 memscrub_notify(ms_paddr_t pa)
1233 {
1234         mutex_enter(&memscrub_lock);
1235         if (!memscrub_page_retire_span_search(pa))
1236                 memscrub_page_retire_span_add(pa);
1237         mutex_exit(&memscrub_lock);
1238 }
1239 
1240 /*
1241  * Called by memscrub_scan() and memscrub_notify().
1242  * pa: physical address of span with CE/UE, add to global list.
1243  */
1244 static void
1245 memscrub_page_retire_span_add(ms_paddr_t pa)
1246 {
1247         memscrub_page_retire_span_t *new_span;
1248 
1249         new_span = (memscrub_page_retire_span_t *)
1250             kmem_zalloc(sizeof (memscrub_page_retire_span_t), KM_NOSLEEP);
1251 
1252         if (new_span == NULL) {
1253 #ifdef MEMSCRUB_DEBUG
1254                 cmn_err(CE_NOTE, "failed to allocate new span - span with"
1255                     " retired page/s not tracked.\n");
1256 #endif /* MEMSCRUB_DEBUG */
1257                 return;
1258         }
1259 
1260         new_span->address = pa;
1261         new_span->next = memscrub_page_retire_span_list;
1262         memscrub_page_retire_span_list = new_span;
1263 }
1264 
1265 /*
1266  * Called by memscrub_scan().
1267  * pa: physical address of span to be removed from global list.
1268  */
1269 static void
1270 memscrub_page_retire_span_delete(ms_paddr_t pa)
1271 {
1272         memscrub_page_retire_span_t *prev_span, *next_span;
1273 
1274         prev_span = memscrub_page_retire_span_list;
1275         next_span = memscrub_page_retire_span_list->next;
1276 
1277         if (pa == prev_span->address) {
1278                 memscrub_page_retire_span_list = next_span;
1279                 kmem_free(prev_span, sizeof (memscrub_page_retire_span_t));
1280                 return;
1281         }
1282 
1283         while (next_span) {
1284                 if (pa == next_span->address) {
1285                         prev_span->next = next_span->next;
1286                         kmem_free(next_span,
1287                             sizeof (memscrub_page_retire_span_t));
1288                         return;
1289                 }
1290                 prev_span = next_span;
1291                 next_span = next_span->next;
1292         }
1293 }
1294 
1295 /*
1296  * Called by memscrub_scan() and memscrub_notify().
1297  * pa: physical address of span to be searched in global list.
1298  */
1299 static int
1300 memscrub_page_retire_span_search(ms_paddr_t pa)
1301 {
1302         memscrub_page_retire_span_t *next_span = memscrub_page_retire_span_list;
1303 
1304         while (next_span) {
1305                 if (pa == next_span->address)
1306                         return (1);
1307                 next_span = next_span->next;
1308         }
1309         return (0);
1310 }
1311 
1312 /*
1313  * Called from new_memscrub() as a result of memory delete.
1314  * Using page_numtopp_nolock() to determine if we have valid PA.
1315  */
1316 static void
1317 memscrub_page_retire_span_list_update(void)
1318 {
1319         memscrub_page_retire_span_t *prev, *cur, *next;
1320 
1321         if (memscrub_page_retire_span_list == NULL)
1322                 return;
1323 
1324         prev = cur = memscrub_page_retire_span_list;
1325         next = cur->next;
1326 
1327         while (cur) {
1328                 if (page_numtopp_nolock(mmu_btop(cur->address)) == NULL) {
1329                         if (cur == memscrub_page_retire_span_list) {
1330                                 memscrub_page_retire_span_list = next;
1331                                 kmem_free(cur,
1332                                     sizeof (memscrub_page_retire_span_t));
1333                                 prev = cur = memscrub_page_retire_span_list;
1334                         } else {
1335                                 prev->next = cur->next;
1336                                 kmem_free(cur,
1337                                     sizeof (memscrub_page_retire_span_t));
1338                                 cur = next;
1339                         }
1340                 } else {
1341                         prev = cur;
1342                         cur = next;
1343                 }
1344                 if (cur != NULL)
1345                         next = cur->next;
1346         }
1347 }
1348 
1349 /*
1350  * The memory add/delete callback mechanism does not pass in the
1351  * page ranges. The phys_install list has been updated though, so
1352  * create a new scrub list from it.
1353  */
1354 
1355 static int
1356 new_memscrub(int update_page_retire_list)
1357 {
1358         struct memlist *src, *list, *old_list;
1359         uint_t npgs;
1360 
1361         /*
1362          * copy phys_install to memscrub_memlist
1363          */
1364         list = NULL;
1365         npgs = 0;
1366         memlist_read_lock();
1367         for (src = phys_install; src; src = src->ml_next) {
1368                 if (memscrub_add_span_gen((pfn_t)(src->ml_address >> PAGESHIFT),
1369                     (pgcnt_t)(src->ml_size >> PAGESHIFT), &list, &npgs)) {
1370                         memlist_read_unlock();
1371                         while (list) {
1372                                 struct memlist *el;
1373 
1374                                 el = list;
1375                                 list = list->ml_next;
1376                                 kmem_free(el, sizeof (struct memlist));
1377                         }
1378                         return (-1);
1379                 }
1380         }
1381         memlist_read_unlock();
1382 
1383         mutex_enter(&memscrub_lock);
1384         memscrub_phys_pages = npgs;
1385         old_list = memscrub_memlist;
1386         memscrub_memlist = list;
1387 
1388         if (update_page_retire_list)
1389                 memscrub_page_retire_span_list_update();
1390 
1391         mutex_exit(&memscrub_lock);
1392 
1393         while (old_list) {
1394                 struct memlist *el;
1395 
1396                 el = old_list;
1397                 old_list = old_list->ml_next;
1398                 kmem_free(el, sizeof (struct memlist));
1399         }
1400 
1401         return (0);
1402 }
1403 
1404 /*ARGSUSED*/
1405 static void
1406 memscrub_mem_config_post_add(
1407         void *arg,
1408         pgcnt_t delta_pages)
1409 {
1410         /*
1411          * We increment pause_memscrub before entering new_memscrub(). This
1412          * will force the memscrubber to sleep, allowing the DR callback
1413          * thread to acquire memscrub_lock in new_memscrub(). The use of
1414          * atomic_add_32() allows concurrent memory DR operations to use the
1415          * callbacks safely.
1416          */
1417         atomic_inc_32(&pause_memscrub);
1418         ASSERT(pause_memscrub != 0);
1419 
1420         /*
1421          * "Don't care" if we are not scrubbing new memory.
1422          */
1423         (void) new_memscrub(0);         /* retain page retire list */
1424 
1425         /* Restore the pause setting. */
1426         atomic_dec_32(&pause_memscrub);
1427 }
1428 
1429 /*ARGSUSED*/
1430 static int
1431 memscrub_mem_config_pre_del(
1432         void *arg,
1433         pgcnt_t delta_pages)
1434 {
1435         /* Nothing to do. */
1436         return (0);
1437 }
1438 
1439 /*ARGSUSED*/
1440 static void
1441 memscrub_mem_config_post_del(
1442         void *arg,
1443         pgcnt_t delta_pages,
1444         int cancelled)
1445 {
1446         /*
1447          * We increment pause_memscrub before entering new_memscrub(). This
1448          * will force the memscrubber to sleep, allowing the DR callback
1449          * thread to acquire memscrub_lock in new_memscrub(). The use of
1450          * atomic_add_32() allows concurrent memory DR operations to use the
1451          * callbacks safely.
1452          */
1453         atomic_inc_32(&pause_memscrub);
1454         ASSERT(pause_memscrub != 0);
1455 
1456         /*
1457          * Must stop scrubbing deleted memory as it may be disconnected.
1458          */
1459         if (new_memscrub(1)) {  /* update page retire list */
1460                 disable_memscrub = 1;
1461         }
1462 
1463         /* Restore the pause setting. */
1464         atomic_dec_32(&pause_memscrub);
1465 }
1466 
1467 static kphysm_setup_vector_t memscrub_mem_config_vec = {
1468         KPHYSM_SETUP_VECTOR_VERSION,
1469         memscrub_mem_config_post_add,
1470         memscrub_mem_config_pre_del,
1471         memscrub_mem_config_post_del,
1472 };
1473 
1474 static void
1475 memscrub_init_mem_config()
1476 {
1477         int ret;
1478 
1479         ret = kphysm_setup_func_register(&memscrub_mem_config_vec,
1480             (void *)NULL);
1481         ASSERT(ret == 0);
1482 }
1483 
1484 static void
1485 memscrub_uninit_mem_config()
1486 {
1487         /* This call is OK if the register call was not done. */
1488         kphysm_setup_func_unregister(&memscrub_mem_config_vec, (void *)NULL);
1489 }