1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/sysmacros.h>
  29 #include <sys/machsystm.h>
  30 #include <sys/machparam.h>
  31 #include <sys/cmn_err.h>
  32 #include <sys/stat.h>
  33 #include <sys/mach_descrip.h>
  34 #include <sys/memnode.h>
  35 #include <sys/mdesc.h>
  36 #include <sys/mpo.h>
  37 #include <vm/page.h>
  38 #include <vm/vm_dep.h>
  39 #include <vm/hat_sfmmu.h>
  40 #include <sys/promif.h>
  41 
  42 /*
  43  * MPO and the sun4v memory representation
  44  * ---------------------------------------
  45  *
  46  * Latency groups are defined in the sun4v achitecture by memory-latency-group
  47  * nodes in the Machine Description, as specified in FWARC/2007/260.  These
  48  * tie together cpu nodes and mblock nodes, and contain mask and match
  49  * properties that identify the portion of an mblock that belongs to the
  50  * lgroup.  Mask and match are defined in the Physical Address (PA) space,
  51  * but an mblock defines Real Addresses (RA).  To translate, the mblock
  52  * includes the property address-congruence-offset, hereafter referred to as
  53  * ra_to_pa.  A real address ra is a member of an lgroup if
  54  *
  55  *      (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
  56  *
  57  * The MD is traversed, and information on all mblocks is kept in the array
  58  * mpo_mblock[].  Information on all CPUs, including which lgroup they map
  59  * to, is kept in the array mpo_cpu[].
  60  *
  61  * This implementation makes (and verifies) the simplifying assumption that
  62  * the mask bits are the same for all defined lgroups, and that all 1 bits in
  63  * the mask are contiguous.  Thus the number of lgroups is bounded by the
  64  * number of possible mask values, and the lgrp_handle_t is defined as the
  65  * mask value, shifted right to eliminate the 0 bit positions in mask.  The
  66  * masks and values are also referred to as "home bits" in the code.
  67  *
  68  * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
  69  * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
  70  * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
  71  * home bits.  This yields the mem_node.
  72  *
  73  * Interfaces
  74  * ----------
  75  *
  76  * This file exports the following entry points:
  77  *
  78  * plat_lgrp_init()
  79  * plat_build_mem_nodes()
  80  * plat_lgrp_cpu_to_hand()
  81  * plat_lgrp_latency()
  82  * plat_pfn_to_mem_node()
  83  *      These implement the usual platform lgroup interfaces.
  84  *
  85  * plat_rapfn_to_papfn()
  86  *      Recover the PA page coloring bits from an RA.
  87  *
  88  * plat_mem_node_iterator_init()
  89  *      Initialize an iterator to efficiently step through pages in a mem_node.
  90  *
  91  * plat_mem_node_intersect_range()
  92  *      Find the intersection with a mem_node.
  93  *
  94  * plat_slice_add()
  95  * plat_slice_del()
  96  *      Platform hooks to add/delete a pfn range.
  97  *
  98  * Internal Organization
  99  * ---------------------
 100  *
 101  * A number of routines are used both boot/DR code which (re)build
 102  * appropriate MPO structures.
 103  *
 104  * mblock_alloc()
 105  *      Allocate memory for mblocks and stripes as
 106  *      appropriate for boot or memory DR.
 107  *
 108  * mblock_free()
 109  *      Free memory allocated by mblock_alloc.
 110  *
 111  * mblock_update()
 112  *      Build mblocks based on mblock nodes read from the MD.
 113  *
 114  * mblock_update_add()
 115  *      Rebuild mblocks after a memory DR add operation.
 116  *
 117  * mblock_update_del()
 118  *      Rebuild mblocks after a memory DR delete operation.
 119  *
 120  * mblock_install()
 121  *      Install mblocks as the new configuration.
 122  *
 123  * mstripe_update()
 124  *      Build stripes based on mblocks.
 125  *
 126  * mnode_update()
 127  *      Call memnode layer to add/del a pfn range, based on stripes.
 128  *
 129  * The platform interfaces allocate all memory required for the
 130  * particualar update first, block access to the MPO structures
 131  * while they are updated, and free old structures after the update.
 132  */
 133 
 134 int     sun4v_mpo_enable = 1;
 135 int     sun4v_mpo_debug = 0;
 136 char    sun4v_mpo_status[256] = "";
 137 
 138 /* Save CPU info from the MD and associate CPUs with lgroups */
 139 static  struct cpu_md mpo_cpu[NCPU];
 140 
 141 /* Save lgroup info from the MD */
 142 #define MAX_MD_LGROUPS 32
 143 static  struct  lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
 144 static  int     n_lgrpnodes = 0;
 145 static  int     n_locality_groups = 0;
 146 static  int     max_locality_groups = 0;
 147 static  int     szc_mask0 = 0;
 148 
 149 /* Save mblocks from the MD */
 150 #define SMALL_MBLOCKS_COUNT     8
 151 static  struct  mblock_md *mpo_mblock;
 152 static  struct  mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT];
 153 static  int     n_mblocks = 0;
 154 
 155 /* Save mem_node stripes calculate from mblocks and lgroups. */
 156 static mem_stripe_t *mem_stripes;
 157 static  mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES];
 158 static  int     n_mem_stripes = 0;
 159 static  pfn_t   mnode_stride;   /* distance between stripes, start to start */
 160 static  int     stripe_shift;   /* stride/stripes expressed as a shift */
 161 static  pfn_t   mnode_pages;    /* mem_node stripe width */
 162 
 163 /* Save home mask and shift used to calculate lgrp_handle_t values */
 164 static  uint64_t home_mask = 0;
 165 static  pfn_t   home_mask_pfn = 0;
 166 static  int     home_mask_shift = 0;
 167 static  uint_t  home_mask_pfn_shift = 0;
 168 
 169 /* Save lowest and highest latencies found across all lgroups */
 170 static  int     lower_latency = 0;
 171 static  int     higher_latency = 0;
 172 
 173 static  pfn_t   base_ra_to_pa_pfn = 0;  /* ra_to_pa for single mblock memory */
 174 static  int     mpo_genid;              /* config gen; updated by mem DR */
 175 static  mpo_config_t mpo_config;        /* current mblocks and stripes */
 176 
 177 typedef enum { U_ADD, U_ADD_ALL, U_DEL } update_t;
 178 
 179 static  int     valid_pages(md_t *md, mde_cookie_t cpu0);
 180 static  int     unique_home_mem_lg_count(uint64_t mem_lg_homeset);
 181 static  int     fix_interleave(void);
 182 
 183 static int  mblock_alloc(mpo_config_t *, update_t, int nmblocks);
 184 static void mblock_install(mpo_config_t *);
 185 static void mblock_free(mpo_config_t *);
 186 static void mblock_update(mpo_config_t *, md_t, mde_cookie_t *mblocknodes);
 187 static void mblock_update_add(mpo_config_t *);
 188 static void mblock_update_del(mpo_config_t *, mpo_config_t *, pfn_t, pfn_t);
 189 static void mstripe_update(mpo_config_t *);
 190 static void mnode_update(mpo_config_t *, pfn_t, pfn_t, update_t);
 191 
 192 /* Debug support */
 193 #if defined(DEBUG) && !defined(lint)
 194 #define VALIDATE_SLICE(base, end) {                                     \
 195         ASSERT(IS_P2ALIGNED(ptob(base), TTEBYTES(TTE256M)));            \
 196         ASSERT(IS_P2ALIGNED(ptob(end - base + 1), TTEBYTES(TTE256M)));  \
 197 }
 198 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
 199 #else
 200 #define VALIDATE_SLICE(base, end)
 201 #define MPO_DEBUG(...)
 202 #endif  /* DEBUG */
 203 
 204 /* Record status message, viewable from mdb */
 205 #define MPO_STATUS(args...) {                                                 \
 206         (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args);   \
 207         MPO_DEBUG(sun4v_mpo_status);                                          \
 208 }
 209 
 210 /*
 211  * The MPO locks are to protect the MPO metadata while that
 212  * information is updated as a result of a memory DR operation.
 213  * The read lock must be acquired to read the metadata and the
 214  * write locks must be acquired to update it.
 215  */
 216 #define mpo_rd_lock     kpreempt_disable
 217 #define mpo_rd_unlock   kpreempt_enable
 218 
 219 static void
 220 mpo_wr_lock()
 221 {
 222         mutex_enter(&cpu_lock);
 223         pause_cpus(NULL, NULL);
 224         mutex_exit(&cpu_lock);
 225 }
 226 
 227 static void
 228 mpo_wr_unlock()
 229 {
 230         mutex_enter(&cpu_lock);
 231         start_cpus();
 232         mutex_exit(&cpu_lock);
 233 }
 234 
 235 /*
 236  * Routine to read a uint64_t from a given md
 237  */
 238 static  int64_t
 239 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
 240 {
 241         int err = md_get_prop_val(md, node, propname, val);
 242         return (err);
 243 }
 244 
 245 static int
 246 mblock_cmp(const void *a, const void *b)
 247 {
 248         struct mblock_md *m1 = (struct mblock_md *)a;
 249         struct mblock_md *m2 = (struct mblock_md *)b;
 250 
 251         if (m1->base < m2->base)
 252                 return (-1);
 253         else if (m1->base == m2->base)
 254                 return (0);
 255         else
 256                 return (1);
 257 }
 258 
 259 static void
 260 mblock_sort(struct mblock_md *mblocks, int n)
 261 {
 262         extern void qsort(void *, size_t, size_t,
 263             int (*)(const void *, const void *));
 264 
 265         qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
 266 }
 267 
 268 static void
 269 mpo_update_tunables(void)
 270 {
 271         int i, ncpu_min;
 272 
 273         /*
 274          * lgrp_expand_proc_thresh is the minimum load on the lgroups
 275          * this process is currently running on before considering
 276          *  expanding threads to another lgroup.
 277          *
 278          * lgrp_expand_proc_diff determines how much less the remote lgroup
 279          *  must be loaded before expanding to it.
 280          *
 281          * On sun4v CMT processors, threads share a core pipeline, and
 282          * at less than 100% utilization, best throughput is obtained by
 283          * spreading threads across more cores, even if some are in a
 284          * different lgroup.  Spread threads to a new lgroup if the
 285          * current group is more than 50% loaded.  Because of virtualization,
 286          * lgroups may have different numbers of CPUs, but the tunables
 287          * apply to all lgroups, so find the smallest lgroup and compute
 288          * 50% loading.
 289          */
 290 
 291         ncpu_min = NCPU;
 292         for (i = 0; i < n_lgrpnodes; i++) {
 293                 int ncpu = mpo_lgroup[i].ncpu;
 294                 if (ncpu != 0 && ncpu < ncpu_min)
 295                         ncpu_min = ncpu;
 296         }
 297         lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
 298 
 299         /* new home may only be half as loaded as the existing home to use it */
 300         lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
 301 
 302         lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
 303 }
 304 
 305 static mde_cookie_t
 306 cpuid_to_cpunode(md_t *md, int cpuid)
 307 {
 308         mde_cookie_t    rootnode, foundnode, *cpunodes;
 309         uint64_t        cpuid_prop;
 310         int     n_cpunodes, i;
 311 
 312         if (md == NULL)
 313                 return (MDE_INVAL_ELEM_COOKIE);
 314 
 315         rootnode = md_root_node(md);
 316         if (rootnode == MDE_INVAL_ELEM_COOKIE)
 317                 return (MDE_INVAL_ELEM_COOKIE);
 318 
 319         n_cpunodes = md_alloc_scan_dag(md, rootnode, PROP_LG_CPU,
 320             "fwd", &cpunodes);
 321         if (n_cpunodes <= 0 || n_cpunodes > NCPU)
 322                 goto cpuid_fail;
 323 
 324         for (i = 0; i < n_cpunodes; i++) {
 325                 if (md_get_prop_val(md, cpunodes[i], PROP_LG_CPU_ID,
 326                     &cpuid_prop))
 327                         break;
 328                 if (cpuid_prop == (uint64_t)cpuid) {
 329                         foundnode = cpunodes[i];
 330                         md_free_scan_dag(md, &cpunodes);
 331                         return (foundnode);
 332                 }
 333         }
 334 cpuid_fail:
 335         if (n_cpunodes > 0)
 336                 md_free_scan_dag(md, &cpunodes);
 337         return (MDE_INVAL_ELEM_COOKIE);
 338 }
 339 
 340 static int
 341 mpo_cpu_to_lgroup(md_t *md, mde_cookie_t cpunode)
 342 {
 343         mde_cookie_t *nodes;
 344         uint64_t latency, lowest_latency;
 345         uint64_t address_match, lowest_address_match;
 346         int n_lgroups, j, result = 0;
 347 
 348         /* Find lgroup nodes reachable from this cpu */
 349         n_lgroups = md_alloc_scan_dag(md, cpunode, PROP_LG_MEM_LG,
 350             "fwd", &nodes);
 351 
 352         lowest_latency = ~(0UL);
 353 
 354         /* Find the lgroup node with the smallest latency */
 355         for (j = 0; j < n_lgroups; j++) {
 356                 result = get_int(md, nodes[j], PROP_LG_LATENCY,
 357                     &latency);
 358                 result |= get_int(md, nodes[j], PROP_LG_MATCH,
 359                     &address_match);
 360                 if (result != 0) {
 361                         j = -1;
 362                         goto to_lgrp_done;
 363                 }
 364                 if (latency < lowest_latency) {
 365                         lowest_latency = latency;
 366                         lowest_address_match = address_match;
 367                 }
 368         }
 369         for (j = 0; j < n_lgrpnodes; j++) {
 370                 if ((mpo_lgroup[j].latency == lowest_latency) &&
 371                     (mpo_lgroup[j].addr_match == lowest_address_match))
 372                         break;
 373         }
 374         if (j == n_lgrpnodes)
 375                 j = -1;
 376 
 377 to_lgrp_done:
 378         if (n_lgroups > 0)
 379                 md_free_scan_dag(md, &nodes);
 380         return (j);
 381 }
 382 
 383 /* Called when DR'ing in a CPU */
 384 void
 385 mpo_cpu_add(md_t *md, int cpuid)
 386 {
 387         mde_cookie_t cpunode;
 388 
 389         int i;
 390 
 391         if (n_lgrpnodes <= 0)
 392                 return;
 393 
 394         if (md == NULL)
 395                 goto add_fail;
 396 
 397         cpunode = cpuid_to_cpunode(md, cpuid);
 398         if (cpunode == MDE_INVAL_ELEM_COOKIE)
 399                 goto add_fail;
 400 
 401         i = mpo_cpu_to_lgroup(md, cpunode);
 402         if (i == -1)
 403                 goto add_fail;
 404 
 405         mpo_cpu[cpuid].lgrp_index = i;
 406         mpo_cpu[cpuid].home = mpo_lgroup[i].addr_match >> home_mask_shift;
 407         mpo_lgroup[i].ncpu++;
 408         mpo_update_tunables();
 409         return;
 410 add_fail:
 411         panic("mpo_cpu_add: Cannot read MD");
 412 }
 413 
 414 /* Called when DR'ing out a CPU */
 415 void
 416 mpo_cpu_remove(int cpuid)
 417 {
 418         int i;
 419 
 420         if (n_lgrpnodes <= 0)
 421                 return;
 422 
 423         i = mpo_cpu[cpuid].lgrp_index;
 424         mpo_lgroup[i].ncpu--;
 425         mpo_cpu[cpuid].home = 0;
 426         mpo_cpu[cpuid].lgrp_index = -1;
 427         mpo_update_tunables();
 428 }
 429 
 430 static mde_cookie_t
 431 md_get_root(md_t *md)
 432 {
 433         mde_cookie_t root = MDE_INVAL_ELEM_COOKIE;
 434         int n_nodes;
 435 
 436         n_nodes = md_node_count(md);
 437 
 438         if (n_nodes <= 0) {
 439                 MPO_STATUS("md_get_root: No nodes in node count\n");
 440                 return (root);
 441         }
 442 
 443         root = md_root_node(md);
 444 
 445         if (root == MDE_INVAL_ELEM_COOKIE) {
 446                 MPO_STATUS("md_get_root: Root node is missing\n");
 447                 return (root);
 448         }
 449 
 450         MPO_DEBUG("md_get_root: Node Count: %d\n", n_nodes);
 451         MPO_DEBUG("md_get_root: md: %p\n", md);
 452         MPO_DEBUG("md_get_root: root: %lx\n", root);
 453 done:
 454         return (root);
 455 }
 456 
 457 static int
 458 lgrp_update(md_t *md, mde_cookie_t root)
 459 {
 460         int i, j, result;
 461         int ret_val = 0;
 462         int sub_page_fix;
 463         mde_cookie_t *nodes, *lgrpnodes;
 464 
 465         n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
 466             "fwd", &lgrpnodes);
 467 
 468         if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
 469                 MPO_STATUS("lgrp_update: No Lgroups\n");
 470                 ret_val = -1;
 471                 goto fail;
 472         }
 473 
 474         MPO_DEBUG("lgrp_update: mem_lgs: %d\n", n_lgrpnodes);
 475 
 476         for (i = 0; i < n_lgrpnodes; i++) {
 477                 mpo_lgroup[i].node = lgrpnodes[i];
 478                 mpo_lgroup[i].id = i;
 479                 mpo_lgroup[i].ncpu = 0;
 480                 result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
 481                     &mpo_lgroup[i].addr_mask);
 482                 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
 483                     &mpo_lgroup[i].addr_match);
 484 
 485                 /*
 486                  * If either the mask or match properties are missing, set to 0
 487                  */
 488                 if (result < 0) {
 489                         mpo_lgroup[i].addr_mask = 0;
 490                         mpo_lgroup[i].addr_match = 0;
 491                 }
 492 
 493                 /* Set latency to 0 if property not present */
 494 
 495                 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
 496                     &mpo_lgroup[i].latency);
 497                 if (result < 0)
 498                         mpo_lgroup[i].latency = 0;
 499         }
 500 
 501         /*
 502          * Sub-page level interleave is not yet supported.  Check for it,
 503          * and remove sub-page interleaved lgroups from mpo_lgroup and
 504          * n_lgrpnodes.  If no lgroups are left, return.
 505          */
 506 
 507         sub_page_fix = fix_interleave();
 508         if (n_lgrpnodes == 0) {
 509                 ret_val = -1;
 510                 goto fail;
 511         }
 512 
 513         /* Ensure that all of the addr_mask values are the same */
 514 
 515         for (i = 0; i < n_lgrpnodes; i++) {
 516                 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
 517                         MPO_STATUS("lgrp_update: "
 518                             "addr_mask values are not the same\n");
 519                         ret_val = -1;
 520                         goto fail;
 521                 }
 522         }
 523 
 524         /*
 525          * Ensure that all lgrp nodes see all the mblocks. However, if
 526          * sub-page interleave is being fixed, they do not, so skip
 527          * the check.
 528          */
 529 
 530         if (sub_page_fix == 0) {
 531                 for (i = 0; i < n_lgrpnodes; i++) {
 532                         j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
 533                             PROP_LG_MBLOCK, "fwd", &nodes);
 534                         md_free_scan_dag(md, &nodes);
 535                         if (j != n_mblocks) {
 536                                 MPO_STATUS("lgrp_update: "
 537                                     "sub-page interleave is being fixed\n");
 538                                 ret_val = -1;
 539                                 goto fail;
 540                         }
 541                 }
 542         }
 543 fail:
 544         if (n_lgrpnodes > 0) {
 545                 md_free_scan_dag(md, &lgrpnodes);
 546                 for (i = 0; i < n_lgrpnodes; i++)
 547                         mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
 548         }
 549 
 550         return (ret_val);
 551 }
 552 
 553 /*
 554  *
 555  * Traverse the MD to determine:
 556  *
 557  *  Number of CPU nodes, lgrp_nodes, and mblocks
 558  *  Then for each lgrp_node, obtain the appropriate data.
 559  *  For each CPU, determine its home locality and store it.
 560  *  For each mblock, retrieve its data and store it.
 561  */
 562 static  int
 563 lgrp_traverse(md_t *md)
 564 {
 565         mde_cookie_t root, *cpunodes, *mblocknodes;
 566         int o;
 567         uint64_t i, k, stripe, stride;
 568         uint64_t mem_lg_homeset = 0;
 569         int ret_val = 0;
 570         int result = 0;
 571         int n_cpunodes = 0;
 572         mpo_config_t new_config;
 573 
 574         if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE) {
 575                 ret_val = -1;
 576                 goto fail;
 577         }
 578 
 579         n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd",
 580             &mblocknodes);
 581         if (n_mblocks <= 0) {
 582                 MPO_STATUS("lgrp_traverse: No mblock nodes detected in Machine "
 583                     "Descriptor\n");
 584                 ret_val = -1;
 585                 goto fail;
 586         }
 587 
 588         /*
 589          * Build the Memory Nodes.  Do this before any possibility of
 590          * bailing from this routine so we obtain ra_to_pa (needed for page
 591          * coloring) even when there are no lgroups defined.
 592          */
 593         if (mblock_alloc(&new_config, U_ADD_ALL, n_mblocks) < 0) {
 594                 ret_val = -1;
 595                 goto fail;
 596         }
 597 
 598         mblock_update(&new_config, md, mblocknodes);
 599         mblock_install(&new_config);
 600 
 601         /* Page coloring hook is required so we can iterate through mnodes */
 602         if (&page_next_pfn_for_color_cpu == NULL) {
 603                 MPO_STATUS("lgrp_traverse: No page coloring support\n");
 604                 ret_val = -1;
 605                 goto fail;
 606         }
 607 
 608         /* Global enable for mpo */
 609         if (sun4v_mpo_enable == 0) {
 610                 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
 611                 ret_val = -1;
 612                 goto fail;
 613         }
 614 
 615         n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
 616 
 617         if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
 618                 MPO_STATUS("lgrp_traverse: No CPU nodes detected "
 619                     "in MD\n");
 620                 ret_val = -1;
 621                 goto fail;
 622         }
 623 
 624         MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
 625 
 626         if ((ret_val = lgrp_update(md, root)) == -1)
 627                 goto fail;
 628 
 629         /*
 630          * Use the address mask from the first lgroup node
 631          * to establish our home_mask.
 632          */
 633         home_mask = mpo_lgroup[0].addr_mask;
 634         home_mask_pfn = btop(home_mask);
 635         home_mask_shift = lowbit(home_mask) - 1;
 636         home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
 637         mnode_pages = btop(1ULL << home_mask_shift);
 638 
 639         /*
 640          * How many values are possible in home mask?  Assume the mask
 641          * bits are contiguous.
 642          */
 643         max_locality_groups =
 644             1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
 645 
 646         stripe_shift = highbit(max_locality_groups) - 1;
 647         stripe = ptob(mnode_pages);
 648         stride = max_locality_groups * stripe;
 649         mnode_stride = btop(stride);
 650 
 651         /* Now verify the home mask bits are contiguous */
 652 
 653         if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
 654                 MPO_STATUS("lgrp_traverse: "
 655                     "home mask bits are not contiguous\n");
 656                 ret_val = -1;
 657                 goto fail;
 658         }
 659 
 660         /* Record all of the home bits */
 661 
 662         for (i = 0; i < n_lgrpnodes; i++) {
 663                 HOMESET_ADD(mem_lg_homeset,
 664                     mpo_lgroup[i].addr_match >> home_mask_shift);
 665         }
 666 
 667         /* Count the number different "home"  mem_lg's we've discovered */
 668 
 669         n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
 670 
 671         /* If we have only 1 locality group then we can exit */
 672         if (n_locality_groups == 1) {
 673                 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
 674                 ret_val = -1;
 675                 goto fail;
 676         }
 677 
 678         /*
 679          * Set the latencies.  A CPU's lgroup is defined by the lowest
 680          * latency found.  All other memory is considered remote, and the
 681          * remote latency is represented by the highest latency found.
 682          * Thus hierarchical lgroups, if any, are approximated by a
 683          * two level scheme.
 684          *
 685          * The Solaris MPO framework by convention wants to see latencies
 686          * in units of nano-sec/10. In the MD, the units are defined to be
 687          * pico-seconds.
 688          */
 689 
 690         lower_latency = mpo_lgroup[0].latency;
 691         higher_latency = mpo_lgroup[0].latency;
 692 
 693         for (i = 1; i < n_lgrpnodes; i++) {
 694                 if (mpo_lgroup[i].latency < lower_latency) {
 695                         lower_latency = mpo_lgroup[i].latency;
 696                 }
 697                 if (mpo_lgroup[i].latency > higher_latency) {
 698                         higher_latency = mpo_lgroup[i].latency;
 699                 }
 700         }
 701         lower_latency /= 10000;
 702         higher_latency /= 10000;
 703 
 704         /* Clear our CPU data */
 705 
 706         for (i = 0; i < NCPU; i++) {
 707                 mpo_cpu[i].home = 0;
 708                 mpo_cpu[i].lgrp_index = -1;
 709         }
 710 
 711         /* Build the CPU nodes */
 712         for (i = 0; i < n_cpunodes; i++) {
 713 
 714                 /* Read in the lgroup nodes */
 715                 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
 716                 if (result < 0) {
 717                         MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
 718                         ret_val = -1;
 719                         goto fail;
 720                 }
 721 
 722                 o = mpo_cpu_to_lgroup(md, cpunodes[i]);
 723                 if (o == -1) {
 724                         ret_val = -1;
 725                         goto fail;
 726                 }
 727                 mpo_cpu[k].lgrp_index = o;
 728                 mpo_cpu[k].home = mpo_lgroup[o].addr_match >> home_mask_shift;
 729                 mpo_lgroup[o].ncpu++;
 730         }
 731         /* Validate that no large pages cross mnode boundaries. */
 732         if (valid_pages(md, cpunodes[0]) == 0) {
 733                 ret_val = -1;
 734                 goto fail;
 735         }
 736 
 737 fail:
 738         if (n_cpunodes > 0)
 739                 md_free_scan_dag(md, &cpunodes);
 740         if (n_mblocks > 0)
 741                 md_free_scan_dag(md, &mblocknodes);
 742         else
 743                 panic("lgrp_traverse: No memory blocks found");
 744 
 745         if (ret_val == 0) {
 746                 MPO_STATUS("MPO feature is enabled.\n");
 747         } else
 748                 sun4v_mpo_enable = 0;   /* set this for DR */
 749 
 750         return (ret_val);
 751 }
 752 
 753 /*
 754  *  Determine the number of unique mem_lg's present in our system
 755  */
 756 static  int
 757 unique_home_mem_lg_count(uint64_t mem_lg_homeset)
 758 {
 759         int homeid;
 760         int count = 0;
 761 
 762         /*
 763          * Scan the "home" bits of the mem_lgs, count
 764          * the number that are unique.
 765          */
 766 
 767         for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
 768                 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
 769                         count++;
 770                 }
 771         }
 772 
 773         MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
 774             mem_lg_homeset);
 775         MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
 776 
 777         /* Default must be at least one */
 778         if (count == 0)
 779                 count = 1;
 780 
 781         return (count);
 782 }
 783 
 784 /*
 785  * Platform specific lgroup initialization
 786  */
 787 void
 788 plat_lgrp_init(void)
 789 {
 790         md_t *md;
 791         int rc;
 792 
 793         /* Get the Machine Descriptor handle */
 794 
 795         md = md_get_handle();
 796 
 797         /* If not, we cannot continue */
 798 
 799         if (md == NULL) {
 800                 panic("cannot access machine descriptor\n");
 801         } else {
 802                 rc = lgrp_traverse(md);
 803                 (void) md_fini_handle(md);
 804         }
 805 
 806         /*
 807          * If we can't process the MD for lgroups then at least let the
 808          * system try to boot.  Assume we have one lgroup so that
 809          * when plat_build_mem_nodes is called, it will attempt to init
 810          * an mnode based on the supplied memory segment.
 811          */
 812 
 813         if (rc == -1) {
 814                 home_mask_pfn = 0;
 815                 max_locality_groups = 1;
 816                 n_locality_groups = 1;
 817                 return;
 818         }
 819 
 820         mem_node_pfn_shift = 0;
 821         mem_node_physalign = 0;
 822 
 823         /* Use lgroup-aware TSB allocations */
 824         tsb_lgrp_affinity = 1;
 825 
 826         /* Require that a home lgroup have some memory to be chosen */
 827         lgrp_mem_free_thresh = 1;
 828 
 829         /* Standard home-on-next-touch policy */
 830         lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
 831 
 832         /* Disable option to choose root lgroup if all leaf lgroups are busy */
 833         lgrp_load_thresh = UINT32_MAX;
 834 
 835         mpo_update_tunables();
 836 }
 837 
 838 /*
 839  *  Helper routine for debugging calls to mem_node_add_slice()
 840  */
 841 static  void
 842 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
 843 {
 844 #if defined(DEBUG) && !defined(lint)
 845         static int slice_count = 0;
 846 
 847         slice_count++;
 848         MPO_DEBUG("mem_add_slice(%d): basepfn: %lx  endpfn: %lx\n",
 849             slice_count, basepfn, endpfn);
 850 #endif
 851         mem_node_add_slice(basepfn, endpfn);
 852 }
 853 
 854 static  void
 855 mpo_mem_node_del_slice(pfn_t basepfn, pfn_t endpfn)
 856 {
 857 #if defined(DEBUG) && !defined(lint)
 858         static int slice_count = 0;
 859 
 860         slice_count++;
 861         MPO_DEBUG("mem_del_slice(%d): basepfn: %lx  endpfn: %lx\n",
 862             slice_count, basepfn, endpfn);
 863 #endif
 864         mem_node_del_slice(basepfn, endpfn);
 865 }
 866 
 867 /*
 868  *  Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
 869  */
 870 static  void
 871 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
 872 {
 873         MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld, "
 874             "mnode index: %d\n", plathand, mnode);
 875         plat_assign_lgrphand_to_mem_node(plathand, mnode);
 876 }
 877 
 878 /*
 879  * plat_build_mem_nodes()
 880  *
 881  * Define the mem_nodes based on the modified boot memory list,
 882  * or based on info read from the MD in plat_lgrp_init().
 883  *
 884  * When the home mask lies in the middle of the address bits (as it does on
 885  * Victoria Falls), then the memory in one mem_node is no longer contiguous;
 886  * it is striped across an mblock in a repeating pattern of contiguous memory
 887  * followed by a gap.  The stripe width is the size of the contiguous piece.
 888  * The stride is the distance from the start of one contiguous piece to the
 889  * start of the next.  The gap is thus stride - stripe_width.
 890  *
 891  * The stripe of an mnode that falls within an mblock is described by the type
 892  * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock.  The
 893  * mem_stripe_t's are kept in a global array mem_stripes[].  The index into
 894  * this array is predetermined.  The mem_stripe_t that describes mnode m
 895  * within mpo_mblock[i] is stored at
 896  *       mem_stripes[ m + i * max_locality_groups ]
 897  *
 898  * max_locality_groups is the total number of possible locality groups,
 899  * as defined by the size of the home mask, even if the memory assigned
 900  * to the domain is small and does not cover all the lgroups.  Thus some
 901  * mem_stripe_t's may be empty.
 902  *
 903  * The members of mem_stripe_t are:
 904  *      physbase: First valid page in mem_node in the corresponding mblock
 905  *      physmax: Last valid page in mem_node in mblock
 906  *      offset:  The full stripe width starts at physbase - offset.
 907  *          Thus if offset is non-zero, this mem_node starts in the middle
 908  *          of a stripe width, and the second full stripe starts at
 909  *          physbase - offset + stride.  (even though physmax may fall in the
 910  *          middle of a stripe width, we do not save the ending fragment size
 911  *          in this data structure.)
 912  *      exists: Set to 1 if the mblock has memory in this mem_node stripe.
 913  *
 914  *      The stripe width is kept in the global mnode_pages.
 915  *      The stride is kept in the global mnode_stride.
 916  *      All the above use pfn's as the unit.
 917  *
 918  * As an example, the memory layout for a domain with 2 mblocks and 4
 919  * mem_nodes 0,1,2,3 could look like this:
 920  *
 921  *      123012301230 ...        012301230123 ...
 922  *        mblock 0                mblock 1
 923  */
 924 
 925 /*ARGSUSED*/
 926 void
 927 plat_build_mem_nodes(prom_memlist_t *list, size_t nelems)
 928 {
 929         int elem;
 930         uint64_t base, len;
 931 
 932         /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
 933         max_mem_nodes = max_locality_groups;
 934 
 935         mstripe_update(&mpo_config);
 936 
 937         /* Check for non-MPO sun4v platforms */
 938         if (n_locality_groups <= 1) {
 939                 mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0);
 940                 for (elem = 0; elem < nelems; list++, elem++) {
 941                         base = list->addr;
 942                         len = list->size;
 943 
 944                         mpo_mem_node_add_slice(btop(base),
 945                             btop(base + len - 1));
 946                 }
 947                 mem_node_pfn_shift = 0;
 948                 mem_node_physalign = 0;
 949         } else
 950                 mnode_update(&mpo_config, 0, 0, U_ADD_ALL);
 951 
 952         /*
 953          * Indicate to vm_pagelist that the hpm_counters array
 954          * should be shared because the ranges overlap.
 955          */
 956         if (max_mem_nodes > 1) {
 957                 interleaved_mnodes = 1;
 958         }
 959 }
 960 
 961 /*
 962  * Return the locality group value for the supplied processor
 963  */
 964 lgrp_handle_t
 965 plat_lgrp_cpu_to_hand(processorid_t id)
 966 {
 967         lgrp_handle_t lgrphand;
 968 
 969         mpo_rd_lock();
 970         if (n_locality_groups > 1) {
 971                 lgrphand = (lgrp_handle_t)mpo_cpu[(int)id].home;
 972         } else {
 973                 lgrphand = (lgrp_handle_t)LGRP_DEFAULT_HANDLE; /* Default */
 974         }
 975         mpo_rd_unlock();
 976 
 977         return (lgrphand);
 978 }
 979 
 980 int
 981 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
 982 {
 983         /*
 984          * Return min remote latency when there are more than two lgroups
 985          * (root and child) and getting latency between two different lgroups
 986          * or root is involved.
 987          */
 988         if (lgrp_optimizations() && (from != to ||
 989             from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
 990                 return ((int)higher_latency);
 991         } else {
 992                 return ((int)lower_latency);
 993         }
 994 }
 995 
 996 int
 997 plat_pfn_to_mem_node(pfn_t pfn)
 998 {
 999         int i, mnode;
1000         pfn_t ra_to_pa_pfn;
1001         struct mblock_md *mb;
1002 
1003         if (n_locality_groups <= 1)
1004                 return (0);
1005 
1006         /*
1007          * The mnode is defined to be 1:1 with the lgroup handle, which
1008          * is taken from from the home bits.  Find the mblock in which
1009          * the pfn falls to get the ra_to_pa adjustment, and extract
1010          * the home bits.
1011          */
1012         mpo_rd_lock();
1013         mb = &mpo_mblock[0];
1014         for (i = 0; i < n_mblocks; i++) {
1015                 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
1016                         ra_to_pa_pfn = btop(mb->ra_to_pa);
1017                         mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
1018                             home_mask_pfn_shift);
1019                         ASSERT(mnode < max_mem_nodes);
1020                         mpo_rd_unlock();
1021                         return (mnode);
1022                 }
1023                 mb++;
1024         }
1025 
1026         panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
1027         return (pfn);
1028 }
1029 
1030 /*
1031  * plat_rapfn_to_papfn
1032  *
1033  * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
1034  * and home mask bits are correct.  The upper bits do not necessarily
1035  * match the actual PA, however.
1036  */
1037 pfn_t
1038 plat_rapfn_to_papfn(pfn_t pfn)
1039 {
1040         int i;
1041         pfn_t ra_to_pa_pfn;
1042         struct mblock_md *mb;
1043 
1044         ASSERT(n_mblocks > 0);
1045         if (n_mblocks == 1)
1046                 return (pfn + base_ra_to_pa_pfn);
1047 
1048         /*
1049          * Find the mblock in which the pfn falls
1050          * in order to get the ra_to_pa adjustment.
1051          */
1052         mpo_rd_lock();
1053         for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
1054                 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
1055                         ra_to_pa_pfn = btop(mb->ra_to_pa);
1056                         mpo_rd_unlock();
1057                         return (pfn + ra_to_pa_pfn);
1058                 }
1059         }
1060 
1061         panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
1062         return (pfn);
1063 }
1064 
1065 /*
1066  * plat_mem_node_iterator_init()
1067  *      Initialize cookie "it" to iterate over pfn's in an mnode.  There is
1068  *      no additional iterator function.  The caller uses the info from
1069  *      the iterator structure directly.
1070  *
1071  *      pfn: starting pfn.
1072  *      mnode: desired mnode.
1073  *      szc: desired page size.
1074  *      init:
1075  *          if 1, start a new traversal, initialize "it", find first
1076  *              mblock containing pfn, and return its starting pfn
1077  *              within the mnode.
1078  *          if 0, continue the previous traversal using passed-in data
1079  *              from "it", advance to the next mblock, and return its
1080  *              starting pfn within the mnode.
1081  *      it: returns readonly data to the caller; see below.
1082  *
1083  *      The input pfn must be aligned for the page size szc.
1084  *
1085  *      Returns: starting pfn for the iteration for the mnode/mblock,
1086  *          which is aligned according to the page size,
1087  *          or returns (pfn_t)(-1) if the input pfn lies past the last
1088  *          valid pfn of the mnode.
1089  *      Returns misc values in the "it" struct that allows the caller
1090  *          to advance the pfn within an mblock using address arithmetic;
1091  *          see definition of mem_node_iterator_t in vm_dep.h.
1092  *          When the caller calculates a pfn that is greater than the
1093  *          returned value it->mi_mblock_end, the caller should again
1094  *          call plat_mem_node_iterator_init, passing init=0.
1095  *
1096  *          The last mblock in continuation case may be invalid because
1097  *          of memory DR.  To detect this situation mi_genid is checked
1098  *          against mpo_genid which is incremented after a memory DR
1099  *          operation.  See also plat_slice_add()/plat_slice_del().
1100  */
1101 pfn_t
1102 plat_mem_node_iterator_init(pfn_t pfn, int mnode, uchar_t szc,
1103     mem_node_iterator_t *it, int init)
1104 {
1105         int i;
1106         pgcnt_t szcpgcnt = PNUM_SIZE(szc);
1107         struct mblock_md *mblock;
1108         pfn_t base, end;
1109         mem_stripe_t *ms;
1110         uint64_t szcpagesize;
1111 
1112         ASSERT(it != NULL);
1113         ASSERT(mnode >= 0 && mnode < max_mem_nodes);
1114         ASSERT(n_mblocks > 0);
1115         ASSERT(P2PHASE(pfn, szcpgcnt) == 0);
1116 
1117         mpo_rd_lock();
1118 
1119         if (init || (it->mi_genid != mpo_genid)) {
1120                 it->mi_genid = mpo_genid;
1121                 it->mi_last_mblock = 0;
1122                 it->mi_init = 1;
1123         }
1124 
1125         /* Check if mpo is not enabled and we only have one mblock */
1126         if (n_locality_groups == 1 && n_mblocks == 1) {
1127                 if (P2PHASE(base_ra_to_pa_pfn, szcpgcnt)) {
1128                         pfn = (pfn_t)-1;
1129                         goto done;
1130                 }
1131                 it->mi_mnode = mnode;
1132                 it->mi_ra_to_pa = base_ra_to_pa_pfn;
1133                 it->mi_mnode_pfn_mask = 0;
1134                 it->mi_mnode_pfn_shift = 0;
1135                 it->mi_mnode_mask = 0;
1136                 it->mi_mblock_base = mem_node_config[mnode].physbase;
1137                 it->mi_mblock_end = mem_node_config[mnode].physmax;
1138                 if (pfn < it->mi_mblock_base)
1139                         pfn = P2ROUNDUP(it->mi_mblock_base, szcpgcnt);
1140                 if ((pfn + szcpgcnt - 1) > it->mi_mblock_end)
1141                         pfn = (pfn_t)-1;
1142                 goto done;
1143         }
1144 
1145         /* init=1 means begin iterator, init=0 means continue */
1146         if (init == 1) {
1147                 i = 0;
1148         } else {
1149                 ASSERT(it->mi_last_mblock < n_mblocks);
1150                 i = it->mi_last_mblock;
1151                 ASSERT(pfn >
1152                     mem_stripes[i * max_locality_groups + mnode].physmax);
1153                 if (++i == n_mblocks) {
1154                         pfn = (pfn_t)-1;
1155                         goto done;
1156                 }
1157         }
1158 
1159         /*
1160          * Find mblock that contains pfn for mnode's stripe, or first such an
1161          * mblock after pfn, else pfn is out of bound and we'll return -1.
1162          * mblocks and stripes are sorted in ascending address order.
1163          */
1164         szcpagesize = szcpgcnt << PAGESHIFT;
1165         for (; i < n_mblocks; i++) {
1166                 if (P2PHASE(mpo_mblock[i].ra_to_pa, szcpagesize))
1167                         continue;
1168                 ms = &mem_stripes[i * max_locality_groups + mnode];
1169                 if (ms->exists && (pfn + szcpgcnt - 1) <= ms->physmax &&
1170                     (P2ROUNDUP(ms->physbase, szcpgcnt) + szcpgcnt - 1) <=
1171                     ms->physmax)
1172                         break;
1173         }
1174         if (i == n_mblocks) {
1175                 it->mi_last_mblock = i - 1;
1176                 pfn = (pfn_t)-1;
1177                 goto done;
1178         }
1179 
1180         it->mi_last_mblock = i;
1181 
1182         mblock = &mpo_mblock[i];
1183         base = ms->physbase;
1184         end = ms->physmax;
1185 
1186         it->mi_mnode = mnode;
1187         it->mi_ra_to_pa = btop(mblock->ra_to_pa);
1188         it->mi_mblock_base = base;
1189         it->mi_mblock_end = end;
1190         it->mi_mnode_pfn_mask = home_mask_pfn;       /* is 0 for non-MPO case */
1191         it->mi_mnode_pfn_shift = home_mask_pfn_shift;
1192         it->mi_mnode_mask = max_locality_groups - 1;
1193         if (pfn < base) {
1194                 pfn = P2ROUNDUP(base, szcpgcnt);
1195                 ASSERT(pfn + szcpgcnt - 1 <= end);
1196         }
1197         ASSERT((pfn + szcpgcnt - 1) <= mpo_mblock[i].end_pfn);
1198 done:
1199         mpo_rd_unlock();
1200         return (pfn);
1201 }
1202 
1203 /*
1204  * plat_mem_node_intersect_range()
1205  *
1206  * Find the intersection between a memnode and a range of pfn's.
1207  */
1208 void
1209 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
1210     int mnode, pgcnt_t *npages_out)
1211 {
1212         pfn_t offset, len, hole, base, end, test_end, frag;
1213         pfn_t nearest;
1214         mem_stripe_t *ms;
1215         int i, npages;
1216 
1217         *npages_out = 0;
1218 
1219         if (!mem_node_config[mnode].exists || test_len == 0)
1220                 return;
1221 
1222         base = mem_node_config[mnode].physbase;
1223         end = mem_node_config[mnode].physmax;
1224 
1225         test_end = test_base + test_len - 1;
1226         if (end < test_base || base > test_end)
1227                 return;
1228 
1229         if (n_locality_groups == 1) {
1230                 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
1231                 return;
1232         }
1233 
1234         hole = mnode_stride - mnode_pages;
1235         npages = 0;
1236 
1237         /*
1238          * Iterate over all the stripes for this mnode (one per mblock),
1239          * find the intersection with each, and accumulate the intersections.
1240          *
1241          * Determing the intersection with a stripe is tricky.  If base or end
1242          * fall outside the mem_node bounds, round them to physbase/physmax of
1243          * mem_node.  If base or end fall in a gap, round them to start of
1244          * nearest stripe.  If they fall within a stripe, keep base or end,
1245          * but calculate the fragment size that should be excluded from the
1246          * stripe.  Calculate how many strides fall in the adjusted range,
1247          * multiply by stripe width, and add the start and end fragments.
1248          */
1249 
1250         mpo_rd_lock();
1251         for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
1252                 ms = &mem_stripes[i];
1253                 if (ms->exists &&
1254                     test_base <= (end = ms->physmax) &&
1255                     test_end >= (base = ms->physbase)) {
1256 
1257                         offset = ms->offset;
1258 
1259                         if (test_base > base) {
1260                                 /* Round test_base to next multiple of stride */
1261                                 len = P2ROUNDUP(test_base - (base - offset),
1262                                     mnode_stride);
1263                                 nearest = base - offset + len;
1264                                 /*
1265                                  * Compute distance from test_base to the
1266                                  * stride boundary to see if test_base falls
1267                                  * in the stripe or in the hole.
1268                                  */
1269                                 if (nearest - test_base > hole) {
1270                                         /*
1271                                          * test_base lies in stripe,
1272                                          * and offset should be excluded.
1273                                          */
1274                                         offset = test_base -
1275                                             (nearest - mnode_stride);
1276                                         base = test_base;
1277                                 } else {
1278                                         /* round up to next stripe start */
1279                                         offset = 0;
1280                                         base = nearest;
1281                                         if (base > end)
1282                                                 continue;
1283                                 }
1284 
1285                         }
1286 
1287                         if (test_end < end)
1288                                 end = test_end;
1289                         end++;          /* adjust to an exclusive bound */
1290 
1291                         /* Round end to next multiple of stride */
1292                         len = P2ROUNDUP(end - (base - offset), mnode_stride);
1293                         nearest = (base - offset) + len;
1294                         if (nearest - end <= hole) {
1295                                 /* end falls in hole, use entire last stripe */
1296                                 frag = 0;
1297                         } else {
1298                                 /* end falls in stripe, compute fragment */
1299                                 frag = nearest - hole - end;
1300                         }
1301 
1302                         len = (len >> stripe_shift) - offset - frag;
1303                         npages += len;
1304                 }
1305         }
1306 
1307         *npages_out = npages;
1308         mpo_rd_unlock();
1309 }
1310 
1311 /*
1312  * valid_pages()
1313  *
1314  * Return 1 if pages are valid and do not cross mnode boundaries
1315  * (which would break page free list assumptions), and 0 otherwise.
1316  */
1317 
1318 #define MNODE(pa)       \
1319         ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
1320 
1321 static int
1322 valid_pages(md_t *md, mde_cookie_t cpu0)
1323 {
1324         int i, max_szc;
1325         uint64_t last_page_base, szc_mask;
1326         uint64_t max_page_len, max_coalesce_len;
1327         struct mblock_md *mb = mpo_mblock;
1328 
1329         /*
1330          * Find the smaller of the largest page possible and supported.
1331          * mmu_exported_pagesize_mask is not yet initialized, so read
1332          * it from the MD.  Apply minimal fixups in case of broken MDs
1333          * to get a sane mask.
1334          */
1335 
1336         if (cpu0 == NULL)
1337                 szc_mask = szc_mask0;
1338         else {
1339                 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
1340                         szc_mask = 0;
1341                 /* largest in sun4v default support */
1342                 szc_mask |=  (1 << TTE4M);
1343                 szc_mask0 = szc_mask;
1344         }
1345         max_szc = highbit(szc_mask) - 1;
1346         if (max_szc > TTE256M)
1347                 max_szc = TTE256M;
1348         max_page_len = TTEBYTES(max_szc);
1349 
1350         /*
1351          * Page coalescing code coalesces all sizes up to 256M on sun4v, even
1352          * if mmu-page-size-list does not contain it, so 256M pages must fall
1353          * within one mnode to use MPO.
1354          */
1355         max_coalesce_len = TTEBYTES(TTE256M);
1356         ASSERT(max_coalesce_len >= max_page_len);
1357 
1358         if (ptob(mnode_pages) < max_coalesce_len) {
1359                 MPO_STATUS("Page too large; MPO disabled: page = %lx, "
1360                     "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
1361                 return (0);
1362         }
1363 
1364         for (i = 0; i < n_mblocks; i++) {
1365                 uint64_t base = mb->base;
1366                 uint64_t end = mb->base + mb->size - 1;
1367                 uint64_t ra_to_pa = mb->ra_to_pa;
1368 
1369                 /*
1370                  * If mblock is smaller than the max page size, then
1371                  * RA = PA mod MAXPAGE is not guaranteed, but it must
1372                  * not span mnodes.
1373                  */
1374                 if (mb->size < max_page_len) {
1375                         if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
1376                                 MPO_STATUS("Small mblock spans mnodes; "
1377                                     "MPO disabled: base = %lx, end = %lx, "
1378                                     "ra2pa = %lx\n", base, end, ra_to_pa);
1379                                 return (0);
1380                         }
1381                 } else {
1382                         /* Verify RA = PA mod MAXPAGE, using coalesce size */
1383                         uint64_t pa_base = base + ra_to_pa;
1384                         if ((base & (max_coalesce_len - 1)) !=
1385                             (pa_base & (max_coalesce_len - 1))) {
1386                                 MPO_STATUS("bad page alignment; MPO disabled: "
1387                                     "ra = %lx, pa = %lx, pagelen = %lx\n",
1388                                     base, pa_base, max_coalesce_len);
1389                                 return (0);
1390                         }
1391                 }
1392 
1393                 /*
1394                  * Find start of last large page in mblock in RA space.
1395                  * If page extends into the next mblock, verify the
1396                  * mnode does not change.
1397                  */
1398                 last_page_base = P2ALIGN(end, max_coalesce_len);
1399                 if (i + 1 < n_mblocks &&
1400                     last_page_base + max_coalesce_len > mb[1].base &&
1401                     MNODE(last_page_base + ra_to_pa) !=
1402                     MNODE(mb[1].base + mb[1].ra_to_pa)) {
1403                         MPO_STATUS("Large page spans mblocks; MPO disabled: "
1404                             "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
1405                             "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
1406                             mb[1].ra_to_pa, max_coalesce_len);
1407                         return (0);
1408                 }
1409 
1410                 mb++;
1411         }
1412         return (1);
1413 }
1414 
1415 
1416 /*
1417  * fix_interleave() - Find lgroups with sub-page sized memory interleave,
1418  * if any, and remove them.  This yields a config where the "coarse
1419  * grained" lgroups cover all of memory, even though part of that memory
1420  * is fine grain interleaved and does not deliver a purely local memory
1421  * latency.
1422  *
1423  * This function reads and modifies the globals:
1424  *      mpo_lgroup[], n_lgrpnodes
1425  *
1426  * Returns 1 if lgroup nodes were removed, 0 otherwise.
1427  */
1428 
1429 static int
1430 fix_interleave(void)
1431 {
1432         int i, j;
1433         uint64_t mask = 0;
1434 
1435         j = 0;
1436         for (i = 0; i < n_lgrpnodes; i++) {
1437                 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
1438                         /* remove this lgroup */
1439                         mask = mpo_lgroup[i].addr_mask;
1440                 } else {
1441                         mpo_lgroup[j++] = mpo_lgroup[i];
1442                 }
1443         }
1444         n_lgrpnodes = j;
1445 
1446         if (mask != 0)
1447                 MPO_STATUS("sub-page interleave %lx found; "
1448                     "removing lgroup.\n", mask);
1449 
1450         return (mask != 0);
1451 }
1452 
1453 /*
1454  * mblock_alloc
1455  *
1456  * Allocate memory for mblock an stripe arrays from either static or
1457  * dynamic space depending on utype, and return the result in mc.
1458  * Returns 0 on success and -1 on error.
1459  */
1460 
1461 static int
1462 mblock_alloc(mpo_config_t *mc, update_t utype, int nmblocks)
1463 {
1464         mblock_md_t *mb = NULL;
1465         mem_stripe_t *ms = NULL;
1466         int nstripes = MAX_MEM_NODES * nmblocks;
1467         size_t mblocksz = nmblocks * sizeof (struct mblock_md);
1468         size_t mstripesz = nstripes * sizeof (mem_stripe_t);
1469         size_t allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz));
1470 
1471         /*
1472          * Allocate space for mblocks and mstripes.
1473          *
1474          * For DR allocations, just use kmem_alloc(), and set
1475          * mc_alloc_sz to indicate it was used.
1476          *
1477          * For boot allocation:
1478          * If we have a small number of mblocks we will use the space
1479          * that we preallocated. Otherwise, we will dynamically
1480          * allocate the space from the prom and map it to the
1481          * reserved VA at MPOBUF_BASE.
1482          */
1483 
1484         if (utype == U_ADD || utype == U_DEL) {
1485                 mb = (struct mblock_md *)kmem_zalloc(allocsz, KM_SLEEP);
1486                 ms = (mem_stripe_t *)(mb + nmblocks);
1487                 mc->mc_alloc_sz = allocsz;
1488         } else if (nmblocks <= SMALL_MBLOCKS_COUNT) {
1489                 mb = &small_mpo_mblocks[0];
1490                 ms = &small_mem_stripes[0];
1491                 mc->mc_alloc_sz = 0;
1492         } else {
1493                 /* Ensure that we dont request more space than reserved */
1494                 if (allocsz > MPOBUF_SIZE) {
1495                         MPO_STATUS("mblock_alloc: Insufficient space "
1496                             "for mblock structures \n");
1497                         return (-1);
1498                 }
1499                 mb = (struct mblock_md *)
1500                     prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE);
1501                 if (mb != (struct mblock_md *)MPOBUF_BASE) {
1502                         MPO_STATUS("mblock_alloc: Cannot allocate space "
1503                             "for mblocks \n");
1504                         return (-1);
1505                 }
1506                 mpo_heap32_buf = (caddr_t)MPOBUF_BASE;
1507                 mpo_heap32_bufsz = MPOBUF_SIZE;
1508                 ms = (mem_stripe_t *)(mb + nmblocks);
1509                 mc->mc_alloc_sz = 0;
1510         }
1511         mc->mc_mblocks = mb;
1512         mc->mc_stripes = ms;
1513         mc->mc_nmblocks = nmblocks;
1514         mc->mc_nstripes = nstripes;
1515         MPO_DEBUG("mblock_alloc: mblocks: %d\n", nmblocks);
1516         return (0);
1517 }
1518 
1519 /*
1520  * mblock_free
1521  *
1522  * Free memory in mc that was allocated by mblock_alloc.
1523  */
1524 
1525 static void
1526 mblock_free(mpo_config_t *mc)
1527 {
1528         if (mc->mc_alloc_sz > 0) {
1529                 ASSERT(mc->mc_mblocks != mpo_mblock);
1530                 kmem_free((caddr_t)mc->mc_mblocks, mc->mc_alloc_sz);
1531         }
1532         bzero(mc, sizeof (*mc));
1533 }
1534 
1535 /*
1536  * mblock_install
1537  *
1538  * Install mblock config passed in mc as the global configuration.
1539  * May only be called at boot or while holding mpo_wr_lock.
1540  */
1541 
1542 static void
1543 mblock_install(mpo_config_t *mc)
1544 {
1545         mpo_mblock = mc->mc_mblocks;
1546         n_mblocks = mc->mc_nmblocks;
1547         mem_stripes = mc->mc_stripes;
1548         n_mem_stripes = mc->mc_nstripes;
1549         base_ra_to_pa_pfn = btop(mc->mc_mblocks[0].ra_to_pa);
1550         mpo_config = *mc;
1551 }
1552 
1553 /*
1554  * mblock_update
1555  *
1556  * Traverse mblocknodes, read the mblock properties from the MD, and
1557  * save the mblocks in mc.
1558  */
1559 
1560 static void
1561 mblock_update(mpo_config_t *mc, md_t md, mde_cookie_t *mblocknodes)
1562 {
1563         uint64_t i, j;
1564         int result = 0;
1565         mblock_md_t *mblock = mc->mc_mblocks;
1566 
1567         for (i = 0, j = 0; j < mc->mc_nmblocks; j++) {
1568 
1569                 /* Without a base or size value we will fail */
1570                 result = get_int(md, mblocknodes[j], PROP_LG_BASE,
1571                     &mblock[i].base);
1572                 if (result < 0) {
1573                         MPO_STATUS("mblock_update: "
1574                             "PROP_LG_BASE is missing\n");
1575                         mc->mc_nmblocks = 0;
1576                         return;
1577                 }
1578 
1579                 result = get_int(md, mblocknodes[j], PROP_LG_SIZE,
1580                     &mblock[i].size);
1581                 if (result < 0) {
1582                         MPO_STATUS("mblock_update: "
1583                             "PROP_LG_SIZE is missing\n");
1584                         mc->mc_nmblocks = 0;
1585                         return;
1586                 }
1587 
1588                 result = get_int(md, mblocknodes[j],
1589                     PROP_LG_RA_PA_OFFSET, &mblock[i].ra_to_pa);
1590 
1591                 /* If we don't have an ra_pa_offset, just set it to 0 */
1592                 if (result < 0)
1593                         mblock[i].ra_to_pa = 0;
1594 
1595                 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
1596                     "ra_to_pa = %lx\n", i,
1597                     mblock[i].base,
1598                     mblock[i].size,
1599                     mblock[i].ra_to_pa);
1600 
1601                 /* check for unsupportable values of base and size */
1602                 if (mblock[i].base > mblock[i].base + mblock[i].size) {
1603                         MPO_STATUS("mblock_update: "
1604                             "PROP_LG_BASE+PROP_LG_SIZE is invalid: "
1605                             "base = %lx, size = %lx\n",
1606                             mblock[i].base, mblock[i].size);
1607                         mc->mc_nmblocks = 0;
1608                         return;
1609                 }
1610 
1611                 /* eliminate size==0 blocks */
1612                 if (mblock[i].size != 0) {
1613                         uint64_t base = mblock[i].base;
1614                         uint64_t end = base + mblock[i].size;
1615                         ASSERT(end > base);
1616                         mblock[i].base_pfn = btop(base);
1617                         mblock[i].end_pfn = btop(end - 1);
1618                         i++;
1619                 }
1620         }
1621 
1622         if (i == 0) {
1623                 MPO_STATUS("mblock_update: "
1624                     "No non-empty mblock nodes were found "
1625                     "in the Machine Descriptor\n");
1626                 mc->mc_nmblocks = 0;
1627                 return;
1628         }
1629         ASSERT(i <= mc->mc_nmblocks);
1630         mc->mc_nmblocks = i;
1631 
1632         /* Must sort mblocks by address for mem_node_iterator_init() */
1633         mblock_sort(mblock, mc->mc_nmblocks);
1634 }
1635 
1636 /*
1637  * mblock_update_add
1638  *
1639  * Update mblock config after a memory DR add.  The added range is not
1640  * needed, as we read *all* mblock nodes from the MD.  Save the mblocks
1641  * in mc.
1642  */
1643 
1644 static void
1645 mblock_update_add(mpo_config_t *mc)
1646 {
1647         md_t *md;
1648         mde_cookie_t root, *mblocknodes;
1649         int nmblocks = 0;
1650 
1651         if ((md = md_get_handle()) == NULL) {
1652                 MPO_STATUS("Cannot access Machine Descriptor\n");
1653                 goto error;
1654         }
1655 
1656         if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE)
1657                 goto error;
1658 
1659         nmblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd",
1660             &mblocknodes);
1661         if (nmblocks <= 0) {
1662                 MPO_STATUS("No mblock nodes detected in Machine Descriptor\n");
1663                 goto error;
1664         }
1665 
1666         if (mblock_alloc(mc, U_ADD, nmblocks) < 0)
1667                 goto error;
1668 
1669         mblock_update(mc, md, mblocknodes);
1670         md_free_scan_dag(md, &mblocknodes);
1671         (void) md_fini_handle(md);
1672         return;
1673 error:
1674         panic("mblock_update_add: cannot process mblocks from MD.\n");
1675 }
1676 
1677 /*
1678  * mblock_update_del
1679  *
1680  * Update mblocks after a memory DR deletion of the range (ubase, uend).
1681  * Allocate a new mblock config, copy old config to the new, modify the new
1682  * mblocks to reflect the deletion.   The new mblocks are returned in
1683  * mc_new and are not yet installed as the active config.
1684  */
1685 
1686 static void
1687 mblock_update_del(mpo_config_t *mc_new, mpo_config_t *mc_old, pfn_t ubase,
1688     pfn_t uend)
1689 {
1690         int i, j;
1691         pfn_t base, end;
1692         mblock_md_t *mblock;
1693         int nmblocks = mc_old->mc_nmblocks;
1694 
1695         MPO_DEBUG("mblock_update_del(0x%lx, 0x%lx)\n", ubase, uend);
1696 
1697         /*
1698          * Allocate mblocks in mc_new and copy the old to the new.
1699          * Allocate one extra in case the deletion splits an mblock.
1700          */
1701         if (mblock_alloc(mc_new, U_DEL, nmblocks + 1) < 0)
1702                 return;
1703         mblock = mc_new->mc_mblocks;
1704         bcopy(mc_old->mc_mblocks, mblock, nmblocks * sizeof (mblock_md_t));
1705 
1706         /*
1707          * Find the mblock containing the deleted range and adjust it in
1708          * the new config.
1709          */
1710         for (i = 0; i < nmblocks; i++) {
1711 
1712                 base = btop(mblock[i].base);
1713                 end = base + btop(mblock[i].size) - 1;
1714 
1715                 /*
1716                  * Adjust the mblock based on the subset that was deleted.
1717                  *
1718                  * If the entire mblk was deleted, compact the table.
1719                  *
1720                  * If the middle of the mblk was deleted, extend
1721                  * the table.  Space for the new slot was already
1722                  * allocated.
1723                  *
1724                  * The memory to be deleted is a mblock or a subset of
1725                  * and does not span multiple mblocks.
1726                  */
1727                 if (base == ubase && end == uend) {
1728                         for (j = i; j < nmblocks - 1; j++)
1729                                 mblock[j] = mblock[j + 1];
1730                         nmblocks--;
1731                         bzero(&mblock[nmblocks], sizeof (*mblock));
1732                         break;
1733                 } else if (base < ubase && end > uend) {
1734                         for (j = nmblocks - 1; j >= i; j--)
1735                                 mblock[j + 1] = mblock[j];
1736                         mblock[i].size = ptob(ubase - base);
1737                         mblock[i].end_pfn = ubase - 1;
1738                         mblock[i + 1].base = ptob(uend + 1);
1739                         mblock[i + 1].size = ptob(end - uend);
1740                         mblock[i + 1].base_pfn = uend + 1;
1741                         nmblocks++;
1742                         break;
1743                 } else if (base == ubase) {
1744                         MPO_DEBUG("mblock_update_del: shrink>"
1745                             " i=%d base=0x%lx end=0x%lx", i, base, end);
1746                         mblock[i].base = ptob(uend + 1);
1747                         mblock[i].size -= ptob(uend - ubase + 1);
1748                         base = uend + 1;
1749                         mblock[i].base_pfn = base;
1750                         mblock[i].end_pfn = end;
1751                         MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end);
1752                         break;
1753                 } else if (end == uend) {
1754                         MPO_DEBUG("mblock_update_del: shrink<"
1755                             " i=%d base=0x%lx end=0x%lx", i, base, end);
1756                         mblock[i].size -= ptob(uend - ubase + 1);
1757                         end = ubase - 1;
1758                         mblock[i].base_pfn = base;
1759                         mblock[i].end_pfn = end;
1760                         MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end);
1761                         break;
1762                 }
1763         }
1764         mc_new->mc_nmblocks = nmblocks;
1765         ASSERT(end > base);
1766 }
1767 
1768 /*
1769  * mstripe_update
1770  *
1771  * Read mblocks from mc and update mstripes in mc
1772  */
1773 
1774 static void
1775 mstripe_update(mpo_config_t *mc)
1776 {
1777         lgrp_handle_t lgrphand, lgrp_start;
1778         int i, mnode;
1779         uint64_t offset, stripe_end, base, end, ra_to_pa, stride;
1780         uint64_t stripe, frag, remove;
1781         mem_stripe_t *ms;
1782         mblock_md_t *mblock = mc->mc_mblocks;
1783         int nmblocks = mc->mc_nmblocks;
1784         int mstripesz = MAX_MEM_NODES * nmblocks * sizeof (mem_stripe_t);
1785 
1786         /* Check for non-MPO sun4v platforms or memory DR removal */
1787         if (n_locality_groups <= 1) {
1788                 ASSERT(n_locality_groups == 1);
1789                 ASSERT(max_locality_groups == 1 && max_mem_nodes == 1);
1790 
1791                 if (nmblocks == 1) {
1792                         mc->mc_nstripes = 0;
1793                 } else {
1794                         mc->mc_nstripes = nmblocks;
1795                         bzero(mc->mc_stripes, mstripesz);
1796                         for (i = 0; i < nmblocks; i++) {
1797                                 mc->mc_stripes[i].exists = 1;
1798                                 mc->mc_stripes[i].physbase = mblock[i].base_pfn;
1799                                 mc->mc_stripes[i].physmax = mblock[i].end_pfn;
1800                         }
1801                 }
1802                 return;
1803         }
1804 
1805         bzero(mc->mc_stripes, mstripesz);
1806         mc->mc_nstripes = max_locality_groups * nmblocks;
1807         stripe = ptob(mnode_pages);
1808         stride = max_locality_groups * stripe;
1809 
1810         for (i = 0; i < nmblocks; i++) {
1811                 base = mblock[i].base;
1812                 end = base + mblock[i].size;
1813                 ra_to_pa = mblock[i].ra_to_pa;
1814 
1815                 /* Find the offset from the prev stripe boundary in PA space. */
1816                 offset = (base + ra_to_pa) & (stripe - 1);
1817 
1818                 /* Set the next stripe boundary. */
1819                 stripe_end = base - offset + stripe;
1820 
1821                 lgrp_start = (((base + ra_to_pa) & home_mask) >>
1822                     home_mask_shift);
1823                 lgrphand = lgrp_start;
1824 
1825                 /*
1826                  * Loop over all lgroups covered by the mblock, creating a
1827                  * stripe for each.  Stop when lgrp_start is visited again.
1828                  */
1829                 do {
1830                         /* mblock may not span all lgroups */
1831                         if (base >= end)
1832                                 break;
1833 
1834                         mnode = lgrphand;
1835                         ASSERT(mnode < max_mem_nodes);
1836 
1837                         /*
1838                          * Calculate the size of the fragment that does not
1839                          * belong to the mnode in the last partial stride.
1840                          */
1841                         frag = (end - (base - offset)) & (stride - 1);
1842                         if (frag == 0) {
1843                                 /* remove the gap */
1844                                 remove = stride - stripe;
1845                         } else if (frag < stripe) {
1846                                 /* fragment fits in stripe; keep it all */
1847                                 remove = 0;
1848                         } else {
1849                                 /* fragment is large; trim after whole stripe */
1850                                 remove = frag - stripe;
1851                         }
1852 
1853                         ms = &mc->mc_stripes[i * max_locality_groups + mnode];
1854                         ms->physbase = btop(base);
1855                         ms->physmax = btop(end - 1 - remove);
1856                         ms->offset = btop(offset);
1857                         ms->exists = 1;
1858 
1859                         base = stripe_end;
1860                         stripe_end += stripe;
1861                         offset = 0;
1862                         lgrphand = (((base + ra_to_pa) & home_mask) >>
1863                             home_mask_shift);
1864                 } while (lgrphand != lgrp_start);
1865         }
1866 }
1867 
1868 #define INTERSECT(a, b, c, d)                           \
1869         if (((a) >= (c) && (a) <= (d)) ||         \
1870             ((c) >= (a) && (c) <= (b))) {         \
1871                 (c) = MAX((a), (c));                    \
1872                 (d) = MIN((b), (d));                    \
1873         } else {                                        \
1874                 ASSERT((a) >= (d) || (b) <= (c)); \
1875                 continue;                               \
1876         }                                               \
1877 
1878 /*
1879  * mnode_update
1880  *
1881  * Read stripes from mc and update mnode extents.  The mnode extents are
1882  * part of the live configuration, so this can only be done at boot time
1883  * or while holding the mpo_wr_lock.
1884  */
1885 
1886 static void
1887 mnode_update(mpo_config_t *mc, pfn_t ubase, pfn_t uend, update_t utype)
1888 {
1889         int i, j, mnode, found;
1890         pfn_t base, end;
1891         mem_stripe_t *ms;
1892 
1893         MPO_DEBUG("mnode_udpate: basepfn: %lx  endpfn: %lx\n", ubase, uend);
1894 
1895         if (n_locality_groups <= 1 && mc->mc_nmblocks == 1) {
1896                 if (utype == U_ADD)
1897                         mpo_mem_node_add_slice(ubase, uend);
1898                 else if (utype == U_DEL)
1899                         mpo_mem_node_del_slice(ubase, uend);
1900                 else
1901                         panic("mnode update: %d: invalid\n", utype);
1902                 return;
1903         }
1904 
1905         found = 0;
1906         for (i = 0; i < mc->mc_nmblocks; i++) {
1907                 for (mnode = 0; mnode < max_locality_groups; mnode++) {
1908 
1909                         j = i * max_locality_groups + mnode;
1910                         ms = &mc->mc_stripes[j];
1911                         if (!ms->exists)
1912                                 continue;
1913 
1914                         base = ms->physbase;
1915                         end = ms->physmax;
1916 
1917                         /*
1918                          * Look for the mstripes intersecting this slice.
1919                          *
1920                          * The mstripe and slice pairs may not be equal
1921                          * if a subset of a mblock is added/deleted.
1922                          */
1923                         switch (utype) {
1924                         case U_ADD:
1925                                 INTERSECT(ubase, uend, base, end);
1926                                 /*FALLTHROUGH*/
1927                         case U_ADD_ALL:
1928                                 if (n_locality_groups > 1)
1929                                         mpo_plat_assign_lgrphand_to_mem_node(
1930                                             mnode, mnode);
1931                                 mpo_mem_node_add_slice(base, end);
1932                                 break;
1933                         case U_DEL:
1934                                 INTERSECT(ubase, uend, base, end);
1935                                 mpo_mem_node_del_slice(base, end);
1936                                 break;
1937                         default:
1938                                 panic("mnode_update: %d: invalid\n", utype);
1939                                 break;
1940                         }
1941 
1942                         found++;
1943                 }
1944         }
1945 
1946         if (!found)
1947                 panic("mnode_update: mstripe not found");
1948 
1949 #ifdef  DEBUG
1950         if (utype == U_ADD_ALL || utype == U_DEL)
1951                 return;
1952         found = 0;
1953         for (i = 0; i < max_mem_nodes; i++) {
1954                 if (!mem_node_config[i].exists)
1955                         continue;
1956                 if (ubase >= mem_node_config[i].physbase &&
1957                     ubase <= mem_node_config[i].physmax)
1958                         found |= 1;
1959                 if (uend >= mem_node_config[i].physbase &&
1960                     uend <= mem_node_config[i].physmax)
1961                         found |= 2;
1962         }
1963         ASSERT(found == 3);
1964         {
1965                 pfn_t minpfn, maxpfn;
1966 
1967                 mem_node_max_range(&minpfn, &maxpfn);
1968                 ASSERT(minpfn <= ubase);
1969                 ASSERT(maxpfn >= uend);
1970         }
1971 #endif
1972 }
1973 
1974 /*
1975  * Plat_slice_add()/plat_slice_del() are the platform hooks
1976  * for adding/deleting a pfn range to/from the system.
1977  *
1978  * Platform_slice_add() is used for both boot/DR cases.
1979  *
1980  * - Zeus has already added the mblocks to the MD, so read the updated
1981  *   MD and allocate all data structures required to manage the new memory
1982  *   configuration.
1983  *
1984  * - Recompute the stripes which are derived from the mblocks.
1985  *
1986  * - Update (expand) the mnode extents and install the modified mblocks as
1987  *   the new mpo config.  This must be done while holding the mpo_wr_lock
1988  *   to guarantee that no other threads access the mpo meta-data.
1989  *
1990  * - Unlock MPO data structures; the new config is live.  Free the old config.
1991  *
1992  * Plat_slice_del() is used for DR only.
1993  *
1994  * - Zeus has not yet modified the MD to reflect the deletion, so copy
1995  *   the old mpo mblocks and delete the range from the copy.
1996  *
1997  * - Recompute the stripes which are derived from the mblocks.
1998  *
1999  * - Update (shrink) the mnode extents and install the modified mblocks as
2000  *   the new mpo config.  This must be done while holding the mpo_wr_lock
2001  *   to guarantee that no other threads access the mpo meta-data.
2002  *
2003  * - Unlock MPO data structures; the new config is live.  Free the old config.
2004  */
2005 
2006 void
2007 plat_slice_add(pfn_t base, pfn_t end)
2008 {
2009         mpo_config_t old_config = mpo_config;
2010         mpo_config_t new_config;
2011 
2012         VALIDATE_SLICE(base, end);
2013         mblock_update_add(&new_config);
2014         mstripe_update(&new_config);
2015         mpo_wr_lock();
2016         mblock_install(&new_config);
2017         /* Use new config to add all ranges for mnode_update */
2018         mnode_update(&new_config, base, end, U_ADD);
2019         mpo_genid++;
2020         mpo_wr_unlock();
2021         mblock_free(&old_config);
2022 }
2023 
2024 void
2025 plat_slice_del(pfn_t base, pfn_t end)
2026 {
2027         mpo_config_t old_config = mpo_config;
2028         mpo_config_t new_config;
2029 
2030         VALIDATE_SLICE(base, end);
2031         mblock_update_del(&new_config, &old_config, base, end);
2032         mstripe_update(&new_config);
2033         mpo_wr_lock();
2034         /* Use old config to find deleted range for mnode_update */
2035         mnode_update(&old_config, base, end, U_DEL);
2036         mblock_install(&new_config);
2037         mpo_genid++;
2038         mpo_wr_unlock();
2039         mblock_free(&old_config);
2040 }