1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * OPL platform specific functions for
  29  * CPU/Memory error diagnosis engine.
  30  */
  31 #include <cmd.h>
  32 #include <cmd_dimm.h>
  33 #include <cmd_bank.h>
  34 #include <cmd_page.h>
  35 #include <cmd_opl.h>
  36 #include <string.h>
  37 #include <errno.h>
  38 #include <fcntl.h>
  39 #include <unistd.h>
  40 #include <dirent.h>
  41 #include <sys/stat.h>
  42 
  43 #include <sys/fm/protocol.h>
  44 #include <sys/fm/io/opl_mc_fm.h>
  45 #include <sys/async.h>
  46 #include <sys/opl_olympus_regs.h>
  47 #include <sys/fm/cpu/SPARC64-VI.h>
  48 #include <sys/int_const.h>
  49 #include <sys/mutex.h>
  50 #include <sys/dditypes.h>
  51 #include <opl/sys/mc-opl.h>
  52 
  53 /*
  54  * The following is the common function for handling
  55  * memory UE with EID=MEM.
  56  * The error could be detected by either CPU/IO.
  57  */
  58 cmd_evdisp_t
  59 opl_ue_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
  60     int hdlr_type)
  61 {
  62         nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL;
  63         uint64_t ubc_ue_log_reg, pa;
  64         cmd_page_t *page;
  65 
  66         if (nvlist_lookup_nvlist(nvl,
  67             FM_EREPORT_PAYLOAD_NAME_RESOURCE, &rsrc) != 0)
  68                 return (CMD_EVD_BAD);
  69 
  70         switch (hdlr_type) {
  71         case CMD_OPL_HDLR_CPU:
  72 
  73                 if (nvlist_lookup_uint64(nvl,
  74                     FM_EREPORT_PAYLOAD_NAME_SFAR, &pa) != 0)
  75                         return (CMD_EVD_BAD);
  76 
  77                 fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n",
  78                     (u_longlong_t)pa);
  79                 break;
  80 
  81         case CMD_OPL_HDLR_IO:
  82 
  83                 if (nvlist_lookup_uint64(nvl, OBERON_UBC_MUE,
  84                     &ubc_ue_log_reg) != 0)
  85                         return (CMD_EVD_BAD);
  86 
  87                 pa = (ubc_ue_log_reg & UBC_UE_ADR_MASK);
  88 
  89                 fmd_hdl_debug(hdl, "cmd_ue_mem: ue_log_reg=%llx\n",
  90                     (u_longlong_t)ubc_ue_log_reg);
  91                 fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n",
  92                     (u_longlong_t)pa);
  93                 break;
  94 
  95         default:
  96 
  97                 return (CMD_EVD_BAD);
  98         }
  99 
 100         if ((page = cmd_page_lookup(pa)) != NULL &&
 101             page->page_case.cc_cp != NULL &&
 102             fmd_case_solved(hdl, page->page_case.cc_cp))
 103                 return (CMD_EVD_REDUND);
 104 
 105         if (nvlist_dup(rsrc, &asru, 0) != 0) {
 106                 fmd_hdl_debug(hdl, "opl_ue_mem nvlist dup failed\n");
 107                 return (CMD_EVD_BAD);
 108         }
 109 
 110         if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
 111                 nvlist_free(asru);
 112                 CMD_STAT_BUMP(bad_mem_asru);
 113                 return (CMD_EVD_BAD);
 114         }
 115 
 116         if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) {
 117                 nvlist_free(asru);
 118                 return (CMD_EVD_BAD);
 119         }
 120 
 121         cmd_page_fault(hdl, asru, fru, ep, pa);
 122         nvlist_free(asru);
 123         nvlist_free(fru);
 124         return (CMD_EVD_OK);
 125 }
 126 
 127 /*
 128  * The following is the main function to handle generating
 129  * the sibling cpu suspect list for the CPU detected UE
 130  * error cases.  This is to handle the
 131  * multiple strand/core architecture on the OPL platform.
 132  */
 133 cmd_evdisp_t
 134 cmd_opl_ue_cpu(fmd_hdl_t *hdl, fmd_event_t *ep,
 135     const char *class, const char *fltname,
 136     cmd_ptrsubtype_t ptr, cmd_cpu_t *cpu,
 137     cmd_case_t *cc, uint8_t cpumask)
 138 {
 139         const char *uuid;
 140         cmd_cpu_t *main_cpu, *sib_cpu;
 141         nvlist_t *fmri;
 142         cmd_list_t *cpu_list;
 143         opl_cpu_t *opl_cpu;
 144         uint32_t main_cpuid, nsusp = 1;
 145         uint8_t cert;
 146 
 147         fmd_hdl_debug(hdl,
 148             "Enter OPL_CPUUE_HANDLER for class %x\n", class);
 149 
 150         main_cpu = cpu;
 151         main_cpuid = cpu->cpu_cpuid;
 152 
 153         if (strcmp(fltname, "core") == 0)
 154                 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
 155                     IS_CORE);
 156         else if (strcmp(fltname, "chip") == 0)
 157                 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
 158                     IS_CHIP);
 159         else
 160                 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
 161                     IS_STRAND);
 162 
 163         for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL;
 164             opl_cpu = cmd_list_next(opl_cpu)) {
 165                 if (opl_cpu->oc_cpuid == main_cpuid) {
 166                         sib_cpu = main_cpu;
 167                         opl_cpu->oc_cmd_cpu = main_cpu;
 168                 } else {
 169                         fmri = cmd_cpu_fmri_create(opl_cpu->oc_cpuid, cpumask);
 170                         if (fmri == NULL) {
 171                                 opl_cpu->oc_cmd_cpu = NULL;
 172                                 fmd_hdl_debug(hdl,
 173                                     "missing asru, cpuid %u excluded\n",
 174                                     opl_cpu->oc_cpuid);
 175                                 continue;
 176                         }
 177 
 178                         sib_cpu = cmd_cpu_lookup(hdl, fmri, class,
 179                             CMD_CPU_LEVEL_THREAD);
 180                         if (sib_cpu == NULL || sib_cpu->cpu_faulting) {
 181                                 nvlist_free(fmri);
 182                                 opl_cpu->oc_cmd_cpu = NULL;
 183                                 fmd_hdl_debug(hdl,
 184                                 "cpu not present, cpuid %u excluded\n",
 185                                     opl_cpu->oc_cpuid);
 186                                 continue;
 187                         }
 188                         opl_cpu->oc_cmd_cpu = sib_cpu;
 189                         nvlist_free(fmri);
 190                         nsusp++;
 191                 }
 192                 if (cpu->cpu_cpuid == main_cpuid) {
 193                         if (cc->cc_cp != NULL &&
 194                             fmd_case_solved(hdl, cc->cc_cp)) {
 195                                 if (cpu_list != NULL)
 196                                         opl_cpulist_free(hdl, cpu_list);
 197                                 return (CMD_EVD_REDUND);
 198                         }
 199 
 200                         if (cc->cc_cp == NULL)
 201                                 cc->cc_cp = cmd_case_create(hdl,
 202                                     &cpu->cpu_header, ptr, &uuid);
 203 
 204                         if (cc->cc_serdnm != NULL) {
 205                                 fmd_hdl_debug(hdl,
 206                         "destroying existing %s state for class %x\n",
 207                                     cc->cc_serdnm, class);
 208                                 fmd_serd_destroy(hdl, cc->cc_serdnm);
 209                                 fmd_hdl_strfree(hdl, cc->cc_serdnm);
 210                                 cc->cc_serdnm = NULL;
 211                                 fmd_case_reset(hdl, cc->cc_cp);
 212                         }
 213                         fmd_case_add_ereport(hdl, cc->cc_cp, ep);
 214                 }
 215         }
 216         cert = opl_avg(100, nsusp);
 217         for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL;
 218             opl_cpu = cmd_list_next(opl_cpu)) {
 219                 if (opl_cpu->oc_cmd_cpu != NULL) {
 220                         nvlist_t *cpu_rsrc;
 221 
 222                         cpu_rsrc = opl_cpursrc_create(hdl, opl_cpu->oc_cpuid);
 223                         if (cpu_rsrc == NULL) {
 224                                 fmd_hdl_debug(hdl,
 225                                 "missing rsrc, cpuid %u excluded\n",
 226                                     opl_cpu->oc_cpuid);
 227                                 continue;
 228                         }
 229                         cmd_cpu_create_faultlist(hdl, cc->cc_cp,
 230                             opl_cpu->oc_cmd_cpu, fltname, cpu_rsrc, cert);
 231                         nvlist_free(cpu_rsrc);
 232                 }
 233         }
 234         fmd_case_solve(hdl, cc->cc_cp);
 235         if (cpu_list != NULL)
 236                 opl_cpulist_free(hdl, cpu_list);
 237         return (CMD_EVD_OK);
 238 }
 239 
 240 /*
 241  * Generates DIMM fault if the number of Permanent CE
 242  * threshold is exceeded.
 243  */
 244 static void
 245 opl_ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
 246 {
 247         nvlist_t *dflt;
 248         fmd_case_t *cp;
 249 
 250         fmd_hdl_debug(hdl,
 251             "Permanent CE event threshold checking.\n");
 252 
 253         if (dimm->dimm_flags & CMD_MEM_F_FAULTING) {
 254                 /* We've already complained about this DIMM */
 255                 return;
 256         }
 257 
 258         if (dimm->dimm_nretired >= fmd_prop_get_int32(hdl,
 259             "max_perm_ce_dimm")) {
 260                 dimm->dimm_flags |= CMD_MEM_F_FAULTING;
 261                 cp = fmd_case_open(hdl, NULL);
 262                 dflt = cmd_dimm_create_fault(hdl, dimm, "fault.memory.dimm",
 263                     CMD_FLTMAXCONF);
 264                 fmd_case_add_suspect(hdl, cp, dflt);
 265                 fmd_case_solve(hdl, cp);
 266         }
 267 }
 268 
 269 /*
 270  * Notify fault page information (pa and errlog) to XSCF via mc-opl
 271  */
 272 #define MC_PHYDEV_DIR   "/devices"
 273 #define MC_PHYPREFIX    "pseudo-mc@"
 274 static int
 275 opl_scf_log(fmd_hdl_t *hdl, nvlist_t *nvl)
 276 {
 277         uint32_t *eadd, *elog;
 278         uint_t n;
 279         uint64_t pa;
 280         char path[MAXPATHLEN];
 281         char *unum;
 282         nvlist_t *rsrc;
 283         DIR *mcdir;
 284         struct dirent *dp;
 285         mc_flt_page_t flt_page;
 286         cmd_page_t *page;
 287         struct stat statbuf;
 288 
 289         /*
 290          * Extract ereport.
 291          * Sanity check of pa is already done at cmd_opl_mac_common().
 292          * mc-opl sets only one entry for MC_OPL_ERR_ADD, MC_OPL_ERR_LOG,
 293          * and MC_OPL_BANK.
 294          */
 295         if ((nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa) != 0) ||
 296             (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_ADD, &eadd, &n) != 0) ||
 297             (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_LOG, &elog, &n) != 0)) {
 298                 fmd_hdl_debug(hdl, "opl_scf_log failed to extract ereport.\n");
 299                 return (-1);
 300         }
 301         if (nvlist_lookup_nvlist(nvl, FM_EREPORT_PAYLOAD_NAME_RESOURCE,
 302             &rsrc) != 0) {
 303                 fmd_hdl_debug(hdl, "opl_scf_log failed to get resource.\n");
 304                 return (-1);
 305         }
 306         if (nvlist_lookup_string(rsrc, FM_FMRI_MEM_UNUM, &unum) != 0) {
 307                 fmd_hdl_debug(hdl, "opl_scf_log failed to get unum.\n");
 308                 return (-1);
 309         }
 310 
 311         page = cmd_page_lookup(pa);
 312         if (page != NULL && page->page_flags & CMD_MEM_F_FAULTING) {
 313                 /*
 314                  * fault.memory.page will not be created.
 315                  */
 316                 return (0);
 317         }
 318 
 319         flt_page.err_add = eadd[0];
 320         flt_page.err_log = elog[0];
 321         flt_page.fmri_addr = (uint64_t)(uint32_t)unum;
 322         flt_page.fmri_sz = strlen(unum) + 1;
 323 
 324         fmd_hdl_debug(hdl, "opl_scf_log DIMM: %s (%d)\n",
 325             unum, strlen(unum) + 1);
 326         fmd_hdl_debug(hdl, "opl_scf_log pa:%llx add:%x log:%x\n",
 327             pa, eadd[0], elog[0]);
 328 
 329         if ((mcdir = opendir(MC_PHYDEV_DIR)) != NULL) {
 330                 while ((dp = readdir(mcdir)) != NULL) {
 331                         int fd;
 332 
 333                         if (strncmp(dp->d_name, MC_PHYPREFIX,
 334                             strlen(MC_PHYPREFIX)) != 0)
 335                                 continue;
 336 
 337                         (void) snprintf(path, sizeof (path),
 338                             "%s/%s", MC_PHYDEV_DIR, dp->d_name);
 339 
 340                         if (stat(path, &statbuf) != 0 ||
 341                             (statbuf.st_mode & S_IFCHR) == 0) {
 342                                 /* skip if not a character device */
 343                                 continue;
 344                         }
 345 
 346                         if ((fd = open(path, O_RDONLY)) < 0)
 347                                 continue;
 348 
 349                         if (ioctl(fd, MCIOC_FAULT_PAGE, &flt_page) == 0) {
 350                                 fmd_hdl_debug(hdl, "opl_scf_log ioctl(%s)\n",
 351                                     path);
 352                                 (void) close(fd);
 353                                 (void) closedir(mcdir);
 354                                 return (0);
 355                         }
 356                         (void) close(fd);
 357                 }
 358                 (void) closedir(mcdir);
 359         }
 360 
 361         fmd_hdl_debug(hdl, "opl_scf_log failed ioctl().\n");
 362 
 363         return (-1);
 364 }
 365 
 366 /*
 367  * This is the common function for processing MAC detected
 368  * Intermittent and Permanent CEs.
 369  */
 370 
 371 cmd_evdisp_t
 372 cmd_opl_mac_ce(fmd_hdl_t *hdl, fmd_event_t *ep, const char *class,
 373     nvlist_t *asru, nvlist_t *fru, uint64_t pa, nvlist_t *nvl)
 374 {
 375         cmd_dimm_t *dimm;
 376         const char *uuid;
 377 
 378         fmd_hdl_debug(hdl,
 379             "Processing CE ereport\n");
 380 
 381         if ((dimm = cmd_dimm_lookup(hdl, asru)) == NULL &&
 382             (dimm = cmd_dimm_create(hdl, asru)) == NULL)
 383                 return (CMD_EVD_UNUSED);
 384 
 385         if (dimm->dimm_case.cc_cp == NULL) {
 386                 dimm->dimm_case.cc_cp = cmd_case_create(hdl,
 387                     &dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid);
 388         }
 389 
 390         if (strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
 391                 CMD_STAT_BUMP(ce_interm);
 392                 fmd_hdl_debug(hdl, "adding FJ-Intermittent event "
 393                     "to CE serd engine\n");
 394 
 395                 if (dimm->dimm_case.cc_serdnm == NULL) {
 396                         dimm->dimm_case.cc_serdnm =
 397                             cmd_mem_serdnm_create(hdl,
 398                             "dimm", dimm->dimm_unum);
 399                         fmd_serd_create(hdl, dimm->dimm_case.cc_serdnm,
 400                             fmd_prop_get_int32(hdl, "ce_n"),
 401                             fmd_prop_get_int64(hdl, "ce_t"));
 402                 }
 403 
 404                 if (fmd_serd_record(hdl, dimm->dimm_case.cc_serdnm, ep) ==
 405                     FMD_B_FALSE) {
 406                         return (CMD_EVD_OK); /* engine hasn't fired */
 407                 }
 408                 fmd_hdl_debug(hdl, "ce serd fired\n");
 409                 fmd_case_add_serd(hdl, dimm->dimm_case.cc_cp,
 410                     dimm->dimm_case.cc_serdnm);
 411                 fmd_serd_reset(hdl, dimm->dimm_case.cc_serdnm);
 412 
 413                 (void) opl_scf_log(hdl, nvl);
 414         } else {
 415                 CMD_STAT_BUMP(ce_sticky);
 416         }
 417 
 418         dimm->dimm_nretired++;
 419         dimm->dimm_retstat.fmds_value.ui64++;
 420         cmd_dimm_dirty(hdl, dimm);
 421 
 422         cmd_page_fault(hdl, asru, fru, ep, pa);
 423         opl_ce_thresh_check(hdl, dimm);
 424 
 425         return (CMD_EVD_OK);
 426 }
 427 
 428 /*
 429  * This is the common entry for processing MAC detected errors.
 430  * It is responsible for generating the memory page fault event.
 431  * The permanent CE (sticky) in normal mode is handled here also
 432  * in the same way as in the UE case.
 433  */
 434 /*ARGSUSED*/
 435 cmd_evdisp_t
 436 cmd_opl_mac_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 437     const char *class, cmd_errcl_t clcode)
 438 {
 439         uint64_t pa;
 440         nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL;
 441         cmd_page_t *page;
 442 
 443         fmd_hdl_debug(hdl, "cmd_mac_common: clcode=%ll\n", clcode);
 444 
 445         if (nvlist_lookup_nvlist(nvl, MC_OPL_RESOURCE, &rsrc) != 0)
 446                 return (CMD_EVD_BAD);
 447 
 448         if (nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa)
 449             != 0)
 450                 return (CMD_EVD_BAD);
 451 
 452         /*
 453          * Check for invalid pa.
 454          * The most sig. bit should not be on.
 455          * It would be out of the range of possible pa
 456          * in MAC's view.
 457          */
 458         if (((uint64_t)1 << 63) & pa)
 459                 return (CMD_EVD_BAD);
 460 
 461         if ((page = cmd_page_lookup(pa)) != NULL &&
 462             page->page_case.cc_cp != NULL &&
 463             fmd_case_solved(hdl, page->page_case.cc_cp))
 464                 return (CMD_EVD_REDUND);
 465 
 466         if (nvlist_dup(rsrc, &asru, 0) != 0) {
 467                 fmd_hdl_debug(hdl, "cmd_opl_mac_common nvlist dup failed\n");
 468                 return (CMD_EVD_BAD);
 469         }
 470 
 471         if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
 472                 fmd_hdl_debug(hdl, "cmd_opl_mac_common expand failed\n");
 473                 nvlist_free(asru);
 474                 CMD_STAT_BUMP(bad_mem_asru);
 475                 return (CMD_EVD_BAD);
 476         }
 477 
 478         if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) {
 479                 fmd_hdl_debug(hdl, "cmd_opl_mac_common fru_create failed\n");
 480                 nvlist_free(asru);
 481                 return (CMD_EVD_BAD);
 482         }
 483 
 484         /*
 485          * process PCE and ICE to create DIMM fault
 486          */
 487         if (strcmp(class, "ereport.asic.mac.mi-ce") == 0 ||
 488             strcmp(class, "ereport.asic.mac.ptrl-ce") == 0 ||
 489             strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
 490                 cmd_evdisp_t ret;
 491 
 492                 ret = cmd_opl_mac_ce(hdl, ep, class, asru, fru, pa, nvl);
 493                 nvlist_free(asru);
 494                 nvlist_free(fru);
 495                 if (ret != CMD_EVD_OK) {
 496                         fmd_hdl_debug(hdl,
 497                             "cmd_opl_mac_common: mac_ce failed\n");
 498                         return (CMD_EVD_BAD);
 499                 } else
 500                         return (CMD_EVD_OK);
 501         }
 502 
 503         /* The following code handles page retires for UEs and CMPEs.  */
 504 
 505         cmd_page_fault(hdl, asru, fru, ep, pa);
 506         nvlist_free(asru);
 507         nvlist_free(fru);
 508         return (CMD_EVD_OK);
 509 }
 510 
 511 /*
 512  * Common entry points for handling CPU/IO detected UE with
 513  * respect to EID=MEM.
 514  */
 515 /*ARGSUSED*/
 516 cmd_evdisp_t
 517 cmd_opl_cpu_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 518     const char *class, cmd_errcl_t clcode)
 519 {
 520         return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_CPU));
 521 }
 522 
 523 /*ARGSUSED*/
 524 cmd_evdisp_t
 525 cmd_opl_io_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 526     const char *class, cmd_errcl_t clcode)
 527 {
 528         return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_IO));
 529 }