1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * PCI ECC support 27 */ 28 29 #include <sys/types.h> 30 #include <sys/systm.h> /* for strrchr */ 31 #include <sys/kmem.h> 32 #include <sys/sunddi.h> 33 #include <sys/intr.h> 34 #include <sys/async.h> /* struct async_flt */ 35 #include <sys/ddi_impldefs.h> 36 #include <sys/machsystm.h> 37 #include <sys/sysmacros.h> 38 #include <sys/fm/protocol.h> 39 #include <sys/fm/util.h> 40 #include <sys/fm/io/pci.h> 41 #include <sys/fm/io/sun4upci.h> 42 #include <sys/fm/io/ddi.h> 43 #include <sys/pci/pci_obj.h> /* ld/st physio */ 44 #include <sys/cpuvar.h> 45 #include <sys/errclassify.h> 46 #include <sys/cpu_module.h> 47 #include <sys/async.h> 48 49 /*LINTLIBRARY*/ 50 51 static void ecc_disable(ecc_t *, int); 52 static void ecc_delayed_ce(void *); 53 static uint64_t ecc_read_afsr(ecc_intr_info_t *); 54 static void ecc_ereport_post(dev_info_t *dip, ecc_errstate_t *ecc_err); 55 56 clock_t pci_ecc_panic_delay = 200; 57 int ecc_ce_delay_secs = 6; /* number of sec to delay reenabling of CEs */ 58 int ecc_ce_delayed = 1; /* global for enabling/disabling CE delay */ 59 60 void 61 ecc_create(pci_t *pci_p) 62 { 63 #ifdef DEBUG 64 dev_info_t *dip = pci_p->pci_dip; 65 #endif 66 uint64_t cb_base_pa = pci_p->pci_cb_p->cb_base_pa; 67 ecc_t *ecc_p; 68 69 ecc_p = (ecc_t *)kmem_zalloc(sizeof (ecc_t), KM_SLEEP); 70 ecc_p->ecc_pci_cmn_p = pci_p->pci_common_p; 71 pci_p->pci_ecc_p = ecc_p; 72 73 ecc_p->ecc_ue.ecc_p = ecc_p; 74 ecc_p->ecc_ue.ecc_type = CBNINTR_UE; 75 ecc_p->ecc_ce.ecc_p = ecc_p; 76 ecc_p->ecc_ce.ecc_type = CBNINTR_CE; 77 78 pci_ecc_setup(ecc_p); 79 80 /* 81 * Determine the virtual addresses of the streaming cache 82 * control/status and flush registers. 83 */ 84 ecc_p->ecc_csr_pa = cb_base_pa + COMMON_ECC_CSR_OFFSET; 85 ecc_p->ecc_ue.ecc_afsr_pa = cb_base_pa + COMMON_UE_AFSR_OFFSET; 86 ecc_p->ecc_ue.ecc_afar_pa = cb_base_pa + COMMON_UE_AFAR_OFFSET; 87 ecc_p->ecc_ce.ecc_afsr_pa = cb_base_pa + COMMON_CE_AFSR_OFFSET; 88 ecc_p->ecc_ce.ecc_afar_pa = cb_base_pa + COMMON_CE_AFAR_OFFSET; 89 90 DEBUG1(DBG_ATTACH, dip, "ecc_create: csr=%x\n", ecc_p->ecc_csr_pa); 91 DEBUG2(DBG_ATTACH, dip, "ecc_create: ue_afsr=%x, ue_afar=%x\n", 92 ecc_p->ecc_ue.ecc_afsr_pa, ecc_p->ecc_ue.ecc_afar_pa); 93 DEBUG2(DBG_ATTACH, dip, "ecc_create: ce_afsr=%x, ce_afar=%x\n", 94 ecc_p->ecc_ce.ecc_afsr_pa, ecc_p->ecc_ce.ecc_afar_pa); 95 96 ecc_configure(pci_p); 97 98 /* 99 * Register routines to be called from system error handling code. 100 */ 101 bus_func_register(BF_TYPE_ERRDIS, (busfunc_t)ecc_disable_nowait, ecc_p); 102 } 103 104 int 105 ecc_register_intr(pci_t *pci_p) 106 { 107 ecc_t *ecc_p = pci_p->pci_ecc_p; 108 int ret; 109 110 /* 111 * Install the UE and CE error interrupt handlers. 112 */ 113 if ((ret = pci_ecc_add_intr(pci_p, CBNINTR_UE, &ecc_p->ecc_ue)) != 114 DDI_SUCCESS) 115 return (ret); 116 if ((ret = pci_ecc_add_intr(pci_p, CBNINTR_CE, &ecc_p->ecc_ce)) != 117 DDI_SUCCESS) 118 return (ret); 119 120 return (DDI_SUCCESS); 121 } 122 123 void 124 ecc_destroy(pci_t *pci_p) 125 { 126 ecc_t *ecc_p = pci_p->pci_ecc_p; 127 128 DEBUG0(DBG_DETACH, pci_p->pci_dip, "ecc_destroy:\n"); 129 130 /* 131 * Disable UE and CE ECC error interrupts. 132 */ 133 ecc_disable_wait(ecc_p); 134 135 /* 136 * Remove the ECC interrupt handlers. 137 */ 138 pci_ecc_rem_intr(pci_p, CBNINTR_UE, &ecc_p->ecc_ue); 139 pci_ecc_rem_intr(pci_p, CBNINTR_CE, &ecc_p->ecc_ce); 140 141 /* 142 * Unregister our error handling functions. 143 */ 144 bus_func_unregister(BF_TYPE_ERRDIS, 145 (busfunc_t)ecc_disable_nowait, ecc_p); 146 /* 147 * If a timer has been set, unset it. 148 */ 149 (void) untimeout(ecc_p->ecc_to_id); 150 151 kmem_free(ecc_p, sizeof (ecc_t)); 152 pci_p->pci_ecc_p = NULL; 153 } 154 155 void 156 ecc_configure(pci_t *pci_p) 157 { 158 ecc_t *ecc_p = pci_p->pci_ecc_p; 159 dev_info_t *dip = pci_p->pci_dip; 160 uint64_t l; 161 162 /* 163 * Clear any pending ECC errors. 164 */ 165 DEBUG0(DBG_ATTACH, dip, "ecc_configure: clearing UE and CE errors\n"); 166 l = (COMMON_ECC_UE_AFSR_E_MASK << COMMON_ECC_UE_AFSR_PE_SHIFT) | 167 (COMMON_ECC_UE_AFSR_E_MASK << COMMON_ECC_UE_AFSR_SE_SHIFT); 168 stdphysio(ecc_p->ecc_ue.ecc_afsr_pa, l); 169 170 l = (COMMON_ECC_CE_AFSR_E_MASK << COMMON_ECC_CE_AFSR_PE_SHIFT) | 171 (COMMON_ECC_CE_AFSR_E_MASK << COMMON_ECC_CE_AFSR_SE_SHIFT); 172 stdphysio(ecc_p->ecc_ce.ecc_afsr_pa, l); 173 174 /* 175 * Enable ECC error detections via the control register. 176 */ 177 DEBUG0(DBG_ATTACH, dip, "ecc_configure: enabling UE CE detection\n"); 178 l = COMMON_ECC_CTRL_ECC_EN; 179 if (ecc_error_intr_enable) 180 l |= COMMON_ECC_CTRL_UE_INTEN | COMMON_ECC_CTRL_CE_INTEN; 181 stdphysio(ecc_p->ecc_csr_pa, l); 182 } 183 184 void 185 ecc_enable_intr(pci_t *pci_p) 186 { 187 cb_enable_nintr(pci_p, CBNINTR_UE); 188 cb_enable_nintr(pci_p, CBNINTR_CE); 189 } 190 191 void 192 ecc_disable_wait(ecc_t *ecc_p) 193 { 194 ecc_disable(ecc_p, IB_INTR_WAIT); 195 } 196 197 uint_t 198 ecc_disable_nowait(ecc_t *ecc_p) 199 { 200 ecc_disable(ecc_p, IB_INTR_NOWAIT); 201 return (BF_NONE); 202 } 203 204 static void 205 ecc_disable(ecc_t *ecc_p, int wait) 206 { 207 cb_t *cb_p = ecc_p->ecc_pci_cmn_p->pci_common_cb_p; 208 uint64_t csr_pa = ecc_p->ecc_csr_pa; 209 uint64_t csr = lddphysio(csr_pa); 210 211 csr &= ~(COMMON_ECC_CTRL_UE_INTEN | COMMON_ECC_CTRL_CE_INTEN); 212 stdphysio(csr_pa, csr); 213 214 cb_disable_nintr(cb_p, CBNINTR_UE, wait); 215 cb_disable_nintr(cb_p, CBNINTR_CE, wait); 216 } 217 218 /* 219 * I/O ECC error handling: 220 * 221 * Below are the generic functions that handle PCI(pcisch, pcipsy) detected 222 * ECC errors. 223 * 224 * The registered interrupt handler for both pcisch and pcipsy is ecc_intr(), 225 * it's function is to receive the error, capture some state, and pass that on 226 * to the ecc_err_handler() for reporting purposes. 227 * 228 * ecc_err_handler() gathers more state(via ecc_errstate_get) and attempts 229 * to handle and report the error. ecc_err_handler() must determine if we need 230 * to panic due to this error (via pci_ecc_classify, which also decodes the 231 * ECC afsr), and if any side effects exist that may have caused or are due 232 * to this error. PBM errors related to the ECC error may exist, to report 233 * them we call pci_pbm_err_handler() and call ndi_fm_handler_dispatch() so 234 * that the child devices can log their pci errors. 235 * 236 * To report the error we must also get the syndrome and unum, which can not 237 * be done in high level interrupted context. Therefore we have an error 238 * queue(pci_ecc_queue) which we dispatch errors to, to report the errors 239 * (ecc_err_drain()). 240 * 241 * ecc_err_drain() will be called when either the softint is triggered 242 * or the system is panicing. Either way it will gather more information 243 * about the error from the CPU(via ecc_cpu_call(), ecc.c), attempt to 244 * retire the faulty page(if error is a UE), and report the detected error. 245 * 246 * ecc_delayed_ce() is called via timeout from ecc_err_handler() following 247 * the receipt of a CE interrupt. It will be called after 6ms and check to 248 * see if any new CEs are present, if so we will log and another timeout will 249 * be set by(ecc_err_handler()). If no CEs are present then it will re-enable 250 * CEs by clearing the previous interrupt. This is to keep the system going 251 * in the event of a CE storm. 252 */ 253 254 /* 255 * Function used to get ECC AFSR register 256 */ 257 static uint64_t 258 ecc_read_afsr(ecc_intr_info_t *ecc_ii_p) 259 { 260 uint_t i; 261 uint64_t afsr = 0ull; 262 263 ASSERT((ecc_ii_p->ecc_type == CBNINTR_UE) || 264 (ecc_ii_p->ecc_type == CBNINTR_CE)); 265 if (!ecc_ii_p->ecc_errpndg_mask) 266 return (lddphysio(ecc_ii_p->ecc_afsr_pa)); 267 268 for (i = 0; i < pci_ecc_afsr_retries; i++) { 269 270 /* 271 * If we timeout, the logging routine will 272 * know because it will see the ERRPNDG bits 273 * set in the AFSR. 274 */ 275 afsr = lddphysio(ecc_ii_p->ecc_afsr_pa); 276 if ((afsr & ecc_ii_p->ecc_errpndg_mask) == 0) 277 break; 278 } 279 return (afsr); 280 } 281 282 /* 283 * IO detected ECC error interrupt handler, calls ecc_err_handler to post 284 * error reports and handle the interrupt. Re-entry into ecc_err_handler 285 * is protected by the per-chip mutex pci_fm_mutex. 286 */ 287 uint_t 288 ecc_intr(caddr_t a) 289 { 290 ecc_intr_info_t *ecc_ii_p = (ecc_intr_info_t *)a; 291 ecc_t *ecc_p = ecc_ii_p->ecc_p; 292 pci_common_t *cmn_p = ecc_p->ecc_pci_cmn_p; 293 ecc_errstate_t ecc_err; 294 int ret = DDI_FM_OK; 295 296 bzero(&ecc_err, sizeof (ecc_errstate_t)); 297 ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1); 298 ecc_err.ecc_ii_p = *ecc_ii_p; 299 ecc_err.ecc_p = ecc_p; 300 ecc_err.ecc_caller = PCI_ECC_CALL; 301 302 mutex_enter(&cmn_p->pci_fm_mutex); 303 ret = ecc_err_handler(&ecc_err); 304 mutex_exit(&cmn_p->pci_fm_mutex); 305 if (ret == DDI_FM_FATAL) { 306 /* 307 * Need delay here to allow CPUs to handle related traps, 308 * such as FRUs for USIIIi systems. 309 */ 310 DELAY(pci_ecc_panic_delay); 311 fm_panic("Fatal PCI UE Error"); 312 } 313 314 return (DDI_INTR_CLAIMED); 315 } 316 317 /* 318 * Function used to gather IO ECC error state. 319 */ 320 static void 321 ecc_errstate_get(ecc_errstate_t *ecc_err_p) 322 { 323 ecc_t *ecc_p; 324 uint_t bus_id; 325 326 ASSERT(ecc_err_p); 327 328 ecc_p = ecc_err_p->ecc_ii_p.ecc_p; 329 bus_id = ecc_p->ecc_pci_cmn_p->pci_common_id; 330 331 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex)); 332 /* 333 * Read the fault registers. 334 */ 335 ecc_err_p->ecc_afsr = ecc_read_afsr(&ecc_err_p->ecc_ii_p); 336 ecc_err_p->ecc_afar = lddphysio(ecc_err_p->ecc_ii_p.ecc_afar_pa); 337 338 ecc_err_p->ecc_offset = ((ecc_err_p->ecc_afsr & 339 ecc_err_p->ecc_ii_p.ecc_offset_mask) >> 340 ecc_err_p->ecc_ii_p.ecc_offset_shift) << 341 ecc_err_p->ecc_ii_p.ecc_size_log2; 342 343 ecc_err_p->ecc_aflt.flt_id = gethrtime(); 344 ecc_err_p->ecc_aflt.flt_stat = ecc_err_p->ecc_afsr; 345 ecc_err_p->ecc_aflt.flt_addr = P2ALIGN(ecc_err_p->ecc_afar, 64) + 346 ecc_err_p->ecc_offset; 347 ecc_err_p->ecc_aflt.flt_bus_id = bus_id; 348 ecc_err_p->ecc_aflt.flt_inst = CPU->cpu_id; 349 ecc_err_p->ecc_aflt.flt_status = ECC_IOBUS; 350 ecc_err_p->ecc_aflt.flt_in_memory = 351 (pf_is_memory(ecc_err_p->ecc_afar >> MMU_PAGESHIFT))? 1: 0; 352 ecc_err_p->ecc_aflt.flt_class = BUS_FAULT; 353 } 354 355 /* 356 * ecc_pci_check: Called by ecc_err_handler() this function is responsible 357 * for calling pci_pbm_err_handler() for both sides of the schizo/psycho 358 * and calling their children error handlers(via ndi_fm_handler_dispatch()). 359 */ 360 static int 361 ecc_pci_check(ecc_t *ecc_p, uint64_t fme_ena) 362 { 363 ddi_fm_error_t derr; 364 int i; 365 int ret; 366 367 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex)); 368 369 bzero(&derr, sizeof (ddi_fm_error_t)); 370 derr.fme_version = DDI_FME_VERSION; 371 derr.fme_ena = fme_ena; 372 ret = DDI_FM_NONFATAL; 373 374 /* 375 * Need to report any PBM errors which may have caused or 376 * resulted from this error. 377 * 378 * Each psycho or schizo is represented by a pair of pci nodes 379 * in the device tree. 380 */ 381 for (i = 0; i < 2; i++) { 382 dev_info_t *dip; 383 pci_t *pci_p; 384 385 /* Make sure PBM PCI node exists */ 386 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[i]; 387 if (pci_p == NULL) 388 continue; 389 390 dip = pci_p->pci_dip; 391 if (pci_pbm_err_handler(dip, &derr, (void *)pci_p, 392 PCI_ECC_CALL) == DDI_FM_FATAL) 393 ret = DDI_FM_FATAL; 394 } 395 if (ret == DDI_FM_FATAL) 396 return (DDI_FM_FATAL); 397 else 398 return (DDI_FM_NONFATAL); 399 } 400 401 /* 402 * Function used to handle and log IO detected ECC errors, can be called by 403 * ecc_intr and pci_err_callback(trap callback). Protected by pci_fm_mutex. 404 */ 405 int 406 ecc_err_handler(ecc_errstate_t *ecc_err_p) 407 { 408 uint64_t pri_err, sec_err; 409 ecc_intr_info_t *ecc_ii_p = &ecc_err_p->ecc_ii_p; 410 ecc_t *ecc_p = ecc_ii_p->ecc_p; 411 pci_t *pci_p; 412 cb_t *cb_p; 413 int fatal = 0; 414 int nonfatal = 0; 415 ecc_errstate_t ecc_sec_err; 416 uint64_t sec_tmp; 417 int i; 418 uint64_t afsr_err[] = { COMMON_ECC_AFSR_E_PIO, 419 COMMON_ECC_AFSR_E_DRD, 420 COMMON_ECC_AFSR_E_DWR }; 421 422 423 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex)); 424 425 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[0]; 426 if (pci_p == NULL) 427 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[1]; 428 429 cb_p = ecc_p->ecc_pci_cmn_p->pci_common_cb_p; 430 431 ecc_errstate_get(ecc_err_p); 432 pri_err = (ecc_err_p->ecc_afsr >> COMMON_ECC_UE_AFSR_PE_SHIFT) & 433 COMMON_ECC_UE_AFSR_E_MASK; 434 435 sec_err = (ecc_err_p->ecc_afsr >> COMMON_ECC_UE_AFSR_SE_SHIFT) & 436 COMMON_ECC_UE_AFSR_E_MASK; 437 438 switch (ecc_ii_p->ecc_type) { 439 case CBNINTR_UE: 440 if (pri_err) { 441 ecc_err_p->ecc_aflt.flt_synd = 442 pci_ecc_get_synd(ecc_err_p->ecc_afsr); 443 ecc_err_p->ecc_pri = 1; 444 pci_ecc_classify(pri_err, ecc_err_p); 445 errorq_dispatch(pci_ecc_queue, (void *)ecc_err_p, 446 sizeof (ecc_errstate_t), 447 ecc_err_p->ecc_aflt.flt_panic); 448 } 449 if (sec_err) { 450 ecc_sec_err = *ecc_err_p; 451 ecc_sec_err.ecc_pri = 0; 452 /* 453 * Secondary errors are cumulative so we need to loop 454 * through to capture them all. 455 */ 456 for (i = 0; i < 3; i++) { 457 sec_tmp = sec_err & afsr_err[i]; 458 if (sec_tmp) { 459 pci_ecc_classify(sec_tmp, &ecc_sec_err); 460 ecc_ereport_post(pci_p->pci_dip, 461 &ecc_sec_err); 462 } 463 } 464 } 465 /* 466 * Check for PCI bus errors that may have resulted from or 467 * caused this UE. 468 */ 469 if (ecc_err_p->ecc_caller == PCI_ECC_CALL && 470 ecc_pci_check(ecc_p, ecc_err_p->ecc_ena) == DDI_FM_FATAL) 471 ecc_err_p->ecc_aflt.flt_panic = 1; 472 473 if (ecc_err_p->ecc_aflt.flt_panic && 474 ecc_err_p->ecc_aflt.flt_in_memory) 475 panic_aflt = ecc_err_p->ecc_aflt; 476 477 if (ecc_err_p->ecc_aflt.flt_panic) { 478 /* 479 * Disable all further errors since this will be 480 * treated as a fatal error. 481 */ 482 (void) ecc_disable_nowait(ecc_p); 483 fatal++; 484 } 485 break; 486 487 case CBNINTR_CE: 488 if (pri_err) { 489 ecc_err_p->ecc_pri = 1; 490 pci_ecc_classify(pri_err, ecc_err_p); 491 ecc_err_p->ecc_aflt.flt_synd = 492 pci_ecc_get_synd(ecc_err_p->ecc_afsr); 493 ce_scrub(&ecc_err_p->ecc_aflt); 494 errorq_dispatch(pci_ecc_queue, (void *)ecc_err_p, 495 sizeof (ecc_errstate_t), ERRORQ_ASYNC); 496 nonfatal++; 497 } 498 if (sec_err) { 499 ecc_sec_err = *ecc_err_p; 500 ecc_sec_err.ecc_pri = 0; 501 /* 502 * Secondary errors are cumulative so we need to loop 503 * through to capture them all. 504 */ 505 for (i = 0; i < 3; i++) { 506 sec_tmp = sec_err & afsr_err[i]; 507 if (sec_tmp) { 508 pci_ecc_classify(sec_tmp, &ecc_sec_err); 509 ecc_ereport_post(pci_p->pci_dip, 510 &ecc_sec_err); 511 } 512 } 513 nonfatal++; 514 } 515 break; 516 517 default: 518 return (DDI_FM_OK); 519 } 520 /* Clear the errors */ 521 stdphysio(ecc_ii_p->ecc_afsr_pa, ecc_err_p->ecc_afsr); 522 /* 523 * Clear the interrupt if called by ecc_intr and UE error or if called 524 * by ecc_intr and CE error and delayed CE interrupt handling is 525 * turned off. 526 */ 527 if ((ecc_err_p->ecc_caller == PCI_ECC_CALL && 528 ecc_ii_p->ecc_type == CBNINTR_UE && !fatal) || 529 (ecc_err_p->ecc_caller == PCI_ECC_CALL && 530 ecc_ii_p->ecc_type == CBNINTR_CE && !ecc_ce_delayed)) 531 cb_clear_nintr(cb_p, ecc_ii_p->ecc_type); 532 if (!fatal && !nonfatal) 533 return (DDI_FM_OK); 534 else if (fatal) 535 return (DDI_FM_FATAL); 536 return (DDI_FM_NONFATAL); 537 } 538 539 /* 540 * Called from ecc_err_drain below for CBINTR_CE case. 541 */ 542 static int 543 ecc_err_cexdiag(ecc_errstate_t *ecc_err, errorq_elem_t *eqep) 544 { 545 struct async_flt *ecc = &ecc_err->ecc_aflt; 546 uint64_t errors; 547 548 if (page_retire_check(ecc->flt_addr, &errors) == EINVAL) { 549 CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_NOPP); 550 return (0); 551 } else if (errors != PR_OK) { 552 CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_PAGEDET); 553 return (0); 554 } else { 555 return (ce_scrub_xdiag_recirc(ecc, pci_ecc_queue, eqep, 556 offsetof(ecc_errstate_t, ecc_aflt))); 557 } 558 } 559 560 /* 561 * Function used to drain pci_ecc_queue, either during panic or after softint 562 * is generated, to log IO detected ECC errors. 563 */ 564 /*ARGSUSED*/ 565 void 566 ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep) 567 { 568 struct async_flt *ecc = &ecc_err->ecc_aflt; 569 pci_t *pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[0]; 570 int ecc_type = ecc_err->ecc_ii_p.ecc_type; 571 572 if (pci_p == NULL) 573 pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[1]; 574 575 if (ecc->flt_class == RECIRC_BUS_FAULT) { 576 /* 577 * Perform any additional actions that occur after the 578 * ecc_err_cexdiag below and post the ereport. 579 */ 580 ecc->flt_class = BUS_FAULT; 581 ecc_err->ecc_err_type = flt_to_error_type(ecc); 582 ecc_ereport_post(pci_p->pci_dip, ecc_err); 583 return; 584 } 585 586 ecc_cpu_call(ecc, ecc_err->ecc_unum, (ecc_type == CBNINTR_UE) ? 587 ECC_IO_UE : ECC_IO_CE); 588 589 switch (ecc_type) { 590 case CBNINTR_UE: 591 if (ecc_err->ecc_pg_ret == 1) { 592 (void) page_retire(ecc->flt_addr, PR_UE); 593 } 594 ecc_err->ecc_err_type = flt_to_error_type(ecc); 595 break; 596 597 case CBNINTR_CE: 598 /* 599 * Setup timeout (if CE detected via interrupt) to 600 * re-enable CE interrupts if no more CEs are detected. 601 * This is to protect against CE storms. 602 */ 603 if (ecc_ce_delayed && 604 ecc_err->ecc_caller == PCI_ECC_CALL && 605 ecc_err->ecc_p->ecc_to_id == 0) { 606 ecc_err->ecc_p->ecc_to_id = timeout(ecc_delayed_ce, 607 (void *)ecc_err->ecc_p, 608 drv_usectohz((clock_t)ecc_ce_delay_secs * 609 MICROSEC)); 610 } 611 612 /* ecc_err_cexdiag returns nonzero to recirculate */ 613 if (CE_XDIAG_EXT_ALG_APPLIED(ecc->flt_disp) && 614 ecc_err_cexdiag(ecc_err, eqep)) 615 return; 616 ecc_err->ecc_err_type = flt_to_error_type(ecc); 617 break; 618 } 619 620 ecc_ereport_post(pci_p->pci_dip, ecc_err); 621 } 622 623 static void 624 ecc_delayed_ce(void *arg) 625 { 626 ecc_t *ecc_p = (ecc_t *)arg; 627 pci_common_t *cmn_p; 628 cb_t *cb_p; 629 630 ASSERT(ecc_p); 631 632 cmn_p = ecc_p->ecc_pci_cmn_p; 633 cb_p = cmn_p->pci_common_cb_p; 634 /* 635 * If no more CE errors are found then enable interrupts(by 636 * clearing the previous interrupt), else send in for logging 637 * and the timeout should be set again. 638 */ 639 ecc_p->ecc_to_id = 0; 640 if (!((ecc_read_afsr(&ecc_p->ecc_ce) >> 641 COMMON_ECC_UE_AFSR_PE_SHIFT) & COMMON_ECC_UE_AFSR_E_MASK)) { 642 cb_clear_nintr(cb_p, ecc_p->ecc_ce.ecc_type); 643 } else { 644 ecc_errstate_t ecc_err; 645 646 bzero(&ecc_err, sizeof (ecc_errstate_t)); 647 ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1); 648 ecc_err.ecc_ii_p = ecc_p->ecc_ce; 649 ecc_err.ecc_p = ecc_p; 650 ecc_err.ecc_caller = PCI_ECC_CALL; 651 652 mutex_enter(&cmn_p->pci_fm_mutex); 653 (void) ecc_err_handler(&ecc_err); 654 mutex_exit(&cmn_p->pci_fm_mutex); 655 } 656 } 657 658 /* 659 * Function used to post IO detected ECC ereports. 660 */ 661 static void 662 ecc_ereport_post(dev_info_t *dip, ecc_errstate_t *ecc_err) 663 { 664 char buf[FM_MAX_CLASS], dev_path[MAXPATHLEN], *ptr; 665 struct i_ddi_fmhdl *fmhdl = DEVI(dip)->devi_fmhdl; 666 nvlist_t *ereport, *detector; 667 nv_alloc_t *nva; 668 errorq_elem_t *eqep; 669 670 /* 671 * We do not use ddi_fm_ereport_post because we need to set a 672 * special detector here. Since we do not have a device path for 673 * the bridge chip we use what we think it should be to aid in 674 * diagnosis. This path fmri is created by pci_fmri_create() 675 * during initialization. 676 */ 677 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s.%s", DDI_IO_CLASS, 678 ecc_err->ecc_bridge_type, ecc_err->ecc_aflt.flt_erpt_class); 679 680 ecc_err->ecc_ena = ecc_err->ecc_ena ? ecc_err->ecc_ena : 681 fm_ena_generate(0, FM_ENA_FMT1); 682 683 eqep = errorq_reserve(fmhdl->fh_errorq); 684 if (eqep == NULL) 685 return; 686 687 ereport = errorq_elem_nvl(fmhdl->fh_errorq, eqep); 688 nva = errorq_elem_nva(fmhdl->fh_errorq, eqep); 689 detector = fm_nvlist_create(nva); 690 691 ASSERT(ereport); 692 ASSERT(nva); 693 ASSERT(detector); 694 695 ddi_pathname(dip, dev_path); 696 ptr = strrchr(dev_path, (int)','); 697 698 if (ptr) 699 *ptr = '\0'; 700 701 fm_fmri_dev_set(detector, FM_DEV_SCHEME_VERSION, NULL, dev_path, 702 NULL, NULL); 703 704 if (ecc_err->ecc_pri) { 705 if ((ecc_err->ecc_fmri = fm_nvlist_create(nva)) != NULL) { 706 char sid[DIMM_SERIAL_ID_LEN] = ""; 707 uint64_t offset = (uint64_t)-1; 708 int len; 709 int ret; 710 711 ret = cpu_get_mem_sid(ecc_err->ecc_unum, sid, 712 DIMM_SERIAL_ID_LEN, &len); 713 714 if (ret == 0) { 715 (void) cpu_get_mem_offset( 716 ecc_err->ecc_aflt.flt_addr, &offset); 717 } 718 719 fm_fmri_mem_set(ecc_err->ecc_fmri, 720 FM_MEM_SCHEME_VERSION, NULL, ecc_err->ecc_unum, 721 (ret == 0) ? sid : NULL, offset); 722 } 723 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 724 ecc_err->ecc_ena, detector, 725 PCI_ECC_AFSR, DATA_TYPE_UINT64, ecc_err->ecc_afsr, 726 PCI_ECC_AFAR, DATA_TYPE_UINT64, ecc_err->ecc_aflt.flt_addr, 727 PCI_ECC_CTRL, DATA_TYPE_UINT64, ecc_err->ecc_ctrl, 728 PCI_ECC_SYND, DATA_TYPE_UINT16, ecc_err->ecc_aflt.flt_synd, 729 PCI_ECC_TYPE, DATA_TYPE_STRING, ecc_err->ecc_err_type, 730 PCI_ECC_DISP, DATA_TYPE_UINT64, ecc_err->ecc_aflt.flt_disp, 731 PCI_ECC_RESOURCE, DATA_TYPE_NVLIST, ecc_err->ecc_fmri, 732 NULL); 733 } else { 734 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 735 ecc_err->ecc_ena, detector, 736 PCI_ECC_AFSR, DATA_TYPE_UINT64, ecc_err->ecc_afsr, 737 PCI_ECC_CTRL, DATA_TYPE_UINT64, ecc_err->ecc_ctrl, 738 NULL); 739 } 740 errorq_commit(fmhdl->fh_errorq, eqep, ERRORQ_ASYNC); 741 }