2423 * on_trap protection)
2424 * 4. The error is on a retired page
2425 *
2426 * Note: AFLT_PROT_EC is used places other than the memory
2427 * scrubber. However, none of those errors should occur
2428 * on a retired page.
2429 */
2430 if ((ch_flt->afsr_errs &
2431 (C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) == C_AFSR_CE &&
2432 aflt->flt_prot == AFLT_PROT_EC) {
2433
2434 if (page_retire_check(aflt->flt_addr, NULL) == 0) {
2435 if (ch_flt->flt_trapped_ce & CE_CEEN_DEFER) {
2436
2437 /*
2438 * Since we're skipping logging, we'll need
2439 * to schedule the re-enabling of CEEN
2440 */
2441 (void) timeout(cpu_delayed_check_ce_errors,
2442 (void *)(uintptr_t)aflt->flt_inst,
2443 drv_usectohz((clock_t)cpu_ceen_delay_secs
2444 * MICROSEC));
2445 }
2446
2447 /*
2448 * Inform memscrubber - scrubbing induced
2449 * CE on a retired page.
2450 */
2451 memscrub_induced_error();
2452 return (0);
2453 }
2454 }
2455
2456 /*
2457 * Perform/schedule further classification actions, but
2458 * only if the page is healthy (we don't want bad
2459 * pages inducing too much diagnostic activity). If we could
2460 * not find a page pointer then we also skip this. If
2461 * ce_scrub_xdiag_recirc returns nonzero then it has chosen
2462 * to copy and recirculate the event (for further diagnostics)
2463 * and we should not proceed to log it here.
2464 *
2651 ch_flt->flt_bit);
2652 }
2653
2654 if (aflt->flt_func != NULL)
2655 aflt->flt_func(aflt, unum);
2656
2657 if (afar_status != AFLT_STAT_INVALID)
2658 cpu_log_diag_info(ch_flt);
2659
2660 /*
2661 * If we have a CEEN error , we do not reenable CEEN until after
2662 * we exit the trap handler. Otherwise, another error may
2663 * occur causing the handler to be entered recursively.
2664 * We set a timeout to trigger in cpu_ceen_delay_secs seconds,
2665 * to try and ensure that the CPU makes progress in the face
2666 * of a CE storm.
2667 */
2668 if (ch_flt->flt_trapped_ce & CE_CEEN_DEFER) {
2669 (void) timeout(cpu_delayed_check_ce_errors,
2670 (void *)(uintptr_t)aflt->flt_inst,
2671 drv_usectohz((clock_t)cpu_ceen_delay_secs * MICROSEC));
2672 }
2673 }
2674
2675 /*
2676 * Invoked by error_init() early in startup and therefore before
2677 * startup_errorq() is called to drain any error Q -
2678 *
2679 * startup()
2680 * startup_end()
2681 * error_init()
2682 * cpu_error_init()
2683 * errorq_init()
2684 * errorq_drain()
2685 * start_other_cpus()
2686 *
2687 * The purpose of this routine is to create error-related taskqs. Taskqs
2688 * are used for this purpose because cpu_lock can't be grabbed from interrupt
2689 * context.
2690 */
2691 void
6055
6056 do {
6057 outstanding = *countp;
6058 for (i = 0; i < outstanding; i++) {
6059 scrub_ecache(how_many);
6060 }
6061 } while (atomic_add_32_nv(countp, -outstanding));
6062
6063 return (DDI_INTR_CLAIMED);
6064 }
6065
6066 /*
6067 * Timeout function to reenable CE
6068 */
6069 static void
6070 cpu_delayed_check_ce_errors(void *arg)
6071 {
6072 if (!taskq_dispatch(ch_check_ce_tq, cpu_check_ce_errors, arg,
6073 TQ_NOSLEEP)) {
6074 (void) timeout(cpu_delayed_check_ce_errors, arg,
6075 drv_usectohz((clock_t)cpu_ceen_delay_secs * MICROSEC));
6076 }
6077 }
6078
6079 /*
6080 * CE Deferred Re-enable after trap.
6081 *
6082 * When the CPU gets a disrupting trap for any of the errors
6083 * controlled by the CEEN bit, CEEN is disabled in the trap handler
6084 * immediately. To eliminate the possibility of multiple CEs causing
6085 * recursive stack overflow in the trap handler, we cannot
6086 * reenable CEEN while still running in the trap handler. Instead,
6087 * after a CE is logged on a CPU, we schedule a timeout function,
6088 * cpu_check_ce_errors(), to trigger after cpu_ceen_delay_secs
6089 * seconds. This function will check whether any further CEs
6090 * have occurred on that CPU, and if none have, will reenable CEEN.
6091 *
6092 * If further CEs have occurred while CEEN is disabled, another
6093 * timeout will be scheduled. This is to ensure that the CPU can
6094 * make progress in the face of CE 'storms', and that it does not
6095 * spend all its time logging CE errors.
6144 TIMEOUT_CEEN_CHECK, 0);
6145 mutex_exit(&cpu_lock);
6146 } else {
6147 /*
6148 * When the CPU is not accepting xcalls, or
6149 * the processor is offlined, we don't want to
6150 * incur the extra overhead of trying to schedule the
6151 * CE timeout indefinitely. However, we don't want to lose
6152 * CE checking forever.
6153 *
6154 * Keep rescheduling the timeout, accepting the additional
6155 * overhead as the cost of correctness in the case where we get
6156 * a CE, disable CEEN, offline the CPU during the
6157 * the timeout interval, and then online it at some
6158 * point in the future. This is unlikely given the short
6159 * cpu_ceen_delay_secs.
6160 */
6161 mutex_exit(&cpu_lock);
6162 (void) timeout(cpu_delayed_check_ce_errors,
6163 (void *)(uintptr_t)cp->cpu_id,
6164 drv_usectohz((clock_t)cpu_ceen_delay_secs * MICROSEC));
6165 }
6166 }
6167
6168 /*
6169 * This routine will check whether CEs have occurred while
6170 * CEEN is disabled. Any CEs detected will be logged and, if
6171 * possible, scrubbed.
6172 *
6173 * The memscrubber will also use this routine to clear any errors
6174 * caused by its scrubbing with CEEN disabled.
6175 *
6176 * flag == SCRUBBER_CEEN_CHECK
6177 * called from memscrubber, just check/scrub, no reset
6178 * paddr physical addr. for start of scrub pages
6179 * vaddr virtual addr. for scrub area
6180 * psz page size of area to be scrubbed
6181 *
6182 * flag == TIMEOUT_CEEN_CHECK
6183 * timeout function has triggered, reset timeout or CEEN
6184 *
|
2423 * on_trap protection)
2424 * 4. The error is on a retired page
2425 *
2426 * Note: AFLT_PROT_EC is used places other than the memory
2427 * scrubber. However, none of those errors should occur
2428 * on a retired page.
2429 */
2430 if ((ch_flt->afsr_errs &
2431 (C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) == C_AFSR_CE &&
2432 aflt->flt_prot == AFLT_PROT_EC) {
2433
2434 if (page_retire_check(aflt->flt_addr, NULL) == 0) {
2435 if (ch_flt->flt_trapped_ce & CE_CEEN_DEFER) {
2436
2437 /*
2438 * Since we're skipping logging, we'll need
2439 * to schedule the re-enabling of CEEN
2440 */
2441 (void) timeout(cpu_delayed_check_ce_errors,
2442 (void *)(uintptr_t)aflt->flt_inst,
2443 drv_sectohz((clock_t)cpu_ceen_delay_secs));
2444 }
2445
2446 /*
2447 * Inform memscrubber - scrubbing induced
2448 * CE on a retired page.
2449 */
2450 memscrub_induced_error();
2451 return (0);
2452 }
2453 }
2454
2455 /*
2456 * Perform/schedule further classification actions, but
2457 * only if the page is healthy (we don't want bad
2458 * pages inducing too much diagnostic activity). If we could
2459 * not find a page pointer then we also skip this. If
2460 * ce_scrub_xdiag_recirc returns nonzero then it has chosen
2461 * to copy and recirculate the event (for further diagnostics)
2462 * and we should not proceed to log it here.
2463 *
2650 ch_flt->flt_bit);
2651 }
2652
2653 if (aflt->flt_func != NULL)
2654 aflt->flt_func(aflt, unum);
2655
2656 if (afar_status != AFLT_STAT_INVALID)
2657 cpu_log_diag_info(ch_flt);
2658
2659 /*
2660 * If we have a CEEN error , we do not reenable CEEN until after
2661 * we exit the trap handler. Otherwise, another error may
2662 * occur causing the handler to be entered recursively.
2663 * We set a timeout to trigger in cpu_ceen_delay_secs seconds,
2664 * to try and ensure that the CPU makes progress in the face
2665 * of a CE storm.
2666 */
2667 if (ch_flt->flt_trapped_ce & CE_CEEN_DEFER) {
2668 (void) timeout(cpu_delayed_check_ce_errors,
2669 (void *)(uintptr_t)aflt->flt_inst,
2670 drv_sectohz((clock_t)cpu_ceen_delay_secs));
2671 }
2672 }
2673
2674 /*
2675 * Invoked by error_init() early in startup and therefore before
2676 * startup_errorq() is called to drain any error Q -
2677 *
2678 * startup()
2679 * startup_end()
2680 * error_init()
2681 * cpu_error_init()
2682 * errorq_init()
2683 * errorq_drain()
2684 * start_other_cpus()
2685 *
2686 * The purpose of this routine is to create error-related taskqs. Taskqs
2687 * are used for this purpose because cpu_lock can't be grabbed from interrupt
2688 * context.
2689 */
2690 void
6054
6055 do {
6056 outstanding = *countp;
6057 for (i = 0; i < outstanding; i++) {
6058 scrub_ecache(how_many);
6059 }
6060 } while (atomic_add_32_nv(countp, -outstanding));
6061
6062 return (DDI_INTR_CLAIMED);
6063 }
6064
6065 /*
6066 * Timeout function to reenable CE
6067 */
6068 static void
6069 cpu_delayed_check_ce_errors(void *arg)
6070 {
6071 if (!taskq_dispatch(ch_check_ce_tq, cpu_check_ce_errors, arg,
6072 TQ_NOSLEEP)) {
6073 (void) timeout(cpu_delayed_check_ce_errors, arg,
6074 drv_sectohz((clock_t)cpu_ceen_delay_secs));
6075 }
6076 }
6077
6078 /*
6079 * CE Deferred Re-enable after trap.
6080 *
6081 * When the CPU gets a disrupting trap for any of the errors
6082 * controlled by the CEEN bit, CEEN is disabled in the trap handler
6083 * immediately. To eliminate the possibility of multiple CEs causing
6084 * recursive stack overflow in the trap handler, we cannot
6085 * reenable CEEN while still running in the trap handler. Instead,
6086 * after a CE is logged on a CPU, we schedule a timeout function,
6087 * cpu_check_ce_errors(), to trigger after cpu_ceen_delay_secs
6088 * seconds. This function will check whether any further CEs
6089 * have occurred on that CPU, and if none have, will reenable CEEN.
6090 *
6091 * If further CEs have occurred while CEEN is disabled, another
6092 * timeout will be scheduled. This is to ensure that the CPU can
6093 * make progress in the face of CE 'storms', and that it does not
6094 * spend all its time logging CE errors.
6143 TIMEOUT_CEEN_CHECK, 0);
6144 mutex_exit(&cpu_lock);
6145 } else {
6146 /*
6147 * When the CPU is not accepting xcalls, or
6148 * the processor is offlined, we don't want to
6149 * incur the extra overhead of trying to schedule the
6150 * CE timeout indefinitely. However, we don't want to lose
6151 * CE checking forever.
6152 *
6153 * Keep rescheduling the timeout, accepting the additional
6154 * overhead as the cost of correctness in the case where we get
6155 * a CE, disable CEEN, offline the CPU during the
6156 * the timeout interval, and then online it at some
6157 * point in the future. This is unlikely given the short
6158 * cpu_ceen_delay_secs.
6159 */
6160 mutex_exit(&cpu_lock);
6161 (void) timeout(cpu_delayed_check_ce_errors,
6162 (void *)(uintptr_t)cp->cpu_id,
6163 drv_sectohz((clock_t)cpu_ceen_delay_secs));
6164 }
6165 }
6166
6167 /*
6168 * This routine will check whether CEs have occurred while
6169 * CEEN is disabled. Any CEs detected will be logged and, if
6170 * possible, scrubbed.
6171 *
6172 * The memscrubber will also use this routine to clear any errors
6173 * caused by its scrubbing with CEEN disabled.
6174 *
6175 * flag == SCRUBBER_CEEN_CHECK
6176 * called from memscrubber, just check/scrub, no reset
6177 * paddr physical addr. for start of scrub pages
6178 * vaddr virtual addr. for scrub area
6179 * psz page size of area to be scrubbed
6180 *
6181 * flag == TIMEOUT_CEEN_CHECK
6182 * timeout function has triggered, reset timeout or CEEN
6183 *
|