182 if (hcl == NULL)
183 return;
184
185 for (i = 0; i < n; i++) {
186 (void) nvlist_alloc(&hcl[i],
187 NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE, 0);
188 }
189
190 for (i = 0, j = 0; i < n; i++) {
191 (void) nvlist_lookup_string(hcl1[i], FM_FMRI_HC_NAME, &name);
192 (void) nvlist_lookup_string(hcl1[i], FM_FMRI_HC_ID, &id);
193 (void) nvlist_add_string(hcl[j], FM_FMRI_HC_NAME, name);
194 (void) nvlist_add_string(hcl[j], FM_FMRI_HC_ID, id);
195 j++;
196 if (strcmp(name, "chip") == 0)
197 break;
198 }
199
200 if (nvlist_alloc(&rsrc, NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE, 0) != 0) {
201 for (i = 0; i < n; i++) {
202 if (hcl[i] != NULL)
203 nvlist_free(hcl[i]);
204 }
205 fmd_hdl_free(hdl, hcl, sizeof (nvlist_t *) * n);
206 }
207
208 if (nvlist_add_uint8(rsrc, FM_VERSION, FM_HC_SCHEME_VERSION) != 0 ||
209 nvlist_add_string(rsrc, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0 ||
210 nvlist_add_string(rsrc, FM_FMRI_HC_ROOT, "") != 0 ||
211 nvlist_add_uint32(rsrc, FM_FMRI_HC_LIST_SZ, n) != 0 ||
212 nvlist_add_nvlist_array(rsrc, FM_FMRI_HC_LIST, hcl, n) != 0) {
213 for (i = 0; i < n; i++) {
214 if (hcl[i] != NULL)
215 nvlist_free(hcl[i]);
216 }
217 fmd_hdl_free(hdl, hcl, sizeof (nvlist_t *) * n);
218 nvlist_free(rsrc);
219 }
220
221 fru = gmem_find_fault_fru(hdl, rsrc);
222 if (fru != NULL) {
223 cp = fmd_case_open(hdl, NULL);
224 fltlist = fmd_nvl_create_fault(hdl, "fault.memory.datapath",
225 100, fru, fru, fru);
226 fmd_case_add_suspect(hdl, cp, fltlist);
227 fmd_case_solve(hdl, cp);
228 nvlist_free(fru);
229 }
230
231 for (i = 0; i < n; i++) {
232 if (hcl[i] != NULL)
233 nvlist_free(hcl[i]);
234 }
235
236 fmd_hdl_free(hdl, hcl, sizeof (nvlist_t *) * n);
237 nvlist_free(rsrc);
238 }
239
240 /*
241 * formula to conver an unhashed address to hashed address
242 * PA[17:11] = (PA[32:28] xor PA[17:13]) :: ((PA[19:18] xor PA[12:11])
243 */
244 static void
245 gmem_to_hashed_addr(uint64_t *addr, uint64_t afar)
246 {
247
248 *addr = (afar & OFFBIT) | ((afar & BIT28_32) >> 15) ^ (afar & BIT13_17)
249 | ((afar & BIT18_19) >> 7) ^ (afar & BIT11_12);
250 }
251
252 /*
382 if (nret < gmem.gm_low_ce_thresh)
383 return;
384
385 if (dimm->dimm_phys_addr_hi >= dimm->dimm_phys_addr_low)
386 delta_addr =
387 (dimm->dimm_phys_addr_hi - dimm->dimm_phys_addr_low) /
388 (nret - 1);
389
390 if (nret >= gmem.gm_max_retired_pages || delta_addr > GMEM_MQ_512KB) {
391
392 fmd_hdl_debug(hdl, "ce_thresh_check succeeded nret=%d", nret);
393 dimm->dimm_flags |= GMEM_F_FAULTING;
394 gmem_dimm_dirty(hdl, dimm);
395
396 cp = fmd_case_open(hdl, NULL);
397 rsrc = gmem_find_dimm_rsc(hdl, dimm->dimm_serial);
398 flt = fmd_nvl_create_fault(hdl, GMEM_FAULT_DIMM_PAGES,
399 GMEM_FLTMAXCONF, NULL, gmem_dimm_fru(dimm), rsrc);
400 fmd_case_add_suspect(hdl, cp, flt);
401 fmd_case_solve(hdl, cp);
402 if (rsrc != NULL)
403 nvlist_free(rsrc);
404 }
405 }
406
407 /*
408 * rule 5b checking. The check succeeds if more than 120
409 * non-intermittent CEs are reported against one symbol
410 * position of one afar in 72 hours
411 */
412 static void
413 mq_5b_check(fmd_hdl_t *hdl, gmem_dimm_t *dimm)
414 {
415 nvlist_t *flt, *rsrc;
416 fmd_case_t *cp;
417 gmem_mq_t *ip, *next;
418 int cw;
419
420 for (cw = 0; cw < GMEM_MAX_CKWDS; cw++) {
421 for (ip = gmem_list_next(&dimm->mq_root[cw]);
422 ip != NULL; ip = next) {
423 next = gmem_list_next(ip);
424 if (ip->mq_dupce_count >= gmem.gm_dupce) {
425 fmd_hdl_debug(hdl,
426 "mq_5b_check succeeded: duplicate CE=%d",
427 ip->mq_dupce_count);
428 cp = fmd_case_open(hdl, NULL);
429 rsrc = gmem_find_dimm_rsc(hdl,
430 dimm->dimm_serial);
431 flt = fmd_nvl_create_fault(hdl,
432 GMEM_FAULT_DIMM_PAGES, GMEM_FLTMAXCONF,
433 NULL, gmem_dimm_fru(dimm), rsrc);
434 dimm->dimm_flags |= GMEM_F_FAULTING;
435 gmem_dimm_dirty(hdl, dimm);
436 fmd_case_add_suspect(hdl, cp, flt);
437 fmd_case_solve(hdl, cp);
438 if (rsrc != NULL)
439 nvlist_free(rsrc);
440 return;
441 }
442 }
443 }
444 }
445
446 /*
447 * delete the expired duplicate CE time stamps
448 */
449 static void
450 mq_prune_dup(fmd_hdl_t *hdl, gmem_mq_t *ip, uint64_t now)
451 {
452 tstamp_t *tsp, *next;
453
454 for (tsp = gmem_list_next(&ip->mq_dupce_tstamp); tsp != NULL;
455 tsp = next) {
456 next = gmem_list_next(tsp);
457 if (tsp->tstamp < now - GMEM_MQ_TIMELIM) {
458 gmem_list_delete(&ip->mq_dupce_tstamp, &tsp->ts_l);
688 upos_array[++i].mq1 = NULL;
689 }
690 }
691 if (i - upos_pairs >= 2) {
692 /* Rule 4A violation */
693 rsc = gmem_find_dimm_rsc(hdl, dimm->dimm_serial);
694 flt = fmd_nvl_create_fault(hdl, GMEM_FAULT_DIMM_4A,
695 GMEM_FLTMAXCONF, NULL, gmem_dimm_fru(dimm), rsc);
696 for (j = upos_pairs; j < i; j++) {
697 fmd_case_add_ereport(hdl,
698 dimm->dimm_case.cc_cp,
699 upos_array[j].mq1->mq_ep);
700 fmd_case_add_ereport(hdl,
701 dimm->dimm_case.cc_cp,
702 upos_array[j].mq2->mq_ep);
703 }
704 dimm->dimm_flags |= GMEM_F_FAULTING;
705 gmem_dimm_dirty(hdl, dimm);
706 fmd_case_add_suspect(hdl, dimm->dimm_case.cc_cp, flt);
707 fmd_case_solve(hdl, dimm->dimm_case.cc_cp);
708 if (rsc != NULL)
709 nvlist_free(rsc);
710 return;
711 }
712 upos_pairs = i;
713 assert(upos_pairs < 16);
714 }
715 }
716
717 /*ARGSUSED*/
718 gmem_evdisp_t
719 gmem_ce(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
720 {
721 uint16_t symbol_pos, cw;
722 uint64_t phyaddr, offset, addr;
723 uint32_t filter_ratio = 0;
724 gmem_dimm_t *dimm;
725 gmem_page_t *page;
726 nvlist_t *fru = NULL;
727 nvlist_t *topo_rsc = NULL;
728 nvlist_t *rsrc, *det;
|
182 if (hcl == NULL)
183 return;
184
185 for (i = 0; i < n; i++) {
186 (void) nvlist_alloc(&hcl[i],
187 NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE, 0);
188 }
189
190 for (i = 0, j = 0; i < n; i++) {
191 (void) nvlist_lookup_string(hcl1[i], FM_FMRI_HC_NAME, &name);
192 (void) nvlist_lookup_string(hcl1[i], FM_FMRI_HC_ID, &id);
193 (void) nvlist_add_string(hcl[j], FM_FMRI_HC_NAME, name);
194 (void) nvlist_add_string(hcl[j], FM_FMRI_HC_ID, id);
195 j++;
196 if (strcmp(name, "chip") == 0)
197 break;
198 }
199
200 if (nvlist_alloc(&rsrc, NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE, 0) != 0) {
201 for (i = 0; i < n; i++) {
202 nvlist_free(hcl[i]);
203 }
204 fmd_hdl_free(hdl, hcl, sizeof (nvlist_t *) * n);
205 }
206
207 if (nvlist_add_uint8(rsrc, FM_VERSION, FM_HC_SCHEME_VERSION) != 0 ||
208 nvlist_add_string(rsrc, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0 ||
209 nvlist_add_string(rsrc, FM_FMRI_HC_ROOT, "") != 0 ||
210 nvlist_add_uint32(rsrc, FM_FMRI_HC_LIST_SZ, n) != 0 ||
211 nvlist_add_nvlist_array(rsrc, FM_FMRI_HC_LIST, hcl, n) != 0) {
212 for (i = 0; i < n; i++) {
213 nvlist_free(hcl[i]);
214 }
215 fmd_hdl_free(hdl, hcl, sizeof (nvlist_t *) * n);
216 nvlist_free(rsrc);
217 }
218
219 fru = gmem_find_fault_fru(hdl, rsrc);
220 if (fru != NULL) {
221 cp = fmd_case_open(hdl, NULL);
222 fltlist = fmd_nvl_create_fault(hdl, "fault.memory.datapath",
223 100, fru, fru, fru);
224 fmd_case_add_suspect(hdl, cp, fltlist);
225 fmd_case_solve(hdl, cp);
226 nvlist_free(fru);
227 }
228
229 for (i = 0; i < n; i++) {
230 nvlist_free(hcl[i]);
231 }
232
233 fmd_hdl_free(hdl, hcl, sizeof (nvlist_t *) * n);
234 nvlist_free(rsrc);
235 }
236
237 /*
238 * formula to conver an unhashed address to hashed address
239 * PA[17:11] = (PA[32:28] xor PA[17:13]) :: ((PA[19:18] xor PA[12:11])
240 */
241 static void
242 gmem_to_hashed_addr(uint64_t *addr, uint64_t afar)
243 {
244
245 *addr = (afar & OFFBIT) | ((afar & BIT28_32) >> 15) ^ (afar & BIT13_17)
246 | ((afar & BIT18_19) >> 7) ^ (afar & BIT11_12);
247 }
248
249 /*
379 if (nret < gmem.gm_low_ce_thresh)
380 return;
381
382 if (dimm->dimm_phys_addr_hi >= dimm->dimm_phys_addr_low)
383 delta_addr =
384 (dimm->dimm_phys_addr_hi - dimm->dimm_phys_addr_low) /
385 (nret - 1);
386
387 if (nret >= gmem.gm_max_retired_pages || delta_addr > GMEM_MQ_512KB) {
388
389 fmd_hdl_debug(hdl, "ce_thresh_check succeeded nret=%d", nret);
390 dimm->dimm_flags |= GMEM_F_FAULTING;
391 gmem_dimm_dirty(hdl, dimm);
392
393 cp = fmd_case_open(hdl, NULL);
394 rsrc = gmem_find_dimm_rsc(hdl, dimm->dimm_serial);
395 flt = fmd_nvl_create_fault(hdl, GMEM_FAULT_DIMM_PAGES,
396 GMEM_FLTMAXCONF, NULL, gmem_dimm_fru(dimm), rsrc);
397 fmd_case_add_suspect(hdl, cp, flt);
398 fmd_case_solve(hdl, cp);
399 nvlist_free(rsrc);
400 }
401 }
402
403 /*
404 * rule 5b checking. The check succeeds if more than 120
405 * non-intermittent CEs are reported against one symbol
406 * position of one afar in 72 hours
407 */
408 static void
409 mq_5b_check(fmd_hdl_t *hdl, gmem_dimm_t *dimm)
410 {
411 nvlist_t *flt, *rsrc;
412 fmd_case_t *cp;
413 gmem_mq_t *ip, *next;
414 int cw;
415
416 for (cw = 0; cw < GMEM_MAX_CKWDS; cw++) {
417 for (ip = gmem_list_next(&dimm->mq_root[cw]);
418 ip != NULL; ip = next) {
419 next = gmem_list_next(ip);
420 if (ip->mq_dupce_count >= gmem.gm_dupce) {
421 fmd_hdl_debug(hdl,
422 "mq_5b_check succeeded: duplicate CE=%d",
423 ip->mq_dupce_count);
424 cp = fmd_case_open(hdl, NULL);
425 rsrc = gmem_find_dimm_rsc(hdl,
426 dimm->dimm_serial);
427 flt = fmd_nvl_create_fault(hdl,
428 GMEM_FAULT_DIMM_PAGES, GMEM_FLTMAXCONF,
429 NULL, gmem_dimm_fru(dimm), rsrc);
430 dimm->dimm_flags |= GMEM_F_FAULTING;
431 gmem_dimm_dirty(hdl, dimm);
432 fmd_case_add_suspect(hdl, cp, flt);
433 fmd_case_solve(hdl, cp);
434 nvlist_free(rsrc);
435 return;
436 }
437 }
438 }
439 }
440
441 /*
442 * delete the expired duplicate CE time stamps
443 */
444 static void
445 mq_prune_dup(fmd_hdl_t *hdl, gmem_mq_t *ip, uint64_t now)
446 {
447 tstamp_t *tsp, *next;
448
449 for (tsp = gmem_list_next(&ip->mq_dupce_tstamp); tsp != NULL;
450 tsp = next) {
451 next = gmem_list_next(tsp);
452 if (tsp->tstamp < now - GMEM_MQ_TIMELIM) {
453 gmem_list_delete(&ip->mq_dupce_tstamp, &tsp->ts_l);
683 upos_array[++i].mq1 = NULL;
684 }
685 }
686 if (i - upos_pairs >= 2) {
687 /* Rule 4A violation */
688 rsc = gmem_find_dimm_rsc(hdl, dimm->dimm_serial);
689 flt = fmd_nvl_create_fault(hdl, GMEM_FAULT_DIMM_4A,
690 GMEM_FLTMAXCONF, NULL, gmem_dimm_fru(dimm), rsc);
691 for (j = upos_pairs; j < i; j++) {
692 fmd_case_add_ereport(hdl,
693 dimm->dimm_case.cc_cp,
694 upos_array[j].mq1->mq_ep);
695 fmd_case_add_ereport(hdl,
696 dimm->dimm_case.cc_cp,
697 upos_array[j].mq2->mq_ep);
698 }
699 dimm->dimm_flags |= GMEM_F_FAULTING;
700 gmem_dimm_dirty(hdl, dimm);
701 fmd_case_add_suspect(hdl, dimm->dimm_case.cc_cp, flt);
702 fmd_case_solve(hdl, dimm->dimm_case.cc_cp);
703 nvlist_free(rsc);
704 return;
705 }
706 upos_pairs = i;
707 assert(upos_pairs < 16);
708 }
709 }
710
711 /*ARGSUSED*/
712 gmem_evdisp_t
713 gmem_ce(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
714 {
715 uint16_t symbol_pos, cw;
716 uint64_t phyaddr, offset, addr;
717 uint32_t filter_ratio = 0;
718 gmem_dimm_t *dimm;
719 gmem_page_t *page;
720 nvlist_t *fru = NULL;
721 nvlist_t *topo_rsc = NULL;
722 nvlist_t *rsrc, *det;
|