Skip to content

Commit d334a49

Browse files
yhuang-intellenb
authored andcommitted
ACPI, APEI, Generic Hardware Error Source memory error support
Generic Hardware Error Source provides a way to report platform hardware errors (such as that from chipset). It works in so called "Firmware First" mode, that is, hardware errors are reported to firmware firstly, then reported to Linux by firmware. This way, some non-standard hardware error registers or non-standard hardware link can be checked by firmware to produce more valuable hardware error information for Linux. Now, only SCI notification type and memory errors are supported. More notification type and hardware error type will be added later. These memory errors are reported to user space through /dev/mcelog via faking a corrected Machine Check, so that the error memory page can be offlined by /sbin/mcelog if the error count for one page is beyond the threshold. On some machines, Machine Check can not report physical address for some corrected memory errors, but GHES can do that. So this simplified GHES is implemented firstly. Signed-off-by: Huang Ying <[email protected]> Signed-off-by: Andi Kleen <[email protected]> Signed-off-by: Len Brown <[email protected]>
1 parent 06d65de commit d334a49

File tree

6 files changed

+504
-0
lines changed

6 files changed

+504
-0
lines changed

arch/x86/include/asm/mce.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,5 +225,13 @@ extern void mcheck_intel_therm_init(void);
225225
static inline void mcheck_intel_therm_init(void) { }
226226
#endif
227227

228+
/*
229+
* Used by APEI to report memory error via /dev/mcelog
230+
*/
231+
232+
struct cper_sec_mem_err;
233+
extern void apei_mce_report_mem_error(int corrected,
234+
struct cper_sec_mem_err *mem_err);
235+
228236
#endif /* __KERNEL__ */
229237
#endif /* _ASM_X86_MCE_H */

arch/x86/kernel/cpu/mcheck/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
77
obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
88

99
obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
10+
11+
obj-$(CONFIG_ACPI_APEI) += mce-apei.o
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/*
2+
* Bridge between MCE and APEI
3+
*
4+
* On some machine, corrected memory errors are reported via APEI
5+
* generic hardware error source (GHES) instead of corrected Machine
6+
* Check. These corrected memory errors can be reported to user space
7+
* through /dev/mcelog via faking a corrected Machine Check, so that
8+
* the error memory page can be offlined by /sbin/mcelog if the error
9+
* count for one page is beyond the threshold.
10+
*
11+
* Copyright 2010 Intel Corp.
12+
* Author: Huang Ying <[email protected]>
13+
*
14+
* This program is free software; you can redistribute it and/or
15+
* modify it under the terms of the GNU General Public License version
16+
* 2 as published by the Free Software Foundation.
17+
*
18+
* This program is distributed in the hope that it will be useful,
19+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
20+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21+
* GNU General Public License for more details.
22+
*
23+
* You should have received a copy of the GNU General Public License
24+
* along with this program; if not, write to the Free Software
25+
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26+
*/
27+
28+
#include <linux/kernel.h>
29+
#include <linux/acpi.h>
30+
#include <linux/cper.h>
31+
#include <acpi/apei.h>
32+
#include <asm/mce.h>
33+
34+
#include "mce-internal.h"
35+
36+
void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
37+
{
38+
struct mce m;
39+
40+
/* Only corrected MC is reported */
41+
if (!corrected)
42+
return;
43+
44+
mce_setup(&m);
45+
m.bank = 1;
46+
/* Fake a memory read corrected error with unknown channel */
47+
m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
48+
m.addr = mem_err->physical_addr;
49+
mce_log(&m);
50+
mce_notify_irq();
51+
}
52+
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);

drivers/acpi/apei/Kconfig

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,20 @@ config ACPI_APEI
77
especially. In addition it supports error serialization and
88
error injection.
99

10+
config ACPI_APEI_GHES
11+
tristate "APEI Generic Hardware Error Source"
12+
depends on ACPI_APEI && X86
13+
select ACPI_HED
14+
help
15+
Generic Hardware Error Source provides a way to report
16+
platform hardware errors (such as that from chipset). It
17+
works in so called "Firmware First" mode, that is, hardware
18+
errors are reported to firmware firstly, then reported to
19+
Linux by firmware. This way, some non-standard hardware
20+
error registers or non-standard hardware link can be checked
21+
by firmware to produce more valuable hardware error
22+
information for Linux.
23+
1024
config ACPI_APEI_EINJ
1125
tristate "APEI Error INJection (EINJ)"
1226
depends on ACPI_APEI && DEBUG_FS

drivers/acpi/apei/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
obj-$(CONFIG_ACPI_APEI) += apei.o
2+
obj-$(CONFIG_ACPI_APEI_GHES) += ghes.o
23
obj-$(CONFIG_ACPI_APEI_EINJ) += einj.o
34

45
apei-y := apei-base.o hest.o cper.o

0 commit comments

Comments
 (0)