Skip to content

Commit c0d1217

Browse files
Dave JiangLinus Torvalds
authored andcommitted
drivers/edac: add new nmi rescan
Provides a way for NMI reported errors on x86 to notify the EDAC subsystem pending ECC errors by writing to a software state variable. Here's the reworked patch. I added an EDAC stub to the kernel so we can have variables that are in the kernel even if EDAC is a module. I also implemented the idea of using the chip driver to select error detection mode via module parameter and eliminate the kernel compile option. Please review/test. Thx! Also, I only made changes to some of the chipset drivers since I am unfamiliar with the other ones. We can add similar changes as we go. Signed-off-by: Dave Jiang <[email protected]> Signed-off-by: Douglas Thompson <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 28f96ee commit c0d1217

File tree

11 files changed

+160
-15
lines changed

11 files changed

+160
-15
lines changed

arch/i386/kernel/traps.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@
4141
#include <linux/mca.h>
4242
#endif
4343

44+
#if defined(CONFIG_EDAC)
45+
#include <linux/edac.h>
46+
#endif
47+
4448
#include <asm/processor.h>
4549
#include <asm/system.h>
4650
#include <asm/io.h>
@@ -638,6 +642,14 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
638642
printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
639643
"CPU %d.\n", reason, smp_processor_id());
640644
printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
645+
646+
#if defined(CONFIG_EDAC)
647+
if(edac_handler_set()) {
648+
edac_atomic_assert_error();
649+
return;
650+
}
651+
#endif
652+
641653
if (panic_on_unrecovered_nmi)
642654
panic("NMI: Not continuing");
643655

arch/x86_64/kernel/traps.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@
3434
#include <linux/bug.h>
3535
#include <linux/kdebug.h>
3636

37+
#if defined(CONFIG_EDAC)
38+
#include <linux/edac.h>
39+
#endif
40+
3741
#include <asm/system.h>
3842
#include <asm/io.h>
3943
#include <asm/atomic.h>
@@ -719,6 +723,13 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
719723
reason);
720724
printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
721725

726+
#if defined(CONFIG_EDAC)
727+
if(edac_handler_set()) {
728+
edac_atomic_assert_error();
729+
return;
730+
}
731+
#endif
732+
722733
if (panic_on_unrecovered_nmi)
723734
panic("NMI: Not continuing");
724735

drivers/edac/Kconfig

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -109,15 +109,4 @@ config EDAC_I5000
109109
Support for error detection and correction the Intel
110110
Greekcreek/Blackford chipsets.
111111

112-
choice
113-
prompt "Error detecting method"
114-
default EDAC_POLL
115-
116-
config EDAC_POLL
117-
bool "Poll for errors"
118-
help
119-
Poll the chipset periodically to detect errors.
120-
121-
endchoice
122-
123112
endif # EDAC

drivers/edac/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
# This file may be distributed under the terms of the
66
# GNU General Public License.
77
#
8-
# $Id: Makefile,v 1.4.2.3 2005/07/08 22:05:38 dsp_llnl Exp $
98

109

10+
obj-$(CONFIG_EDAC) := edac_stub.o
1111
obj-$(CONFIG_EDAC_MM_EDAC) += edac_core.o
1212

1313
edac_core-objs := edac_mc.o edac_device.o edac_mc_sysfs.o edac_pci_sysfs.o

drivers/edac/e752x_edac.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <linux/pci.h>
2323
#include <linux/pci_ids.h>
2424
#include <linux/slab.h>
25+
#include <linux/edac.h>
2526
#include "edac_mc.h"
2627

2728
#define E752X_REVISION " Ver: 2.0.1 " __DATE__
@@ -948,6 +949,16 @@ static int e752x_probe1(struct pci_dev *pdev, int dev_idx)
948949
debugf0("%s(): mci\n", __func__);
949950
debugf0("Starting Probe1\n");
950951

952+
/* make sure error reporting method is sane */
953+
switch(edac_op_state) {
954+
case EDAC_OPSTATE_POLL:
955+
case EDAC_OPSTATE_NMI:
956+
break;
957+
default:
958+
edac_op_state = EDAC_OPSTATE_POLL;
959+
break;
960+
}
961+
951962
/* check to see if device 0 function 1 is enabled; if it isn't, we
952963
* assume the BIOS has reserved it for a reason and is expecting
953964
* exclusive access, we take care not to violate that assumption and
@@ -1123,4 +1134,5 @@ MODULE_DESCRIPTION("MC support for Intel e752x memory controllers");
11231134
module_param(force_function_unhide, int, 0444);
11241135
MODULE_PARM_DESC(force_function_unhide, "if BIOS sets Dev0:Fun1 up as hidden:"
11251136
" 1=force unhide and hope BIOS doesn't fight driver for Dev0:Fun1 access");
1126-
1137+
module_param(edac_op_state, int, 0444);
1138+
MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");

drivers/edac/e7xxx_edac.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <linux/pci.h>
2828
#include <linux/pci_ids.h>
2929
#include <linux/slab.h>
30+
#include <linux/edac.h>
3031
#include "edac_mc.h"
3132

3233
#define E7XXX_REVISION " Ver: 2.0.1 " __DATE__
@@ -419,6 +420,17 @@ static int e7xxx_probe1(struct pci_dev *pdev, int dev_idx)
419420
struct e7xxx_error_info discard;
420421

421422
debugf0("%s(): mci\n", __func__);
423+
424+
/* make sure error reporting method is sane */
425+
switch(edac_op_state) {
426+
case EDAC_OPSTATE_POLL:
427+
case EDAC_OPSTATE_NMI:
428+
break;
429+
default:
430+
edac_op_state = EDAC_OPSTATE_POLL;
431+
break;
432+
}
433+
422434
pci_read_config_dword(pdev, E7XXX_DRC, &drc);
423435

424436
drc_chan = dual_channel_active(drc, dev_idx);
@@ -565,3 +577,5 @@ MODULE_LICENSE("GPL");
565577
MODULE_AUTHOR("Linux Networx (http://lnxi.com) Thayne Harbaugh et al\n"
566578
"Based on.work by Dan Hollis et al");
567579
MODULE_DESCRIPTION("MC support for Intel e7xxx memory controllers");
580+
module_param(edac_op_state, int, 0444);
581+
MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");

drivers/edac/edac_mc.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <linux/list.h>
2828
#include <linux/sysdev.h>
2929
#include <linux/ctype.h>
30+
#include <linux/edac.h>
3031
#include <asm/uaccess.h>
3132
#include <asm/page.h>
3233
#include <asm/edac.h>
@@ -241,6 +242,7 @@ static int add_mc_to_global_list (struct mem_ctl_info *mci)
241242
}
242243

243244
list_add_tail_rcu(&mci->link, insert_before);
245+
atomic_inc(&edac_handlers);
244246
return 0;
245247

246248
fail0:
@@ -267,6 +269,7 @@ static void complete_mc_list_del(struct rcu_head *head)
267269

268270
static void del_mc_from_global_list(struct mem_ctl_info *mci)
269271
{
272+
atomic_dec(&edac_handlers);
270273
list_del_rcu(&mci->link);
271274
init_completion(&mci->complete);
272275
call_rcu(&mci->rcu, complete_mc_list_del);

drivers/edac/edac_module.c

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11

22
#include <linux/freezer.h>
33
#include <linux/kthread.h>
4+
#include <linux/edac.h>
45

56
#include "edac_mc.h"
67
#include "edac_module.h"
@@ -101,6 +102,25 @@ static void do_edac_check(void)
101102
edac_pci_do_parity_check();
102103
}
103104

105+
/*
106+
* handler for EDAC to check if NMI type handler has asserted interrupt
107+
*/
108+
static int edac_assert_error_check_and_clear(void)
109+
{
110+
int vreg;
111+
112+
if(edac_op_state == EDAC_OPSTATE_POLL)
113+
return 1;
114+
115+
vreg = atomic_read(&edac_err_assert);
116+
if(vreg) {
117+
atomic_set(&edac_err_assert, 0);
118+
return 1;
119+
}
120+
121+
return 0;
122+
}
123+
104124
/*
105125
* Action thread for EDAC to perform the POLL operations
106126
*/
@@ -109,8 +129,8 @@ static int edac_kernel_thread(void *arg)
109129
int msec;
110130

111131
while (!kthread_should_stop()) {
112-
113-
do_edac_check();
132+
if(edac_assert_error_check_and_clear())
133+
do_edac_check();
114134

115135
/* goto sleep for the interval */
116136
msec = (HZ * edac_get_poll_msec()) / 1000;

drivers/edac/edac_stub.c

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* common EDAC components that must be in kernel
3+
*
4+
* Author: Dave Jiang <[email protected]>
5+
*
6+
* 2007 (c) MontaVista Software, Inc. This file is licensed under
7+
* the terms of the GNU General Public License version 2. This program
8+
* is licensed "as is" without any warranty of any kind, whether express
9+
* or implied.
10+
*
11+
*/
12+
#include <linux/module.h>
13+
#include <linux/edac.h>
14+
#include <asm/atomic.h>
15+
#include <asm/edac.h>
16+
17+
int edac_op_state = EDAC_OPSTATE_INVAL;
18+
EXPORT_SYMBOL(edac_op_state);
19+
20+
atomic_t edac_handlers = ATOMIC_INIT(0);
21+
EXPORT_SYMBOL(edac_handlers);
22+
23+
atomic_t edac_err_assert = ATOMIC_INIT(0);
24+
EXPORT_SYMBOL(edac_err_assert);
25+
26+
inline int edac_handler_set(void)
27+
{
28+
if (edac_op_state == EDAC_OPSTATE_POLL)
29+
return 0;
30+
31+
return atomic_read(&edac_handlers);
32+
}
33+
EXPORT_SYMBOL(edac_handler_set);
34+
35+
/*
36+
* handler for NMI type of interrupts to assert error
37+
*/
38+
inline void edac_atomic_assert_error(void)
39+
{
40+
atomic_set(&edac_err_assert, 1);
41+
}
42+
EXPORT_SYMBOL(edac_atomic_assert_error);

drivers/edac/i5000_edac.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <linux/pci.h>
2020
#include <linux/pci_ids.h>
2121
#include <linux/slab.h>
22+
#include <linux/edac.h>
2223
#include <asm/mmzone.h>
2324

2425
#include "edac_mc.h"
@@ -1285,6 +1286,16 @@ static int i5000_probe1(struct pci_dev *pdev, int dev_idx)
12851286
if (PCI_FUNC(pdev->devfn) != 0)
12861287
return -ENODEV;
12871288

1289+
/* make sure error reporting method is sane */
1290+
switch(edac_op_state) {
1291+
case EDAC_OPSTATE_POLL:
1292+
case EDAC_OPSTATE_NMI:
1293+
break;
1294+
default:
1295+
edac_op_state = EDAC_OPSTATE_POLL;
1296+
break;
1297+
}
1298+
12881299
/* Ask the devices for the number of CSROWS and CHANNELS so
12891300
* that we can calculate the memory resources, etc
12901301
*
@@ -1475,3 +1486,5 @@ MODULE_AUTHOR
14751486
("Linux Networx (http://lnxi.com) Doug Thompson <[email protected]>");
14761487
MODULE_DESCRIPTION("MC Driver for Intel I5000 memory controllers - "
14771488
I5000_REVISION);
1489+
module_param(edac_op_state, int, 0444);
1490+
MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");

0 commit comments

Comments
 (0)