x86/MCE: allow overriding the CMCI threshold We've had reports of systems where CMCIs would surface at a relatively high rate during certain periods of time, without them apparently causing subsequent more severe problems (see Xeon E7-8800/4800/2800 specification clarification SC1). Give the admin a knob to lower the impact on the system logs. Signed-off-by: Jan Beulich --- a/docs/misc/xen-command-line.markdown +++ b/docs/misc/xen-command-line.markdown @@ -242,6 +242,14 @@ the NMI watchdog is also enabled. If set, override Xen's default choice for the platform timer. +### cmci-threshold +> `= ` + +> Default: `2` + +Specify the event count threshold for raising Corrected Machine Check +Interrupts. Specifying zero disables CMCI handling. + ### cmos-rtc-probe > `= ` --- a/xen/arch/x86/cpu/mcheck/mce_intel.c +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c @@ -492,6 +492,9 @@ static int do_cmci_discover(int i) { unsigned msr = MSR_IA32_MCx_CTL2(i); u64 val; + unsigned int threshold, max_threshold; + static unsigned int cmci_threshold = 2; + integer_param("cmci-threshold", cmci_threshold); rdmsrl(msr, val); /* Some other CPU already owns this bank. */ @@ -500,15 +503,28 @@ static int do_cmci_discover(int i) goto out; } - val &= ~CMCI_THRESHOLD_MASK; - wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD); - rdmsrl(msr, val); + if ( cmci_threshold ) + { + wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD_MASK); + rdmsrl(msr, val); + } if (!(val & CMCI_EN)) { /* This bank does not support CMCI. Polling timer has to handle it. */ mcabanks_set(i, __get_cpu_var(no_cmci_banks)); + wrmsrl(msr, val & ~CMCI_THRESHOLD_MASK); return 0; } + max_threshold = MASK_EXTR(val, CMCI_THRESHOLD_MASK); + threshold = cmci_threshold; + if ( threshold > max_threshold ) + { + mce_printk(MCE_QUIET, + "CMCI: threshold %#x too large for CPU%u bank %u, using %#x\n", + threshold, smp_processor_id(), i, max_threshold); + threshold = max_threshold; + } + wrmsrl(msr, (val & ~CMCI_THRESHOLD_MASK) | CMCI_EN | threshold); mcabanks_set(i, __get_cpu_var(mce_banks_owned)); out: mcabanks_clear(i, __get_cpu_var(no_cmci_banks)); --- a/xen/arch/x86/cpu/mcheck/x86_mca.h +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h @@ -86,9 +86,6 @@ /* Bitfield of MSR_K8_HWCR register */ #define K8_HWCR_MCi_STATUS_WREN (1ULL << 18) -/*Intel Specific bitfield*/ -#define CMCI_THRESHOLD 0x2 - #define MCi_MISC_ADDRMOD_MASK (0x7UL << 6) #define MCi_MISC_PHYSMOD (0x2UL << 6)