@@ -40,6 +40,8 @@ struct dentry *mce_get_debugfs_dir(void);
extern mce_banks_t mce_banks_ce_disabled;
+void track_cmci_storm(int bank, u64 status);
+
#ifdef CONFIG_X86_MCE_INTEL
void cmci_disable_bank(int bank);
void intel_init_cmci(void);
@@ -54,7 +56,7 @@ static inline void intel_clear_lmce(void) { }
static inline bool intel_filter_mce(struct mce *m) { return false; }
#endif
-void mce_timer_kick(unsigned long interval);
+void mce_timer_kick(bool storm);
#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
@@ -694,6 +694,8 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
barrier();
m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
+ track_cmci_storm(i, m.status);
+
/* If this entry is not valid, ignore it */
if (!(m.status & MCI_STATUS_VAL))
continue;
@@ -1597,6 +1599,7 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
+static DEFINE_PER_CPU(bool, storm_poll_mode);
static void __start_timer(struct timer_list *t, unsigned long interval)
{
@@ -1632,22 +1635,29 @@ static void mce_timer_fn(struct timer_list *t)
else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
- __this_cpu_write(mce_next_interval, iv);
- __start_timer(t, iv);
+ if (__this_cpu_read(storm_poll_mode)) {
+ __start_timer(t, HZ);
+ } else {
+ __this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+ }
}
/*
- * Ensure that the timer is firing in @interval from now.
+ * When a storm starts on any bank on this CPU, switch to polling
+ * once per second. When the storm ends, revert to the default
+ * polling interval.
*/
-void mce_timer_kick(unsigned long interval)
+void mce_timer_kick(bool storm)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- unsigned long iv = __this_cpu_read(mce_next_interval);
- __start_timer(t, interval);
+ __this_cpu_write(storm_poll_mode, storm);
- if (interval < iv)
- __this_cpu_write(mce_next_interval, interval);
+ if (storm)
+ __start_timer(t, HZ);
+ else
+ __this_cpu_write(mce_next_interval, check_interval * HZ);
}
/* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@ -47,8 +47,40 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
*/
static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
+/*
+ * CMCI storm tracking state
+ * stormy_bank_count: per-cpu count of MC banks in storm state
+ * bank_history: bitmask tracking of corrected errors seen in each bank
+ * bank_time_stamp: last time (in jiffies) that each bank was polled
+ * cmci_threshold: MCi_CTL2 threshold for each bank when there is no storm
+ */
+static DEFINE_PER_CPU(int, stormy_bank_count);
+static DEFINE_PER_CPU(u64 [MAX_NR_BANKS], bank_history);
+static DEFINE_PER_CPU(bool [MAX_NR_BANKS], bank_storm);
+static DEFINE_PER_CPU(unsigned long [MAX_NR_BANKS], bank_time_stamp);
+static int cmci_threshold[MAX_NR_BANKS];
+
+/* Linux non-storm CMCI threshold (may be overridden by BIOS */
#define CMCI_THRESHOLD 1
+/*
+ * High threshold to limit CMCI rate during storms. Max supported is
+ * 0x7FFF. Use this slightly smaller value so it has a distinctive
+ * signature when some asks "Why am I not seeing all corrected errors?"
+ */
+#define CMCI_STORM_THRESHOLD 32749
+
+/*
+ * How many errors within the history buffer mark the start of a storm
+ */
+#define STORM_BEGIN_THRESHOLD 5
+
+/*
+ * How many polls of machine check bank without an error before declaring
+ * the storm is over
+ */
+#define STORM_END_POLL_THRESHOLD 30
+
static int cmci_supported(int *banks)
{
u64 cap;
@@ -103,6 +135,93 @@ static bool lmce_supported(void)
return tmp & FEAT_CTL_LMCE_ENABLED;
}
+/*
+ * Set a new CMCI threshold value. Preserve the state of the
+ * MCI_CTL2_CMCI_EN bit in case this happens during a
+ * cmci_rediscover() operation.
+ */
+static void cmci_set_threshold(int bank, int thresh)
+{
+ unsigned long flags;
+ u64 val;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+static void cmci_storm_begin(int bank)
+{
+ __set_bit(bank, this_cpu_ptr(mce_poll_banks));
+ this_cpu_write(bank_storm[bank], true);
+
+ /*
+ * If this is the first bank on this CPU to enter storm mode
+ * start polling
+ */
+ if (this_cpu_inc_return(stormy_bank_count) == 1)
+ mce_timer_kick(true);
+}
+
+static void cmci_storm_end(int bank)
+{
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ this_cpu_write(bank_history[bank], 0ull);
+ this_cpu_write(bank_storm[bank], false);
+
+ /* If no banks left in storm mode, stop polling */
+ if (!this_cpu_dec_return(stormy_bank_count))
+ mce_timer_kick(false);
+}
+
+void track_cmci_storm(int bank, u64 status)
+{
+ unsigned long now = jiffies, delta;
+ unsigned int shift = 1;
+ u64 history;
+
+ /*
+ * When a bank is in storm mode it is polled once per second and
+ * the history mask will record about the last minute of poll results.
+ * If it is not in storm mode, then the bank is only checked when
+ * there is a CMCI interrupt. Check how long it has been since
+ * this bank was last checked, and adjust the amount of "shift"
+ * to apply to history.
+ */
+ if (!this_cpu_read(bank_storm[bank])) {
+ delta = now - this_cpu_read(bank_time_stamp[bank]);
+ shift = (delta + HZ) / HZ;
+ }
+
+ /* If has been a long time since the last poll, clear history */
+ if (shift >= 64)
+ history = 0;
+ else
+ history = this_cpu_read(bank_history[bank]) << shift;
+ this_cpu_write(bank_time_stamp[bank], now);
+
+ /* History keeps track of corrected errors. VAL=1 && UC=0 */
+ if ((status & (MCI_STATUS_VAL | MCI_STATUS_UC)) == MCI_STATUS_VAL)
+ history |= 1;
+ this_cpu_write(bank_history[bank], history);
+
+ if (this_cpu_read(bank_storm[bank])) {
+ if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD - 1, 0))
+ return;
+ pr_notice("CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), bank);
+ cmci_set_threshold(bank, cmci_threshold[bank]);
+ cmci_storm_end(bank);
+ } else {
+ if (hweight64(history) < STORM_BEGIN_THRESHOLD)
+ return;
+ pr_notice("CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), bank);
+ cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
+ cmci_storm_begin(bank);
+ }
+}
+
/*
* The interrupt handler. This is called on every event.
* Just call the poller directly to log any events.
@@ -147,6 +266,9 @@ static void cmci_discover(int banks)
continue;
}
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ goto storm;
+
if (!mca_cfg.bios_cmci_threshold) {
val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
val |= CMCI_THRESHOLD;
@@ -159,7 +281,7 @@ static void cmci_discover(int banks)
bios_zero_thresh = 1;
val |= CMCI_THRESHOLD;
}
-
+storm:
val |= MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
@@ -167,7 +289,14 @@ static void cmci_discover(int banks)
/* Did the enable bit stick? -- the bank supports CMCI */
if (val & MCI_CTL2_CMCI_EN) {
set_bit(i, owned);
- __clear_bit(i, this_cpu_ptr(mce_poll_banks));
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) {
+ pr_notice("CPU%d BANK%d CMCI inherited storm\n", smp_processor_id(), i);
+ this_cpu_write(bank_history[i], ~0ull);
+ this_cpu_write(bank_time_stamp[i], jiffies);
+ cmci_storm_begin(i);
+ } else {
+ __clear_bit(i, this_cpu_ptr(mce_poll_banks));
+ }
/*
* We are able to set thresholds for some banks that
* had a threshold of 0. This means the BIOS has not
@@ -177,6 +306,10 @@ static void cmci_discover(int banks)
if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
(val & MCI_CTL2_CMCI_THRESHOLD_MASK))
bios_wrong_thresh = 1;
+
+ /* Save default threshold for each bank */
+ if (cmci_threshold[i] == 0)
+ cmci_threshold[i] = val & MCI_CTL2_CMCI_THRESHOLD_MASK;
} else {
WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks)));
}
@@ -218,6 +351,8 @@ static void __cmci_disable_bank(int bank)
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
__clear_bit(bank, this_cpu_ptr(mce_banks_owned));
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ cmci_storm_end(bank);
}
/*