[PATCHv2,0/2] *** Detect interrupt storm in softlockup ***

Message ID 20240130074744.45759-1-yaoma@linux.alibaba.com
Headers
Series *** Detect interrupt storm in softlockup *** |

Message

Bitao Hu Jan. 30, 2024, 7:47 a.m. UTC
  I have implemented a low-overhead method for detecting interrupt storm
in softlockup. Please review it, all comments are welcome.

Changes from v1 to v2:

- From Douglas, optimize the memory of cpustats. With the maximum number
of CPUs, that's now this.
2 * 8192 * 4 + 1 * 8192 * 5 * 4 + 1 * 8192 = 237,568 bytes.

- From Liu Song, refactor the code format and add necessary comments. 

- From Douglas, use interrupt counts instead of interrupt time to
determine the cause of softlockup. 

- Remove the cmdline parameter added in PATCHv1.

Bitao Hu (2):
  watchdog/softlockup: low-overhead detection of interrupt storm
  watchdog/softlockup: report the most frequent interrupts

 kernel/watchdog.c | 222 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 222 insertions(+)
  

Comments

Liu Song Jan. 31, 2024, 1:50 a.m. UTC | #1
在 2024/1/30 15:47, Bitao Hu 写道:
> When the watchdog determines that the current soft lockup is due
> to an interrupt storm based on CPU utilization, reporting the
> most frequent interrupts could be good enough for further
> troubleshooting.
>
> Below is an example of interrupt storm. The call tree does not
> provide useful information, but we can analyze which interrupt
> caused the soft lockup by comparing the counts of interrupts.
>
> [ 2987.488075] watchdog: BUG: soft lockup - CPU#9 stuck for 23s! [kworker/9:1:214]
> [ 2987.488607] CPU#9 Utilization every 4s during lockup:
> [ 2987.488941]  #1:   0% system,          0% softirq,   100% hardirq,     0% idle
> [ 2987.489357]  #2:   0% system,          0% softirq,   100% hardirq,     0% idle
> [ 2987.489771]  #3:   0% system,          0% softirq,   100% hardirq,     0% idle
> [ 2987.490186]  #4:   0% system,          0% softirq,   100% hardirq,     0% idle
> [ 2987.490601]  #5:   0% system,          0% softirq,   100% hardirq,     0% idle
> [ 2987.491034] CPU#9 Detect HardIRQ Time exceeds 50%. Most frequent HardIRQs:
> [ 2987.491493]  #1: 330985      irq#7(IPI)
> [ 2987.491743]  #2: 5000        irq#10(arch_timer)
> [ 2987.492039]  #3: 9           irq#91(nvme0q2)
> [ 2987.492318]  #4: 3           irq#118(virtio1-output.12)
> ...
> [ 2987.492728] Call trace:
> [ 2987.492729]  __do_softirq+0xa8/0x364
>
> Signed-off-by: Bitao Hu <yaoma@linux.alibaba.com>
> ---
>   kernel/watchdog.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 150 insertions(+)
>
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 0efe9604c3c2..38fb18e17d71 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -25,6 +25,9 @@
>   #include <linux/stop_machine.h>
>   #include <linux/kernel_stat.h>
>   #include <linux/math64.h>
> +#include <linux/irq.h>
> +#include <linux/bitops.h>
> +#include <linux/irqdesc.h>
>   
>   #include <asm/irq_regs.h>
>   #include <linux/kvm_para.h>
> @@ -431,11 +434,15 @@ void touch_softlockup_watchdog_sync(void)
>   	__this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
>   }
>   
> +static void set_potential_softlockup(unsigned long now, unsigned long touch_ts);
> +
>   static int is_softlockup(unsigned long touch_ts,
>   			 unsigned long period_ts,
>   			 unsigned long now)
>   {
>   	if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) {
> +		/* Softlockup may occur in the current period */
> +		set_potential_softlockup(now, period_ts);
>   		/* Warn about unreasonable delays. */
>   		if (time_after(now, period_ts + get_softlockup_thresh()))
>   			return now - touch_ts;
> @@ -457,6 +464,8 @@ static enum cpu_usage_stat idx_to_stat[NUM_STATS_PER_GROUP] = {
>   	CPUTIME_SYSTEM, CPUTIME_SOFTIRQ, CPUTIME_IRQ, CPUTIME_IDLE
>   };
>   
> +static void print_hardirq_counts(void);
> +
>   static void update_cpustat(void)
>   {
>   	u8 i;
> @@ -504,10 +513,150 @@ static void print_cpustat(void)
>   			utilization[i][STATS_SYSTEM], utilization[i][STATS_SOFTIRQ],
>   			utilization[i][STATS_HARDIRQ], utilization[i][STATS_IDLE]);
>   	}
> +	print_hardirq_counts();
> +}
> +
> +#define HARDIRQ_PERCENT_THRESH		50
> +#define NUM_HARDIRQ_REPORT		5
> +static DECLARE_BITMAP(softlockup_hardirq_cpus, CONFIG_NR_CPUS);
> +static DEFINE_PER_CPU(u32 *, hardirq_counts);
> +
> +static void find_counts_top(u32 *irq_counts, int *irq, u32 perirq_counts, int perirq_id, int range)
> +{
> +	unsigned int i, j;
> +
> +	for (i = 0; i < range; i++) {
> +		if (perirq_counts > irq_counts[i]) {
> +			for (j = range - 1; j > i; j--) {
> +				irq_counts[j] = irq_counts[j - 1];
> +				irq[j] = irq[j - 1];
> +			}
> +			irq_counts[j] = perirq_counts;
> +			irq[j] = perirq_id;
> +			break;
> +		}
> +	}
> +}
> +
> +/*
> + * If the proportion of time spent handling irq exceeds HARDIRQ_PERCENT_THRESH%
> + * during sample_period, then it is necessary to record the counts of each irq.
> + */
> +static inline bool need_record_irq_counts(int type)
> +{
> +	int tail = this_cpu_read(cpustat_tail);
> +	u8 utilization;
> +
> +	if (--tail == -1)
> +		tail = 4;
> +	utilization = this_cpu_read(cpustat_utilization[tail][type]);
> +	return utilization > HARDIRQ_PERCENT_THRESH;
>   }
> +
> +/*
> + * Mark softlockup as potentially caused by hardirq
> + */
> +static void set_potential_softlockup_hardirq(void)
> +{
> +	u32 i;
> +	u32 *counts = __this_cpu_read(hardirq_counts);
> +	int cpu = smp_processor_id();
> +	struct irq_desc *desc;
> +
> +	if (!need_record_irq_counts(STATS_HARDIRQ))
> +		return;
> +
> +	if (!test_bit(cpu, softlockup_hardirq_cpus)) {
> +		counts = kmalloc_array(nr_irqs, sizeof(u32), GFP_ATOMIC);
> +		if (!counts)
> +			return;
> +		for_each_irq_desc(i, desc) {
> +			if (!desc)
> +				continue;
> +			counts[i] = desc->kstat_irqs ?
> +				*this_cpu_ptr(desc->kstat_irqs) : 0;
> +		}
> +		__this_cpu_write(hardirq_counts, counts);
> +		set_bit(cpu, softlockup_hardirq_cpus);
> +	}
> +}
> +
> +static void clear_potential_softlockup_hardirq(void)
> +{
> +	u32 *counts = __this_cpu_read(hardirq_counts);
> +	int cpu = smp_processor_id();
> +
> +	if (test_bit(cpu, softlockup_hardirq_cpus)) {
> +		kfree(counts);
> +		counts = NULL;
> +		__this_cpu_write(hardirq_counts, counts);
> +		clear_bit(cpu, softlockup_hardirq_cpus);
> +	}
> +}
> +
> +/*
> + * Mark that softlockup may occur
> + */
> +static void set_potential_softlockup(unsigned long now, unsigned long period_ts)
> +{
> +	if (time_after_eq(now, period_ts + get_softlockup_thresh() / 5))
> +		set_potential_softlockup_hardirq();
> +}
> +
> +static void clear_potential_softlockup(void)
> +{
> +	clear_potential_softlockup_hardirq();
> +}
> +
> +static void print_hardirq_counts(void)
> +{
> +	u32 i;
> +	struct irq_desc *desc;
> +	u32 counts_diff;
> +	u32 *counts = __this_cpu_read(hardirq_counts);
> +	int cpu = smp_processor_id();
> +	u32 hardirq_counts_top[NUM_HARDIRQ_REPORT] = {0, 0, 0, 0, 0};
> +	int hardirq_top[NUM_HARDIRQ_REPORT] = {-1, -1, -1, -1, -1};
"hardirq_counts_top" and "hardirq_top" seem like two members of a struct,
working together to record the most suspicious irq, so wouldn't using a 
struct make it clearer?
> +
> +	if (test_bit(cpu, softlockup_hardirq_cpus)) {
> +		/* Find the top NUM_HARDIRQ_REPORT most frequent interrupts */
> +		for_each_irq_desc(i, desc) {
> +			if (!desc)
> +				continue;
> +			counts_diff = desc->kstat_irqs ?
> +				*this_cpu_ptr(desc->kstat_irqs) - counts[i] : 0;
> +			find_counts_top(hardirq_counts_top, hardirq_top,
> +					counts_diff, i, NUM_HARDIRQ_REPORT);
> +		}
> +		/*
> +		 * We do not want the "watchdog: " prefix on every line,
> +		 * hence we use "printk" instead of "pr_crit".
> +		 */
> +		printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. Most frequent HardIRQs:\n",
> +			smp_processor_id(), HARDIRQ_PERCENT_THRESH);
> +		for (i = 0; i < NUM_HARDIRQ_REPORT; i++) {
> +			if (hardirq_top[i] == -1)
> +				break;
> +			desc = irq_to_desc(hardirq_top[i]);
> +			if (desc && desc->action)
> +				printk(KERN_CRIT "\t#%u: %-10u\tirq#%d(%s)\n",
> +					i+1, hardirq_counts_top[i],
> +					hardirq_top[i], desc->action->name);
> +			else
> +				printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n",
> +					i+1, hardirq_counts_top[i],
> +					hardirq_top[i]);
> +		}
> +		if (!need_record_irq_counts(STATS_HARDIRQ))
> +			clear_potential_softlockup_hardirq();
> +	}
> +}
> +
>   #else
>   static inline void update_cpustat(void) { }
>   static inline void print_cpustat(void) { }
> +static inline void set_potential_softlockup(unsigned long now, unsigned long period_ts) { }
> +static inline void clear_potential_softlockup(void) { }
>   #endif
>   
>   /* watchdog detector functions */
> @@ -525,6 +674,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
>   static int softlockup_fn(void *data)
>   {
>   	update_touch_ts();
> +	clear_potential_softlockup();
>   	complete(this_cpu_ptr(&softlockup_completion));
>   
>   	return 0;
  
Bitao Hu Jan. 31, 2024, 11:08 a.m. UTC | #2
On 2024/1/31 09:50, Liu Song wrote:
> 
> 在 2024/1/30 15:47, Bitao Hu 写道:
>> When the watchdog determines that the current soft lockup is due
>> to an interrupt storm based on CPU utilization, reporting the
>> most frequent interrupts could be good enough for further
>> troubleshooting.
>>
>> Below is an example of interrupt storm. The call tree does not
>> provide useful information, but we can analyze which interrupt
>> caused the soft lockup by comparing the counts of interrupts.
>>
>> [ 2987.488075] watchdog: BUG: soft lockup - CPU#9 stuck for 23s! 
>> [kworker/9:1:214]
>> [ 2987.488607] CPU#9 Utilization every 4s during lockup:
>> [ 2987.488941]  #1:   0% system,          0% softirq,   100% 
>> hardirq,     0% idle
>> [ 2987.489357]  #2:   0% system,          0% softirq,   100% 
>> hardirq,     0% idle
>> [ 2987.489771]  #3:   0% system,          0% softirq,   100% 
>> hardirq,     0% idle
>> [ 2987.490186]  #4:   0% system,          0% softirq,   100% 
>> hardirq,     0% idle
>> [ 2987.490601]  #5:   0% system,          0% softirq,   100% 
>> hardirq,     0% idle
>> [ 2987.491034] CPU#9 Detect HardIRQ Time exceeds 50%. Most frequent 
>> HardIRQs:
>> [ 2987.491493]  #1: 330985      irq#7(IPI)
>> [ 2987.491743]  #2: 5000        irq#10(arch_timer)
>> [ 2987.492039]  #3: 9           irq#91(nvme0q2)
>> [ 2987.492318]  #4: 3           irq#118(virtio1-output.12)
>> ...
>> [ 2987.492728] Call trace:
>> [ 2987.492729]  __do_softirq+0xa8/0x364
>>
>> Signed-off-by: Bitao Hu <yaoma@linux.alibaba.com>
>> ---
>>   kernel/watchdog.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 150 insertions(+)
>>
>> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
>> index 0efe9604c3c2..38fb18e17d71 100644
>> --- a/kernel/watchdog.c
>> +++ b/kernel/watchdog.c
>> @@ -25,6 +25,9 @@
>>   #include <linux/stop_machine.h>
>>   #include <linux/kernel_stat.h>
>>   #include <linux/math64.h>
>> +#include <linux/irq.h>
>> +#include <linux/bitops.h>
>> +#include <linux/irqdesc.h>
>>   #include <asm/irq_regs.h>
>>   #include <linux/kvm_para.h>
>> @@ -431,11 +434,15 @@ void touch_softlockup_watchdog_sync(void)
>>       __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
>>   }
>> +static void set_potential_softlockup(unsigned long now, unsigned long 
>> touch_ts);
>> +
>>   static int is_softlockup(unsigned long touch_ts,
>>                unsigned long period_ts,
>>                unsigned long now)
>>   {
>>       if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && 
>> watchdog_thresh) {
>> +        /* Softlockup may occur in the current period */
>> +        set_potential_softlockup(now, period_ts);
>>           /* Warn about unreasonable delays. */
>>           if (time_after(now, period_ts + get_softlockup_thresh()))
>>               return now - touch_ts;
>> @@ -457,6 +464,8 @@ static enum cpu_usage_stat 
>> idx_to_stat[NUM_STATS_PER_GROUP] = {
>>       CPUTIME_SYSTEM, CPUTIME_SOFTIRQ, CPUTIME_IRQ, CPUTIME_IDLE
>>   };
>> +static void print_hardirq_counts(void);
>> +
>>   static void update_cpustat(void)
>>   {
>>       u8 i;
>> @@ -504,10 +513,150 @@ static void print_cpustat(void)
>>               utilization[i][STATS_SYSTEM], 
>> utilization[i][STATS_SOFTIRQ],
>>               utilization[i][STATS_HARDIRQ], utilization[i][STATS_IDLE]);
>>       }
>> +    print_hardirq_counts();
>> +}
>> +
>> +#define HARDIRQ_PERCENT_THRESH        50
>> +#define NUM_HARDIRQ_REPORT        5
>> +static DECLARE_BITMAP(softlockup_hardirq_cpus, CONFIG_NR_CPUS);
>> +static DEFINE_PER_CPU(u32 *, hardirq_counts);
>> +
>> +static void find_counts_top(u32 *irq_counts, int *irq, u32 
>> perirq_counts, int perirq_id, int range)
>> +{
>> +    unsigned int i, j;
>> +
>> +    for (i = 0; i < range; i++) {
>> +        if (perirq_counts > irq_counts[i]) {
>> +            for (j = range - 1; j > i; j--) {
>> +                irq_counts[j] = irq_counts[j - 1];
>> +                irq[j] = irq[j - 1];
>> +            }
>> +            irq_counts[j] = perirq_counts;
>> +            irq[j] = perirq_id;
>> +            break;
>> +        }
>> +    }
>> +}
>> +
>> +/*
>> + * If the proportion of time spent handling irq exceeds 
>> HARDIRQ_PERCENT_THRESH%
>> + * during sample_period, then it is necessary to record the counts of 
>> each irq.
>> + */
>> +static inline bool need_record_irq_counts(int type)
>> +{
>> +    int tail = this_cpu_read(cpustat_tail);
>> +    u8 utilization;
>> +
>> +    if (--tail == -1)
>> +        tail = 4;
>> +    utilization = this_cpu_read(cpustat_utilization[tail][type]);
>> +    return utilization > HARDIRQ_PERCENT_THRESH;
>>   }
>> +
>> +/*
>> + * Mark softlockup as potentially caused by hardirq
>> + */
>> +static void set_potential_softlockup_hardirq(void)
>> +{
>> +    u32 i;
>> +    u32 *counts = __this_cpu_read(hardirq_counts);
>> +    int cpu = smp_processor_id();
>> +    struct irq_desc *desc;
>> +
>> +    if (!need_record_irq_counts(STATS_HARDIRQ))
>> +        return;
>> +
>> +    if (!test_bit(cpu, softlockup_hardirq_cpus)) {
>> +        counts = kmalloc_array(nr_irqs, sizeof(u32), GFP_ATOMIC);
>> +        if (!counts)
>> +            return;
>> +        for_each_irq_desc(i, desc) {
>> +            if (!desc)
>> +                continue;
>> +            counts[i] = desc->kstat_irqs ?
>> +                *this_cpu_ptr(desc->kstat_irqs) : 0;
>> +        }
>> +        __this_cpu_write(hardirq_counts, counts);
>> +        set_bit(cpu, softlockup_hardirq_cpus);
>> +    }
>> +}
>> +
>> +static void clear_potential_softlockup_hardirq(void)
>> +{
>> +    u32 *counts = __this_cpu_read(hardirq_counts);
>> +    int cpu = smp_processor_id();
>> +
>> +    if (test_bit(cpu, softlockup_hardirq_cpus)) {
>> +        kfree(counts);
>> +        counts = NULL;
>> +        __this_cpu_write(hardirq_counts, counts);
>> +        clear_bit(cpu, softlockup_hardirq_cpus);
>> +    }
>> +}
>> +
>> +/*
>> + * Mark that softlockup may occur
>> + */
>> +static void set_potential_softlockup(unsigned long now, unsigned long 
>> period_ts)
>> +{
>> +    if (time_after_eq(now, period_ts + get_softlockup_thresh() / 5))
>> +        set_potential_softlockup_hardirq();
>> +}
>> +
>> +static void clear_potential_softlockup(void)
>> +{
>> +    clear_potential_softlockup_hardirq();
>> +}
>> +
>> +static void print_hardirq_counts(void)
>> +{
>> +    u32 i;
>> +    struct irq_desc *desc;
>> +    u32 counts_diff;
>> +    u32 *counts = __this_cpu_read(hardirq_counts);
>> +    int cpu = smp_processor_id();
>> +    u32 hardirq_counts_top[NUM_HARDIRQ_REPORT] = {0, 0, 0, 0, 0};
>> +    int hardirq_top[NUM_HARDIRQ_REPORT] = {-1, -1, -1, -1, -1};
> "hardirq_counts_top" and "hardirq_top" seem like two members of a struct,
> working together to record the most suspicious irq, so wouldn't using a 
> struct make it clearer?
OK, I will define a struct.
>> +
>> +    if (test_bit(cpu, softlockup_hardirq_cpus)) {
>> +        /* Find the top NUM_HARDIRQ_REPORT most frequent interrupts */
>> +        for_each_irq_desc(i, desc) {
>> +            if (!desc)
>> +                continue;
>> +            counts_diff = desc->kstat_irqs ?
>> +                *this_cpu_ptr(desc->kstat_irqs) - counts[i] : 0;
>> +            find_counts_top(hardirq_counts_top, hardirq_top,
>> +                    counts_diff, i, NUM_HARDIRQ_REPORT);
>> +        }
>> +        /*
>> +         * We do not want the "watchdog: " prefix on every line,
>> +         * hence we use "printk" instead of "pr_crit".
>> +         */
>> +        printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. 
>> Most frequent HardIRQs:\n",
>> +            smp_processor_id(), HARDIRQ_PERCENT_THRESH);
>> +        for (i = 0; i < NUM_HARDIRQ_REPORT; i++) {
>> +            if (hardirq_top[i] == -1)
>> +                break;
>> +            desc = irq_to_desc(hardirq_top[i]);
>> +            if (desc && desc->action)
>> +                printk(KERN_CRIT "\t#%u: %-10u\tirq#%d(%s)\n",
>> +                    i+1, hardirq_counts_top[i],
>> +                    hardirq_top[i], desc->action->name);
>> +            else
>> +                printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n",
>> +                    i+1, hardirq_counts_top[i],
>> +                    hardirq_top[i]);
>> +        }
>> +        if (!need_record_irq_counts(STATS_HARDIRQ))
>> +            clear_potential_softlockup_hardirq();
>> +    }
>> +}
>> +
>>   #else
>>   static inline void update_cpustat(void) { }
>>   static inline void print_cpustat(void) { }
>> +static inline void set_potential_softlockup(unsigned long now, 
>> unsigned long period_ts) { }
>> +static inline void clear_potential_softlockup(void) { }
>>   #endif
>>   /* watchdog detector functions */
>> @@ -525,6 +674,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, 
>> softlockup_stop_work);
>>   static int softlockup_fn(void *data)
>>   {
>>       update_touch_ts();
>> +    clear_potential_softlockup();
>>       complete(this_cpu_ptr(&softlockup_completion));
>>       return 0;