diff mbox series

watchdog: Prefer use "ref-cycles" for NMI watchdog

Message ID	20230509221700.859865-1-song@kernel.org
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20; From: Song Liu <song@kernel.org> To: <linux-kernel@vger.kernel.org> CC: <kernel-team@meta.com>, Song Liu <song@kernel.org>, Andrew Morton <akpm@linux-foundation.org>, Peter Zijlstra <peterz@infradead.org> Subject: [PATCH] watchdog: Prefer use "ref-cycles" for NMI watchdog Date: Tue, 9 May 2023 15:17:00 -0700 Message-ID: <20230509221700.859865-1-song@kernel.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8BIT Content-Type: text/plain Precedence: bulk
Series	watchdog: Prefer use "ref-cycles" for NMI watchdog \| watchdog: Prefer use "ref-cycles" for NMI watchdog

Commit Message

Song Liu May 9, 2023, 10:17 p.m. UTC

  NMI watchdog permanently consumes one hardware counters per CPU on the
system. For systems that use many hardware counters, this causes more
aggressive time multiplexing of perf events.

OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely
used. Try use "ref-cycles" for the watchdog. If the CPU supports it, so
that one more hardware counter is available to the user. If the CPU doesn't
support "ref-cycles", fall back to "cycles".

The downside of this change is that users of "ref-cycles" need to disable
nmi_watchdog.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Song Liu <song@kernel.org>
---
 kernel/watchdog_hld.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

Comments

Peter Zijlstra May 12, 2023, 12:46 p.m. UTC | #1

On Tue, May 09, 2023 at 03:17:00PM -0700, Song Liu wrote:
> NMI watchdog permanently consumes one hardware counters per CPU on the
> system. For systems that use many hardware counters, this causes more
> aggressive time multiplexing of perf events.
> 
> OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely
> used. Try use "ref-cycles" for the watchdog. If the CPU supports it, so
> that one more hardware counter is available to the user. If the CPU doesn't
> support "ref-cycles", fall back to "cycles".
> 
> The downside of this change is that users of "ref-cycles" need to disable
> nmi_watchdog.

Urgh..

how about something like so instead; then you can use whatever event you
like...

---
 include/linux/nmi.h   |  2 ++
 kernel/watchdog.c     | 45 ++++++++++++++++++++++++++++++++++++---------
 kernel/watchdog_hld.c |  4 ++--
 3 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 048c0b9aa623..8b6307837346 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -19,6 +19,8 @@ bool is_hardlockup(void);
 
 extern int watchdog_user_enabled;
 extern int nmi_watchdog_user_enabled;
+extern int nmi_watchdog_type;
+extern u64 nmi_watchdog_config;
 extern int soft_watchdog_user_enabled;
 extern int watchdog_thresh;
 extern unsigned long watchdog_enabled;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 8e61f21e7e33..b3c09e0f96a3 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -40,6 +40,8 @@ static DEFINE_MUTEX(watchdog_mutex);
 unsigned long __read_mostly watchdog_enabled;
 int __read_mostly watchdog_user_enabled = 1;
 int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
+int __ro_after_init nmi_watchdog_type = PERF_TYPE_HARDWARE;
+u64 __ro_after_init nmi_watchdog_config = PERF_COUNT_HW_CPU_CYCLES;
 int __read_mostly soft_watchdog_user_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
 static int __read_mostly nmi_watchdog_available;
@@ -73,15 +75,40 @@ void __init hardlockup_detector_disable(void)
 
 static int __init hardlockup_panic_setup(char *str)
 {
-	if (!strncmp(str, "panic", 5))
-		hardlockup_panic = 1;
-	else if (!strncmp(str, "nopanic", 7))
-		hardlockup_panic = 0;
-	else if (!strncmp(str, "0", 1))
-		nmi_watchdog_user_enabled = 0;
-	else if (!strncmp(str, "1", 1))
-		nmi_watchdog_user_enabled = 1;
-	return 1;
+	int ret = 1;
+
+	if (!str)
+		return -EINVAL;
+
+	while (str) {
+		char *next = strchr(str, ',');
+		if (next) {
+			*next = 0;
+			next++;
+		}
+
+		if (!strcmp(str, "panic"))
+			hardlockup_panic = 1;
+		else if (!strcmp(str, "nopanic"))
+			hardlockup_panic = 0;
+		else if (!strcmp(str, "0"))
+			nmi_watchdog_user_enabled = 0;
+		else if (!strcmp(str, "1"))
+			nmi_watchdog_user_enabled = 1;
+		else if (str[0] == 'r') {
+			str++;
+			ret = kstrtou64(str, 16, &nmi_watchdog_config);
+			if (ret)
+				break;
+			nmi_watchdog_type = PERF_TYPE_RAW;
+			nmi_watchdog_user_enabled = 1;
+		}
+
+		str = next;
+	}
+
+	return ret;
+
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
 
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 247bf0b1582c..27bc15f9a92a 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -99,8 +99,6 @@ static inline bool watchdog_check_timestamp(void)
 #endif
 
 static struct perf_event_attr wd_hw_attr = {
-	.type		= PERF_TYPE_HARDWARE,
-	.config		= PERF_COUNT_HW_CPU_CYCLES,
 	.size		= sizeof(struct perf_event_attr),
 	.pinned		= 1,
 	.disabled	= 1,
@@ -170,6 +168,8 @@ static int hardlockup_detector_event_create(void)
 	struct perf_event *evt;
 
 	wd_attr = &wd_hw_attr;
+	wd_attr->type = nmi_watchdog_type;
+	wd_attr->config = nmi_watchdog_config;
 	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
 
 	/* Try to register using hardware perf events */

Song Liu May 12, 2023, 4:43 p.m. UTC | #2

On Fri, May 12, 2023 at 5:47 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, May 09, 2023 at 03:17:00PM -0700, Song Liu wrote:
> > NMI watchdog permanently consumes one hardware counters per CPU on the
> > system. For systems that use many hardware counters, this causes more
> > aggressive time multiplexing of perf events.
> >
> > OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely
> > used. Try use "ref-cycles" for the watchdog. If the CPU supports it, so
> > that one more hardware counter is available to the user. If the CPU doesn't
> > support "ref-cycles", fall back to "cycles".
> >
> > The downside of this change is that users of "ref-cycles" need to disable
> > nmi_watchdog.
>
> Urgh..
>
> how about something like so instead; then you can use whatever event you
> like...

Configuring this at boot time is not ideal for our use case. Currently, we have
some systems support ref-cycles and some don't. So this is one more kernel
argument we need to make sure to get correctly. This also means we cannot
change this setting without reboot.

Another idea I have is to use sysctl kernel.nmi_watchdog, so we can change
the event after boot. Would this work?

Btw, the limitation here (ref-cycles users need to disable NMI watchdog) comes
from the limitation that the programmable counters cannot do ref-cycles. Is this
something we may change (or already changed)?

Thanks,
Song

>
> ---
>  include/linux/nmi.h   |  2 ++
>  kernel/watchdog.c     | 45 ++++++++++++++++++++++++++++++++++++---------
>  kernel/watchdog_hld.c |  4 ++--
>  3 files changed, 40 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index 048c0b9aa623..8b6307837346 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -19,6 +19,8 @@ bool is_hardlockup(void);
>
>  extern int watchdog_user_enabled;
>  extern int nmi_watchdog_user_enabled;
> +extern int nmi_watchdog_type;
> +extern u64 nmi_watchdog_config;
>  extern int soft_watchdog_user_enabled;
>  extern int watchdog_thresh;
>  extern unsigned long watchdog_enabled;
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 8e61f21e7e33..b3c09e0f96a3 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -40,6 +40,8 @@ static DEFINE_MUTEX(watchdog_mutex);
>  unsigned long __read_mostly watchdog_enabled;
>  int __read_mostly watchdog_user_enabled = 1;
>  int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
> +int __ro_after_init nmi_watchdog_type = PERF_TYPE_HARDWARE;
> +u64 __ro_after_init nmi_watchdog_config = PERF_COUNT_HW_CPU_CYCLES;
>  int __read_mostly soft_watchdog_user_enabled = 1;
>  int __read_mostly watchdog_thresh = 10;
>  static int __read_mostly nmi_watchdog_available;
> @@ -73,15 +75,40 @@ void __init hardlockup_detector_disable(void)
>
>  static int __init hardlockup_panic_setup(char *str)
>  {
> -       if (!strncmp(str, "panic", 5))
> -               hardlockup_panic = 1;
> -       else if (!strncmp(str, "nopanic", 7))
> -               hardlockup_panic = 0;
> -       else if (!strncmp(str, "0", 1))
> -               nmi_watchdog_user_enabled = 0;
> -       else if (!strncmp(str, "1", 1))
> -               nmi_watchdog_user_enabled = 1;
> -       return 1;
> +       int ret = 1;
> +
> +       if (!str)
> +               return -EINVAL;
> +
> +       while (str) {
> +               char *next = strchr(str, ',');
> +               if (next) {
> +                       *next = 0;
> +                       next++;
> +               }
> +
> +               if (!strcmp(str, "panic"))
> +                       hardlockup_panic = 1;
> +               else if (!strcmp(str, "nopanic"))
> +                       hardlockup_panic = 0;
> +               else if (!strcmp(str, "0"))
> +                       nmi_watchdog_user_enabled = 0;
> +               else if (!strcmp(str, "1"))
> +                       nmi_watchdog_user_enabled = 1;
> +               else if (str[0] == 'r') {
> +                       str++;
> +                       ret = kstrtou64(str, 16, &nmi_watchdog_config);
> +                       if (ret)
> +                               break;
> +                       nmi_watchdog_type = PERF_TYPE_RAW;
> +                       nmi_watchdog_user_enabled = 1;
> +               }
> +
> +               str = next;
> +       }
> +
> +       return ret;
> +
>  }
>  __setup("nmi_watchdog=", hardlockup_panic_setup);
>
> diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
> index 247bf0b1582c..27bc15f9a92a 100644
> --- a/kernel/watchdog_hld.c
> +++ b/kernel/watchdog_hld.c
> @@ -99,8 +99,6 @@ static inline bool watchdog_check_timestamp(void)
>  #endif
>
>  static struct perf_event_attr wd_hw_attr = {
> -       .type           = PERF_TYPE_HARDWARE,
> -       .config         = PERF_COUNT_HW_CPU_CYCLES,
>         .size           = sizeof(struct perf_event_attr),
>         .pinned         = 1,
>         .disabled       = 1,
> @@ -170,6 +168,8 @@ static int hardlockup_detector_event_create(void)
>         struct perf_event *evt;
>
>         wd_attr = &wd_hw_attr;
> +       wd_attr->type = nmi_watchdog_type;
> +       wd_attr->config = nmi_watchdog_config;
>         wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
>
>         /* Try to register using hardware perf events */

Andrew Morton May 12, 2023, 11:40 p.m. UTC | #3

On Tue, 9 May 2023 15:17:00 -0700 Song Liu <song@kernel.org> wrote:

> NMI watchdog permanently consumes one hardware counters per CPU on the
> system. For systems that use many hardware counters, this causes more
> aggressive time multiplexing of perf events.
> 
> OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely
> used. Try use "ref-cycles" for the watchdog. If the CPU supports it, so
> that one more hardware counter is available to the user. If the CPU doesn't
> support "ref-cycles", fall back to "cycles".
> 
> The downside of this change is that users of "ref-cycles" need to disable
> nmi_watchdog.
> 
> ...
>
> @@ -286,6 +286,12 @@ int __init hardlockup_detector_perf_init(void)
>  {
>  	int ret = hardlockup_detector_event_create();
>  
> +	if (ret) {

If we get here, hardlockup_detector_event_create() has sent a scary
pr_debug message.

> +		/* Failed to create "ref-cycles", try "cycles" instead */
> +		wd_hw_attr.config = PERF_COUNT_HW_CPU_CYCLES;
> +		ret = hardlockup_detector_event_create();

So it would be good to emit a followup message here telling users that
things are OK.  Or tell the user we're retrying with a different
counter, etc.

> +		/* Failed to create "ref-cycles", try "cycles" instead */
> +		wd_hw_attr.config = PERF_COUNT_HW_CPU_CYCLES;
> +		ret = hardlockup_detector_event_create();
> +	}
> +
>  	if (ret) {
>  		pr_info("Perf NMI watchdog permanently disabled\n");
>  	} else {
> -- 
> 2.34.1

Song Liu May 13, 2023, 12:03 a.m. UTC | #4

> On May 12, 2023, at 4:40 PM, Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> On Tue, 9 May 2023 15:17:00 -0700 Song Liu <song@kernel.org> wrote:
> 
>> NMI watchdog permanently consumes one hardware counters per CPU on the
>> system. For systems that use many hardware counters, this causes more
>> aggressive time multiplexing of perf events.
>> 
>> OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely
>> used. Try use "ref-cycles" for the watchdog. If the CPU supports it, so
>> that one more hardware counter is available to the user. If the CPU doesn't
>> support "ref-cycles", fall back to "cycles".
>> 
>> The downside of this change is that users of "ref-cycles" need to disable
>> nmi_watchdog.
>> 
>> ...
>> 
>> @@ -286,6 +286,12 @@ int __init hardlockup_detector_perf_init(void)
>> {
>> int ret = hardlockup_detector_event_create();
>> 
>> + if (ret) {
> 
> If we get here, hardlockup_detector_event_create() has sent a scary
> pr_debug message.
> 
>> + /* Failed to create "ref-cycles", try "cycles" instead */
>> + wd_hw_attr.config = PERF_COUNT_HW_CPU_CYCLES;
>> + ret = hardlockup_detector_event_create();
> 
> So it would be good to emit a followup message here telling users that
> things are OK.  Or tell the user we're retrying with a different
> counter, etc.

How about we ask hardlockup_detector_event_create() not to send pr_debug
message in the first try (something like below)?

Also, I think Peter's concern is valid. If some user daemon monitors 
ref-cycles in the background (I am not aware of such use cases though), 
this could be a real issue. 

Thanks,
Song


diff --git i/kernel/watchdog_hld.c w/kernel/watchdog_hld.c
index f77109d98641..a1d2a43ea31f 100644
--- i/kernel/watchdog_hld.c
+++ w/kernel/watchdog_hld.c
@@ -163,7 +163,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
        return;
 }

-static int hardlockup_detector_event_create(void)
+static int hardlockup_detector_event_create(bool send_warning)
 {
        unsigned int cpu = smp_processor_id();
        struct perf_event_attr *wd_attr;
@@ -176,8 +176,10 @@ static int hardlockup_detector_event_create(void)
        evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
                                               watchdog_overflow_callback, NULL);
        if (IS_ERR(evt)) {
-               pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
-                        PTR_ERR(evt));
+               if (send_warning) {
+                       pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
+                                PTR_ERR(evt));
+               }
                return PTR_ERR(evt);
        }
        this_cpu_write(watchdog_ev, evt);
@@ -189,7 +191,7 @@ static int hardlockup_detector_event_create(void)
  */
 void hardlockup_detector_perf_enable(void)
 {
-       if (hardlockup_detector_event_create())
+       if (hardlockup_detector_event_create(true))
                return;

        /* use original value for check */
@@ -284,12 +286,12 @@ void __init hardlockup_detector_perf_restart(void)
  */
 int __init hardlockup_detector_perf_init(void)
 {
-       int ret = hardlockup_detector_event_create();
+       int ret = hardlockup_detector_event_create(false);

        if (ret) {
                /* Failed to create "ref-cycles", try "cycles" instead */
                wd_hw_attr.config = PERF_COUNT_HW_CPU_CYCLES;
-               ret = hardlockup_detector_event_create();
+               ret = hardlockup_detector_event_create(true);
        }

        if (ret) {

Peter Zijlstra May 15, 2023, 10:33 a.m. UTC | #5

On Fri, May 12, 2023 at 09:43:48AM -0700, Song Liu wrote:
> On Fri, May 12, 2023 at 5:47 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, May 09, 2023 at 03:17:00PM -0700, Song Liu wrote:
> > > NMI watchdog permanently consumes one hardware counters per CPU on the
> > > system. For systems that use many hardware counters, this causes more
> > > aggressive time multiplexing of perf events.
> > >
> > > OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely
> > > used. Try use "ref-cycles" for the watchdog. If the CPU supports it, so
> > > that one more hardware counter is available to the user. If the CPU doesn't
> > > support "ref-cycles", fall back to "cycles".
> > >
> > > The downside of this change is that users of "ref-cycles" need to disable
> > > nmi_watchdog.
> >
> > Urgh..
> >
> > how about something like so instead; then you can use whatever event you
> > like...
> 
> Configuring this at boot time is not ideal for our use case. Currently, we have
> some systems support ref-cycles and some don't. So this is one more kernel
> argument we need to make sure to get correctly. This also means we cannot
> change this setting without reboot.

You can still add the fallback (with a suitable pr_warn() that the
requested config is not valid or so).

> Another idea I have is to use sysctl kernel.nmi_watchdog, so we can change
> the event after boot. Would this work?

Yeah, I suppose you can also extend the thing to allow runtime changes
to the values, provided the NMI watchdog is disabled at the time or
somesuch.

> Btw, the limitation here (ref-cycles users need to disable NMI watchdog) comes
> from the limitation that the programmable counters cannot do ref-cycles. Is this
> something we may change (or already changed)?

I really don't know .. and if it's not in the SDM I probably couldn't
tell you anyway :/

Song Liu May 15, 2023, 8:33 p.m. UTC | #6

> On May 15, 2023, at 3:33 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Fri, May 12, 2023 at 09:43:48AM -0700, Song Liu wrote:
>> On Fri, May 12, 2023 at 5:47 AM Peter Zijlstra <peterz@infradead.org> wrote:
>>> 
>>> On Tue, May 09, 2023 at 03:17:00PM -0700, Song Liu wrote:
>>>> NMI watchdog permanently consumes one hardware counters per CPU on the
>>>> system. For systems that use many hardware counters, this causes more
>>>> aggressive time multiplexing of perf events.
>>>> 
>>>> OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely
>>>> used. Try use "ref-cycles" for the watchdog. If the CPU supports it, so
>>>> that one more hardware counter is available to the user. If the CPU doesn't
>>>> support "ref-cycles", fall back to "cycles".
>>>> 
>>>> The downside of this change is that users of "ref-cycles" need to disable
>>>> nmi_watchdog.
>>> 
>>> Urgh..
>>> 
>>> how about something like so instead; then you can use whatever event you
>>> like...
>> 
>> Configuring this at boot time is not ideal for our use case. Currently, we have
>> some systems support ref-cycles and some don't. So this is one more kernel
>> argument we need to make sure to get correctly. This also means we cannot
>> change this setting without reboot.
> 
> You can still add the fallback (with a suitable pr_warn() that the
> requested config is not valid or so).
> 
>> Another idea I have is to use sysctl kernel.nmi_watchdog, so we can change
>> the event after boot. Would this work?
> 
> Yeah, I suppose you can also extend the thing to allow runtime changes
> to the values, provided the NMI watchdog is disabled at the time or
> somesuch.

How about something like:

sysctl kernel.nmi_watchdog=0 => no nmi watchdog
sysctl kernel.nmi_watchdog=1 => use "cycles" nmi watchdog
sysctl kernel.nmi_watchdog=2 => use "ref-cycles" nmi watchdog

I know the values are arbitrary. But using raw code encoding seems an 
overkill (for command line or sysctl). 


By the way, current code for "sysctl kernel.nmi_watchdog" doesn't 
report the error properly. For example, on an Intel host, with:

  sysctl kernel.nmi_watchdog=0
  perf stat -e cycles:D,cycles:D,cycles:D,cycles:D,cycles:D -a &
  sysctl kernel.nmi_watchdog=1
  kill $(pidof perf)

At this point, the watchdog is effectively disabled, because the perf
events failed to enable. But sysctl still show:

   kernel.nmi_watchdog = 1

How about we use current version (plus error reporting fix suggested 
by Andrew)? If this causes any user visible issues, we can add the 
option to configure it via sysctl (and fix sysctl reporting at the 
same time). 

Thanks,
Song

diff mbox series

Patch

diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 247bf0b1582c..f77109d98641 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -100,7 +100,7 @@  static inline bool watchdog_check_timestamp(void)
 
 static struct perf_event_attr wd_hw_attr = {
 	.type		= PERF_TYPE_HARDWARE,
-	.config		= PERF_COUNT_HW_CPU_CYCLES,
+	.config		= PERF_COUNT_HW_REF_CPU_CYCLES,
 	.size		= sizeof(struct perf_event_attr),
 	.pinned		= 1,
 	.disabled	= 1,
@@ -286,6 +286,12 @@  int __init hardlockup_detector_perf_init(void)
 {
 	int ret = hardlockup_detector_event_create();
 
+	if (ret) {
+		/* Failed to create "ref-cycles", try "cycles" instead */
+		wd_hw_attr.config = PERF_COUNT_HW_CPU_CYCLES;
+		ret = hardlockup_detector_event_create();
+	}
+
 	if (ret) {
 		pr_info("Perf NMI watchdog permanently disabled\n");
 	} else {