[v3] watchdog: Allow nmi watchdog to use "ref-cycles" event

Message ID 20230517175105.1614575-1-song@kernel.org
State New
Headers
Series [v3] watchdog: Allow nmi watchdog to use "ref-cycles" event |

Commit Message

Song Liu May 17, 2023, 5:51 p.m. UTC
  NMI watchdog permanently consumes one hardware counters per CPU on the
system. For systems that use many hardware counters, this causes more
aggressive time multiplexing of perf events.

OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely
used. Add kernel cmdline arg nmi_watchdog=ref-cycles to configure the
watchdog to use "ref-cycles" event instead of "cycles".

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Song Liu <song@kernel.org>

---
Changes in v3:

Pivot the design to use kernel arg nmi_watchdog=ref-cycles (Peter)
---
 Documentation/admin-guide/kernel-parameters.txt | 5 +++--
 include/linux/nmi.h                             | 2 ++
 kernel/watchdog.c                               | 2 ++
 kernel/watchdog_hld.c                           | 9 +++++++++
 4 files changed, 16 insertions(+), 2 deletions(-)
  

Comments

kernel test robot May 17, 2023, 8:48 p.m. UTC | #1
Hi Song,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.4-rc2 next-20230517]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Song-Liu/watchdog-Allow-nmi-watchdog-to-use-ref-cycles-event/20230518-015450
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20230517175105.1614575-1-song%40kernel.org
patch subject: [PATCH v3] watchdog: Allow nmi watchdog to use "ref-cycles" event
config: powerpc-allnoconfig
compiler: powerpc-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/c119f335899443fe6ba4c91b295b469e2793712f
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Song-Liu/watchdog-Allow-nmi-watchdog-to-use-ref-cycles-event/20230518-015450
        git checkout c119f335899443fe6ba4c91b295b469e2793712f
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/kernel/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202305180440.PKKQNyLU-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from arch/powerpc/kernel/stacktrace.c:15:
>> include/linux/nmi.h:112:13: error: no previous prototype for 'hardlockup_config_perf_event' [-Werror=missing-prototypes]
     112 | extern void hardlockup_config_perf_event(const char *str) { }
         |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
   cc1: all warnings being treated as errors


vim +/hardlockup_config_perf_event +112 include/linux/nmi.h

    96	
    97	#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
    98	extern void arch_touch_nmi_watchdog(void);
    99	extern void hardlockup_detector_perf_stop(void);
   100	extern void hardlockup_detector_perf_restart(void);
   101	extern void hardlockup_detector_perf_disable(void);
   102	extern void hardlockup_detector_perf_enable(void);
   103	extern void hardlockup_detector_perf_cleanup(void);
   104	extern int hardlockup_detector_perf_init(void);
   105	extern void hardlockup_config_perf_event(const char *str);
   106	#else
   107	static inline void hardlockup_detector_perf_stop(void) { }
   108	static inline void hardlockup_detector_perf_restart(void) { }
   109	static inline void hardlockup_detector_perf_disable(void) { }
   110	static inline void hardlockup_detector_perf_enable(void) { }
   111	static inline void hardlockup_detector_perf_cleanup(void) { }
 > 112	extern void hardlockup_config_perf_event(const char *str) { }
   113	# if !defined(CONFIG_HAVE_NMI_WATCHDOG)
   114	static inline int hardlockup_detector_perf_init(void) { return -ENODEV; }
   115	static inline void arch_touch_nmi_watchdog(void) {}
   116	# else
   117	static inline int hardlockup_detector_perf_init(void) { return 0; }
   118	# endif
   119	#endif
   120
  
kernel test robot May 17, 2023, 8:48 p.m. UTC | #2
Hi Song,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.4-rc2 next-20230517]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Song-Liu/watchdog-Allow-nmi-watchdog-to-use-ref-cycles-event/20230518-015450
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20230517175105.1614575-1-song%40kernel.org
patch subject: [PATCH v3] watchdog: Allow nmi watchdog to use "ref-cycles" event
config: um-i386_defconfig
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/c119f335899443fe6ba4c91b295b469e2793712f
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Song-Liu/watchdog-Allow-nmi-watchdog-to-use-ref-cycles-event/20230518-015450
        git checkout c119f335899443fe6ba4c91b295b469e2793712f
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=um SUBARCH=i386 olddefconfig
        make W=1 O=build_dir ARCH=um SUBARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202305180435.gh1msXbz-lkp@intel.com/

All errors (new ones prefixed by >>):

   ld: kernel/panic.o: in function `hardlockup_config_perf_event':
>> include/linux/nmi.h:112: multiple definition of `hardlockup_config_perf_event'; init/main.o:include/linux/nmi.h:112: first defined here
   ld: kernel/cpu.o: in function `hardlockup_config_perf_event':
>> include/linux/nmi.h:112: multiple definition of `hardlockup_config_perf_event'; init/main.o:include/linux/nmi.h:112: first defined here
   ld: kernel/workqueue.o: in function `hardlockup_config_perf_event':
>> include/linux/nmi.h:112: multiple definition of `hardlockup_config_perf_event'; init/main.o:include/linux/nmi.h:112: first defined here
   ld: kernel/sched/core.o: in function `hardlockup_config_perf_event':
   kernel/sched/core.c:2148: multiple definition of `hardlockup_config_perf_event'; init/main.o:include/linux/nmi.h:112: first defined here
   ld: kernel/sched/build_utility.o: in function `hardlockup_config_perf_event':
>> include/linux/nmi.h:112: multiple definition of `hardlockup_config_perf_event'; init/main.o:include/linux/nmi.h:112: first defined here
   ld: kernel/printk/printk.o: in function `hardlockup_config_perf_event':
>> include/linux/nmi.h:112: multiple definition of `hardlockup_config_perf_event'; init/main.o:include/linux/nmi.h:112: first defined here
   ld: kernel/time/timekeeping.o: in function `hardlockup_config_perf_event':
>> include/linux/nmi.h:112: multiple definition of `hardlockup_config_perf_event'; init/main.o:include/linux/nmi.h:112: first defined here
   ld: kernel/time/timer_list.o: in function `hardlockup_config_perf_event':
>> include/linux/nmi.h:112: multiple definition of `hardlockup_config_perf_event'; init/main.o:include/linux/nmi.h:112: first defined here
   ld: kernel/time/tick-common.o: in function `hardlockup_config_perf_event':
>> include/linux/nmi.h:112: multiple definition of `hardlockup_config_perf_event'; init/main.o:include/linux/nmi.h:112: first defined here
   ld: kernel/time/tick-sched.o: in function `hardlockup_config_perf_event':
>> include/linux/nmi.h:112: multiple definition of `hardlockup_config_perf_event'; init/main.o:include/linux/nmi.h:112: first defined here
   ld: mm/mm_init.o: in function `hardlockup_config_perf_event':
>> include/linux/nmi.h:112: multiple definition of `hardlockup_config_perf_event'; init/main.o:include/linux/nmi.h:112: first defined here
   ld: mm/page_alloc.o: in function `hardlockup_config_perf_event':
>> include/linux/nmi.h:112: multiple definition of `hardlockup_config_perf_event'; init/main.o:include/linux/nmi.h:112: first defined here
   ld: drivers/tty/sysrq.o: in function `hardlockup_config_perf_event':
>> include/linux/nmi.h:112: multiple definition of `hardlockup_config_perf_event'; init/main.o:include/linux/nmi.h:112: first defined here
   ld: lib/nmi_backtrace.o: in function `hardlockup_config_perf_event':
   lib/nmi_backtrace.c:112: multiple definition of `hardlockup_config_perf_event'; init/main.o:include/linux/nmi.h:112: first defined here


vim +112 include/linux/nmi.h

    96	
    97	#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
    98	extern void arch_touch_nmi_watchdog(void);
    99	extern void hardlockup_detector_perf_stop(void);
   100	extern void hardlockup_detector_perf_restart(void);
   101	extern void hardlockup_detector_perf_disable(void);
   102	extern void hardlockup_detector_perf_enable(void);
   103	extern void hardlockup_detector_perf_cleanup(void);
   104	extern int hardlockup_detector_perf_init(void);
   105	extern void hardlockup_config_perf_event(const char *str);
   106	#else
   107	static inline void hardlockup_detector_perf_stop(void) { }
   108	static inline void hardlockup_detector_perf_restart(void) { }
   109	static inline void hardlockup_detector_perf_disable(void) { }
   110	static inline void hardlockup_detector_perf_enable(void) { }
   111	static inline void hardlockup_detector_perf_cleanup(void) { }
 > 112	extern void hardlockup_config_perf_event(const char *str) { }
   113	# if !defined(CONFIG_HAVE_NMI_WATCHDOG)
   114	static inline int hardlockup_detector_perf_init(void) { return -ENODEV; }
   115	static inline void arch_touch_nmi_watchdog(void) {}
   116	# else
   117	static inline int hardlockup_detector_perf_init(void) { return 0; }
   118	# endif
   119	#endif
   120
  

Patch

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9e5bab29685f..d378e23dad7c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3593,10 +3593,12 @@ 
 			Format: [state][,regs][,debounce][,die]
 
 	nmi_watchdog=	[KNL,BUGS=X86] Debugging features for SMP kernels
-			Format: [panic,][nopanic,][num]
+			Format: [panic,][nopanic,][ref-cycles][num]
 			Valid num: 0 or 1
 			0 - turn hardlockup detector in nmi_watchdog off
 			1 - turn hardlockup detector in nmi_watchdog on
+			ref-cycles - configure the watchdog with perf event
+			             "ref-cycles" instead of "cycles"
 			When panic is specified, panic when an NMI watchdog
 			timeout occurs (or 'nopanic' to not panic on an NMI
 			watchdog, if CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is set)
@@ -7097,4 +7099,3 @@ 
 				memory, and other data can't be written using
 				xmon commands.
 			off	xmon is disabled.
-
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 048c0b9aa623..9fe1c1831287 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -102,12 +102,14 @@  extern void hardlockup_detector_perf_disable(void);
 extern void hardlockup_detector_perf_enable(void);
 extern void hardlockup_detector_perf_cleanup(void);
 extern int hardlockup_detector_perf_init(void);
+extern void hardlockup_config_perf_event(const char *str);
 #else
 static inline void hardlockup_detector_perf_stop(void) { }
 static inline void hardlockup_detector_perf_restart(void) { }
 static inline void hardlockup_detector_perf_disable(void) { }
 static inline void hardlockup_detector_perf_enable(void) { }
 static inline void hardlockup_detector_perf_cleanup(void) { }
+extern void hardlockup_config_perf_event(const char *str) { }
 # if !defined(CONFIG_HAVE_NMI_WATCHDOG)
 static inline int hardlockup_detector_perf_init(void) { return -ENODEV; }
 static inline void arch_touch_nmi_watchdog(void) {}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 8e61f21e7e33..fed4f0be8e1a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -81,6 +81,8 @@  static int __init hardlockup_panic_setup(char *str)
 		nmi_watchdog_user_enabled = 0;
 	else if (!strncmp(str, "1", 1))
 		nmi_watchdog_user_enabled = 1;
+	else if (!strncmp(str, "ref-cycles", 10))
+		hardlockup_config_perf_event(str);
 	return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 247bf0b1582c..4deca58ba6ed 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -294,3 +294,12 @@  int __init hardlockup_detector_perf_init(void)
 	}
 	return ret;
 }
+
+/**
+ * hardlockup_config_perf_event - Overwrite config of wd_hw_attr
+ */
+void __init hardlockup_config_perf_event(const char *str)
+{
+	if (!strncmp(str, "ref-cycles", 10))
+		wd_hw_attr.config = PERF_COUNT_HW_REF_CPU_CYCLES;
+}