[v2] rcu: Keeping rcu-related kthreads running on housekeeping CPUS

Message ID 20230209102730.974465-1-qiang1.zhang@intel.com
State New
Headers
Series [v2] rcu: Keeping rcu-related kthreads running on housekeeping CPUS |

Commit Message

Zqiang Feb. 9, 2023, 10:27 a.m. UTC
  For kernels built with CONFIG_NO_HZ_FULL=y and CONFIG_RCU_NOCB_CPU=y,
when passing cpulist to "isolcpus=", "nohz_full=" and "rcu_nocbs="
bootparams, after system starting, the rcu-related kthreads(include
rcu_gp, rcuog*, rcuop* kthreads etc) will running on housekeeping
CPUs, but for cpulist contains CPU0, the result will deferent, these
rcu-related kthreads will be restricted to running on CPU0.

Although invoke kthread_create() to spwan rcu-related kthreads and
when it's starting, invoke set_cpus_allowed_ptr() to allowed cpumaks
is housekeeping_cpumask(HK_TYPE_KTHREAD), but due to these rcu-related
kthreads are created before starting other CPUS, that is to say, at
this time, only CPU0 is online, when these rcu-related kthreads running
and set allowed cpumaks is housekeeping cpumask, if find that only CPU0
is online and CPU0 exists in "isolcpus=", "nohz_full=" and "rcu_nocbs="
bootparams, invoke set_cpus_allowed_ptr() will return error.

set_cpus_allowed_ptr()
 ->__set_cpus_allowed_ptr()
   ->__set_cpus_allowed_ptr_locked
     {
                bool kthread = p->flags & PF_KTHREAD;
                ....
                if (kthread || is_migration_disabled(p))
                        cpu_valid_mask = cpu_online_mask;
                ....
                dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask);
                if (dest_cpu >= nr_cpu_ids) {
                        ret = -EINVAL;
                        goto out;
                }
                ....
     }

At this time, only CPU0 is set in the cpu_online_mask, the ctx->new_mask
is housekeeping cpumask and not contains CPU0, this will result dest_cpu
is illegal cpu value, the set_cpus_allowed_ptr() will return -EINVAL and
failed to set housekeeping cpumask.

This commit therefore add additional cpus_allowed_ptr() call in CPU hotplug
path. and reset the CPU affinity of rcuboost, rcuog, rcuop kthreads after
all other CPUs are online.

Signed-off-by: Zqiang <qiang1.zhang@intel.com>
---

 runqemu kvm slirp nographic qemuparams="-m 1024 -smp 4"
 bootparams="console=ttyS0 isolcpus=0,1 nohz_full=0,1 rcu_nocbs=0,1" -d

 root@qemux86-64:~# ps -eo pid,psr,comm | grep rcu
    3   0 rcu_gp
    6   0 kworker/0:0-rcu_gp
   12   0 rcu_tasks_kthread
   13   0 rcu_tasks_rude_kthread
   14   0 rcu_tasks_trace_kthread
   16   0 rcu_preempt
   17   0 rcuog/0
   18   0 rcuop/0
   19   2 rcub/0
   20   0 rcu_exp_gp_kthread_worker
   21   0 rcu_exp_par_gp_kthread_worker
   29   0 rcuop/1
   35   0 rcuog/2
   36   0 rcuop/2
   42   2 rcuop/3
 root@qemux86-64:~#
 root@qemux86-64:~#
 root@qemux86-64:~#
 root@qemux86-64:~# cat /proc/16/status | grep Cpus
 Cpus_allowed:   1
 Cpus_allowed_list:      0

 Applay this patch:
 
 root@qemux86-64:/# ps -eo pid,psr,comm | grep rcu
    3   0 rcu_gp
    6   0 kworker/0:0-rcu_gp
   12   3 rcu_tasks_kthread
   13   2 rcu_tasks_rude_kthread
   14   3 rcu_tasks_trace_kthread
   16   3 rcu_preempt
   17   3 rcuog/0
   18   2 rcuop/0
   19   2 rcub/0
   20   3 rcu_exp_gp_kthread_worker
   21   0 rcu_exp_par_gp_kthread_worker
   29   3 rcuop/1
   35   0 rcuog/2
   36   0 rcuop/2
   42   2 rcuop/3
 root@qemux86-64:/# cat /proc/16/status | grep Cpus
 Cpus_allowed:   c
 Cpus_allowed_list:      2-3

 kernel/rcu/rcu.h         |  4 +++
 kernel/rcu/tasks.h       | 26 +++++++++++++----
 kernel/rcu/tree.c        | 61 +++++++++++++++++++++++++++++++++++++---
 kernel/rcu/tree.h        |  1 -
 kernel/rcu/tree_nocb.h   | 18 +++++++++++-
 kernel/rcu/tree_plugin.h |  9 ------
 6 files changed, 99 insertions(+), 20 deletions(-)
  

Comments

kernel test robot Feb. 9, 2023, 12:59 p.m. UTC | #1
Hi Zqiang,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on paulmck-rcu/dev]
[also build test ERROR on next-20230209]
[cannot apply to linus/master v6.2-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Zqiang/rcu-Keeping-rcu-related-kthreads-running-on-housekeeping-CPUS/20230209-182311
base:   https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev
patch link:    https://lore.kernel.org/r/20230209102730.974465-1-qiang1.zhang%40intel.com
patch subject: [PATCH v2] rcu: Keeping rcu-related kthreads running on housekeeping CPUS
config: arc-randconfig-r024-20230209 (https://download.01.org/0day-ci/archive/20230209/202302092018.kI8rty7k-lkp@intel.com/config)
compiler: arc-elf-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/af15a3ade363b21d823918088623f8564cbd9d08
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Zqiang/rcu-Keeping-rcu-related-kthreads-running-on-housekeeping-CPUS/20230209-182311
        git checkout af15a3ade363b21d823918088623f8564cbd9d08
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=arc olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=arc SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202302092018.kI8rty7k-lkp@intel.com

All errors (new ones prefixed by >>):

   arc-elf-ld: kernel/rcu/srcutiny.o: in function `rcu_kthread_setaffinity':
>> srcutiny.c:(.text+0x35c): multiple definition of `rcu_kthread_setaffinity'; kernel/rcu/update.o:update.c:(.text+0x204): first defined here
   arc-elf-ld: kernel/rcu/tiny.o: in function `rcu_kthread_setaffinity':
   tiny.c:(.text+0x164): multiple definition of `rcu_kthread_setaffinity'; kernel/rcu/update.o:update.c:(.text+0x204): first defined here
  
kernel test robot Feb. 9, 2023, 2:51 p.m. UTC | #2
Hi Zqiang,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on paulmck-rcu/dev]
[also build test WARNING on next-20230209]
[cannot apply to linus/master v6.2-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Zqiang/rcu-Keeping-rcu-related-kthreads-running-on-housekeeping-CPUS/20230209-182311
base:   https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev
patch link:    https://lore.kernel.org/r/20230209102730.974465-1-qiang1.zhang%40intel.com
patch subject: [PATCH v2] rcu: Keeping rcu-related kthreads running on housekeeping CPUS
config: riscv-randconfig-r016-20230209 (https://download.01.org/0day-ci/archive/20230209/202302092209.AYjmQ4Id-lkp@intel.com/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project db0e6591612b53910a1b366863348bdb9d7d2fb1)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install riscv cross compiling tool for clang build
        # apt-get install binutils-riscv64-linux-gnu
        # https://github.com/intel-lab-lkp/linux/commit/af15a3ade363b21d823918088623f8564cbd9d08
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Zqiang/rcu-Keeping-rcu-related-kthreads-running-on-housekeeping-CPUS/20230209-182311
        git checkout af15a3ade363b21d823918088623f8564cbd9d08
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=riscv olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=riscv SHELL=/bin/bash kernel/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202302092209.AYjmQ4Id-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from kernel/torture.c:40:
>> kernel/rcu/rcu.h:509:6: warning: no previous prototype for function 'rcu_tasks_generic_setaffinity' [-Wmissing-prototypes]
   void rcu_tasks_generic_setaffinity(int cpu) {}
        ^
   kernel/rcu/rcu.h:509:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void rcu_tasks_generic_setaffinity(int cpu) {}
   ^
   static 
   1 warning generated.


vim +/rcu_tasks_generic_setaffinity +509 kernel/rcu/rcu.h

   484	
   485	#ifdef CONFIG_TINY_RCU
   486	/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
   487	static inline bool rcu_gp_is_normal(void) { return true; }
   488	static inline bool rcu_gp_is_expedited(void) { return false; }
   489	static inline bool rcu_async_should_hurry(void) { return false; }
   490	static inline void rcu_expedite_gp(void) { }
   491	static inline void rcu_unexpedite_gp(void) { }
   492	static inline void rcu_async_hurry(void) { }
   493	static inline void rcu_async_relax(void) { }
   494	static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
   495	#else /* #ifdef CONFIG_TINY_RCU */
   496	bool rcu_gp_is_normal(void);     /* Internal RCU use. */
   497	bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
   498	bool rcu_async_should_hurry(void);  /* Internal RCU use. */
   499	void rcu_expedite_gp(void);
   500	void rcu_unexpedite_gp(void);
   501	void rcu_async_hurry(void);
   502	void rcu_async_relax(void);
   503	void rcupdate_announce_bootup_oddness(void);
   504	#ifdef CONFIG_TASKS_RCU_GENERIC
   505	void show_rcu_tasks_gp_kthreads(void);
   506	void rcu_tasks_generic_setaffinity(int cpu);
   507	#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
   508	static inline void show_rcu_tasks_gp_kthreads(void) {}
 > 509	void rcu_tasks_generic_setaffinity(int cpu) {}
   510	#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
   511	void rcu_request_urgent_qs_task(struct task_struct *t);
   512	#endif /* #else #ifdef CONFIG_TINY_RCU */
   513
  
kernel test robot Feb. 9, 2023, 3:21 p.m. UTC | #3
Hi Zqiang,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on paulmck-rcu/dev]
[also build test WARNING on next-20230209]
[cannot apply to linus/master v6.2-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Zqiang/rcu-Keeping-rcu-related-kthreads-running-on-housekeeping-CPUS/20230209-182311
base:   https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev
patch link:    https://lore.kernel.org/r/20230209102730.974465-1-qiang1.zhang%40intel.com
patch subject: [PATCH v2] rcu: Keeping rcu-related kthreads running on housekeeping CPUS
config: m68k-allyesconfig (https://download.01.org/0day-ci/archive/20230209/202302092354.9PFOG9CE-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/af15a3ade363b21d823918088623f8564cbd9d08
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Zqiang/rcu-Keeping-rcu-related-kthreads-running-on-housekeeping-CPUS/20230209-182311
        git checkout af15a3ade363b21d823918088623f8564cbd9d08
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash kernel/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202302092354.9PFOG9CE-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from kernel/torture.c:40:
>> kernel/rcu/rcu.h:602:6: warning: no previous prototype for 'rcu_kthread_setaffinity' [-Wmissing-prototypes]
     602 | void rcu_kthread_setaffinity(struct task_struct *tsk, int outgoing) { }
         |      ^~~~~~~~~~~~~~~~~~~~~~~
--
   In file included from kernel/rcu/update.c:49:
>> kernel/rcu/rcu.h:602:6: warning: no previous prototype for 'rcu_kthread_setaffinity' [-Wmissing-prototypes]
     602 | void rcu_kthread_setaffinity(struct task_struct *tsk, int outgoing) { }
         |      ^~~~~~~~~~~~~~~~~~~~~~~
   In file included from kernel/rcu/update.c:649:
>> kernel/rcu/tasks.h:1970:6: warning: no previous prototype for 'rcu_tasks_generic_setaffinity' [-Wmissing-prototypes]
    1970 | void rcu_tasks_generic_setaffinity(int cpu)
         |      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~


vim +/rcu_kthread_setaffinity +602 kernel/rcu/rcu.h

   588	
   589	#ifdef CONFIG_TINY_RCU
   590	static inline bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { return false; }
   591	static inline unsigned long rcu_get_gp_seq(void) { return 0; }
   592	static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
   593	static inline unsigned long
   594	srcu_batches_completed(struct srcu_struct *sp) { return 0; }
   595	static inline void rcu_force_quiescent_state(void) { }
   596	static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; }
   597	static inline void show_rcu_gp_kthreads(void) { }
   598	static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
   599	static inline void rcu_fwd_progress_check(unsigned long j) { }
   600	static inline void rcu_gp_slow_register(atomic_t *rgssp) { }
   601	static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { }
 > 602	void rcu_kthread_setaffinity(struct task_struct *tsk, int outgoing) { }
   603	#else /* #ifdef CONFIG_TINY_RCU */
   604	bool rcu_dynticks_zero_in_eqs(int cpu, int *vp);
   605	unsigned long rcu_get_gp_seq(void);
   606	unsigned long rcu_exp_batches_completed(void);
   607	unsigned long srcu_batches_completed(struct srcu_struct *sp);
   608	bool rcu_check_boost_fail(unsigned long gp_state, int *cpup);
   609	void show_rcu_gp_kthreads(void);
   610	int rcu_get_gp_kthreads_prio(void);
   611	void rcu_fwd_progress_check(unsigned long j);
   612	void rcu_force_quiescent_state(void);
   613	void rcu_kthread_setaffinity(struct task_struct *tsk, int outgoing);
   614	extern struct workqueue_struct *rcu_gp_wq;
   615	#ifdef CONFIG_RCU_EXP_KTHREAD
   616	extern struct kthread_worker *rcu_exp_gp_kworker;
   617	extern struct kthread_worker *rcu_exp_par_gp_kworker;
   618	#else /* !CONFIG_RCU_EXP_KTHREAD */
   619	extern struct workqueue_struct *rcu_par_gp_wq;
   620	#endif /* CONFIG_RCU_EXP_KTHREAD */
   621	void rcu_gp_slow_register(atomic_t *rgssp);
   622	void rcu_gp_slow_unregister(atomic_t *rgssp);
   623	#endif /* #else #ifdef CONFIG_TINY_RCU */
   624
  
Frederic Weisbecker Feb. 10, 2023, 12:32 a.m. UTC | #4
On Thu, Feb 09, 2023 at 06:27:30PM +0800, Zqiang wrote:
> For kernels built with CONFIG_NO_HZ_FULL=y and CONFIG_RCU_NOCB_CPU=y,
> when passing cpulist to "isolcpus=", "nohz_full=" and "rcu_nocbs="
> bootparams, after system starting, the rcu-related kthreads(include
> rcu_gp, rcuog*, rcuop* kthreads etc) will running on housekeeping
> CPUs, but for cpulist contains CPU0, the result will deferent, these
> rcu-related kthreads will be restricted to running on CPU0.
> 
> Although invoke kthread_create() to spwan rcu-related kthreads and
> when it's starting, invoke set_cpus_allowed_ptr() to allowed cpumaks
> is housekeeping_cpumask(HK_TYPE_KTHREAD), but due to these rcu-related
> kthreads are created before starting other CPUS, that is to say, at
> this time, only CPU0 is online, when these rcu-related kthreads running
> and set allowed cpumaks is housekeeping cpumask, if find that only CPU0
> is online and CPU0 exists in "isolcpus=", "nohz_full=" and "rcu_nocbs="
> bootparams, invoke set_cpus_allowed_ptr() will return error.
> 
> set_cpus_allowed_ptr()
>  ->__set_cpus_allowed_ptr()
>    ->__set_cpus_allowed_ptr_locked
>      {
>                 bool kthread = p->flags & PF_KTHREAD;
>                 ....
>                 if (kthread || is_migration_disabled(p))
>                         cpu_valid_mask = cpu_online_mask;
>                 ....
>                 dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask);
>                 if (dest_cpu >= nr_cpu_ids) {
>                         ret = -EINVAL;
>                         goto out;
>                 }
>                 ....
>      }
> 
> At this time, only CPU0 is set in the cpu_online_mask, the ctx->new_mask
> is housekeeping cpumask and not contains CPU0, this will result dest_cpu
> is illegal cpu value, the set_cpus_allowed_ptr() will return -EINVAL and
> failed to set housekeeping cpumask.
> 
> This commit therefore add additional cpus_allowed_ptr() call in CPU hotplug
> path. and reset the CPU affinity of rcuboost, rcuog, rcuop kthreads after
> all other CPUs are online.
> 
> Signed-off-by: Zqiang <qiang1.zhang@intel.com>

Good catch! But based on that and your other fix, I suspect that
nohz_full=0... has never been seriously used.

A few points:

* This is a problem for kthreads in general. And since HK_TYPE_KTHREAD =
  HK_TYPE_RCU and both are going to be merged in the future, I think we should
  stop handling the RCU kthreads housekeeping affinity from RCU but let the
  kthread code handle that and also fix the issue from the kthread code.
  RCU boost may be an exception since we try to enforce some node locality
  within the housekeeping range.  

* If CPU 0 is isolated and it is the boot CPU, we should wait for a secondary
  CPU to boot before activating nohz_full at all. Unfortunately the nohz_full
  code is not yet ready for runtime housekeeping cpumask change but work is
  in progress (I'm saying that for 10 years...)

* I'm tempted to revert 08ae95f4fd3b (nohz_full: Allow the boot CPU to be
  nohz_full) since it doesn't work and nobody ever complained?

Thanks.
  
Zqiang Feb. 10, 2023, 5:26 a.m. UTC | #5
> For kernels built with CONFIG_NO_HZ_FULL=y and CONFIG_RCU_NOCB_CPU=y,
> when passing cpulist to "isolcpus=", "nohz_full=" and "rcu_nocbs="
> bootparams, after system starting, the rcu-related kthreads(include
> rcu_gp, rcuog*, rcuop* kthreads etc) will running on housekeeping
> CPUs, but for cpulist contains CPU0, the result will deferent, these
> rcu-related kthreads will be restricted to running on CPU0.
> 
> Although invoke kthread_create() to spwan rcu-related kthreads and
> when it's starting, invoke set_cpus_allowed_ptr() to allowed cpumaks
> is housekeeping_cpumask(HK_TYPE_KTHREAD), but due to these rcu-related
> kthreads are created before starting other CPUS, that is to say, at
> this time, only CPU0 is online, when these rcu-related kthreads running
> and set allowed cpumaks is housekeeping cpumask, if find that only CPU0
> is online and CPU0 exists in "isolcpus=", "nohz_full=" and "rcu_nocbs="
> bootparams, invoke set_cpus_allowed_ptr() will return error.
> 
> set_cpus_allowed_ptr()
>  ->__set_cpus_allowed_ptr()
>    ->__set_cpus_allowed_ptr_locked
>      {
>                 bool kthread = p->flags & PF_KTHREAD;
>                 ....
>                 if (kthread || is_migration_disabled(p))
>                         cpu_valid_mask = cpu_online_mask;
>                 ....
>                 dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask);
>                 if (dest_cpu >= nr_cpu_ids) {
>                         ret = -EINVAL;
>                         goto out;
>                 }
>                 ....
>      }
> 
> At this time, only CPU0 is set in the cpu_online_mask, the ctx->new_mask
> is housekeeping cpumask and not contains CPU0, this will result dest_cpu
> is illegal cpu value, the set_cpus_allowed_ptr() will return -EINVAL and
> failed to set housekeeping cpumask.
> 
> This commit therefore add additional cpus_allowed_ptr() call in CPU hotplug
> path. and reset the CPU affinity of rcuboost, rcuog, rcuop kthreads after
> all other CPUs are online.
> 
> Signed-off-by: Zqiang <qiang1.zhang@intel.com>
>
>Good catch! But based on that and your other fix, I suspect that
>nohz_full=0... has never been seriously used.
>
>A few points:
>
>* This is a problem for kthreads in general. And since HK_TYPE_KTHREAD =
>  HK_TYPE_RCU and both are going to be merged in the future, I think we should
>  stop handling the RCU kthreads housekeeping affinity from RCU but let the
>  kthread code handle that and also fix the issue from the kthread code.
>  RCU boost may be an exception since we try to enforce some node locality
>  within the housekeeping range.  

Agree.  indeed, these works that set housekeeping CPU affinity should not be handled by RCU, 
and not only RCU-related kthreads are affected, other kthreads created earlier also have the
same problem.

>
>* If CPU 0 is isolated and it is the boot CPU, we should wait for a secondary
>  CPU to boot before activating nohz_full at all. Unfortunately the nohz_full
>  code is not yet ready for runtime housekeeping cpumask change but work is
>  in progress (I'm saying that for 10 years...)
>
>* I'm tempted to revert 08ae95f4fd3b (nohz_full: Allow the boot CPU to be
>  nohz_full) since it doesn't work and nobody ever complained?

Yes if remove 08ae95f4fd3b, this problem will disappear.

Thanks
Zqiang

>
>Thanks.
  
Joel Fernandes Feb. 15, 2023, 2:51 p.m. UTC | #6
On Fri, Feb 10, 2023 at 12:26 AM Zhang, Qiang1 <qiang1.zhang@intel.com> wrote:
>
> > For kernels built with CONFIG_NO_HZ_FULL=y and CONFIG_RCU_NOCB_CPU=y,
> > when passing cpulist to "isolcpus=", "nohz_full=" and "rcu_nocbs="
> > bootparams, after system starting, the rcu-related kthreads(include
> > rcu_gp, rcuog*, rcuop* kthreads etc) will running on housekeeping
> > CPUs, but for cpulist contains CPU0, the result will deferent, these
> > rcu-related kthreads will be restricted to running on CPU0.
> >
> > Although invoke kthread_create() to spwan rcu-related kthreads and
> > when it's starting, invoke set_cpus_allowed_ptr() to allowed cpumaks
> > is housekeeping_cpumask(HK_TYPE_KTHREAD), but due to these rcu-related
> > kthreads are created before starting other CPUS, that is to say, at
> > this time, only CPU0 is online, when these rcu-related kthreads running
> > and set allowed cpumaks is housekeeping cpumask, if find that only CPU0
> > is online and CPU0 exists in "isolcpus=", "nohz_full=" and "rcu_nocbs="
> > bootparams, invoke set_cpus_allowed_ptr() will return error.
> >
> > set_cpus_allowed_ptr()
> >  ->__set_cpus_allowed_ptr()
> >    ->__set_cpus_allowed_ptr_locked
> >      {
> >                 bool kthread = p->flags & PF_KTHREAD;
> >                 ....
> >                 if (kthread || is_migration_disabled(p))
> >                         cpu_valid_mask = cpu_online_mask;
> >                 ....
> >                 dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask);
> >                 if (dest_cpu >= nr_cpu_ids) {
> >                         ret = -EINVAL;
> >                         goto out;
> >                 }
> >                 ....
> >      }
> >
> > At this time, only CPU0 is set in the cpu_online_mask, the ctx->new_mask
> > is housekeeping cpumask and not contains CPU0, this will result dest_cpu
> > is illegal cpu value, the set_cpus_allowed_ptr() will return -EINVAL and
> > failed to set housekeeping cpumask.
> >
> > This commit therefore add additional cpus_allowed_ptr() call in CPU hotplug
> > path. and reset the CPU affinity of rcuboost, rcuog, rcuop kthreads after
> > all other CPUs are online.
> >
> > Signed-off-by: Zqiang <qiang1.zhang@intel.com>
> >
> >Good catch! But based on that and your other fix, I suspect that
> >nohz_full=0... has never been seriously used.
> >
> >A few points:
> >
> >* This is a problem for kthreads in general. And since HK_TYPE_KTHREAD =
> >  HK_TYPE_RCU and both are going to be merged in the future, I think we should
> >  stop handling the RCU kthreads housekeeping affinity from RCU but let the
> >  kthread code handle that and also fix the issue from the kthread code.
> >  RCU boost may be an exception since we try to enforce some node locality
> >  within the housekeeping range.
>
> Agree.  indeed, these works that set housekeeping CPU affinity should not be handled by RCU,
> and not only RCU-related kthreads are affected, other kthreads created earlier also have the
> same problem.

Agreed as well.

> >* If CPU 0 is isolated and it is the boot CPU, we should wait for a secondary
> >  CPU to boot before activating nohz_full at all. Unfortunately the nohz_full
> >  code is not yet ready for runtime housekeeping cpumask change but work is
> >  in progress (I'm saying that for 10 years...)
> >
> >* I'm tempted to revert 08ae95f4fd3b (nohz_full: Allow the boot CPU to be
> >  nohz_full) since it doesn't work and nobody ever complained?
>
> Yes if remove 08ae95f4fd3b, this problem will disappear.

Just checking. So what's the next step, revert this?

Thanks.
  

Patch

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index a3adcf9a9919..1cad82e93304 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -503,8 +503,10 @@  void rcu_async_relax(void);
 void rcupdate_announce_bootup_oddness(void);
 #ifdef CONFIG_TASKS_RCU_GENERIC
 void show_rcu_tasks_gp_kthreads(void);
+void rcu_tasks_generic_setaffinity(int cpu);
 #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
 static inline void show_rcu_tasks_gp_kthreads(void) {}
+void rcu_tasks_generic_setaffinity(int cpu) {}
 #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
 void rcu_request_urgent_qs_task(struct task_struct *t);
 #endif /* #else #ifdef CONFIG_TINY_RCU */
@@ -597,6 +599,7 @@  static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
 static inline void rcu_fwd_progress_check(unsigned long j) { }
 static inline void rcu_gp_slow_register(atomic_t *rgssp) { }
 static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { }
+void rcu_kthread_setaffinity(struct task_struct *tsk, int outgoing) { }
 #else /* #ifdef CONFIG_TINY_RCU */
 bool rcu_dynticks_zero_in_eqs(int cpu, int *vp);
 unsigned long rcu_get_gp_seq(void);
@@ -607,6 +610,7 @@  void show_rcu_gp_kthreads(void);
 int rcu_get_gp_kthreads_prio(void);
 void rcu_fwd_progress_check(unsigned long j);
 void rcu_force_quiescent_state(void);
+void rcu_kthread_setaffinity(struct task_struct *tsk, int outgoing);
 extern struct workqueue_struct *rcu_gp_wq;
 #ifdef CONFIG_RCU_EXP_KTHREAD
 extern struct kthread_worker *rcu_exp_gp_kworker;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index baf7ec178155..cebc02198ef7 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -545,10 +545,6 @@  static int __noreturn rcu_tasks_kthread(void *arg)
 {
 	struct rcu_tasks *rtp = arg;
 
-	/* Run on housekeeping CPUs by default.  Sysadm can move if desired. */
-	housekeeping_affine(current, HK_TYPE_RCU);
-	WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start!
-
 	/*
 	 * Each pass through the following loop makes one check for
 	 * newly arrived callbacks, and, if there are some, waits for
@@ -586,9 +582,11 @@  static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
 {
 	struct task_struct *t;
 
-	t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname);
+	t = kthread_create(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname);
 	if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name))
 		return;
+	WRITE_ONCE(rtp->kthread_ptr, t);
+	wake_up_process(t);
 	smp_mb(); /* Ensure others see full kthread. */
 }
 
@@ -1969,6 +1967,24 @@  void __init rcu_init_tasks_generic(void)
 	rcu_tasks_initiate_self_tests();
 }
 
+void rcu_tasks_generic_setaffinity(int cpu)
+{
+#ifdef CONFIG_TASKS_RCU
+	if (rcu_tasks.kthread_ptr)
+		rcu_kthread_setaffinity(rcu_tasks.kthread_ptr, cpu);
+#endif
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+	if (rcu_tasks_rude.kthread_ptr)
+		rcu_kthread_setaffinity(rcu_tasks_rude.kthread_ptr, cpu);
+#endif
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+	if (rcu_tasks_trace.kthread_ptr)
+		rcu_kthread_setaffinity(rcu_tasks_trace.kthread_ptr, cpu);
+#endif
+}
+
 #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
 static inline void rcu_tasks_bootup_oddness(void) {}
 #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ee27a03d7576..d1575d74346e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -154,6 +154,8 @@  static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
 static bool rcu_init_invoked(void);
 static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
 static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
+static void rcu_exp_gp_kworkers_setaffinity(int cpu);
+static void rcu_nocb_kthread_setaffinity(int cpu, int outgoing);
 
 /*
  * rcuc/rcub/rcuop kthread realtime priority. The "rcuop"
@@ -1781,7 +1783,6 @@  static noinline void rcu_gp_cleanup(void)
  */
 static int __noreturn rcu_gp_kthread(void *unused)
 {
-	rcu_bind_gp_kthread();
 	for (;;) {
 
 		/* Handle grace-period start. */
@@ -4297,6 +4298,30 @@  static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
 	rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
 }
 
+void rcu_kthread_setaffinity(struct task_struct *tsk, int outgoing)
+{
+	cpumask_var_t mask;
+
+	if (!tsk)
+		return;
+	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+		return;
+	cpumask_copy(mask, housekeeping_cpumask(HK_TYPE_RCU));
+	if (outgoing >= 0 && cpumask_test_cpu(outgoing, mask))
+		cpumask_clear_cpu(outgoing, mask);
+	cpumask_and(mask, cpu_online_mask, mask);
+	if (cpumask_empty(mask))
+		cpumask_copy(mask, cpu_possible_mask);
+	set_cpus_allowed_ptr(tsk, mask);
+	free_cpumask_var(mask);
+}
+
+static void rcu_gp_kthread_setaffinity(int cpu)
+{
+	if (rcu_state.gp_kthread)
+		rcu_kthread_setaffinity(rcu_state.gp_kthread, cpu);
+}
+
 /*
  * Near the end of the CPU-online process.  Pretty much all services
  * enabled, and the CPU is now very much alive.
@@ -4316,7 +4341,10 @@  int rcutree_online_cpu(unsigned int cpu)
 		return 0; /* Too early in boot for scheduler work. */
 	sync_sched_exp_online_cleanup(cpu);
 	rcutree_affinity_setting(cpu, -1);
-
+	rcu_gp_kthread_setaffinity(-1);
+	rcu_nocb_kthread_setaffinity(cpu, -1);
+	rcu_tasks_generic_setaffinity(-1);
+	rcu_exp_gp_kworkers_setaffinity(-1);
 	// Stop-machine done, so allow nohz_full to disable tick.
 	tick_dep_clear(TICK_DEP_BIT_RCU);
 	return 0;
@@ -4339,7 +4367,10 @@  int rcutree_offline_cpu(unsigned int cpu)
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
 	rcutree_affinity_setting(cpu, cpu);
-
+	rcu_gp_kthread_setaffinity(cpu);
+	rcu_nocb_kthread_setaffinity(cpu, cpu);
+	rcu_tasks_generic_setaffinity(cpu);
+	rcu_exp_gp_kworkers_setaffinity(cpu);
 	// nohz_full CPUs need the tick for stop-machine to work quickly
 	tick_dep_set(TICK_DEP_BIT_RCU);
 	return 0;
@@ -4550,6 +4581,14 @@  static void __init rcu_start_exp_gp_kworkers(void)
 				   &param);
 }
 
+static void rcu_exp_gp_kworkers_setaffinity(int cpu)
+{
+	if (rcu_exp_gp_kworker)
+		rcu_kthread_setaffinity(rcu_exp_gp_kworker->task, cpu);
+	if (rcu_exp_par_gp_kworker)
+		rcu_kthread_setaffinity(rcu_exp_par_gp_kworker->task, cpu);
+}
+
 static inline void rcu_alloc_par_gp_wq(void)
 {
 }
@@ -4559,7 +4598,9 @@  struct workqueue_struct *rcu_par_gp_wq;
 static void __init rcu_start_exp_gp_kworkers(void)
 {
 }
-
+static void rcu_exp_gp_kworkers_setaffinity(int cpu)
+{
+}
 static inline void rcu_alloc_par_gp_wq(void)
 {
 	rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
@@ -4609,6 +4650,18 @@  static int __init rcu_spawn_gp_kthread(void)
 }
 early_initcall(rcu_spawn_gp_kthread);
 
+static int __init rcu_boost_resetaffinity(void)
+{
+	struct rcu_node *rnp;
+	int cpu;
+
+	rcu_for_each_leaf_node(rnp)
+		rcu_boost_kthread_setaffinity(rnp, -1);
+	for_each_possible_cpu(cpu)
+		rcu_nocb_kthread_setaffinity(cpu, -1);
+	return 0;
+}
+core_initcall(rcu_boost_resetaffinity);
 /*
  * This function is invoked towards the end of the scheduler's
  * initialization process.  Before this is called, the idle task might
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 192536916f9a..391e3fae4ff5 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -495,7 +495,6 @@  do {								\
 #define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags)
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 
-static void rcu_bind_gp_kthread(void);
 static bool rcu_nohz_full_cpu(void);
 
 /* Forward declarations for tree_stall.h */
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index f2280616f9d5..a9cd07ccf959 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1494,6 +1494,20 @@  static void rcu_spawn_cpu_nocb_kthread(int cpu)
 	mutex_unlock(&rcu_state.barrier_mutex);
 }
 
+static void rcu_nocb_kthread_setaffinity(int cpu, int outgoing)
+{
+	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+	struct rcu_data *rdp_gp;
+
+	if (rdp->nocb_cb_kthread)
+		rcu_kthread_setaffinity(rdp->nocb_cb_kthread, outgoing);
+	rdp_gp = rdp->nocb_gp_rdp;
+	mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+	if (rdp_gp->nocb_gp_kthread)
+		rcu_kthread_setaffinity(rdp_gp->nocb_gp_kthread, outgoing);
+	mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+}
+
 /* How many CB CPU IDs per GP kthread?  Default of -1 for sqrt(nr_cpu_ids). */
 static int rcu_nocb_gp_stride = -1;
 module_param(rcu_nocb_gp_stride, int, 0444);
@@ -1754,7 +1768,9 @@  static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
 static void rcu_spawn_cpu_nocb_kthread(int cpu)
 {
 }
-
+static void rcu_nocb_kthread_setaffinity(int cpu, int outgoing)
+{
+}
 static void show_rcu_nocb_state(struct rcu_data *rdp)
 {
 }
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7b0fe741a088..fdde71ebb83e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1294,12 +1294,3 @@  static bool rcu_nohz_full_cpu(void)
 	return false;
 }
 
-/*
- * Bind the RCU grace-period kthreads to the housekeeping CPU.
- */
-static void rcu_bind_gp_kthread(void)
-{
-	if (!tick_nohz_full_enabled())
-		return;
-	housekeeping_affine(current, HK_TYPE_RCU);
-}