[linux-next,RFC] torture: avoid offline tick_do_timer_cpu

Message ID 20221121035140.118651-1-zhouzhouyi@gmail.com
State New
Headers
Series [linux-next,RFC] torture: avoid offline tick_do_timer_cpu |

Commit Message

Zhouyi Zhou Nov. 21, 2022, 3:51 a.m. UTC
  During CPU-hotplug torture (CONFIG_NO_HZ_FULL=y), if we try to
offline tick_do_timer_cpu, the operation will fail because in
function tick_nohz_cpu_down:
```
if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
      return -EBUSY;
```
Above bug was first discovered in torture tests performed in PPC VM
of Open Source Lab of Oregon State University, and reproducable in RISC-V
and X86-64 (with additional kernel commandline cpu0_hotplug).

In this patch, we avoid offline tick_do_timer_cpu by distribute
the offlining cpu among remaining cpus.

Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
---
 include/linux/tick.h        |  1 +
 kernel/time/tick-common.c   |  1 +
 kernel/time/tick-internal.h |  1 -
 kernel/torture.c            | 10 ++++++++++
 4 files changed, 12 insertions(+), 1 deletion(-)
  

Comments

Paul E. McKenney Nov. 22, 2022, 1:37 a.m. UTC | #1
On Mon, Nov 21, 2022 at 11:51:40AM +0800, Zhouyi Zhou wrote:
> During CPU-hotplug torture (CONFIG_NO_HZ_FULL=y), if we try to
> offline tick_do_timer_cpu, the operation will fail because in
> function tick_nohz_cpu_down:
> ```
> if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
>       return -EBUSY;
> ```
> Above bug was first discovered in torture tests performed in PPC VM
> of Open Source Lab of Oregon State University, and reproducable in RISC-V
> and X86-64 (with additional kernel commandline cpu0_hotplug).
> 
> In this patch, we avoid offline tick_do_timer_cpu by distribute
> the offlining cpu among remaining cpus.
> 
> Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>

Good show chasing this down!

A couple of questions below.

> ---
>  include/linux/tick.h        |  1 +
>  kernel/time/tick-common.c   |  1 +
>  kernel/time/tick-internal.h |  1 -
>  kernel/torture.c            | 10 ++++++++++
>  4 files changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/tick.h b/include/linux/tick.h
> index bfd571f18cfd..23cc0b205853 100644
> --- a/include/linux/tick.h
> +++ b/include/linux/tick.h
> @@ -14,6 +14,7 @@
>  #include <linux/rcupdate.h>
>  
>  #ifdef CONFIG_GENERIC_CLOCKEVENTS
> +extern int tick_do_timer_cpu __read_mostly;
>  extern void __init tick_init(void);
>  /* Should be core only, but ARM BL switcher requires it */
>  extern void tick_suspend_local(void);
> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> index 46789356f856..87b9b9afa320 100644
> --- a/kernel/time/tick-common.c
> +++ b/kernel/time/tick-common.c
> @@ -48,6 +48,7 @@ ktime_t tick_next_period;
>   *    procedure also covers cpu hotplug.
>   */
>  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
> +EXPORT_SYMBOL_GPL(tick_do_timer_cpu);
>  #ifdef CONFIG_NO_HZ_FULL
>  /*
>   * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
> diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
> index 649f2b48e8f0..8953dca10fdd 100644
> --- a/kernel/time/tick-internal.h
> +++ b/kernel/time/tick-internal.h
> @@ -15,7 +15,6 @@
>  
>  DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
>  extern ktime_t tick_next_period;
> -extern int tick_do_timer_cpu __read_mostly;
>  
>  extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
>  extern void tick_handle_periodic(struct clock_event_device *dev);
> diff --git a/kernel/torture.c b/kernel/torture.c
> index 789aeb0e1159..bccbdd33dda2 100644
> --- a/kernel/torture.c
> +++ b/kernel/torture.c
> @@ -33,6 +33,7 @@
>  #include <linux/delay.h>
>  #include <linux/stat.h>
>  #include <linux/slab.h>
> +#include <linux/tick.h>
>  #include <linux/trace_clock.h>
>  #include <linux/ktime.h>
>  #include <asm/byteorder.h>
> @@ -358,7 +359,16 @@ torture_onoff(void *arg)
>  			schedule_timeout_interruptible(HZ / 10);
>  			continue;
>  		}
> +#ifdef CONFIG_NO_HZ_FULL
> +		/* do not offline tick do timer cpu */
> +		if (tick_nohz_full_running) {
> +			cpu = (torture_random(&rand) >> 4) % maxcpu;
> +			if (cpu >= tick_do_timer_cpu)

Why is this ">=" instead of "=="?

> +				cpu = (cpu + 1) % (maxcpu + 1);
> +		} else
> +#else
>  		cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
> +#endif

What happens if the value of tick_do_timer_cpu changes between the time of
the check above and the call to torture_offline() below?  Alternatively,
how is such a change in value prevented?

							Thanx, Paul

>  		if (!torture_offline(cpu,
>  				     &n_offline_attempts, &n_offline_successes,
>  				     &sum_offline, &min_offline, &max_offline))
> -- 
> 2.34.1
>
  
Zhouyi Zhou Nov. 23, 2022, 2:23 a.m. UTC | #2
On Tue, Nov 22, 2022 at 9:37 AM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Mon, Nov 21, 2022 at 11:51:40AM +0800, Zhouyi Zhou wrote:
> > During CPU-hotplug torture (CONFIG_NO_HZ_FULL=y), if we try to
> > offline tick_do_timer_cpu, the operation will fail because in
> > function tick_nohz_cpu_down:
> > ```
> > if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
> >       return -EBUSY;
> > ```
> > Above bug was first discovered in torture tests performed in PPC VM
> > of Open Source Lab of Oregon State University, and reproducable in RISC-V
> > and X86-64 (with additional kernel commandline cpu0_hotplug).
> >
> > In this patch, we avoid offline tick_do_timer_cpu by distribute
> > the offlining cpu among remaining cpus.
> >
> > Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
>
> Good show chasing this down!
Thank Paul for your guidance and encouragement!
>
> A couple of questions below.
The answers below.
>
> > ---
> >  include/linux/tick.h        |  1 +
> >  kernel/time/tick-common.c   |  1 +
> >  kernel/time/tick-internal.h |  1 -
> >  kernel/torture.c            | 10 ++++++++++
> >  4 files changed, 12 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/tick.h b/include/linux/tick.h
> > index bfd571f18cfd..23cc0b205853 100644
> > --- a/include/linux/tick.h
> > +++ b/include/linux/tick.h
> > @@ -14,6 +14,7 @@
> >  #include <linux/rcupdate.h>
> >
> >  #ifdef CONFIG_GENERIC_CLOCKEVENTS
> > +extern int tick_do_timer_cpu __read_mostly;
> >  extern void __init tick_init(void);
> >  /* Should be core only, but ARM BL switcher requires it */
> >  extern void tick_suspend_local(void);
> > diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> > index 46789356f856..87b9b9afa320 100644
> > --- a/kernel/time/tick-common.c
> > +++ b/kernel/time/tick-common.c
> > @@ -48,6 +48,7 @@ ktime_t tick_next_period;
> >   *    procedure also covers cpu hotplug.
> >   */
> >  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
> > +EXPORT_SYMBOL_GPL(tick_do_timer_cpu);
> >  #ifdef CONFIG_NO_HZ_FULL
> >  /*
> >   * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
> > diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
> > index 649f2b48e8f0..8953dca10fdd 100644
> > --- a/kernel/time/tick-internal.h
> > +++ b/kernel/time/tick-internal.h
> > @@ -15,7 +15,6 @@
> >
> >  DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
> >  extern ktime_t tick_next_period;
> > -extern int tick_do_timer_cpu __read_mostly;
> >
> >  extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
> >  extern void tick_handle_periodic(struct clock_event_device *dev);
> > diff --git a/kernel/torture.c b/kernel/torture.c
> > index 789aeb0e1159..bccbdd33dda2 100644
> > --- a/kernel/torture.c
> > +++ b/kernel/torture.c
> > @@ -33,6 +33,7 @@
> >  #include <linux/delay.h>
> >  #include <linux/stat.h>
> >  #include <linux/slab.h>
> > +#include <linux/tick.h>
> >  #include <linux/trace_clock.h>
> >  #include <linux/ktime.h>
> >  #include <asm/byteorder.h>
> > @@ -358,7 +359,16 @@ torture_onoff(void *arg)
> >                       schedule_timeout_interruptible(HZ / 10);
> >                       continue;
> >               }
> > +#ifdef CONFIG_NO_HZ_FULL
> > +             /* do not offline tick do timer cpu */
> > +             if (tick_nohz_full_running) {
> > +                     cpu = (torture_random(&rand) >> 4) % maxcpu;
> > +                     if (cpu >= tick_do_timer_cpu)
>
> Why is this ">=" instead of "=="?
I use probability theory here to let the remaining cpu distribute evenly.
Example:
we have cpus: 0 1 2 3 4 5 6 7
maxcpu = 7
tick_do_timer_cpu = 2
remaining cpus are: 0 1 3 4 5 6 7
if the offline cpu candidate is 2, then the result cpu is 2+1
else if the offline cpu candidate is 3, then the result cpu is 3+1
...
else if the offline cpu candidate is 6, then the result cpu is 6+1
>
> > +                             cpu = (cpu + 1) % (maxcpu + 1);
we could just use cpu = cpu + 1 here
> > +             } else
> > +#else
> >               cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
> > +#endif
>
> What happens if the value of tick_do_timer_cpu changes between the time of
> the check above and the call to torture_offline() below?  Alternatively,
> how is such a change in value prevented?
I did a preliminary research about the above question, this is quite
complicated for me
(because I think I must not bring locks to kernel just because our
test frame need them),
Please give me some days to perform intensive research.

Thanks again
Cheers
Zhouyi
>
>                                                         Thanx, Paul
>
> >               if (!torture_offline(cpu,
> >                                    &n_offline_attempts, &n_offline_successes,
> >                                    &sum_offline, &min_offline, &max_offline))
> > --
> > 2.34.1
> >
  
Paul E. McKenney Nov. 23, 2022, 6:49 p.m. UTC | #3
On Wed, Nov 23, 2022 at 10:23:11AM +0800, Zhouyi Zhou wrote:
> On Tue, Nov 22, 2022 at 9:37 AM Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > On Mon, Nov 21, 2022 at 11:51:40AM +0800, Zhouyi Zhou wrote:
> > > During CPU-hotplug torture (CONFIG_NO_HZ_FULL=y), if we try to
> > > offline tick_do_timer_cpu, the operation will fail because in
> > > function tick_nohz_cpu_down:
> > > ```
> > > if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
> > >       return -EBUSY;
> > > ```
> > > Above bug was first discovered in torture tests performed in PPC VM
> > > of Open Source Lab of Oregon State University, and reproducable in RISC-V
> > > and X86-64 (with additional kernel commandline cpu0_hotplug).
> > >
> > > In this patch, we avoid offline tick_do_timer_cpu by distribute
> > > the offlining cpu among remaining cpus.
> > >
> > > Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
> >
> > Good show chasing this down!
> Thank Paul for your guidance and encouragement!
> >
> > A couple of questions below.
> The answers below.
> >
> > > ---
> > >  include/linux/tick.h        |  1 +
> > >  kernel/time/tick-common.c   |  1 +
> > >  kernel/time/tick-internal.h |  1 -
> > >  kernel/torture.c            | 10 ++++++++++
> > >  4 files changed, 12 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/include/linux/tick.h b/include/linux/tick.h
> > > index bfd571f18cfd..23cc0b205853 100644
> > > --- a/include/linux/tick.h
> > > +++ b/include/linux/tick.h
> > > @@ -14,6 +14,7 @@
> > >  #include <linux/rcupdate.h>
> > >
> > >  #ifdef CONFIG_GENERIC_CLOCKEVENTS
> > > +extern int tick_do_timer_cpu __read_mostly;
> > >  extern void __init tick_init(void);
> > >  /* Should be core only, but ARM BL switcher requires it */
> > >  extern void tick_suspend_local(void);
> > > diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> > > index 46789356f856..87b9b9afa320 100644
> > > --- a/kernel/time/tick-common.c
> > > +++ b/kernel/time/tick-common.c
> > > @@ -48,6 +48,7 @@ ktime_t tick_next_period;
> > >   *    procedure also covers cpu hotplug.
> > >   */
> > >  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
> > > +EXPORT_SYMBOL_GPL(tick_do_timer_cpu);
> > >  #ifdef CONFIG_NO_HZ_FULL
> > >  /*
> > >   * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
> > > diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
> > > index 649f2b48e8f0..8953dca10fdd 100644
> > > --- a/kernel/time/tick-internal.h
> > > +++ b/kernel/time/tick-internal.h
> > > @@ -15,7 +15,6 @@
> > >
> > >  DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
> > >  extern ktime_t tick_next_period;
> > > -extern int tick_do_timer_cpu __read_mostly;
> > >
> > >  extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
> > >  extern void tick_handle_periodic(struct clock_event_device *dev);
> > > diff --git a/kernel/torture.c b/kernel/torture.c
> > > index 789aeb0e1159..bccbdd33dda2 100644
> > > --- a/kernel/torture.c
> > > +++ b/kernel/torture.c
> > > @@ -33,6 +33,7 @@
> > >  #include <linux/delay.h>
> > >  #include <linux/stat.h>
> > >  #include <linux/slab.h>
> > > +#include <linux/tick.h>
> > >  #include <linux/trace_clock.h>
> > >  #include <linux/ktime.h>
> > >  #include <asm/byteorder.h>
> > > @@ -358,7 +359,16 @@ torture_onoff(void *arg)
> > >                       schedule_timeout_interruptible(HZ / 10);
> > >                       continue;
> > >               }
> > > +#ifdef CONFIG_NO_HZ_FULL
> > > +             /* do not offline tick do timer cpu */
> > > +             if (tick_nohz_full_running) {
> > > +                     cpu = (torture_random(&rand) >> 4) % maxcpu;
> > > +                     if (cpu >= tick_do_timer_cpu)
> >
> > Why is this ">=" instead of "=="?
> I use probability theory here to let the remaining cpu distribute evenly.
> Example:
> we have cpus: 0 1 2 3 4 5 6 7
> maxcpu = 7
> tick_do_timer_cpu = 2
> remaining cpus are: 0 1 3 4 5 6 7
> if the offline cpu candidate is 2, then the result cpu is 2+1
> else if the offline cpu candidate is 3, then the result cpu is 3+1
> ...
> else if the offline cpu candidate is 6, then the result cpu is 6+1
> >
> > > +                             cpu = (cpu + 1) % (maxcpu + 1);
> we could just use cpu = cpu + 1 here

But won't this get you double the occurrences of CPU 0 compared to the
other non-tick_do_timer_cpu CPUs?  You might get CPU 0 directly from
torture_random(), or torture_random() might have given you CPU 7, which
then wraps to CPU 0.

What am I missing here?

> > > +             } else
> > > +#else
> > >               cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
> > > +#endif
> >
> > What happens if the value of tick_do_timer_cpu changes between the time of
> > the check above and the call to torture_offline() below?  Alternatively,
> > how is such a change in value prevented?
> I did a preliminary research about the above question, this is quite
> complicated for me
> (because I think I must not bring locks to kernel just because our
> test frame need them),

Agreed, it would be good to avoid added locks.

> Please give me some days to perform intensive research.

No problem, in fact, please do take the time you need for this.
As you say, it is not as simple as one might think.

							Thanx, Paul

> Thanks again
> Cheers
> Zhouyi
> >
> >                                                         Thanx, Paul
> >
> > >               if (!torture_offline(cpu,
> > >                                    &n_offline_attempts, &n_offline_successes,
> > >                                    &sum_offline, &min_offline, &max_offline))
> > > --
> > > 2.34.1
> > >
  
Frederic Weisbecker Nov. 23, 2022, 10:25 p.m. UTC | #4
On Mon, Nov 21, 2022 at 05:37:54PM -0800, Paul E. McKenney wrote:
> On Mon, Nov 21, 2022 at 11:51:40AM +0800, Zhouyi Zhou wrote:
> > @@ -358,7 +359,16 @@ torture_onoff(void *arg)
> >  			schedule_timeout_interruptible(HZ / 10);
> >  			continue;
> >  		}
> > +#ifdef CONFIG_NO_HZ_FULL
> > +		/* do not offline tick do timer cpu */
> > +		if (tick_nohz_full_running) {
> > +			cpu = (torture_random(&rand) >> 4) % maxcpu;
> > +			if (cpu >= tick_do_timer_cpu)
> 
> Why is this ">=" instead of "=="?
> 
> > +				cpu = (cpu + 1) % (maxcpu + 1);
> > +		} else
> > +#else
> >  		cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
> > +#endif
> 
> What happens if the value of tick_do_timer_cpu changes between the time of
> the check above and the call to torture_offline() below?  Alternatively,
> how is such a change in value prevented?

It can't, currently tick_do_timer_cpu is fixed when nohz_full is running.
It can however have special values at early boot such as TICK_DO_TIMER_NONE.
But if rcutorture is initialized after smp, it should be ok.

Thanks.

> 
> 							Thanx, Paul
> 
> >  		if (!torture_offline(cpu,
> >  				     &n_offline_attempts, &n_offline_successes,
> >  				     &sum_offline, &min_offline, &max_offline))
> > -- 
> > 2.34.1
> >
  
Frederic Weisbecker Nov. 23, 2022, 10:36 p.m. UTC | #5
On Mon, Nov 21, 2022 at 11:51:40AM +0800, Zhouyi Zhou wrote:
> During CPU-hotplug torture (CONFIG_NO_HZ_FULL=y), if we try to
> offline tick_do_timer_cpu, the operation will fail because in
> function tick_nohz_cpu_down:
> ```
> if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
>       return -EBUSY;
> ```
> Above bug was first discovered in torture tests performed in PPC VM
> of Open Source Lab of Oregon State University, and reproducable in RISC-V
> and X86-64 (with additional kernel commandline cpu0_hotplug).
> 
> In this patch, we avoid offline tick_do_timer_cpu by distribute
> the offlining cpu among remaining cpus.
> 
> Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
> ---
>  include/linux/tick.h        |  1 +
>  kernel/time/tick-common.c   |  1 +
>  kernel/time/tick-internal.h |  1 -
>  kernel/torture.c            | 10 ++++++++++
>  4 files changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/tick.h b/include/linux/tick.h
> index bfd571f18cfd..23cc0b205853 100644
> --- a/include/linux/tick.h
> +++ b/include/linux/tick.h
> @@ -14,6 +14,7 @@
>  #include <linux/rcupdate.h>
>  
>  #ifdef CONFIG_GENERIC_CLOCKEVENTS
> +extern int tick_do_timer_cpu __read_mostly;
>  extern void __init tick_init(void);
>  /* Should be core only, but ARM BL switcher requires it */
>  extern void tick_suspend_local(void);
> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> index 46789356f856..87b9b9afa320 100644
> --- a/kernel/time/tick-common.c
> +++ b/kernel/time/tick-common.c
> @@ -48,6 +48,7 @@ ktime_t tick_next_period;
>   *    procedure also covers cpu hotplug.
>   */
>  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
> +EXPORT_SYMBOL_GPL(tick_do_timer_cpu);

Please rather make a function for this. This is an internal value
that we don't want to expose to modules.

This can be:

     int tick_nohz_full_timekeeper(void)
     {
         if (tick_nohz_full_enabled() && tick_do_timer_cpu >= 0)
             return tick_do_timer_cpu;
         else
	     return nr_cpu_ids;
     }

And then just check if the value is below nr_cpu_ids.

Thanks.
  
Paul E. McKenney Nov. 23, 2022, 11 p.m. UTC | #6
On Wed, Nov 23, 2022 at 11:25:43PM +0100, Frederic Weisbecker wrote:
> On Mon, Nov 21, 2022 at 05:37:54PM -0800, Paul E. McKenney wrote:
> > On Mon, Nov 21, 2022 at 11:51:40AM +0800, Zhouyi Zhou wrote:
> > > @@ -358,7 +359,16 @@ torture_onoff(void *arg)
> > >  			schedule_timeout_interruptible(HZ / 10);
> > >  			continue;
> > >  		}
> > > +#ifdef CONFIG_NO_HZ_FULL
> > > +		/* do not offline tick do timer cpu */
> > > +		if (tick_nohz_full_running) {
> > > +			cpu = (torture_random(&rand) >> 4) % maxcpu;
> > > +			if (cpu >= tick_do_timer_cpu)
> > 
> > Why is this ">=" instead of "=="?
> > 
> > > +				cpu = (cpu + 1) % (maxcpu + 1);
> > > +		} else
> > > +#else
> > >  		cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
> > > +#endif
> > 
> > What happens if the value of tick_do_timer_cpu changes between the time of
> > the check above and the call to torture_offline() below?  Alternatively,
> > how is such a change in value prevented?
> 
> It can't, currently tick_do_timer_cpu is fixed when nohz_full is running.
> It can however have special values at early boot such as TICK_DO_TIMER_NONE.
> But if rcutorture is initialized after smp, it should be ok.

Ah, getting ahead of myself, thank you for the info!

So the thing to do would be to generate only maxcpu-1 choices.

							Thanx, Paul

> Thanks.
> 
> > 
> > 							Thanx, Paul
> > 
> > >  		if (!torture_offline(cpu,
> > >  				     &n_offline_attempts, &n_offline_successes,
> > >  				     &sum_offline, &min_offline, &max_offline))
> > > -- 
> > > 2.34.1
> > >
  
Zhouyi Zhou Nov. 24, 2022, 2:18 a.m. UTC | #7
On Thu, Nov 24, 2022 at 6:37 AM Frederic Weisbecker <frederic@kernel.org> wrote:
>
> On Mon, Nov 21, 2022 at 11:51:40AM +0800, Zhouyi Zhou wrote:
> > During CPU-hotplug torture (CONFIG_NO_HZ_FULL=y), if we try to
> > offline tick_do_timer_cpu, the operation will fail because in
> > function tick_nohz_cpu_down:
> > ```
> > if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
> >       return -EBUSY;
> > ```
> > Above bug was first discovered in torture tests performed in PPC VM
> > of Open Source Lab of Oregon State University, and reproducable in RISC-V
> > and X86-64 (with additional kernel commandline cpu0_hotplug).
> >
> > In this patch, we avoid offline tick_do_timer_cpu by distribute
> > the offlining cpu among remaining cpus.
> >
> > Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
> > ---
> >  include/linux/tick.h        |  1 +
> >  kernel/time/tick-common.c   |  1 +
> >  kernel/time/tick-internal.h |  1 -
> >  kernel/torture.c            | 10 ++++++++++
> >  4 files changed, 12 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/tick.h b/include/linux/tick.h
> > index bfd571f18cfd..23cc0b205853 100644
> > --- a/include/linux/tick.h
> > +++ b/include/linux/tick.h
> > @@ -14,6 +14,7 @@
> >  #include <linux/rcupdate.h>
> >
> >  #ifdef CONFIG_GENERIC_CLOCKEVENTS
> > +extern int tick_do_timer_cpu __read_mostly;
> >  extern void __init tick_init(void);
> >  /* Should be core only, but ARM BL switcher requires it */
> >  extern void tick_suspend_local(void);
> > diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> > index 46789356f856..87b9b9afa320 100644
> > --- a/kernel/time/tick-common.c
> > +++ b/kernel/time/tick-common.c
> > @@ -48,6 +48,7 @@ ktime_t tick_next_period;
> >   *    procedure also covers cpu hotplug.
> >   */
> >  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
> > +EXPORT_SYMBOL_GPL(tick_do_timer_cpu);
>
> Please rather make a function for this. This is an internal value
> that we don't want to expose to modules.
>
> This can be:
>
>      int tick_nohz_full_timekeeper(void)
>      {
>          if (tick_nohz_full_enabled() && tick_do_timer_cpu >= 0)
>              return tick_do_timer_cpu;
>          else
>              return nr_cpu_ids;
>      }
>
> And then just check if the value is below nr_cpu_ids.
Thank Paul and Frederic both for your guidance!

Things are much easier;-) and I will do it.

Cheers
Zhouyi
>
> Thanks.
  
Zhouyi Zhou Nov. 24, 2022, 2:35 a.m. UTC | #8
On Thu, Nov 24, 2022 at 2:49 AM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Wed, Nov 23, 2022 at 10:23:11AM +0800, Zhouyi Zhou wrote:
> > On Tue, Nov 22, 2022 at 9:37 AM Paul E. McKenney <paulmck@kernel.org> wrote:
> > >
> > > On Mon, Nov 21, 2022 at 11:51:40AM +0800, Zhouyi Zhou wrote:
> > > > During CPU-hotplug torture (CONFIG_NO_HZ_FULL=y), if we try to
> > > > offline tick_do_timer_cpu, the operation will fail because in
> > > > function tick_nohz_cpu_down:
> > > > ```
> > > > if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
> > > >       return -EBUSY;
> > > > ```
> > > > Above bug was first discovered in torture tests performed in PPC VM
> > > > of Open Source Lab of Oregon State University, and reproducable in RISC-V
> > > > and X86-64 (with additional kernel commandline cpu0_hotplug).
> > > >
> > > > In this patch, we avoid offline tick_do_timer_cpu by distribute
> > > > the offlining cpu among remaining cpus.
> > > >
> > > > Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
> > >
> > > Good show chasing this down!
> > Thank Paul for your guidance and encouragement!
> > >
> > > A couple of questions below.
> > The answers below.
> > >
> > > > ---
> > > >  include/linux/tick.h        |  1 +
> > > >  kernel/time/tick-common.c   |  1 +
> > > >  kernel/time/tick-internal.h |  1 -
> > > >  kernel/torture.c            | 10 ++++++++++
> > > >  4 files changed, 12 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/include/linux/tick.h b/include/linux/tick.h
> > > > index bfd571f18cfd..23cc0b205853 100644
> > > > --- a/include/linux/tick.h
> > > > +++ b/include/linux/tick.h
> > > > @@ -14,6 +14,7 @@
> > > >  #include <linux/rcupdate.h>
> > > >
> > > >  #ifdef CONFIG_GENERIC_CLOCKEVENTS
> > > > +extern int tick_do_timer_cpu __read_mostly;
> > > >  extern void __init tick_init(void);
> > > >  /* Should be core only, but ARM BL switcher requires it */
> > > >  extern void tick_suspend_local(void);
> > > > diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> > > > index 46789356f856..87b9b9afa320 100644
> > > > --- a/kernel/time/tick-common.c
> > > > +++ b/kernel/time/tick-common.c
> > > > @@ -48,6 +48,7 @@ ktime_t tick_next_period;
> > > >   *    procedure also covers cpu hotplug.
> > > >   */
> > > >  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
> > > > +EXPORT_SYMBOL_GPL(tick_do_timer_cpu);
> > > >  #ifdef CONFIG_NO_HZ_FULL
> > > >  /*
> > > >   * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
> > > > diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
> > > > index 649f2b48e8f0..8953dca10fdd 100644
> > > > --- a/kernel/time/tick-internal.h
> > > > +++ b/kernel/time/tick-internal.h
> > > > @@ -15,7 +15,6 @@
> > > >
> > > >  DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
> > > >  extern ktime_t tick_next_period;
> > > > -extern int tick_do_timer_cpu __read_mostly;
> > > >
> > > >  extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
> > > >  extern void tick_handle_periodic(struct clock_event_device *dev);
> > > > diff --git a/kernel/torture.c b/kernel/torture.c
> > > > index 789aeb0e1159..bccbdd33dda2 100644
> > > > --- a/kernel/torture.c
> > > > +++ b/kernel/torture.c
> > > > @@ -33,6 +33,7 @@
> > > >  #include <linux/delay.h>
> > > >  #include <linux/stat.h>
> > > >  #include <linux/slab.h>
> > > > +#include <linux/tick.h>
> > > >  #include <linux/trace_clock.h>
> > > >  #include <linux/ktime.h>
> > > >  #include <asm/byteorder.h>
> > > > @@ -358,7 +359,16 @@ torture_onoff(void *arg)
> > > >                       schedule_timeout_interruptible(HZ / 10);
> > > >                       continue;
> > > >               }
> > > > +#ifdef CONFIG_NO_HZ_FULL
> > > > +             /* do not offline tick do timer cpu */
> > > > +             if (tick_nohz_full_running) {
> > > > +                     cpu = (torture_random(&rand) >> 4) % maxcpu;
by examine the beginning code of torture_onoff, I see if we has 8
cpus, then maxcpu = 7 (not 8) here,
then cpu is distributed evenly among 0, 1, 2, 3, 4, 5, 6
if we happens to get 6, then cpu+1 below results in 7
> > > > +                     if (cpu >= tick_do_timer_cpu)
> > >
> > > Why is this ">=" instead of "=="?
> > I use probability theory here to let the remaining cpu distribute evenly.
> > Example:
> > we have cpus: 0 1 2 3 4 5 6 7
> > maxcpu = 7
> > tick_do_timer_cpu = 2
> > remaining cpus are: 0 1 3 4 5 6 7
> > if the offline cpu candidate is 2, then the result cpu is 2+1
> > else if the offline cpu candidate is 3, then the result cpu is 3+1
> > ...
> > else if the offline cpu candidate is 6, then the result cpu is 6+1
> > >
> > > > +                             cpu = (cpu + 1) % (maxcpu + 1);
> > we could just use cpu = cpu + 1 here
>
> But won't this get you double the occurrences of CPU 0 compared to the
> other non-tick_do_timer_cpu CPUs?  You might get CPU 0 directly from
> torture_random(), or torture_random() might have given you CPU 7, which
> then wraps to CPU 0.
I think torture_random won't give me CPU 7 as illustrated above,
my code is a little tricky, please correct me if I am wrong.
>
> What am I missing here?
>
> > > > +             } else
> > > > +#else
> > > >               cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
> > > > +#endif
> > >
> > > What happens if the value of tick_do_timer_cpu changes between the time of
> > > the check above and the call to torture_offline() below?  Alternatively,
> > > how is such a change in value prevented?
> > I did a preliminary research about the above question, this is quite
> > complicated for me
> > (because I think I must not bring locks to kernel just because our
> > test frame need them),
>
> Agreed, it would be good to avoid added locks.
>
> > Please give me some days to perform intensive research.
>
> No problem, in fact, please do take the time you need for this.
> As you say, it is not as simple as one might think.
Thanks Paul for your constant encouragement and guidance!
I improved a lot during the process of learning.

Cheers
Zhouyi
>
>                                                         Thanx, Paul
>
> > Thanks again
> > Cheers
> > Zhouyi
> > >
> > >                                                         Thanx, Paul
> > >
> > > >               if (!torture_offline(cpu,
> > > >                                    &n_offline_attempts, &n_offline_successes,
> > > >                                    &sum_offline, &min_offline, &max_offline))
> > > > --
> > > > 2.34.1
> > > >
  
Thomas Gleixner Nov. 26, 2022, 5:05 p.m. UTC | #9
On Mon, Nov 21 2022 at 11:51, Zhouyi Zhou wrote:
> During CPU-hotplug torture (CONFIG_NO_HZ_FULL=y), if we try to
> offline tick_do_timer_cpu, the operation will fail because in
> function tick_nohz_cpu_down:
> ```
> if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
>       return -EBUSY;
> ```
> Above bug was first discovered in torture tests performed in PPC VM

How is this a bug?

> of Open Source Lab of Oregon State University, and reproducable in RISC-V
> and X86-64 (with additional kernel commandline cpu0_hotplug).
>
> In this patch, we avoid offline tick_do_timer_cpu by distribute
> the offlining cpu among remaining cpus.

Please read Documentation/process. Search for 'this patch'...

>
> Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
> ---
>  include/linux/tick.h        |  1 +
>  kernel/time/tick-common.c   |  1 +
>  kernel/time/tick-internal.h |  1 -
>  kernel/torture.c            | 10 ++++++++++
>  4 files changed, 12 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/tick.h b/include/linux/tick.h
> index bfd571f18cfd..23cc0b205853 100644
> --- a/include/linux/tick.h
> +++ b/include/linux/tick.h
> @@ -14,6 +14,7 @@
>  #include <linux/rcupdate.h>
>  
>  #ifdef CONFIG_GENERIC_CLOCKEVENTS
> +extern int tick_do_timer_cpu __read_mostly;
>  extern void __init tick_init(void);
>  /* Should be core only, but ARM BL switcher requires it */
>  extern void tick_suspend_local(void);
> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> index 46789356f856..87b9b9afa320 100644
> --- a/kernel/time/tick-common.c
> +++ b/kernel/time/tick-common.c
> @@ -48,6 +48,7 @@ ktime_t tick_next_period;
>   *    procedure also covers cpu hotplug.
>   */
>  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
> +EXPORT_SYMBOL_GPL(tick_do_timer_cpu);

No. We are not exporting this just to make a bogus test case happy.

Fix the torture code to handle -EBUSY correctly.

Thanks,

        tglx
  
Zhouyi Zhou Nov. 27, 2022, 2:45 a.m. UTC | #10
Thank Thomas for your guidance

On Sun, Nov 27, 2022 at 1:05 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Mon, Nov 21 2022 at 11:51, Zhouyi Zhou wrote:
> > During CPU-hotplug torture (CONFIG_NO_HZ_FULL=y), if we try to
> > offline tick_do_timer_cpu, the operation will fail because in
> > function tick_nohz_cpu_down:
> > ```
> > if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
> >       return -EBUSY;
> > ```
> > Above bug was first discovered in torture tests performed in PPC VM
>
> How is this a bug?
Yes, this is a false positive instead.
>
> > of Open Source Lab of Oregon State University, and reproducable in RISC-V
> > and X86-64 (with additional kernel commandline cpu0_hotplug).
> >
> > In this patch, we avoid offline tick_do_timer_cpu by distribute
> > the offlining cpu among remaining cpus.
>
> Please read Documentation/process. Search for 'this patch'...
Documentation/process/submitting-patches.rst says:
"Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
to do frotz", as if you are giving orders to the codebase to change
its behaviour."

So, I should construct my patch as:
We avoid ... by ...
>
> >
> > Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
> > ---
> >  include/linux/tick.h        |  1 +
> >  kernel/time/tick-common.c   |  1 +
> >  kernel/time/tick-internal.h |  1 -
> >  kernel/torture.c            | 10 ++++++++++
> >  4 files changed, 12 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/tick.h b/include/linux/tick.h
> > index bfd571f18cfd..23cc0b205853 100644
> > --- a/include/linux/tick.h
> > +++ b/include/linux/tick.h
> > @@ -14,6 +14,7 @@
> >  #include <linux/rcupdate.h>
> >
> >  #ifdef CONFIG_GENERIC_CLOCKEVENTS
> > +extern int tick_do_timer_cpu __read_mostly;
> >  extern void __init tick_init(void);
> >  /* Should be core only, but ARM BL switcher requires it */
> >  extern void tick_suspend_local(void);
> > diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> > index 46789356f856..87b9b9afa320 100644
> > --- a/kernel/time/tick-common.c
> > +++ b/kernel/time/tick-common.c
> > @@ -48,6 +48,7 @@ ktime_t tick_next_period;
> >   *    procedure also covers cpu hotplug.
> >   */
> >  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
> > +EXPORT_SYMBOL_GPL(tick_do_timer_cpu);
>
> No. We are not exporting this just to make a bogus test case happy.
>
> Fix the torture code to handle -EBUSY correctly.
I am going to do a study on this, for now, I do a grep in the kernel tree:
find . -name "*.c"|xargs grep cpuhp_setup_state|wc -l
The result of the grep command shows that there are 268
cpuhp_setup_state* cases.
which may make our task more complicated.

After my study, should we also take Frederic's proposal as a possible option?
(construct a function for this)
https://lore.kernel.org/lkml/20221123223658.GC1395324@lothringen/

I learned a lot during this process
Many thanks
Zhouyi
>
> Thanks,
>
>         tglx
  
Thomas Gleixner Nov. 27, 2022, 12:40 p.m. UTC | #11
Zhouyi,

On Sun, Nov 27 2022 at 10:45, Zhouyi Zhou wrote:
> On Sun, Nov 27, 2022 at 1:05 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> So, I should construct my patch as:
> We avoid ... by ...

Not "We avoid".

Avoid this behaviour by ....

>> No. We are not exporting this just to make a bogus test case happy.
>>
>> Fix the torture code to handle -EBUSY correctly.
> I am going to do a study on this, for now, I do a grep in the kernel tree:
> find . -name "*.c"|xargs grep cpuhp_setup_state|wc -l
> The result of the grep command shows that there are 268
> cpuhp_setup_state* cases.
> which may make our task more complicated.

Why? The whole point of this torture thing is to stress the
infrastructure.

There are quite some reasons why a CPU-hotplug or a hot-unplug operation
can fail, which is not a fatal problem, really.

So if a CPU hotplug operation fails, then why can't the torture test
just move on and validate that the system still behaves correctly?

That gives us more coverage than just testing the good case and giving
up when something unexpected happens.

I even argue that the torture test should inject random failures into
the hotplug state machine to achieve extended code coverage.

Thanks,

        tglx
  
Paul E. McKenney Nov. 27, 2022, 5:53 p.m. UTC | #12
On Sun, Nov 27, 2022 at 01:40:28PM +0100, Thomas Gleixner wrote:

[ . . . ]

> >> No. We are not exporting this just to make a bogus test case happy.
> >>
> >> Fix the torture code to handle -EBUSY correctly.
> > I am going to do a study on this, for now, I do a grep in the kernel tree:
> > find . -name "*.c"|xargs grep cpuhp_setup_state|wc -l
> > The result of the grep command shows that there are 268
> > cpuhp_setup_state* cases.
> > which may make our task more complicated.
> 
> Why? The whole point of this torture thing is to stress the
> infrastructure.

Indeed.

> There are quite some reasons why a CPU-hotplug or a hot-unplug operation
> can fail, which is not a fatal problem, really.
> 
> So if a CPU hotplug operation fails, then why can't the torture test
> just move on and validate that the system still behaves correctly?
> 
> That gives us more coverage than just testing the good case and giving
> up when something unexpected happens.

Agreed, with access to a function like the tick_nohz_full_timekeeper()
suggested earlier in this email thread, then yes, it would make sense to
try to offline the CPU anyway, then forgive the failure in cases where
the CPU matches that indicated by tick_nohz_full_timekeeper().

> I even argue that the torture test should inject random failures into
> the hotplug state machine to achieve extended code coverage.

I could imagine torture_onoff() telling various CPU-hotplug notifiers
to refuse the transition using some TBD interface.  That would better
test the CPU-hotplug common code's ability to deal with failures.

Or did you have something else/additional in mind?

							Thanx, Paul
  
Zhouyi Zhou Nov. 28, 2022, 3 a.m. UTC | #13
Thank you all for your guidance and encouragement!

I learn how to construct commit message properly and learn how
important the role
that the torture test framework plays for the Linux kernel. Hope I can
be of benefit to the community by my work.

I am going to continue to study this topic and study the torture test
framework, and wait for your further instructions.

Best Regards
Zhouyi
On Mon, Nov 28, 2022 at 1:53 AM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Sun, Nov 27, 2022 at 01:40:28PM +0100, Thomas Gleixner wrote:
>
> [ . . . ]
>
> > >> No. We are not exporting this just to make a bogus test case happy.
> > >>
> > >> Fix the torture code to handle -EBUSY correctly.
> > > I am going to do a study on this, for now, I do a grep in the kernel tree:
> > > find . -name "*.c"|xargs grep cpuhp_setup_state|wc -l
> > > The result of the grep command shows that there are 268
> > > cpuhp_setup_state* cases.
> > > which may make our task more complicated.
> >
> > Why? The whole point of this torture thing is to stress the
> > infrastructure.
>
> Indeed.
>
> > There are quite some reasons why a CPU-hotplug or a hot-unplug operation
> > can fail, which is not a fatal problem, really.
> >
> > So if a CPU hotplug operation fails, then why can't the torture test
> > just move on and validate that the system still behaves correctly?
> >
> > That gives us more coverage than just testing the good case and giving
> > up when something unexpected happens.
>
> Agreed, with access to a function like the tick_nohz_full_timekeeper()
> suggested earlier in this email thread, then yes, it would make sense to
> try to offline the CPU anyway, then forgive the failure in cases where
> the CPU matches that indicated by tick_nohz_full_timekeeper().
>
> > I even argue that the torture test should inject random failures into
> > the hotplug state machine to achieve extended code coverage.
>
> I could imagine torture_onoff() telling various CPU-hotplug notifiers
> to refuse the transition using some TBD interface.  That would better
> test the CPU-hotplug common code's ability to deal with failures.
>
> Or did you have something else/additional in mind?
>
>                                                         Thanx, Paul
  
Thomas Gleixner Nov. 28, 2022, 8:12 a.m. UTC | #14
On Sun, Nov 27 2022 at 09:53, Paul E. McKenney wrote:
> On Sun, Nov 27, 2022 at 01:40:28PM +0100, Thomas Gleixner wrote:
>> There are quite some reasons why a CPU-hotplug or a hot-unplug operation
>> can fail, which is not a fatal problem, really.
>> 
>> So if a CPU hotplug operation fails, then why can't the torture test
>> just move on and validate that the system still behaves correctly?
>> 
>> That gives us more coverage than just testing the good case and giving
>> up when something unexpected happens.
>
> Agreed, with access to a function like the tick_nohz_full_timekeeper()
> suggested earlier in this email thread, then yes, it would make sense to
> try to offline the CPU anyway, then forgive the failure in cases where
> the CPU matches that indicated by tick_nohz_full_timekeeper().

Why special casing this? There are other valid reasons why offlining can
fail. So we special case timekeeper today and then next week we special
case something else just because. That does not make sense. If it fails
there is a reason and you can log it. The important part is that the
system is functional and stable after the fail and the rollback.

>> I even argue that the torture test should inject random failures into
>> the hotplug state machine to achieve extended code coverage.
>
> I could imagine torture_onoff() telling various CPU-hotplug notifiers
> to refuse the transition using some TBD interface.

There is already an interface which is exposed to sysfs which allows you
to enforce a "fail" at a defined hotplug state.

> That would better test the CPU-hotplug common code's ability to deal
> with failures.

Correct.

> Or did you have something else/additional in mind?

No.

Thanks,

        tglx
  
Paul E. McKenney Nov. 28, 2022, 3:16 p.m. UTC | #15
On Mon, Nov 28, 2022 at 09:12:28AM +0100, Thomas Gleixner wrote:
> On Sun, Nov 27 2022 at 09:53, Paul E. McKenney wrote:
> > On Sun, Nov 27, 2022 at 01:40:28PM +0100, Thomas Gleixner wrote:
> >> There are quite some reasons why a CPU-hotplug or a hot-unplug operation
> >> can fail, which is not a fatal problem, really.
> >> 
> >> So if a CPU hotplug operation fails, then why can't the torture test
> >> just move on and validate that the system still behaves correctly?
> >> 
> >> That gives us more coverage than just testing the good case and giving
> >> up when something unexpected happens.
> >
> > Agreed, with access to a function like the tick_nohz_full_timekeeper()
> > suggested earlier in this email thread, then yes, it would make sense to
> > try to offline the CPU anyway, then forgive the failure in cases where
> > the CPU matches that indicated by tick_nohz_full_timekeeper().
> 
> Why special casing this? There are other valid reasons why offlining can
> fail. So we special case timekeeper today and then next week we special
> case something else just because. That does not make sense. If it fails
> there is a reason and you can log it. The important part is that the
> system is functional and stable after the fail and the rollback.

Perhaps there are other valid reasons, but they have not been showing up
in my torture-test runs for well over a decade.  Not saying that they
don't happen, of course.  But if they involved (say) cgroups, then my
test setup would not exercise them.

So are you looking to introduce spurious CPU-hotplug failures?  If so,
these will also affect things like suspend/resume.  Plus it will make
it much more difficult to detect real but intermittent CPU-hotplug bugs,
which is the motivation for special-casing the tick_nohz_full_timekeeper()
failures.

So we should discuss introduciton of any spurious failures that might
be under consideration.

Independently of that, the torture_onoff() functions can of course keep
some sort of histogram of the failure return codes.  Or are there other
failure indications that should be captured?

> >> I even argue that the torture test should inject random failures into
> >> the hotplug state machine to achieve extended code coverage.
> >
> > I could imagine torture_onoff() telling various CPU-hotplug notifiers
> > to refuse the transition using some TBD interface.
> 
> There is already an interface which is exposed to sysfs which allows you
> to enforce a "fail" at a defined hotplug state.

If you would like me to be testing this as part of my normal testing
regimen, I will need an in-kernel interface.  Such an interface is of
course not needed for modprobe-style testing, in which case the script
doing the modprobe and rmmod can of course manipulate the sysfs files.
But I don't do that sort of testing very often.  And when I do, it is
almost always with kernels configured for Meta's fleet, which almost
never do CPU-offline operations.

							Thanx, Paul

> > That would better test the CPU-hotplug common code's ability to deal
> > with failures.
> 
> Correct.
> 
> > Or did you have something else/additional in mind?
> 
> No.
> 
> Thanks,
> 
>         tglx
  
Christophe Leroy July 6, 2023, 7:09 a.m. UTC | #16
Le 21/11/2022 à 04:51, Zhouyi Zhou a écrit :
> During CPU-hotplug torture (CONFIG_NO_HZ_FULL=y), if we try to
> offline tick_do_timer_cpu, the operation will fail because in
> function tick_nohz_cpu_down:
> ```
> if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
>        return -EBUSY;
> ```
> Above bug was first discovered in torture tests performed in PPC VM
> of Open Source Lab of Oregon State University, and reproducable in RISC-V
> and X86-64 (with additional kernel commandline cpu0_hotplug).
> 
> In this patch, we avoid offline tick_do_timer_cpu by distribute
> the offlining cpu among remaining cpus.
> 
> Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
> ---
>   include/linux/tick.h        |  1 +
>   kernel/time/tick-common.c   |  1 +
>   kernel/time/tick-internal.h |  1 -
>   kernel/torture.c            | 10 ++++++++++
>   4 files changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/tick.h b/include/linux/tick.h
> index bfd571f18cfd..23cc0b205853 100644
> --- a/include/linux/tick.h
> +++ b/include/linux/tick.h
> @@ -14,6 +14,7 @@
>   #include <linux/rcupdate.h>
>   
>   #ifdef CONFIG_GENERIC_CLOCKEVENTS
> +extern int tick_do_timer_cpu __read_mostly;
>   extern void __init tick_init(void);
>   /* Should be core only, but ARM BL switcher requires it */
>   extern void tick_suspend_local(void);
> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> index 46789356f856..87b9b9afa320 100644
> --- a/kernel/time/tick-common.c
> +++ b/kernel/time/tick-common.c
> @@ -48,6 +48,7 @@ ktime_t tick_next_period;
>    *    procedure also covers cpu hotplug.
>    */
>   int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
> +EXPORT_SYMBOL_GPL(tick_do_timer_cpu);
>   #ifdef CONFIG_NO_HZ_FULL
>   /*
>    * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
> diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
> index 649f2b48e8f0..8953dca10fdd 100644
> --- a/kernel/time/tick-internal.h
> +++ b/kernel/time/tick-internal.h
> @@ -15,7 +15,6 @@
>   
>   DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
>   extern ktime_t tick_next_period;
> -extern int tick_do_timer_cpu __read_mostly;
>   
>   extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
>   extern void tick_handle_periodic(struct clock_event_device *dev);
> diff --git a/kernel/torture.c b/kernel/torture.c
> index 789aeb0e1159..bccbdd33dda2 100644
> --- a/kernel/torture.c
> +++ b/kernel/torture.c
> @@ -33,6 +33,7 @@
>   #include <linux/delay.h>
>   #include <linux/stat.h>
>   #include <linux/slab.h>
> +#include <linux/tick.h>
>   #include <linux/trace_clock.h>
>   #include <linux/ktime.h>
>   #include <asm/byteorder.h>
> @@ -358,7 +359,16 @@ torture_onoff(void *arg)
>   			schedule_timeout_interruptible(HZ / 10);
>   			continue;
>   		}
> +#ifdef CONFIG_NO_HZ_FULL
> +		/* do not offline tick do timer cpu */
> +		if (tick_nohz_full_running) {

Can you use fonction tick_nohz_full_enabled() instead and avoid the #ifdef ?

> +			cpu = (torture_random(&rand) >> 4) % maxcpu;
> +			if (cpu >= tick_do_timer_cpu)
> +				cpu = (cpu + 1) % (maxcpu + 1);
> +		} else
> +#else
>   		cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
> +#endif
>   		if (!torture_offline(cpu,
>   				     &n_offline_attempts, &n_offline_successes,
>   				     &sum_offline, &min_offline, &max_offline))
  
Zhouyi Zhou July 6, 2023, 8:13 a.m. UTC | #17
On Thu, Jul 6, 2023 at 3:09 PM Christophe Leroy
<christophe.leroy@csgroup.eu> wrote:
>
>
>
> Le 21/11/2022 à 04:51, Zhouyi Zhou a écrit :
> > During CPU-hotplug torture (CONFIG_NO_HZ_FULL=y), if we try to
> > offline tick_do_timer_cpu, the operation will fail because in
> > function tick_nohz_cpu_down:
> > ```
> > if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
> >        return -EBUSY;
> > ```
> > Above bug was first discovered in torture tests performed in PPC VM
> > of Open Source Lab of Oregon State University, and reproducable in RISC-V
> > and X86-64 (with additional kernel commandline cpu0_hotplug).
> >
> > In this patch, we avoid offline tick_do_timer_cpu by distribute
> > the offlining cpu among remaining cpus.
> >
> > Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
> > ---
> >   include/linux/tick.h        |  1 +
> >   kernel/time/tick-common.c   |  1 +
> >   kernel/time/tick-internal.h |  1 -
> >   kernel/torture.c            | 10 ++++++++++
> >   4 files changed, 12 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/tick.h b/include/linux/tick.h
> > index bfd571f18cfd..23cc0b205853 100644
> > --- a/include/linux/tick.h
> > +++ b/include/linux/tick.h
> > @@ -14,6 +14,7 @@
> >   #include <linux/rcupdate.h>
> >
> >   #ifdef CONFIG_GENERIC_CLOCKEVENTS
> > +extern int tick_do_timer_cpu __read_mostly;
> >   extern void __init tick_init(void);
> >   /* Should be core only, but ARM BL switcher requires it */
> >   extern void tick_suspend_local(void);
> > diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> > index 46789356f856..87b9b9afa320 100644
> > --- a/kernel/time/tick-common.c
> > +++ b/kernel/time/tick-common.c
> > @@ -48,6 +48,7 @@ ktime_t tick_next_period;
> >    *    procedure also covers cpu hotplug.
> >    */
> >   int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
> > +EXPORT_SYMBOL_GPL(tick_do_timer_cpu);
> >   #ifdef CONFIG_NO_HZ_FULL
> >   /*
> >    * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
> > diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
> > index 649f2b48e8f0..8953dca10fdd 100644
> > --- a/kernel/time/tick-internal.h
> > +++ b/kernel/time/tick-internal.h
> > @@ -15,7 +15,6 @@
> >
> >   DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
> >   extern ktime_t tick_next_period;
> > -extern int tick_do_timer_cpu __read_mostly;
> >
> >   extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
> >   extern void tick_handle_periodic(struct clock_event_device *dev);
> > diff --git a/kernel/torture.c b/kernel/torture.c
> > index 789aeb0e1159..bccbdd33dda2 100644
> > --- a/kernel/torture.c
> > +++ b/kernel/torture.c
> > @@ -33,6 +33,7 @@
> >   #include <linux/delay.h>
> >   #include <linux/stat.h>
> >   #include <linux/slab.h>
> > +#include <linux/tick.h>
> >   #include <linux/trace_clock.h>
> >   #include <linux/ktime.h>
> >   #include <asm/byteorder.h>
> > @@ -358,7 +359,16 @@ torture_onoff(void *arg)
> >                       schedule_timeout_interruptible(HZ / 10);
> >                       continue;
> >               }
> > +#ifdef CONFIG_NO_HZ_FULL
> > +             /* do not offline tick do timer cpu */
> > +             if (tick_nohz_full_running) {
>
> Can you use fonction tick_nohz_full_enabled() instead and avoid the #ifdef ?
Thank Christophe for your wonderful advice, I will follow your advice
next time I prepare a patch.
For this false positive report, 58d766824264 ("tick/nohz: Fix
cpu_is_hotpluggable() by checking with nohz subsystem") has beaten me
in mainline.

Thanks again
Zhouyi
>
> > +                     cpu = (torture_random(&rand) >> 4) % maxcpu;
> > +                     if (cpu >= tick_do_timer_cpu)
> > +                             cpu = (cpu + 1) % (maxcpu + 1);
> > +             } else
> > +#else
> >               cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
> > +#endif
> >               if (!torture_offline(cpu,
> >                                    &n_offline_attempts, &n_offline_successes,
> >                                    &sum_offline, &min_offline, &max_offline))
  

Patch

diff --git a/include/linux/tick.h b/include/linux/tick.h
index bfd571f18cfd..23cc0b205853 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -14,6 +14,7 @@ 
 #include <linux/rcupdate.h>
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
+extern int tick_do_timer_cpu __read_mostly;
 extern void __init tick_init(void);
 /* Should be core only, but ARM BL switcher requires it */
 extern void tick_suspend_local(void);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 46789356f856..87b9b9afa320 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -48,6 +48,7 @@  ktime_t tick_next_period;
  *    procedure also covers cpu hotplug.
  */
 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
+EXPORT_SYMBOL_GPL(tick_do_timer_cpu);
 #ifdef CONFIG_NO_HZ_FULL
 /*
  * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 649f2b48e8f0..8953dca10fdd 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -15,7 +15,6 @@ 
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 extern ktime_t tick_next_period;
-extern int tick_do_timer_cpu __read_mostly;
 
 extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
 extern void tick_handle_periodic(struct clock_event_device *dev);
diff --git a/kernel/torture.c b/kernel/torture.c
index 789aeb0e1159..bccbdd33dda2 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -33,6 +33,7 @@ 
 #include <linux/delay.h>
 #include <linux/stat.h>
 #include <linux/slab.h>
+#include <linux/tick.h>
 #include <linux/trace_clock.h>
 #include <linux/ktime.h>
 #include <asm/byteorder.h>
@@ -358,7 +359,16 @@  torture_onoff(void *arg)
 			schedule_timeout_interruptible(HZ / 10);
 			continue;
 		}
+#ifdef CONFIG_NO_HZ_FULL
+		/* do not offline tick do timer cpu */
+		if (tick_nohz_full_running) {
+			cpu = (torture_random(&rand) >> 4) % maxcpu;
+			if (cpu >= tick_do_timer_cpu)
+				cpu = (cpu + 1) % (maxcpu + 1);
+		} else
+#else
 		cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
+#endif
 		if (!torture_offline(cpu,
 				     &n_offline_attempts, &n_offline_successes,
 				     &sum_offline, &min_offline, &max_offline))