[4/4] sched: Exclude CPU boot code from PF_IDLE area

Message ID 20231024214625.6483-5-frederic@kernel.org
State New
Headers
Series rcu: Fix PF_IDLE related issues v2 |

Commit Message

Frederic Weisbecker Oct. 24, 2023, 9:46 p.m. UTC
  The commit:

	cff9b2332ab7 ("kernel/sched: Modify initial boot task idle setup")

has changed the semantics of what is to be considered an idle task in
such a way that only the actual idle loop is accounted as PF_IDLE. The
intent is to exclude the CPU boot code from that coverage.

However this doesn't clear the flag when the CPU goes down. Therefore
when the CPU goes up again, its boot code is part of the PF_IDLE zone.

Make sure this flag behave consistently and clear the flag when a CPU
exits from the idle loop. If anything, RCU-tasks relies on it to exclude
CPU boot code from its quiescent states.

Fixes: cff9b2332ab7 ("kernel/sched: Modify initial boot task idle setup")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/sched.h | 2 +-
 kernel/cpu.c          | 4 ++++
 kernel/sched/idle.c   | 1 -
 3 files changed, 5 insertions(+), 2 deletions(-)
  

Comments

Peter Zijlstra Oct. 25, 2023, 8:48 a.m. UTC | #1
On Tue, Oct 24, 2023 at 11:46:25PM +0200, Frederic Weisbecker wrote:

> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 8885be2c143e..ad18962b921d 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1945,7 +1945,7 @@ extern struct task_struct *idle_task(int cpu);
>   */
>  static __always_inline bool is_idle_task(const struct task_struct *p)
>  {
> -	return !!(p->flags & PF_IDLE);
> +	return !!(READ_ONCE(p->flags) & PF_IDLE);
>  }
>  
>  extern struct task_struct *curr_task(int cpu);
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index 3b9d5c7eb4a2..3a1991010f4e 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -1394,7 +1394,9 @@ void cpuhp_report_idle_dead(void)
>  {
>  	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
>  
> +	WRITE_ONCE(current->flags, current->flags & ~PF_IDLE);
>  	BUG_ON(st->state != CPUHP_AP_OFFLINE);
> +
>  	rcutree_report_cpu_dead();
>  	st->state = CPUHP_AP_IDLE_DEAD;
>  	/*
> @@ -1642,6 +1644,8 @@ void cpuhp_online_idle(enum cpuhp_state state)
>  {
>  	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
>  
> +	WRITE_ONCE(current->flags, current->flags | PF_IDLE);
> +
>  	/* Happens for the boot cpu */
>  	if (state != CPUHP_AP_ONLINE_IDLE)
>  		return;

Without changing *ALL* ->flags stores to WRITE_ONCE() I don't see the
point of this. Also, since we only care about a single bit, how does
store tearing affect things?

Not to mention if we're really paranoid, what are the SMP ordering
considerations :-)

[ also, PF_ is used for Protocol Family, Page Flag and Process Flag,
  grepping is a pain in the arse :-( ]
  
Frederic Weisbecker Oct. 25, 2023, 11:25 a.m. UTC | #2
Le Wed, Oct 25, 2023 at 10:48:33AM +0200, Peter Zijlstra a écrit :
> On Tue, Oct 24, 2023 at 11:46:25PM +0200, Frederic Weisbecker wrote:
> 
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 8885be2c143e..ad18962b921d 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1945,7 +1945,7 @@ extern struct task_struct *idle_task(int cpu);
> >   */
> >  static __always_inline bool is_idle_task(const struct task_struct *p)
> >  {
> > -	return !!(p->flags & PF_IDLE);
> > +	return !!(READ_ONCE(p->flags) & PF_IDLE);
> >  }
> >  
> >  extern struct task_struct *curr_task(int cpu);
> > diff --git a/kernel/cpu.c b/kernel/cpu.c
> > index 3b9d5c7eb4a2..3a1991010f4e 100644
> > --- a/kernel/cpu.c
> > +++ b/kernel/cpu.c
> > @@ -1394,7 +1394,9 @@ void cpuhp_report_idle_dead(void)
> >  {
> >  	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
> >  
> > +	WRITE_ONCE(current->flags, current->flags & ~PF_IDLE);
> >  	BUG_ON(st->state != CPUHP_AP_OFFLINE);
> > +
> >  	rcutree_report_cpu_dead();
> >  	st->state = CPUHP_AP_IDLE_DEAD;
> >  	/*
> > @@ -1642,6 +1644,8 @@ void cpuhp_online_idle(enum cpuhp_state state)
> >  {
> >  	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
> >  
> > +	WRITE_ONCE(current->flags, current->flags | PF_IDLE);
> > +
> >  	/* Happens for the boot cpu */
> >  	if (state != CPUHP_AP_ONLINE_IDLE)
> >  		return;
> 
> Without changing *ALL* ->flags stores to WRITE_ONCE() I don't see the
> point of this. Also, since we only care about a single bit, how does
> store tearing affect things?
> 
> Not to mention if we're really paranoid, what are the SMP ordering
> considerations :-)
> 
> [ also, PF_ is used for Protocol Family, Page Flag and Process Flag,
>   grepping is a pain in the arse :-( ]

Indeed. Also cpuhp_online_idle() is called with preemption disabled
and cpuhp_report_idle_dead() with interrupts disabled. As for idle
injection in play_idle_precise(), the flag is set and cleared with
preemption disabled.

This means that all writes are in an RCU read side critical section
that RCU-tasks pre-gp's synchronize_rcu() waits for. So I don't think
we need those WRITE_ONCE/READ_ONCE.

Paul are you ok with that?

Thanks.
  

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8885be2c143e..ad18962b921d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1945,7 +1945,7 @@  extern struct task_struct *idle_task(int cpu);
  */
 static __always_inline bool is_idle_task(const struct task_struct *p)
 {
-	return !!(p->flags & PF_IDLE);
+	return !!(READ_ONCE(p->flags) & PF_IDLE);
 }
 
 extern struct task_struct *curr_task(int cpu);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3b9d5c7eb4a2..3a1991010f4e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1394,7 +1394,9 @@  void cpuhp_report_idle_dead(void)
 {
 	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
 
+	WRITE_ONCE(current->flags, current->flags & ~PF_IDLE);
 	BUG_ON(st->state != CPUHP_AP_OFFLINE);
+
 	rcutree_report_cpu_dead();
 	st->state = CPUHP_AP_IDLE_DEAD;
 	/*
@@ -1642,6 +1644,8 @@  void cpuhp_online_idle(enum cpuhp_state state)
 {
 	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
 
+	WRITE_ONCE(current->flags, current->flags | PF_IDLE);
+
 	/* Happens for the boot cpu */
 	if (state != CPUHP_AP_ONLINE_IDLE)
 		return;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 5007b25c5bc6..342f58a329f5 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -373,7 +373,6 @@  EXPORT_SYMBOL_GPL(play_idle_precise);
 
 void cpu_startup_entry(enum cpuhp_state state)
 {
-	current->flags |= PF_IDLE;
 	arch_cpu_idle_prepare();
 	cpuhp_online_idle(state);
 	while (1)