[v5] kernel/fork: beware of __put_task_struct calling context

Message ID 20230210161323.37400-1-wander@redhat.com
State New
Headers
Series [v5] kernel/fork: beware of __put_task_struct calling context |

Commit Message

Wander Lairson Costa Feb. 10, 2023, 4:13 p.m. UTC
  Under PREEMPT_RT, __put_task_struct() indirectly acquires sleeping
locks. Therefore, it can't be called from an non-preemptible context.

One practical example is splat inside inactive_task_timer(), which is
called in a interrupt context:

CPU: 1 PID: 2848 Comm: life Kdump: loaded Tainted: G W ---------
 Hardware name: HP ProLiant DL388p Gen8, BIOS P70 07/15/2012
 Call Trace:
 dump_stack_lvl+0x57/0x7d
 mark_lock_irq.cold+0x33/0xba
 ? stack_trace_save+0x4b/0x70
 ? save_trace+0x55/0x150
 mark_lock+0x1e7/0x400
 mark_usage+0x11d/0x140
 __lock_acquire+0x30d/0x930
 lock_acquire.part.0+0x9c/0x210
 ? refill_obj_stock+0x3d/0x3a0
 ? rcu_read_lock_sched_held+0x3f/0x70
 ? trace_lock_acquire+0x38/0x140
 ? lock_acquire+0x30/0x80
 ? refill_obj_stock+0x3d/0x3a0
 rt_spin_lock+0x27/0xe0
 ? refill_obj_stock+0x3d/0x3a0
 refill_obj_stock+0x3d/0x3a0
 ? inactive_task_timer+0x1ad/0x340
 kmem_cache_free+0x357/0x560
 inactive_task_timer+0x1ad/0x340
 ? switched_from_dl+0x2d0/0x2d0
 __run_hrtimer+0x8a/0x1a0
 __hrtimer_run_queues+0x91/0x130
 hrtimer_interrupt+0x10f/0x220
 __sysvec_apic_timer_interrupt+0x7b/0xd0
 sysvec_apic_timer_interrupt+0x4f/0xd0
 ? asm_sysvec_apic_timer_interrupt+0xa/0x20
 asm_sysvec_apic_timer_interrupt+0x12/0x20
 RIP: 0033:0x7fff196bf6f5

Instead of calling __put_task_struct() directly, we defer it using
call_rcu(). A more natural approach would use a workqueue, but since
in PREEMPT_RT, we can't allocate dynamic memory from atomic context,
the code would become more complex because we would need to put the
work_struct instance in the task_struct and initialize it when we
allocate a new task_struct.

Changelog
=========

v1:
* Initial implementation fixing the splat.

v2:
* Isolate the logic in its own function.
* Fix two more cases caught in review.

v3:
* Change __put_task_struct() to handle the issue internally.

v4:
* Explain why call_rcu() is safe to call from interrupt context.

v5:
* Explain why __put_task_struct() doesn't conflict with
  put_task_sruct_rcu_user.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
Reported-by: Hu Chunyu <chuhu@redhat.com>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Suggested-by: Valentin Schneider <vschneid@redhat.com>
Cc: Paul McKenney <paulmck@kernel.org>
---
 kernel/fork.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)
  

Comments

Sebastian Andrzej Siewior Feb. 10, 2023, 5:19 p.m. UTC | #1
On 2023-02-10 13:13:21 [-0300], Wander Lairson Costa wrote:
> Under PREEMPT_RT, __put_task_struct() indirectly acquires sleeping
> locks. Therefore, it can't be called from an non-preemptible context.
> 
> One practical example is splat inside inactive_task_timer(), which is
> called in a interrupt context:
> 
> CPU: 1 PID: 2848 Comm: life Kdump: loaded Tainted: G W ---------
>  Hardware name: HP ProLiant DL388p Gen8, BIOS P70 07/15/2012
>  Call Trace:
>  dump_stack_lvl+0x57/0x7d
>  mark_lock_irq.cold+0x33/0xba
>  ? stack_trace_save+0x4b/0x70
>  ? save_trace+0x55/0x150
>  mark_lock+0x1e7/0x400
>  mark_usage+0x11d/0x140
>  __lock_acquire+0x30d/0x930
>  lock_acquire.part.0+0x9c/0x210
>  ? refill_obj_stock+0x3d/0x3a0
>  ? rcu_read_lock_sched_held+0x3f/0x70
>  ? trace_lock_acquire+0x38/0x140
>  ? lock_acquire+0x30/0x80
>  ? refill_obj_stock+0x3d/0x3a0
>  rt_spin_lock+0x27/0xe0
>  ? refill_obj_stock+0x3d/0x3a0
>  refill_obj_stock+0x3d/0x3a0
>  ? inactive_task_timer+0x1ad/0x340
>  kmem_cache_free+0x357/0x560
>  inactive_task_timer+0x1ad/0x340
>  ? switched_from_dl+0x2d0/0x2d0
>  __run_hrtimer+0x8a/0x1a0
>  __hrtimer_run_queues+0x91/0x130
>  hrtimer_interrupt+0x10f/0x220
>  __sysvec_apic_timer_interrupt+0x7b/0xd0
>  sysvec_apic_timer_interrupt+0x4f/0xd0
>  ? asm_sysvec_apic_timer_interrupt+0xa/0x20
>  asm_sysvec_apic_timer_interrupt+0x12/0x20
>  RIP: 0033:0x7fff196bf6f5

Now that I looked around: There are other put_task_struct() while the rq
lock is held. I didn't look outside o dl.c.

> Instead of calling __put_task_struct() directly, we defer it using
> call_rcu(). A more natural approach would use a workqueue, but since
> in PREEMPT_RT, we can't allocate dynamic memory from atomic context,
> the code would become more complex because we would need to put the
> work_struct instance in the task_struct and initialize it when we
> allocate a new task_struct.
> 
> Changelog
> =========
> 
> v1:
> * Initial implementation fixing the splat.
> 
> v2:
> * Isolate the logic in its own function.
> * Fix two more cases caught in review.
> 
> v3:
> * Change __put_task_struct() to handle the issue internally.
> 
> v4:
> * Explain why call_rcu() is safe to call from interrupt context.
> 
> v5:
> * Explain why __put_task_struct() doesn't conflict with
>   put_task_sruct_rcu_user.
> 
> Signed-off-by: Wander Lairson Costa <wander@redhat.com>
> Reported-by: Hu Chunyu <chuhu@redhat.com>
> Suggested-by: Oleg Nesterov <oleg@redhat.com>
> Suggested-by: Valentin Schneider <vschneid@redhat.com>
> Cc: Paul McKenney <paulmck@kernel.org>
> ---
>  kernel/fork.c | 33 ++++++++++++++++++++++++++++++++-
>  1 file changed, 32 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 9f7fe3541897..9bf30c725ed8 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -840,7 +840,7 @@ static inline void put_signal_struct(struct signal_struct *sig)
>  		free_signal_struct(sig);
>  }
>  
> -void __put_task_struct(struct task_struct *tsk)
> +static void ___put_task_struct(struct task_struct *tsk)
>  {
>  	WARN_ON(!tsk->exit_state);
>  	WARN_ON(refcount_read(&tsk->usage));
> @@ -857,6 +857,37 @@ void __put_task_struct(struct task_struct *tsk)
>  	sched_core_free(tsk);
>  	free_task(tsk);
>  }
> +
> +static void __put_task_struct_rcu(struct rcu_head *rhp)
> +{
> +	struct task_struct *task = container_of(rhp, struct task_struct, rcu);
> +
> +	___put_task_struct(task);
> +}
> +
> +void __put_task_struct(struct task_struct *tsk)
> +{
> +	if (IS_ENABLED(CONFIG_PREEMPT_RT) && (!preemptible() || !in_task()))

No. If you do this on non-RT kernel with CONFIG_PROVE_RAW_LOCK_NESTING
then it will complain. And why do we have in_task() here?

If Oleg does not want the unconditional RCU then I would prefer an
explicit put task which delays it to RCU for the few users that need it.

A lockdep annotation _before_ ___put_task_struct() should spot users
which are not obviously visible from audit.

> +		/*
> +		 * under PREEMPT_RT, we can't call put_task_struct
> +		 * in atomic context because it will indirectly
> +		 * acquire sleeping locks.
> +		 *
> +		 * call_rcu() will schedule delayed_put_task_struct_rcu()
> +		 * to be called in process context.
> +		 *
> +		 * __put_task_struct() is called called when
> +		 * refcount_dec_and_test(&t->usage) succeeds.
> +		 *
> +		 * This means that it can't "conflict" with
> +		 * put_task_struct_rcu_user() which abuses ->rcu the same
> +		 * way; rcu_users has a reference so task->usage can't be
> +		 * zero after rcu_users 1 -> 0 transition.
> +		 */
> +		call_rcu(&tsk->rcu, __put_task_struct_rcu);
> +	else
> +		___put_task_struct(tsk);
> +}
>  EXPORT_SYMBOL_GPL(__put_task_struct);
>  
>  void __init __weak arch_task_cache_init(void) { }
> -- 
> 2.39.1
>
  
Wander Lairson Costa Feb. 13, 2023, 12:13 p.m. UTC | #2
On Fri, Feb 10, 2023 at 06:19:54PM +0100, Sebastian Andrzej Siewior wrote:
> On 2023-02-10 13:13:21 [-0300], Wander Lairson Costa wrote:
> > Under PREEMPT_RT, __put_task_struct() indirectly acquires sleeping
> > locks. Therefore, it can't be called from an non-preemptible context.
> > 
> > One practical example is splat inside inactive_task_timer(), which is
> > called in a interrupt context:
> > 
> > CPU: 1 PID: 2848 Comm: life Kdump: loaded Tainted: G W ---------
> >  Hardware name: HP ProLiant DL388p Gen8, BIOS P70 07/15/2012
> >  Call Trace:
> >  dump_stack_lvl+0x57/0x7d
> >  mark_lock_irq.cold+0x33/0xba
> >  ? stack_trace_save+0x4b/0x70
> >  ? save_trace+0x55/0x150
> >  mark_lock+0x1e7/0x400
> >  mark_usage+0x11d/0x140
> >  __lock_acquire+0x30d/0x930
> >  lock_acquire.part.0+0x9c/0x210
> >  ? refill_obj_stock+0x3d/0x3a0
> >  ? rcu_read_lock_sched_held+0x3f/0x70
> >  ? trace_lock_acquire+0x38/0x140
> >  ? lock_acquire+0x30/0x80
> >  ? refill_obj_stock+0x3d/0x3a0
> >  rt_spin_lock+0x27/0xe0
> >  ? refill_obj_stock+0x3d/0x3a0
> >  refill_obj_stock+0x3d/0x3a0
> >  ? inactive_task_timer+0x1ad/0x340
> >  kmem_cache_free+0x357/0x560
> >  inactive_task_timer+0x1ad/0x340
> >  ? switched_from_dl+0x2d0/0x2d0
> >  __run_hrtimer+0x8a/0x1a0
> >  __hrtimer_run_queues+0x91/0x130
> >  hrtimer_interrupt+0x10f/0x220
> >  __sysvec_apic_timer_interrupt+0x7b/0xd0
> >  sysvec_apic_timer_interrupt+0x4f/0xd0
> >  ? asm_sysvec_apic_timer_interrupt+0xa/0x20
> >  asm_sysvec_apic_timer_interrupt+0x12/0x20
> >  RIP: 0033:0x7fff196bf6f5
> 
> Now that I looked around: There are other put_task_struct() while the rq
> lock is held. I didn't look outside o dl.c.
> 
> > Instead of calling __put_task_struct() directly, we defer it using
> > call_rcu(). A more natural approach would use a workqueue, but since
> > in PREEMPT_RT, we can't allocate dynamic memory from atomic context,
> > the code would become more complex because we would need to put the
> > work_struct instance in the task_struct and initialize it when we
> > allocate a new task_struct.
> > 
> > Changelog
> > =========
> > 
> > v1:
> > * Initial implementation fixing the splat.
> > 
> > v2:
> > * Isolate the logic in its own function.
> > * Fix two more cases caught in review.
> > 
> > v3:
> > * Change __put_task_struct() to handle the issue internally.
> > 
> > v4:
> > * Explain why call_rcu() is safe to call from interrupt context.
> > 
> > v5:
> > * Explain why __put_task_struct() doesn't conflict with
> >   put_task_sruct_rcu_user.
> > 
> > Signed-off-by: Wander Lairson Costa <wander@redhat.com>
> > Reported-by: Hu Chunyu <chuhu@redhat.com>
> > Suggested-by: Oleg Nesterov <oleg@redhat.com>
> > Suggested-by: Valentin Schneider <vschneid@redhat.com>
> > Cc: Paul McKenney <paulmck@kernel.org>
> > ---
> >  kernel/fork.c | 33 ++++++++++++++++++++++++++++++++-
> >  1 file changed, 32 insertions(+), 1 deletion(-)
> > 
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 9f7fe3541897..9bf30c725ed8 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -840,7 +840,7 @@ static inline void put_signal_struct(struct signal_struct *sig)
> >  		free_signal_struct(sig);
> >  }
> >  
> > -void __put_task_struct(struct task_struct *tsk)
> > +static void ___put_task_struct(struct task_struct *tsk)
> >  {
> >  	WARN_ON(!tsk->exit_state);
> >  	WARN_ON(refcount_read(&tsk->usage));
> > @@ -857,6 +857,37 @@ void __put_task_struct(struct task_struct *tsk)
> >  	sched_core_free(tsk);
> >  	free_task(tsk);
> >  }
> > +
> > +static void __put_task_struct_rcu(struct rcu_head *rhp)
> > +{
> > +	struct task_struct *task = container_of(rhp, struct task_struct, rcu);
> > +
> > +	___put_task_struct(task);
> > +}
> > +
> > +void __put_task_struct(struct task_struct *tsk)
> > +{
> > +	if (IS_ENABLED(CONFIG_PREEMPT_RT) && (!preemptible() || !in_task()))
> 
> No. If you do this on non-RT kernel with CONFIG_PROVE_RAW_LOCK_NESTING
> then it will complain. And why do we have in_task() here?
> 

Initially I thought you were saying it would cause a build failure, but
I built the kernel successfully with CONFIG_PROVE_RAW_LOCK_NESTING.
If it is a non-RT kernel, I understand the optimizer will vanish with
the `if` clause. Would mind further explaining the conflict with
CONFIG_PROVE_RAW_LOCK_NESTING?

The `!in_task()` call is to test if we are in interrupt context.

> If Oleg does not want the unconditional RCU then I would prefer an
> explicit put task which delays it to RCU for the few users that need it.
> 

Do you mean like the approach in v2[1]? I believe to spot all possible
problematic scenarios, would should add

```
if (IS_ENABLED(CONFIG_PREEMPT_RT))
    might_sleep();
```

to `put_task_struct()`.

> A lockdep annotation _before_ ___put_task_struct() should spot users
> which are not obviously visible from audit.
> 
> > +		/*
> > +		 * under PREEMPT_RT, we can't call put_task_struct
> > +		 * in atomic context because it will indirectly
> > +		 * acquire sleeping locks.
> > +		 *
> > +		 * call_rcu() will schedule delayed_put_task_struct_rcu()
> > +		 * to be called in process context.
> > +		 *
> > +		 * __put_task_struct() is called called when
> > +		 * refcount_dec_and_test(&t->usage) succeeds.
> > +		 *
> > +		 * This means that it can't "conflict" with
> > +		 * put_task_struct_rcu_user() which abuses ->rcu the same
> > +		 * way; rcu_users has a reference so task->usage can't be
> > +		 * zero after rcu_users 1 -> 0 transition.
> > +		 */
> > +		call_rcu(&tsk->rcu, __put_task_struct_rcu);
> > +	else
> > +		___put_task_struct(tsk);
> > +}
> >  EXPORT_SYMBOL_GPL(__put_task_struct);
> >  
> >  void __init __weak arch_task_cache_init(void) { }
> > -- 
> > 2.39.1
> > 
> 

[1] https://lore.kernel.org/all/20230120150246.20797-1-wander@redhat.com/
  
Sebastian Andrzej Siewior Feb. 15, 2023, 11:42 a.m. UTC | #3
On 2023-02-13 09:13:55 [-0300], Wander Lairson Costa wrote:
…
> > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > index 9f7fe3541897..9bf30c725ed8 100644
> > > --- a/kernel/fork.c
> > > +++ b/kernel/fork.c
> > > @@ -857,6 +857,37 @@ void __put_task_struct(struct task_struct *tsk)
> > >  	sched_core_free(tsk);
> > >  	free_task(tsk);
> > >  }
> > > +
> > > +static void __put_task_struct_rcu(struct rcu_head *rhp)
> > > +{
> > > +	struct task_struct *task = container_of(rhp, struct task_struct, rcu);
> > > +
> > > +	___put_task_struct(task);
> > > +}
> > > +
> > > +void __put_task_struct(struct task_struct *tsk)
> > > +{
> > > +	if (IS_ENABLED(CONFIG_PREEMPT_RT) && (!preemptible() || !in_task()))
> > 
> > No. If you do this on non-RT kernel with CONFIG_PROVE_RAW_LOCK_NESTING
> > then it will complain. And why do we have in_task() here?
> > 
> 
> Initially I thought you were saying it would cause a build failure, but
> I built the kernel successfully with CONFIG_PROVE_RAW_LOCK_NESTING.
> If it is a non-RT kernel, I understand the optimizer will vanish with
> the `if` clause. Would mind further explaining the conflict with
> CONFIG_PROVE_RAW_LOCK_NESTING?

Documentation/locking/locktypes.rst explains the individual lock types
we have in the kernel and how you should nest them. In short,

 mutex_t -> spinlock_t ->  raw_spinlock_t

You nest/ acquire them left to right, i.e. first the mutex_t, last
raw_spinlock_t. This works always. If you leave PREEMPT_RT out of the
picture then
	raw_spinlock_t -> spinlock_t
and
	spinlock_t -> raw_spinlock_t 

make no difference because the underlying lock structure is the same,
the behaviour is the same. It only causes warning or a boom once
PREEMPT_RT is enabled.
CONFIG_PROVE_RAW_LOCK_NESTING performs exactly this kind of
verification so you can see on a !PREEMPT_RT kernel if there is a
locking chain (or nesting) that would not be okay on PREEMPT_RT.

In this case, at the time you do __put_task_struct() the sched-RQ lock
is held which is a raw_spinlock_t. Later in __put_task_struct() it will
free memory (or do something else) requiring a spinlock_t which would do
the nesting
	raw_spinlock_t -> spinlock_t

which is invalid and so lockdep should yell here.

> The `!in_task()` call is to test if we are in interrupt context.

I am aware of this but here in terms of PREEMPT_RT it doesn't matter.
It excluded the hardirq context which is the important one but this also
happens with preemptible(). It additionally excludes the "serving"
softirq context which is fine because it is preemtible on PREEMPT_RT.

> > If Oleg does not want the unconditional RCU then I would prefer an
> > explicit put task which delays it to RCU for the few users that need it.
> > 
> 
> Do you mean like the approach in v2[1]? I believe to spot all possible
> problematic scenarios, would should add

Yes, an explicit function because you know the context in which put_.*()
is invoked. It wasn't audited by the time it was added, it is not
"regular" case.

> ```
> if (IS_ENABLED(CONFIG_PREEMPT_RT))
>     might_sleep();
> ```
> 
> to `put_task_struct()`.

This only works on PREEMPT_RT and should be enough to spot some of the
offender we have right now. It might also trigger if task::state is
changed (not TASK_RUNNING) and it should be fine. Therefore I would
suggest to use rtlock_might_resched() for testing which is in
   kernel/locking/spinlock_rt.c
but you get the idea.

Longterm, something like the diff at the bottom might compile and will
show raw_spinlock_t -> spinlock_t nesting with
CONFIG_PROVE_RAW_LOCK_NESTING. We won't catch explicit
preempt_disable(), local_irq_disable() users but _should_ be enough and
it would have warned us in this case because:
- the scheduler acquires a raw_spinlock_t
- the hrtimer has an check for this in lockdep_hrtimer_enter() to
  distinguish between timers which are "regular" and those which
  explicitly ask for the hardirq context.

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 357e0068497c1..eedbd50eb5df3 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -113,14 +113,18 @@ static inline struct task_struct *get_task_struct(struct task_struct *t)
 
 extern void __put_task_struct(struct task_struct *t);
 
+extern spinlock_t task_put_lock;
+
 static inline void put_task_struct(struct task_struct *t)
 {
+	might_lock(&task_put_lock);
 	if (refcount_dec_and_test(&t->usage))
 		__put_task_struct(t);
 }
 
 static inline void put_task_struct_many(struct task_struct *t, int nr)
 {
+	might_lock(&task_put_lock);
 	if (refcount_sub_and_test(nr, &t->usage))
 		__put_task_struct(t);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 9f7fe35418978..2f9c09bc22bdb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -840,6 +840,8 @@ static inline void put_signal_struct(struct signal_struct *sig)
 		free_signal_struct(sig);
 }
 
+DEFINE_SPINLOCK(task_put_lock);
+
 void __put_task_struct(struct task_struct *tsk)
 {
 	WARN_ON(!tsk->exit_state);
  
Wander Lairson Costa Feb. 24, 2023, 5:04 p.m. UTC | #4
On Wed, Feb 15, 2023 at 12:42:46PM +0100, Sebastian Andrzej Siewior wrote:
> On 2023-02-13 09:13:55 [-0300], Wander Lairson Costa wrote:
> …
> > > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > > index 9f7fe3541897..9bf30c725ed8 100644
> > > > --- a/kernel/fork.c
> > > > +++ b/kernel/fork.c
> > > > @@ -857,6 +857,37 @@ void __put_task_struct(struct task_struct *tsk)
> > > >  	sched_core_free(tsk);
> > > >  	free_task(tsk);
> > > >  }
> > > > +
> > > > +static void __put_task_struct_rcu(struct rcu_head *rhp)
> > > > +{
> > > > +	struct task_struct *task = container_of(rhp, struct task_struct, rcu);
> > > > +
> > > > +	___put_task_struct(task);
> > > > +}
> > > > +
> > > > +void __put_task_struct(struct task_struct *tsk)
> > > > +{
> > > > +	if (IS_ENABLED(CONFIG_PREEMPT_RT) && (!preemptible() || !in_task()))
> > > 
> > > No. If you do this on non-RT kernel with CONFIG_PROVE_RAW_LOCK_NESTING
> > > then it will complain. And why do we have in_task() here?
> > > 
> > 
> > Initially I thought you were saying it would cause a build failure, but
> > I built the kernel successfully with CONFIG_PROVE_RAW_LOCK_NESTING.
> > If it is a non-RT kernel, I understand the optimizer will vanish with
> > the `if` clause. Would mind further explaining the conflict with
> > CONFIG_PROVE_RAW_LOCK_NESTING?
> 
> Documentation/locking/locktypes.rst explains the individual lock types
> we have in the kernel and how you should nest them. In short,
> 
>  mutex_t -> spinlock_t ->  raw_spinlock_t
> 
> You nest/ acquire them left to right, i.e. first the mutex_t, last
> raw_spinlock_t. This works always. If you leave PREEMPT_RT out of the
> picture then
> 	raw_spinlock_t -> spinlock_t
> and
> 	spinlock_t -> raw_spinlock_t 
> 
> make no difference because the underlying lock structure is the same,
> the behaviour is the same. It only causes warning or a boom once
> PREEMPT_RT is enabled.
> CONFIG_PROVE_RAW_LOCK_NESTING performs exactly this kind of
> verification so you can see on a !PREEMPT_RT kernel if there is a
> locking chain (or nesting) that would not be okay on PREEMPT_RT.
> 
> In this case, at the time you do __put_task_struct() the sched-RQ lock
> is held which is a raw_spinlock_t. Later in __put_task_struct() it will
> free memory (or do something else) requiring a spinlock_t which would do
> the nesting
> 	raw_spinlock_t -> spinlock_t
> 
> which is invalid and so lockdep should yell here.

Thanks for the detailed explanation!

> 
> > The `!in_task()` call is to test if we are in interrupt context.
> 
> I am aware of this but here in terms of PREEMPT_RT it doesn't matter.
> It excluded the hardirq context which is the important one but this also
> happens with preemptible(). It additionally excludes the "serving"
> softirq context which is fine because it is preemtible on PREEMPT_RT.
> 

Indeed, you are write, the !in_task() is uneeded.

> > > If Oleg does not want the unconditional RCU then I would prefer an
> > > explicit put task which delays it to RCU for the few users that need it.
> > > 
> > 
> > Do you mean like the approach in v2[1]? I believe to spot all possible
> > problematic scenarios, would should add
> 
> Yes, an explicit function because you know the context in which put_.*()
> is invoked. It wasn't audited by the time it was added, it is not
> "regular" case.
> 
> > ```
> > if (IS_ENABLED(CONFIG_PREEMPT_RT))
> >     might_sleep();
> > ```
> > 
> > to `put_task_struct()`.
> 
> This only works on PREEMPT_RT and should be enough to spot some of the
> offender we have right now. It might also trigger if task::state is
> changed (not TASK_RUNNING) and it should be fine. Therefore I would
> suggest to use rtlock_might_resched() for testing which is in
>    kernel/locking/spinlock_rt.c
> but you get the idea.
> 
> Longterm, something like the diff at the bottom might compile and will
> show raw_spinlock_t -> spinlock_t nesting with
> CONFIG_PROVE_RAW_LOCK_NESTING. We won't catch explicit
> preempt_disable(), local_irq_disable() users but _should_ be enough and
> it would have warned us in this case because:
> - the scheduler acquires a raw_spinlock_t
> - the hrtimer has an check for this in lockdep_hrtimer_enter() to
>   distinguish between timers which are "regular" and those which
>   explicitly ask for the hardirq context.
> 
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index 357e0068497c1..eedbd50eb5df3 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -113,14 +113,18 @@ static inline struct task_struct *get_task_struct(struct task_struct *t)
>  
>  extern void __put_task_struct(struct task_struct *t);
>  
> +extern spinlock_t task_put_lock;
> +
>  static inline void put_task_struct(struct task_struct *t)
>  {
> +	might_lock(&task_put_lock);
>  	if (refcount_dec_and_test(&t->usage))
>  		__put_task_struct(t);
>  }
>  
>  static inline void put_task_struct_many(struct task_struct *t, int nr)
>  {
> +	might_lock(&task_put_lock);
>  	if (refcount_sub_and_test(nr, &t->usage))
>  		__put_task_struct(t);
>  }
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 9f7fe35418978..2f9c09bc22bdb 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -840,6 +840,8 @@ static inline void put_signal_struct(struct signal_struct *sig)
>  		free_signal_struct(sig);
>  }
>  
> +DEFINE_SPINLOCK(task_put_lock);
> +
>  void __put_task_struct(struct task_struct *tsk)
>  {
>  	WARN_ON(!tsk->exit_state);
> 

I tried this, but it doesn't give the splat in !PREEMPT_RT. But IIUC,
CONFIG_PROVE_RAW_LOCK_NESTING will only work if we hold a raw_spinlock_t
and try to acquire a spin lock. Does it check irq context as well?
  

Patch

diff --git a/kernel/fork.c b/kernel/fork.c
index 9f7fe3541897..9bf30c725ed8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -840,7 +840,7 @@  static inline void put_signal_struct(struct signal_struct *sig)
 		free_signal_struct(sig);
 }
 
-void __put_task_struct(struct task_struct *tsk)
+static void ___put_task_struct(struct task_struct *tsk)
 {
 	WARN_ON(!tsk->exit_state);
 	WARN_ON(refcount_read(&tsk->usage));
@@ -857,6 +857,37 @@  void __put_task_struct(struct task_struct *tsk)
 	sched_core_free(tsk);
 	free_task(tsk);
 }
+
+static void __put_task_struct_rcu(struct rcu_head *rhp)
+{
+	struct task_struct *task = container_of(rhp, struct task_struct, rcu);
+
+	___put_task_struct(task);
+}
+
+void __put_task_struct(struct task_struct *tsk)
+{
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && (!preemptible() || !in_task()))
+		/*
+		 * under PREEMPT_RT, we can't call put_task_struct
+		 * in atomic context because it will indirectly
+		 * acquire sleeping locks.
+		 *
+		 * call_rcu() will schedule delayed_put_task_struct_rcu()
+		 * to be called in process context.
+		 *
+		 * __put_task_struct() is called called when
+		 * refcount_dec_and_test(&t->usage) succeeds.
+		 *
+		 * This means that it can't "conflict" with
+		 * put_task_struct_rcu_user() which abuses ->rcu the same
+		 * way; rcu_users has a reference so task->usage can't be
+		 * zero after rcu_users 1 -> 0 transition.
+		 */
+		call_rcu(&tsk->rcu, __put_task_struct_rcu);
+	else
+		___put_task_struct(tsk);
+}
 EXPORT_SYMBOL_GPL(__put_task_struct);
 
 void __init __weak arch_task_cache_init(void) { }