sched/core: Avoid WARN_DOUBLE_CLOCK warning when CONFIG_SCHED_CORE

Message ID 20221206070550.31763-1-jiahao.os@bytedance.com
State New
Headers
Series sched/core: Avoid WARN_DOUBLE_CLOCK warning when CONFIG_SCHED_CORE |

Commit Message

Hao Jia Dec. 6, 2022, 7:05 a.m. UTC
  When we need to call update_rq_clock() to update the rq clock of
other CPUs on the same core, before that we need to clear RQCF_UPDATED
of rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
Because at this time the rq->clock_update_flags of other CPUs
may be RQCF_UPDATED.

Some call trace reports:
Call Trace 1:
<TASK>
__schedule+0x61c/0x11d0
schedule+0x5d/0xd0
worker_thread+0xb5/0x380
? preempt_count_add+0x56/0xa0
? rescuer_thread+0x310/0x310
kthread+0xe6/0x110
? kthread_complete_and_exit+0x20/0x20
ret_from_fork+0x1f/0x30
</TASK>

Call Trace 2:
<TASK>
__schedule+0x91d/0x11d0
schedule+0x5d/0xd0
exit_to_user_mode_prepare+0xe5/0x1e0
syscall_exit_to_user_mode+0x17/0x30
do_syscall_64+0x40/0x90
entry_SYSCALL_64_after_hwframe+0x63/0xcd

Call Trace 3:
<IRQ>
__sched_core_tick+0x27/0x40
scheduler_tick+0x1be/0x270
? tick_sched_handle.isra.18+0x60/0x60
update_process_times+0x6a/0x90
tick_sched_handle.isra.18+0x1f/0x60
tick_sched_timer+0x47/0x80
__hrtimer_run_queues+0x10a/0x280
hrtimer_interrupt+0x10b/0x240
__sysvec_apic_timer_interrupt+0x70/0x160
sysvec_apic_timer_interrupt+0x9a/0xd0
</IRQ>
<TASK>
asm_sysvec_apic_timer_interrupt+0x16/0x20

Steps to reproduce:
1. Enable CONFIG_SCHED_DEBUG and CONFIG_SCHED_CORE when compiling
   the kernel
2. echo 1 > /sys/kernel/debug/clear_warn_once
   echo "WARN_DOUBLE_CLOCK" > /sys/kernel/debug/sched/features
3. Run the linux/tools/testing/selftests/sched/cs_prctl_test test

Signed-off-by: Hao Jia <jiahao.os@bytedance.com>
---
 kernel/sched/core.c       |  5 ++++-
 kernel/sched/core_sched.c |  4 +++-
 kernel/sched/sched.h      | 10 +++++++++-
 3 files changed, 16 insertions(+), 3 deletions(-)
  

Comments

Hao Jia Jan. 3, 2023, 7:44 a.m. UTC | #1
Friendly ping...

On 2022/12/6 Hao Jia wrote:
> When we need to call update_rq_clock() to update the rq clock of
> other CPUs on the same core, before that we need to clear RQCF_UPDATED
> of rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
> Because at this time the rq->clock_update_flags of other CPUs
> may be RQCF_UPDATED.
> 
> Some call trace reports:
> Call Trace 1:
> <TASK>
> __schedule+0x61c/0x11d0
> schedule+0x5d/0xd0
> worker_thread+0xb5/0x380
> ? preempt_count_add+0x56/0xa0
> ? rescuer_thread+0x310/0x310
> kthread+0xe6/0x110
> ? kthread_complete_and_exit+0x20/0x20
> ret_from_fork+0x1f/0x30
> </TASK>
> 
> Call Trace 2:
> <TASK>
> __schedule+0x91d/0x11d0
> schedule+0x5d/0xd0
> exit_to_user_mode_prepare+0xe5/0x1e0
> syscall_exit_to_user_mode+0x17/0x30
> do_syscall_64+0x40/0x90
> entry_SYSCALL_64_after_hwframe+0x63/0xcd
> 
> Call Trace 3:
> <IRQ>
> __sched_core_tick+0x27/0x40
> scheduler_tick+0x1be/0x270
> ? tick_sched_handle.isra.18+0x60/0x60
> update_process_times+0x6a/0x90
> tick_sched_handle.isra.18+0x1f/0x60
> tick_sched_timer+0x47/0x80
> __hrtimer_run_queues+0x10a/0x280
> hrtimer_interrupt+0x10b/0x240
> __sysvec_apic_timer_interrupt+0x70/0x160
> sysvec_apic_timer_interrupt+0x9a/0xd0
> </IRQ>
> <TASK>
> asm_sysvec_apic_timer_interrupt+0x16/0x20
> 
> Steps to reproduce:
> 1. Enable CONFIG_SCHED_DEBUG and CONFIG_SCHED_CORE when compiling
>     the kernel
> 2. echo 1 > /sys/kernel/debug/clear_warn_once
>     echo "WARN_DOUBLE_CLOCK" > /sys/kernel/debug/sched/features
> 3. Run the linux/tools/testing/selftests/sched/cs_prctl_test test
> 
> Signed-off-by: Hao Jia <jiahao.os@bytedance.com>
> ---
>   kernel/sched/core.c       |  5 ++++-
>   kernel/sched/core_sched.c |  4 +++-
>   kernel/sched/sched.h      | 10 +++++++++-
>   3 files changed, 16 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index daff72f00385..fcf5e4faec34 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5951,6 +5951,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>   	rq->core->core_cookie = 0UL;
>   	if (rq->core->core_forceidle_count) {
>   		if (!core_clock_updated) {
> +			rq_clock_clear_update(rq->core);
>   			update_rq_clock(rq->core);
>   			core_clock_updated = true;
>   		}
> @@ -6007,8 +6008,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>   		 * pick_next_task(). If the current cpu is not the core,
>   		 * the core may also have been updated above.
>   		 */
> -		if (i != cpu && (rq_i != rq->core || !core_clock_updated))
> +		if (i != cpu && (rq_i != rq->core || !core_clock_updated)) {
> +			rq_clock_clear_update(rq_i);
>   			update_rq_clock(rq_i);
> +		}
>   
>   		p = rq_i->core_pick = pick_task(rq_i);
>   		if (!max || prio_less(max, p, fi_before))
> diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
> index a57fd8f27498..70a6f36fd830 100644
> --- a/kernel/sched/core_sched.c
> +++ b/kernel/sched/core_sched.c
> @@ -291,8 +291,10 @@ void __sched_core_tick(struct rq *rq)
>   	if (!rq->core->core_forceidle_count)
>   		return;
>   
> -	if (rq != rq->core)
> +	if (rq != rq->core) {
> +		rq_clock_clear_update(rq->core);
>   		update_rq_clock(rq->core);
> +	}
>   
>   	__sched_core_account_forceidle(rq);
>   }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index a4a20046e586..1a2c40c413c2 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2544,8 +2544,16 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
>   	rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
>   #endif
>   }
> -#else
> +
> +#ifdef CONFIG_SCHED_CORE
> +static inline void rq_clock_clear_update(struct rq *rq)
> +{
> +	rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
> +}
> +#endif
> +#else /* CONFIG_SCHED_DEBUG */
>   static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
> +static inline void rq_clock_clear_update(struct rq *rq) {}
>   #endif
>   
>   #ifdef CONFIG_SMP
  
Peter Zijlstra Jan. 16, 2023, 11:19 a.m. UTC | #2
On Tue, Dec 06, 2022 at 03:05:50PM +0800, Hao Jia wrote:
> When we need to call update_rq_clock() to update the rq clock of
> other CPUs on the same core, before that we need to clear RQCF_UPDATED
> of rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
> Because at this time the rq->clock_update_flags of other CPUs
> may be RQCF_UPDATED.

So you've found that the WARN_DOUBLE_CLOCK machinery doesn't work for
core-sched -- but then instead of fixing that machinery, you put
band-aids on it :/
  
Hao Jia Jan. 31, 2023, 2:35 a.m. UTC | #3
On 2023/1/16 Peter Zijlstra wrote:
> On Tue, Dec 06, 2022 at 03:05:50PM +0800, Hao Jia wrote:
>> When we need to call update_rq_clock() to update the rq clock of
>> other CPUs on the same core, before that we need to clear RQCF_UPDATED
>> of rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
>> Because at this time the rq->clock_update_flags of other CPUs
>> may be RQCF_UPDATED.
> 
> So you've found that the WARN_DOUBLE_CLOCK machinery doesn't work for
> core-sched -- but then instead of fixing that machinery, you put
> band-aids on it :/
> 
> 
Hi, Peter
Sorry for the late reply. I just finished my holiday.

I am trying to adapt WARN_DOUBLE_CLOCK machinery for core-sched.

If sched_core_enabled(), we will get a core wide rq->lock, so we can
safely clear RQCF_UPDATED of rq->clock_update_flags of all CPUs on this
core.
This avoids a WARN_DOUBLE_CLOCK warning when we call update_rq_clock()
to update the rq clock of other cpus on the same core.

We cannot clear rq->clock_update_flags of other cpus on the same core in
rq_pin_lock(). Because in some functions, we will temporarily give up
core wide rq->lock, and then use raw_spin_rq_lock() to obtain core wide
rq->lock, such as newidle_balance() and _double_lock_balance().


Thanks,
Hao

---

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e838feb6adc5..f279912e30b3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -435,6 +435,21 @@ sched_core_dequeue(struct rq *rq, struct 
task_struct *p, int flags) { }

  #endif /* CONFIG_SCHED_CORE */

+static inline void sched_core_rq_clock_clear_update(struct rq *rq)
+{
+#ifdef CONFIG_SCHED_DEBUG
+       const struct cpumask *smt_mask;
+       int i;
+       if (rq->core_enabled) {
+               smt_mask = cpu_smt_mask(rq->cpu);
+               for_each_cpu(i, smt_mask) {
+                       if (rq->cpu != i)
+                               cpu_rq(i)->clock_update_flags &= 
(RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+               }
+       }
+#endif
+}
+
  /*
   * Serialization rules:
   *
@@ -546,6 +561,7 @@ void raw_spin_rq_lock_nested(struct rq *rq, int 
subclass)
                 if (likely(lock == __rq_lockp(rq))) {
                         /* preempt_count *MUST* be > 1 */
                         preempt_enable_no_resched();
+                       sched_core_rq_clock_clear_update(rq);
                         return;
                 }
                 raw_spin_unlock(lock);
  

Patch

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index daff72f00385..fcf5e4faec34 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5951,6 +5951,7 @@  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	rq->core->core_cookie = 0UL;
 	if (rq->core->core_forceidle_count) {
 		if (!core_clock_updated) {
+			rq_clock_clear_update(rq->core);
 			update_rq_clock(rq->core);
 			core_clock_updated = true;
 		}
@@ -6007,8 +6008,10 @@  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 		 * pick_next_task(). If the current cpu is not the core,
 		 * the core may also have been updated above.
 		 */
-		if (i != cpu && (rq_i != rq->core || !core_clock_updated))
+		if (i != cpu && (rq_i != rq->core || !core_clock_updated)) {
+			rq_clock_clear_update(rq_i);
 			update_rq_clock(rq_i);
+		}
 
 		p = rq_i->core_pick = pick_task(rq_i);
 		if (!max || prio_less(max, p, fi_before))
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index a57fd8f27498..70a6f36fd830 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -291,8 +291,10 @@  void __sched_core_tick(struct rq *rq)
 	if (!rq->core->core_forceidle_count)
 		return;
 
-	if (rq != rq->core)
+	if (rq != rq->core) {
+		rq_clock_clear_update(rq->core);
 		update_rq_clock(rq->core);
+	}
 
 	__sched_core_account_forceidle(rq);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a4a20046e586..1a2c40c413c2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2544,8 +2544,16 @@  static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
 	rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
 #endif
 }
-#else
+
+#ifdef CONFIG_SCHED_CORE
+static inline void rq_clock_clear_update(struct rq *rq)
+{
+	rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+}
+#endif
+#else /* CONFIG_SCHED_DEBUG */
 static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
+static inline void rq_clock_clear_update(struct rq *rq) {}
 #endif
 
 #ifdef CONFIG_SMP