[RFC,42/86] sched: force preemption on tick expiration

Message ID 20231107215742.363031-43-ankur.a.arora@oracle.com
State New
Headers
Series Make the kernel preemptible |

Commit Message

Ankur Arora Nov. 7, 2023, 9:57 p.m. UTC
  The kernel can have long running tasks which don't pass through
preemption points for prolonged periods and so will never see
a scheduler's polite TIF_NEED_RESCHED_LAZY.

Force a reschedule at the next tick by upgrading to TIF_NEED_RESCHED,
which will get folded into the preempt_count and a reschedule at the
next safe preemption point.

TODO: deadline scheduler.

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
 kernel/sched/fair.c  | 32 +++++++++++++++++++++++---------
 kernel/sched/rt.c    |  7 ++++++-
 kernel/sched/sched.h |  1 +
 3 files changed, 30 insertions(+), 10 deletions(-)
  

Comments

Peter Zijlstra Nov. 8, 2023, 9:56 a.m. UTC | #1
On Tue, Nov 07, 2023 at 01:57:28PM -0800, Ankur Arora wrote:

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 4d86c618ffa2..fe7e5e9b2207 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1016,8 +1016,11 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
>   * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
>   * this is probably good enough.
>   */
> -static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +static void update_deadline(struct cfs_rq *cfs_rq,
> +			    struct sched_entity *se, bool tick)
>  {
> +	struct rq *rq = rq_of(cfs_rq);
> +
>  	if ((s64)(se->vruntime - se->deadline) < 0)
>  		return;
>  
> @@ -1033,13 +1036,19 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  	 */
>  	se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
>  
> +	if (cfs_rq->nr_running < 2)
> +		return;
> +
>  	/*
> -	 * The task has consumed its request, reschedule.
> +	 * The task has consumed its request, reschedule; eagerly
> +	 * if it ignored our last lazy reschedule.
>  	 */
> -	if (cfs_rq->nr_running > 1) {
> -		resched_curr(rq_of(cfs_rq));
> -		clear_buddies(cfs_rq, se);
> -	}
> +	if (tick && test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED_LAZY))
> +		__resched_curr(rq, RESCHED_eager);
> +	else
> +		resched_curr(rq);
> +
> +	clear_buddies(cfs_rq, se);
>  }
>  
>  #include "pelt.h"
> @@ -1147,7 +1156,7 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
>  /*
>   * Update the current task's runtime statistics.
>   */
> -static void update_curr(struct cfs_rq *cfs_rq)
> +static void __update_curr(struct cfs_rq *cfs_rq, bool tick)
>  {
>  	struct sched_entity *curr = cfs_rq->curr;
>  	u64 now = rq_clock_task(rq_of(cfs_rq));
> @@ -1174,7 +1183,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
>  	schedstat_add(cfs_rq->exec_clock, delta_exec);
>  
>  	curr->vruntime += calc_delta_fair(delta_exec, curr);
> -	update_deadline(cfs_rq, curr);
> +	update_deadline(cfs_rq, curr, tick);
>  	update_min_vruntime(cfs_rq);
>  
>  	if (entity_is_task(curr)) {
> @@ -1188,6 +1197,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
>  	account_cfs_rq_runtime(cfs_rq, delta_exec);
>  }
>  
> +static void update_curr(struct cfs_rq *cfs_rq)
> +{
> +	__update_curr(cfs_rq, false);
> +}
> +
>  static void update_curr_fair(struct rq *rq)
>  {
>  	update_curr(cfs_rq_of(&rq->curr->se));
> @@ -5309,7 +5323,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
>  	/*
>  	 * Update run-time statistics of the 'current'.
>  	 */
> -	update_curr(cfs_rq);
> +	__update_curr(cfs_rq, true);
>  
>  	/*
>  	 * Ensure that runnable average is periodically updated.

I'm thinking this will be less of a mess if you flip it around some.

(ignore the hrtick mess, I'll try and get that cleaned up)

This way you have two distinct sites to handle the preemption. the
update_curr() would be 'FULL ? force : lazy' while the tick one gets the
special magic bits.

---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df348aa55d3c..5399696de9e0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1016,10 +1016,10 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
  * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
  * this is probably good enough.
  */
-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if ((s64)(se->vruntime - se->deadline) < 0)
-		return;
+		return false;
 
 	/*
 	 * For EEVDF the virtual time slope is determined by w_i (iow.
@@ -1037,9 +1037,11 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * The task has consumed its request, reschedule.
 	 */
 	if (cfs_rq->nr_running > 1) {
-		resched_curr(rq_of(cfs_rq));
 		clear_buddies(cfs_rq, se);
+		return true;
 	}
+
+	return false;
 }
 
 #include "pelt.h"
@@ -1147,18 +1149,19 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
 /*
  * Update the current task's runtime statistics.
  */
-static void update_curr(struct cfs_rq *cfs_rq)
+static bool __update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
 	u64 now = rq_clock_task(rq_of(cfs_rq));
 	u64 delta_exec;
+	bool ret;
 
 	if (unlikely(!curr))
-		return;
+		return false;
 
 	delta_exec = now - curr->exec_start;
 	if (unlikely((s64)delta_exec <= 0))
-		return;
+		return false;
 
 	curr->exec_start = now;
 
@@ -1174,7 +1177,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	schedstat_add(cfs_rq->exec_clock, delta_exec);
 
 	curr->vruntime += calc_delta_fair(delta_exec, curr);
-	update_deadline(cfs_rq, curr);
+	ret = update_deadline(cfs_rq, curr);
 	update_min_vruntime(cfs_rq);
 
 	if (entity_is_task(curr)) {
@@ -1186,6 +1189,14 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	}
 
 	account_cfs_rq_runtime(cfs_rq, delta_exec);
+
+	return ret;
+}
+
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+	if (__update_curr(cfs_rq))
+		resched_curr(rq_of(cfs_rq));
 }
 
 static void update_curr_fair(struct rq *rq)
@@ -5309,7 +5320,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
-	update_curr(cfs_rq);
+	bool resched = __update_curr(cfs_rq);
 
 	/*
 	 * Ensure that runnable average is periodically updated.
@@ -5317,22 +5328,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	update_load_avg(cfs_rq, curr, UPDATE_TG);
 	update_cfs_group(curr);
 
-#ifdef CONFIG_SCHED_HRTICK
-	/*
-	 * queued ticks are scheduled to match the slice, so don't bother
-	 * validating it and just reschedule.
-	 */
-	if (queued) {
-		resched_curr(rq_of(cfs_rq));
-		return;
-	}
-	/*
-	 * don't let the period tick interfere with the hrtick preemption
-	 */
-	if (!sched_feat(DOUBLE_TICK) &&
-			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
-		return;
-#endif
+	return resched;
 }
 
 
@@ -12387,12 +12383,16 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
+	bool resched = false;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		entity_tick(cfs_rq, se, queued);
+		resched |= entity_tick(cfs_rq, se, queued);
 	}
 
+	if (resched)
+		resched_curr(rq);
+
 	if (static_branch_unlikely(&sched_numa_balancing))
 		task_tick_numa(rq, curr);
  
Ankur Arora Nov. 21, 2023, 6:44 a.m. UTC | #2
Peter Zijlstra <peterz@infradead.org> writes:

> On Tue, Nov 07, 2023 at 01:57:28PM -0800, Ankur Arora wrote:
>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 4d86c618ffa2..fe7e5e9b2207 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -1016,8 +1016,11 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
>>   * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
>>   * this is probably good enough.
>>   */
>> -static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
>> +static void update_deadline(struct cfs_rq *cfs_rq,
>> +			    struct sched_entity *se, bool tick)
>>  {
>> +	struct rq *rq = rq_of(cfs_rq);
>> +
>>  	if ((s64)(se->vruntime - se->deadline) < 0)
>>  		return;
>>
>> @@ -1033,13 +1036,19 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>  	 */
>>  	se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
>>
>> +	if (cfs_rq->nr_running < 2)
>> +		return;
>> +
>>  	/*
>> -	 * The task has consumed its request, reschedule.
>> +	 * The task has consumed its request, reschedule; eagerly
>> +	 * if it ignored our last lazy reschedule.
>>  	 */
>> -	if (cfs_rq->nr_running > 1) {
>> -		resched_curr(rq_of(cfs_rq));
>> -		clear_buddies(cfs_rq, se);
>> -	}
>> +	if (tick && test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED_LAZY))
>> +		__resched_curr(rq, RESCHED_eager);
>> +	else
>> +		resched_curr(rq);
>> +
>> +	clear_buddies(cfs_rq, se);
>>  }
>>
>>  #include "pelt.h"
>> @@ -1147,7 +1156,7 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
>>  /*
>>   * Update the current task's runtime statistics.
>>   */
>> -static void update_curr(struct cfs_rq *cfs_rq)
>> +static void __update_curr(struct cfs_rq *cfs_rq, bool tick)
>>  {
>>  	struct sched_entity *curr = cfs_rq->curr;
>>  	u64 now = rq_clock_task(rq_of(cfs_rq));
>> @@ -1174,7 +1183,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
>>  	schedstat_add(cfs_rq->exec_clock, delta_exec);
>>
>>  	curr->vruntime += calc_delta_fair(delta_exec, curr);
>> -	update_deadline(cfs_rq, curr);
>> +	update_deadline(cfs_rq, curr, tick);
>>  	update_min_vruntime(cfs_rq);
>>
>>  	if (entity_is_task(curr)) {
>> @@ -1188,6 +1197,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
>>  	account_cfs_rq_runtime(cfs_rq, delta_exec);
>>  }
>>
>> +static void update_curr(struct cfs_rq *cfs_rq)
>> +{
>> +	__update_curr(cfs_rq, false);
>> +}
>> +
>>  static void update_curr_fair(struct rq *rq)
>>  {
>>  	update_curr(cfs_rq_of(&rq->curr->se));
>> @@ -5309,7 +5323,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
>>  	/*
>>  	 * Update run-time statistics of the 'current'.
>>  	 */
>> -	update_curr(cfs_rq);
>> +	__update_curr(cfs_rq, true);
>>
>>  	/*
>>  	 * Ensure that runnable average is periodically updated.
>
> I'm thinking this will be less of a mess if you flip it around some.
>
> (ignore the hrtick mess, I'll try and get that cleaned up)
>
> This way you have two distinct sites to handle the preemption. the
> update_curr() would be 'FULL ? force : lazy' while the tick one gets the
> special magic bits.

Thanks that simplified changes here quite nicely.

--
ankur
  

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4d86c618ffa2..fe7e5e9b2207 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1016,8 +1016,11 @@  static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
  * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
  * this is probably good enough.
  */
-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void update_deadline(struct cfs_rq *cfs_rq,
+			    struct sched_entity *se, bool tick)
 {
+	struct rq *rq = rq_of(cfs_rq);
+
 	if ((s64)(se->vruntime - se->deadline) < 0)
 		return;
 
@@ -1033,13 +1036,19 @@  static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 */
 	se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
 
+	if (cfs_rq->nr_running < 2)
+		return;
+
 	/*
-	 * The task has consumed its request, reschedule.
+	 * The task has consumed its request, reschedule; eagerly
+	 * if it ignored our last lazy reschedule.
 	 */
-	if (cfs_rq->nr_running > 1) {
-		resched_curr(rq_of(cfs_rq));
-		clear_buddies(cfs_rq, se);
-	}
+	if (tick && test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED_LAZY))
+		__resched_curr(rq, RESCHED_eager);
+	else
+		resched_curr(rq);
+
+	clear_buddies(cfs_rq, se);
 }
 
 #include "pelt.h"
@@ -1147,7 +1156,7 @@  static void update_tg_load_avg(struct cfs_rq *cfs_rq)
 /*
  * Update the current task's runtime statistics.
  */
-static void update_curr(struct cfs_rq *cfs_rq)
+static void __update_curr(struct cfs_rq *cfs_rq, bool tick)
 {
 	struct sched_entity *curr = cfs_rq->curr;
 	u64 now = rq_clock_task(rq_of(cfs_rq));
@@ -1174,7 +1183,7 @@  static void update_curr(struct cfs_rq *cfs_rq)
 	schedstat_add(cfs_rq->exec_clock, delta_exec);
 
 	curr->vruntime += calc_delta_fair(delta_exec, curr);
-	update_deadline(cfs_rq, curr);
+	update_deadline(cfs_rq, curr, tick);
 	update_min_vruntime(cfs_rq);
 
 	if (entity_is_task(curr)) {
@@ -1188,6 +1197,11 @@  static void update_curr(struct cfs_rq *cfs_rq)
 	account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+	__update_curr(cfs_rq, false);
+}
+
 static void update_curr_fair(struct rq *rq)
 {
 	update_curr(cfs_rq_of(&rq->curr->se));
@@ -5309,7 +5323,7 @@  entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
-	update_curr(cfs_rq);
+	__update_curr(cfs_rq, true);
 
 	/*
 	 * Ensure that runnable average is periodically updated.
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a79ce6746dd0..5fdb93f1b87e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2664,7 +2664,12 @@  static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 	for_each_sched_rt_entity(rt_se) {
 		if (rt_se->run_list.prev != rt_se->run_list.next) {
 			requeue_task_rt(rq, p, 0);
-			resched_curr(rq);
+
+			if (test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED_LAZY))
+				__resched_curr(rq, RESCHED_eager);
+			else
+				resched_curr(rq);
+
 			return;
 		}
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9e1329a4e890..e29a8897f573 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2434,6 +2434,7 @@  extern void init_sched_fair_class(void);
 
 extern void reweight_task(struct task_struct *p, int prio);
 
+extern void __resched_curr(struct rq *rq, resched_t rs);
 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);