[RFC,V3,1/6] sched: Unify runtime accounting across classes

Message ID 51ad657375206dac0f2609224babafa1c1486d4b.1686239016.git.bristot@kernel.org
State New
Headers
Series SCHED_DEADLINE server infrastructure |

Commit Message

Daniel Bristot de Oliveira June 8, 2023, 3:58 p.m. UTC
  From: Peter Zijlstra <peterz@infradead.org>

All classes use sched_entity::exec_start to track runtime and have
copies of the exact same code around to compute runtime.

Collapse all that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
---
 include/linux/sched.h    |  2 +-
 kernel/sched/deadline.c  | 15 +++--------
 kernel/sched/fair.c      | 57 ++++++++++++++++++++++++++++++----------
 kernel/sched/rt.c        | 15 +++--------
 kernel/sched/sched.h     | 12 ++-------
 kernel/sched/stop_task.c | 13 +--------
 6 files changed, 53 insertions(+), 61 deletions(-)
  

Comments

Phil Auld June 13, 2023, 1:24 p.m. UTC | #1
On Thu, Jun 08, 2023 at 05:58:13PM +0200 Daniel Bristot de Oliveira wrote:
> From: Peter Zijlstra <peterz@infradead.org>
> 
> All classes use sched_entity::exec_start to track runtime and have
> copies of the exact same code around to compute runtime.
> 
> Collapse all that.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>

Reviewed-by: Phil Auld <pauld@redhat.com>

> ---
>  include/linux/sched.h    |  2 +-
>  kernel/sched/deadline.c  | 15 +++--------
>  kernel/sched/fair.c      | 57 ++++++++++++++++++++++++++++++----------
>  kernel/sched/rt.c        | 15 +++--------
>  kernel/sched/sched.h     | 12 ++-------
>  kernel/sched/stop_task.c | 13 +--------
>  6 files changed, 53 insertions(+), 61 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 1292d38d66cc..26b1925a702a 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -521,7 +521,7 @@ struct sched_statistics {
>  	u64				block_max;
>  	s64				sum_block_runtime;
>  
> -	u64				exec_max;
> +	s64				exec_max;
>  	u64				slice_max;
>  
>  	u64				nr_migrations_cold;
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index f827067ad03b..030e7c11607f 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1301,9 +1301,8 @@ static void update_curr_dl(struct rq *rq)
>  {
>  	struct task_struct *curr = rq->curr;
>  	struct sched_dl_entity *dl_se = &curr->dl;
> -	u64 delta_exec, scaled_delta_exec;
> +	s64 delta_exec, scaled_delta_exec;
>  	int cpu = cpu_of(rq);
> -	u64 now;
>  
>  	if (!dl_task(curr) || !on_dl_rq(dl_se))
>  		return;
> @@ -1316,21 +1315,13 @@ static void update_curr_dl(struct rq *rq)
>  	 * natural solution, but the full ramifications of this
>  	 * approach need further study.
>  	 */
> -	now = rq_clock_task(rq);
> -	delta_exec = now - curr->se.exec_start;
> -	if (unlikely((s64)delta_exec <= 0)) {
> +	delta_exec = update_curr_common(rq);
> +	if (unlikely(delta_exec <= 0)) {
>  		if (unlikely(dl_se->dl_yielded))
>  			goto throttle;
>  		return;
>  	}
>  
> -	schedstat_set(curr->stats.exec_max,
> -		      max(curr->stats.exec_max, delta_exec));
> -
> -	trace_sched_stat_runtime(curr, delta_exec, 0);
> -
> -	update_current_exec_runtime(curr, now, delta_exec);
> -
>  	if (dl_entity_is_special(dl_se))
>  		return;
>  
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6189d1a45635..fda67f05190d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -891,23 +891,17 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
>  }
>  #endif /* CONFIG_SMP */
>  
> -/*
> - * Update the current task's runtime statistics.
> - */
> -static void update_curr(struct cfs_rq *cfs_rq)
> +static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
>  {
> -	struct sched_entity *curr = cfs_rq->curr;
> -	u64 now = rq_clock_task(rq_of(cfs_rq));
> -	u64 delta_exec;
> -
> -	if (unlikely(!curr))
> -		return;
> +	u64 now = rq_clock_task(rq);
> +	s64 delta_exec;
>  
>  	delta_exec = now - curr->exec_start;
> -	if (unlikely((s64)delta_exec <= 0))
> -		return;
> +	if (unlikely(delta_exec <= 0))
> +		return delta_exec;
>  
>  	curr->exec_start = now;
> +	curr->sum_exec_runtime += delta_exec;
>  
>  	if (schedstat_enabled()) {
>  		struct sched_statistics *stats;
> @@ -917,8 +911,43 @@ static void update_curr(struct cfs_rq *cfs_rq)
>  				max(delta_exec, stats->exec_max));
>  	}
>  
> -	curr->sum_exec_runtime += delta_exec;
> -	schedstat_add(cfs_rq->exec_clock, delta_exec);
> +	return delta_exec;
> +}
> +
> +/*
> + * Used by other classes to account runtime.
> + */
> +s64 update_curr_common(struct rq *rq)
> +{
> +	struct task_struct *curr = rq->curr;
> +	s64 delta_exec;
> +
> +	delta_exec = update_curr_se(rq, &curr->se);
> +	if (unlikely(delta_exec <= 0))
> +		return delta_exec;
> +
> +	trace_sched_stat_runtime(curr, delta_exec, 0);
> +
> +	account_group_exec_runtime(curr, delta_exec);
> +	cgroup_account_cputime(curr, delta_exec);
> +
> +	return delta_exec;
> +}
> +
> +/*
> + * Update the current task's runtime statistics.
> + */
> +static void update_curr(struct cfs_rq *cfs_rq)
> +{
> +	struct sched_entity *curr = cfs_rq->curr;
> +	s64 delta_exec;
> +
> +	if (unlikely(!curr))
> +		return;
> +
> +	delta_exec = update_curr_se(rq_of(cfs_rq), curr);
> +	if (unlikely(delta_exec <= 0))
> +		return;
>  
>  	curr->vruntime += calc_delta_fair(delta_exec, curr);
>  	update_min_vruntime(cfs_rq);
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 00e0e5074115..efec4f3fef83 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1046,24 +1046,15 @@ static void update_curr_rt(struct rq *rq)
>  {
>  	struct task_struct *curr = rq->curr;
>  	struct sched_rt_entity *rt_se = &curr->rt;
> -	u64 delta_exec;
> -	u64 now;
> +	s64 delta_exec;
>  
>  	if (curr->sched_class != &rt_sched_class)
>  		return;
>  
> -	now = rq_clock_task(rq);
> -	delta_exec = now - curr->se.exec_start;
> -	if (unlikely((s64)delta_exec <= 0))
> +	delta_exec = update_curr_common(rq);
> +	if (unlikely(delta_exec <= 0))
>  		return;
>  
> -	schedstat_set(curr->stats.exec_max,
> -		      max(curr->stats.exec_max, delta_exec));
> -
> -	trace_sched_stat_runtime(curr, delta_exec, 0);
> -
> -	update_current_exec_runtime(curr, now, delta_exec);
> -
>  	if (!rt_bandwidth_enabled())
>  		return;
>  
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 556496c77dc2..da0cec2fc63a 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2176,6 +2176,8 @@ struct affinity_context {
>  	unsigned int flags;
>  };
>  
> +extern s64 update_curr_common(struct rq *rq);
> +
>  struct sched_class {
>  
>  #ifdef CONFIG_UCLAMP_TASK
> @@ -3207,16 +3209,6 @@ extern int sched_dynamic_mode(const char *str);
>  extern void sched_dynamic_update(int mode);
>  #endif
>  
> -static inline void update_current_exec_runtime(struct task_struct *curr,
> -						u64 now, u64 delta_exec)
> -{
> -	curr->se.sum_exec_runtime += delta_exec;
> -	account_group_exec_runtime(curr, delta_exec);
> -
> -	curr->se.exec_start = now;
> -	cgroup_account_cputime(curr, delta_exec);
> -}
> -
>  #ifdef CONFIG_SCHED_MM_CID
>  
>  #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
> diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
> index 85590599b4d6..7595494ceb6d 100644
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq)
>  
>  static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
>  {
> -	struct task_struct *curr = rq->curr;
> -	u64 now, delta_exec;
> -
> -	now = rq_clock_task(rq);
> -	delta_exec = now - curr->se.exec_start;
> -	if (unlikely((s64)delta_exec < 0))
> -		delta_exec = 0;
> -
> -	schedstat_set(curr->stats.exec_max,
> -		      max(curr->stats.exec_max, delta_exec));
> -
> -	update_current_exec_runtime(curr, now, delta_exec);
> +	update_curr_common(rq);
>  }
>  
>  /*
> -- 
> 2.40.1
> 

--
  
Valentin Schneider June 16, 2023, 2:30 p.m. UTC | #2
On 08/06/23 17:58, Daniel Bristot de Oliveira wrote:
> From: Peter Zijlstra <peterz@infradead.org>
>
> All classes use sched_entity::exec_start to track runtime and have
> copies of the exact same code around to compute runtime.
>
> Collapse all that.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>

This one's been around for a while, John also carries it for PE [1] because it
makes things simpler. We should just get it in :-)

The three-layered if (unlikely(delta_exec <= 0)) is unfortunate, but I think we
can live with it. Tiny factorization appended below, but regardless:

Reviewed-by: Valentin Schneider <vschneid@redhat.com>

[1]: http://lore.kernel.org/r/20230601055846.2349566-2-jstultz@google.com

---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e7fcf558dc4bc..e52e609724482 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -914,6 +914,14 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
 	return delta_exec;
 }
 
+static inline void
+account_curr_runtime(struct task_struct *curr, s64 runtime, u64 vruntime)
+{
+	trace_sched_stat_runtime(curr, runtime, vruntime);
+	account_group_exec_runtime(curr, runtime);
+	cgroup_account_cputime(curr, runtime);
+}
+
 /*
  * Used by other classes to account runtime.
  */
@@ -926,10 +934,7 @@ s64 update_curr_common(struct rq *rq)
 	if (unlikely(delta_exec <= 0))
 		return delta_exec;
 
-	trace_sched_stat_runtime(curr, delta_exec, 0);
-
-	account_group_exec_runtime(curr, delta_exec);
-	cgroup_account_cputime(curr, delta_exec);
+	account_curr_runtime(curr, delta_exec, 0);
 
 	return delta_exec;
 }
@@ -955,9 +960,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	if (entity_is_task(curr)) {
 		struct task_struct *curtask = task_of(curr);
 
-		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
-		cgroup_account_cputime(curtask, delta_exec);
-		account_group_exec_runtime(curtask, delta_exec);
+		account_curr_runtime(curtask, delta_exec, curr->vruntime);
 	}
 
 	account_cfs_rq_runtime(cfs_rq, delta_exec);
  

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1292d38d66cc..26b1925a702a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -521,7 +521,7 @@  struct sched_statistics {
 	u64				block_max;
 	s64				sum_block_runtime;
 
-	u64				exec_max;
+	s64				exec_max;
 	u64				slice_max;
 
 	u64				nr_migrations_cold;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f827067ad03b..030e7c11607f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1301,9 +1301,8 @@  static void update_curr_dl(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_dl_entity *dl_se = &curr->dl;
-	u64 delta_exec, scaled_delta_exec;
+	s64 delta_exec, scaled_delta_exec;
 	int cpu = cpu_of(rq);
-	u64 now;
 
 	if (!dl_task(curr) || !on_dl_rq(dl_se))
 		return;
@@ -1316,21 +1315,13 @@  static void update_curr_dl(struct rq *rq)
 	 * natural solution, but the full ramifications of this
 	 * approach need further study.
 	 */
-	now = rq_clock_task(rq);
-	delta_exec = now - curr->se.exec_start;
-	if (unlikely((s64)delta_exec <= 0)) {
+	delta_exec = update_curr_common(rq);
+	if (unlikely(delta_exec <= 0)) {
 		if (unlikely(dl_se->dl_yielded))
 			goto throttle;
 		return;
 	}
 
-	schedstat_set(curr->stats.exec_max,
-		      max(curr->stats.exec_max, delta_exec));
-
-	trace_sched_stat_runtime(curr, delta_exec, 0);
-
-	update_current_exec_runtime(curr, now, delta_exec);
-
 	if (dl_entity_is_special(dl_se))
 		return;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6189d1a45635..fda67f05190d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -891,23 +891,17 @@  static void update_tg_load_avg(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_SMP */
 
-/*
- * Update the current task's runtime statistics.
- */
-static void update_curr(struct cfs_rq *cfs_rq)
+static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
 {
-	struct sched_entity *curr = cfs_rq->curr;
-	u64 now = rq_clock_task(rq_of(cfs_rq));
-	u64 delta_exec;
-
-	if (unlikely(!curr))
-		return;
+	u64 now = rq_clock_task(rq);
+	s64 delta_exec;
 
 	delta_exec = now - curr->exec_start;
-	if (unlikely((s64)delta_exec <= 0))
-		return;
+	if (unlikely(delta_exec <= 0))
+		return delta_exec;
 
 	curr->exec_start = now;
+	curr->sum_exec_runtime += delta_exec;
 
 	if (schedstat_enabled()) {
 		struct sched_statistics *stats;
@@ -917,8 +911,43 @@  static void update_curr(struct cfs_rq *cfs_rq)
 				max(delta_exec, stats->exec_max));
 	}
 
-	curr->sum_exec_runtime += delta_exec;
-	schedstat_add(cfs_rq->exec_clock, delta_exec);
+	return delta_exec;
+}
+
+/*
+ * Used by other classes to account runtime.
+ */
+s64 update_curr_common(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	s64 delta_exec;
+
+	delta_exec = update_curr_se(rq, &curr->se);
+	if (unlikely(delta_exec <= 0))
+		return delta_exec;
+
+	trace_sched_stat_runtime(curr, delta_exec, 0);
+
+	account_group_exec_runtime(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
+
+	return delta_exec;
+}
+
+/*
+ * Update the current task's runtime statistics.
+ */
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr = cfs_rq->curr;
+	s64 delta_exec;
+
+	if (unlikely(!curr))
+		return;
+
+	delta_exec = update_curr_se(rq_of(cfs_rq), curr);
+	if (unlikely(delta_exec <= 0))
+		return;
 
 	curr->vruntime += calc_delta_fair(delta_exec, curr);
 	update_min_vruntime(cfs_rq);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 00e0e5074115..efec4f3fef83 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1046,24 +1046,15 @@  static void update_curr_rt(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_rt_entity *rt_se = &curr->rt;
-	u64 delta_exec;
-	u64 now;
+	s64 delta_exec;
 
 	if (curr->sched_class != &rt_sched_class)
 		return;
 
-	now = rq_clock_task(rq);
-	delta_exec = now - curr->se.exec_start;
-	if (unlikely((s64)delta_exec <= 0))
+	delta_exec = update_curr_common(rq);
+	if (unlikely(delta_exec <= 0))
 		return;
 
-	schedstat_set(curr->stats.exec_max,
-		      max(curr->stats.exec_max, delta_exec));
-
-	trace_sched_stat_runtime(curr, delta_exec, 0);
-
-	update_current_exec_runtime(curr, now, delta_exec);
-
 	if (!rt_bandwidth_enabled())
 		return;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 556496c77dc2..da0cec2fc63a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2176,6 +2176,8 @@  struct affinity_context {
 	unsigned int flags;
 };
 
+extern s64 update_curr_common(struct rq *rq);
+
 struct sched_class {
 
 #ifdef CONFIG_UCLAMP_TASK
@@ -3207,16 +3209,6 @@  extern int sched_dynamic_mode(const char *str);
 extern void sched_dynamic_update(int mode);
 #endif
 
-static inline void update_current_exec_runtime(struct task_struct *curr,
-						u64 now, u64 delta_exec)
-{
-	curr->se.sum_exec_runtime += delta_exec;
-	account_group_exec_runtime(curr, delta_exec);
-
-	curr->se.exec_start = now;
-	cgroup_account_cputime(curr, delta_exec);
-}
-
 #ifdef CONFIG_SCHED_MM_CID
 
 #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 85590599b4d6..7595494ceb6d 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -70,18 +70,7 @@  static void yield_task_stop(struct rq *rq)
 
 static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
 {
-	struct task_struct *curr = rq->curr;
-	u64 now, delta_exec;
-
-	now = rq_clock_task(rq);
-	delta_exec = now - curr->se.exec_start;
-	if (unlikely((s64)delta_exec < 0))
-		delta_exec = 0;
-
-	schedstat_set(curr->stats.exec_max,
-		      max(curr->stats.exec_max, delta_exec));
-
-	update_current_exec_runtime(curr, now, delta_exec);
+	update_curr_common(rq);
 }
 
 /*