[RFC,V3,1/6] sched: Unify runtime accounting across classes
Commit Message
From: Peter Zijlstra <peterz@infradead.org>
All classes use sched_entity::exec_start to track runtime and have
copies of the exact same code around to compute runtime.
Collapse all that.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
---
include/linux/sched.h | 2 +-
kernel/sched/deadline.c | 15 +++--------
kernel/sched/fair.c | 57 ++++++++++++++++++++++++++++++----------
kernel/sched/rt.c | 15 +++--------
kernel/sched/sched.h | 12 ++-------
kernel/sched/stop_task.c | 13 +--------
6 files changed, 53 insertions(+), 61 deletions(-)
Comments
On Thu, Jun 08, 2023 at 05:58:13PM +0200 Daniel Bristot de Oliveira wrote:
> From: Peter Zijlstra <peterz@infradead.org>
>
> All classes use sched_entity::exec_start to track runtime and have
> copies of the exact same code around to compute runtime.
>
> Collapse all that.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
> ---
> include/linux/sched.h | 2 +-
> kernel/sched/deadline.c | 15 +++--------
> kernel/sched/fair.c | 57 ++++++++++++++++++++++++++++++----------
> kernel/sched/rt.c | 15 +++--------
> kernel/sched/sched.h | 12 ++-------
> kernel/sched/stop_task.c | 13 +--------
> 6 files changed, 53 insertions(+), 61 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 1292d38d66cc..26b1925a702a 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -521,7 +521,7 @@ struct sched_statistics {
> u64 block_max;
> s64 sum_block_runtime;
>
> - u64 exec_max;
> + s64 exec_max;
> u64 slice_max;
>
> u64 nr_migrations_cold;
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index f827067ad03b..030e7c11607f 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1301,9 +1301,8 @@ static void update_curr_dl(struct rq *rq)
> {
> struct task_struct *curr = rq->curr;
> struct sched_dl_entity *dl_se = &curr->dl;
> - u64 delta_exec, scaled_delta_exec;
> + s64 delta_exec, scaled_delta_exec;
> int cpu = cpu_of(rq);
> - u64 now;
>
> if (!dl_task(curr) || !on_dl_rq(dl_se))
> return;
> @@ -1316,21 +1315,13 @@ static void update_curr_dl(struct rq *rq)
> * natural solution, but the full ramifications of this
> * approach need further study.
> */
> - now = rq_clock_task(rq);
> - delta_exec = now - curr->se.exec_start;
> - if (unlikely((s64)delta_exec <= 0)) {
> + delta_exec = update_curr_common(rq);
> + if (unlikely(delta_exec <= 0)) {
> if (unlikely(dl_se->dl_yielded))
> goto throttle;
> return;
> }
>
> - schedstat_set(curr->stats.exec_max,
> - max(curr->stats.exec_max, delta_exec));
> -
> - trace_sched_stat_runtime(curr, delta_exec, 0);
> -
> - update_current_exec_runtime(curr, now, delta_exec);
> -
> if (dl_entity_is_special(dl_se))
> return;
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6189d1a45635..fda67f05190d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -891,23 +891,17 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
> }
> #endif /* CONFIG_SMP */
>
> -/*
> - * Update the current task's runtime statistics.
> - */
> -static void update_curr(struct cfs_rq *cfs_rq)
> +static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
> {
> - struct sched_entity *curr = cfs_rq->curr;
> - u64 now = rq_clock_task(rq_of(cfs_rq));
> - u64 delta_exec;
> -
> - if (unlikely(!curr))
> - return;
> + u64 now = rq_clock_task(rq);
> + s64 delta_exec;
>
> delta_exec = now - curr->exec_start;
> - if (unlikely((s64)delta_exec <= 0))
> - return;
> + if (unlikely(delta_exec <= 0))
> + return delta_exec;
>
> curr->exec_start = now;
> + curr->sum_exec_runtime += delta_exec;
>
> if (schedstat_enabled()) {
> struct sched_statistics *stats;
> @@ -917,8 +911,43 @@ static void update_curr(struct cfs_rq *cfs_rq)
> max(delta_exec, stats->exec_max));
> }
>
> - curr->sum_exec_runtime += delta_exec;
> - schedstat_add(cfs_rq->exec_clock, delta_exec);
> + return delta_exec;
> +}
> +
> +/*
> + * Used by other classes to account runtime.
> + */
> +s64 update_curr_common(struct rq *rq)
> +{
> + struct task_struct *curr = rq->curr;
> + s64 delta_exec;
> +
> + delta_exec = update_curr_se(rq, &curr->se);
> + if (unlikely(delta_exec <= 0))
> + return delta_exec;
> +
> + trace_sched_stat_runtime(curr, delta_exec, 0);
> +
> + account_group_exec_runtime(curr, delta_exec);
> + cgroup_account_cputime(curr, delta_exec);
> +
> + return delta_exec;
> +}
> +
> +/*
> + * Update the current task's runtime statistics.
> + */
> +static void update_curr(struct cfs_rq *cfs_rq)
> +{
> + struct sched_entity *curr = cfs_rq->curr;
> + s64 delta_exec;
> +
> + if (unlikely(!curr))
> + return;
> +
> + delta_exec = update_curr_se(rq_of(cfs_rq), curr);
> + if (unlikely(delta_exec <= 0))
> + return;
>
> curr->vruntime += calc_delta_fair(delta_exec, curr);
> update_min_vruntime(cfs_rq);
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 00e0e5074115..efec4f3fef83 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1046,24 +1046,15 @@ static void update_curr_rt(struct rq *rq)
> {
> struct task_struct *curr = rq->curr;
> struct sched_rt_entity *rt_se = &curr->rt;
> - u64 delta_exec;
> - u64 now;
> + s64 delta_exec;
>
> if (curr->sched_class != &rt_sched_class)
> return;
>
> - now = rq_clock_task(rq);
> - delta_exec = now - curr->se.exec_start;
> - if (unlikely((s64)delta_exec <= 0))
> + delta_exec = update_curr_common(rq);
> + if (unlikely(delta_exec <= 0))
> return;
>
> - schedstat_set(curr->stats.exec_max,
> - max(curr->stats.exec_max, delta_exec));
> -
> - trace_sched_stat_runtime(curr, delta_exec, 0);
> -
> - update_current_exec_runtime(curr, now, delta_exec);
> -
> if (!rt_bandwidth_enabled())
> return;
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 556496c77dc2..da0cec2fc63a 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2176,6 +2176,8 @@ struct affinity_context {
> unsigned int flags;
> };
>
> +extern s64 update_curr_common(struct rq *rq);
> +
> struct sched_class {
>
> #ifdef CONFIG_UCLAMP_TASK
> @@ -3207,16 +3209,6 @@ extern int sched_dynamic_mode(const char *str);
> extern void sched_dynamic_update(int mode);
> #endif
>
> -static inline void update_current_exec_runtime(struct task_struct *curr,
> - u64 now, u64 delta_exec)
> -{
> - curr->se.sum_exec_runtime += delta_exec;
> - account_group_exec_runtime(curr, delta_exec);
> -
> - curr->se.exec_start = now;
> - cgroup_account_cputime(curr, delta_exec);
> -}
> -
> #ifdef CONFIG_SCHED_MM_CID
>
> #define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
> diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
> index 85590599b4d6..7595494ceb6d 100644
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq)
>
> static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
> {
> - struct task_struct *curr = rq->curr;
> - u64 now, delta_exec;
> -
> - now = rq_clock_task(rq);
> - delta_exec = now - curr->se.exec_start;
> - if (unlikely((s64)delta_exec < 0))
> - delta_exec = 0;
> -
> - schedstat_set(curr->stats.exec_max,
> - max(curr->stats.exec_max, delta_exec));
> -
> - update_current_exec_runtime(curr, now, delta_exec);
> + update_curr_common(rq);
> }
>
> /*
> --
> 2.40.1
>
--
On 08/06/23 17:58, Daniel Bristot de Oliveira wrote:
> From: Peter Zijlstra <peterz@infradead.org>
>
> All classes use sched_entity::exec_start to track runtime and have
> copies of the exact same code around to compute runtime.
>
> Collapse all that.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
This one's been around for a while, John also carries it for PE [1] because it
makes things simpler. We should just get it in :-)
The three-layered if (unlikely(delta_exec <= 0)) is unfortunate, but I think we
can live with it. Tiny factorization appended below, but regardless:
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
[1]: http://lore.kernel.org/r/20230601055846.2349566-2-jstultz@google.com
---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e7fcf558dc4bc..e52e609724482 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -914,6 +914,14 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
return delta_exec;
}
+static inline void
+account_curr_runtime(struct task_struct *curr, s64 runtime, u64 vruntime)
+{
+ trace_sched_stat_runtime(curr, runtime, vruntime);
+ account_group_exec_runtime(curr, runtime);
+ cgroup_account_cputime(curr, runtime);
+}
+
/*
* Used by other classes to account runtime.
*/
@@ -926,10 +934,7 @@ s64 update_curr_common(struct rq *rq)
if (unlikely(delta_exec <= 0))
return delta_exec;
- trace_sched_stat_runtime(curr, delta_exec, 0);
-
- account_group_exec_runtime(curr, delta_exec);
- cgroup_account_cputime(curr, delta_exec);
+ account_curr_runtime(curr, delta_exec, 0);
return delta_exec;
}
@@ -955,9 +960,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
- trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
- cgroup_account_cputime(curtask, delta_exec);
- account_group_exec_runtime(curtask, delta_exec);
+ account_curr_runtime(curtask, delta_exec, curr->vruntime);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -521,7 +521,7 @@ struct sched_statistics {
u64 block_max;
s64 sum_block_runtime;
- u64 exec_max;
+ s64 exec_max;
u64 slice_max;
u64 nr_migrations_cold;
@@ -1301,9 +1301,8 @@ static void update_curr_dl(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_dl_entity *dl_se = &curr->dl;
- u64 delta_exec, scaled_delta_exec;
+ s64 delta_exec, scaled_delta_exec;
int cpu = cpu_of(rq);
- u64 now;
if (!dl_task(curr) || !on_dl_rq(dl_se))
return;
@@ -1316,21 +1315,13 @@ static void update_curr_dl(struct rq *rq)
* natural solution, but the full ramifications of this
* approach need further study.
*/
- now = rq_clock_task(rq);
- delta_exec = now - curr->se.exec_start;
- if (unlikely((s64)delta_exec <= 0)) {
+ delta_exec = update_curr_common(rq);
+ if (unlikely(delta_exec <= 0)) {
if (unlikely(dl_se->dl_yielded))
goto throttle;
return;
}
- schedstat_set(curr->stats.exec_max,
- max(curr->stats.exec_max, delta_exec));
-
- trace_sched_stat_runtime(curr, delta_exec, 0);
-
- update_current_exec_runtime(curr, now, delta_exec);
-
if (dl_entity_is_special(dl_se))
return;
@@ -891,23 +891,17 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_SMP */
-/*
- * Update the current task's runtime statistics.
- */
-static void update_curr(struct cfs_rq *cfs_rq)
+static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
{
- struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_clock_task(rq_of(cfs_rq));
- u64 delta_exec;
-
- if (unlikely(!curr))
- return;
+ u64 now = rq_clock_task(rq);
+ s64 delta_exec;
delta_exec = now - curr->exec_start;
- if (unlikely((s64)delta_exec <= 0))
- return;
+ if (unlikely(delta_exec <= 0))
+ return delta_exec;
curr->exec_start = now;
+ curr->sum_exec_runtime += delta_exec;
if (schedstat_enabled()) {
struct sched_statistics *stats;
@@ -917,8 +911,43 @@ static void update_curr(struct cfs_rq *cfs_rq)
max(delta_exec, stats->exec_max));
}
- curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq->exec_clock, delta_exec);
+ return delta_exec;
+}
+
+/*
+ * Used by other classes to account runtime.
+ */
+s64 update_curr_common(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ s64 delta_exec;
+
+ delta_exec = update_curr_se(rq, &curr->se);
+ if (unlikely(delta_exec <= 0))
+ return delta_exec;
+
+ trace_sched_stat_runtime(curr, delta_exec, 0);
+
+ account_group_exec_runtime(curr, delta_exec);
+ cgroup_account_cputime(curr, delta_exec);
+
+ return delta_exec;
+}
+
+/*
+ * Update the current task's runtime statistics.
+ */
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 delta_exec;
+
+ if (unlikely(!curr))
+ return;
+
+ delta_exec = update_curr_se(rq_of(cfs_rq), curr);
+ if (unlikely(delta_exec <= 0))
+ return;
curr->vruntime += calc_delta_fair(delta_exec, curr);
update_min_vruntime(cfs_rq);
@@ -1046,24 +1046,15 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
- u64 delta_exec;
- u64 now;
+ s64 delta_exec;
if (curr->sched_class != &rt_sched_class)
return;
- now = rq_clock_task(rq);
- delta_exec = now - curr->se.exec_start;
- if (unlikely((s64)delta_exec <= 0))
+ delta_exec = update_curr_common(rq);
+ if (unlikely(delta_exec <= 0))
return;
- schedstat_set(curr->stats.exec_max,
- max(curr->stats.exec_max, delta_exec));
-
- trace_sched_stat_runtime(curr, delta_exec, 0);
-
- update_current_exec_runtime(curr, now, delta_exec);
-
if (!rt_bandwidth_enabled())
return;
@@ -2176,6 +2176,8 @@ struct affinity_context {
unsigned int flags;
};
+extern s64 update_curr_common(struct rq *rq);
+
struct sched_class {
#ifdef CONFIG_UCLAMP_TASK
@@ -3207,16 +3209,6 @@ extern int sched_dynamic_mode(const char *str);
extern void sched_dynamic_update(int mode);
#endif
-static inline void update_current_exec_runtime(struct task_struct *curr,
- u64 now, u64 delta_exec)
-{
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
-
- curr->se.exec_start = now;
- cgroup_account_cputime(curr, delta_exec);
-}
-
#ifdef CONFIG_SCHED_MM_CID
#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
@@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq)
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *curr = rq->curr;
- u64 now, delta_exec;
-
- now = rq_clock_task(rq);
- delta_exec = now - curr->se.exec_start;
- if (unlikely((s64)delta_exec < 0))
- delta_exec = 0;
-
- schedstat_set(curr->stats.exec_max,
- max(curr->stats.exec_max, delta_exec));
-
- update_current_exec_runtime(curr, now, delta_exec);
+ update_curr_common(rq);
}
/*