v2 sched: Fix tg->load when offlining a CPU

Message ID 20231223111545.62135-1-vincent.guittot@linaro.org
State New
Headers
Series v2 sched: Fix tg->load when offlining a CPU |

Commit Message

Vincent Guittot Dec. 23, 2023, 11:15 a.m. UTC
  When a CPU taken offline, the contribution of its cfs_rqs to task_groups'
load may remain and impact the calculation of the share of the online
CPUs.
Clear the contribution of an offlining CPU to task groups' load and skip
its contribution while it is inactive.

Reported-by: Imran Khan <imran.f.khan@oracle.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-and-tested-by: Imran Khan <imran.f.khan@oracle.com>
---

- Fix !CONFIG_FAIR_GROUP_SCHED case

 kernel/sched/fair.c | 52 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
  

Comments

Dietmar Eggemann Dec. 30, 2023, 5:56 p.m. UTC | #1
On 23/12/2023 12:15, Vincent Guittot wrote:
> When a CPU taken offline, the contribution of its cfs_rqs to task_groups'
> load may remain and impact the calculation of the share of the online
> CPUs.
> Clear the contribution of an offlining CPU to task groups' load and skip
> its contribution while it is inactive.

The patch got applied to tip/sched/core a couple of hours ago so this
might be too late ...

Isn't this fix in rq_offline_fair() also affecting all the other cpus
via cpuset_hotplug_workfn() -> rebuild_sched_domains_locked() ->
partition_sched_domains_locked() -> cpu_attach_domain() ->
rq_attach_root() -> set_rq_offline() ?

> 
> Reported-by: Imran Khan <imran.f.khan@oracle.com>
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> Reviewed-and-tested-by: Imran Khan <imran.f.khan@oracle.com>
> ---
> 
> - Fix !CONFIG_FAIR_GROUP_SCHED case
> 
>  kernel/sched/fair.c | 52 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 52 insertions(+)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bcea3d55d95d..4b09237d24b9 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4100,6 +4100,10 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
>  	if (cfs_rq->tg == &root_task_group)
>  		return;
>  
> +	/* rq has been offline and doesn't contribute anymore to the share */
> +	if (!cpu_active(cpu_of(rq_of(cfs_rq))))
> +		return;
> +
>  	/*
>  	 * For migration heavy workloads, access to tg->load_avg can be
>  	 * unbound. Limit the update rate to at most once per ms.
> @@ -4116,6 +4120,49 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
>  	}
>  }
>  
> +static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
> +{
> +	long delta;
> +	u64 now;
> +
> +	/*
> +	 * No need to update load_avg for root_task_group as it is not used.
> +	 */
> +	if (cfs_rq->tg == &root_task_group)
> +		return;
> +
> +	now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
> +	delta = 0 - cfs_rq->tg_load_avg_contrib;
> +	atomic_long_add(delta, &cfs_rq->tg->load_avg);

Why not:

atomic_long_sub(cfs_rq->tg_load_avg_contrib, &cfs_rq->tg->load_avg) w/o
the local `long delta`?

> +	cfs_rq->tg_load_avg_contrib = 0;
> +	cfs_rq->last_update_tg_load_avg = now;
> +}
> +
> +/* cpu offline callback */
> +static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
> +{
> +	struct task_group *tg;
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	/*
> +	 * The rq clock has already been updated in the
> +	 * set_rq_offline(), so we should skip updating
> +	 * the rq clock again in unthrottle_cfs_rq().

This comment seems misleading here since it looks that it is tailored to
unthrottle_offline_cfs_rqs(), the other cpu offline callback.

> +	 */
> +	rq_clock_start_loop_update(rq);
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(tg, &task_groups, list) {
> +		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> +
> +		clear_tg_load_avg(cfs_rq);
> +	}
> +	rcu_read_unlock();
> +
> +	rq_clock_stop_loop_update(rq);
> +}
> +
>  /*
>   * Called within set_task_rq() right before setting a task's CPU. The
>   * caller only guarantees p->pi_lock is held; no other assumptions,
> @@ -4412,6 +4459,8 @@ static inline bool skip_blocked_update(struct sched_entity *se)
>  
>  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
>  
> +static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
> +
>  static inline int propagate_entity_load_avg(struct sched_entity *se)
>  {
>  	return 0;
> @@ -12446,6 +12495,9 @@ static void rq_offline_fair(struct rq *rq)
>  
>  	/* Ensure any throttled groups are reachable by pick_next_task */
>  	unthrottle_offline_cfs_rqs(rq);
> +
> +	/* Ensure that we remove rq contribution to group share */
> +	clear_tg_offline_cfs_rqs(rq);
>  }
>  
>  #endif /* CONFIG_SMP */
  

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bcea3d55d95d..4b09237d24b9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4100,6 +4100,10 @@  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 	if (cfs_rq->tg == &root_task_group)
 		return;
 
+	/* rq has been offline and doesn't contribute anymore to the share */
+	if (!cpu_active(cpu_of(rq_of(cfs_rq))))
+		return;
+
 	/*
 	 * For migration heavy workloads, access to tg->load_avg can be
 	 * unbound. Limit the update rate to at most once per ms.
@@ -4116,6 +4120,49 @@  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 	}
 }
 
+static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
+{
+	long delta;
+	u64 now;
+
+	/*
+	 * No need to update load_avg for root_task_group as it is not used.
+	 */
+	if (cfs_rq->tg == &root_task_group)
+		return;
+
+	now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+	delta = 0 - cfs_rq->tg_load_avg_contrib;
+	atomic_long_add(delta, &cfs_rq->tg->load_avg);
+	cfs_rq->tg_load_avg_contrib = 0;
+	cfs_rq->last_update_tg_load_avg = now;
+}
+
+/* cpu offline callback */
+static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
+{
+	struct task_group *tg;
+
+	lockdep_assert_rq_held(rq);
+
+	/*
+	 * The rq clock has already been updated in the
+	 * set_rq_offline(), so we should skip updating
+	 * the rq clock again in unthrottle_cfs_rq().
+	 */
+	rq_clock_start_loop_update(rq);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+		clear_tg_load_avg(cfs_rq);
+	}
+	rcu_read_unlock();
+
+	rq_clock_stop_loop_update(rq);
+}
+
 /*
  * Called within set_task_rq() right before setting a task's CPU. The
  * caller only guarantees p->pi_lock is held; no other assumptions,
@@ -4412,6 +4459,8 @@  static inline bool skip_blocked_update(struct sched_entity *se)
 
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
 
+static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
+
 static inline int propagate_entity_load_avg(struct sched_entity *se)
 {
 	return 0;
@@ -12446,6 +12495,9 @@  static void rq_offline_fair(struct rq *rq)
 
 	/* Ensure any throttled groups are reachable by pick_next_task */
 	unthrottle_offline_cfs_rqs(rq);
+
+	/* Ensure that we remove rq contribution to group share */
+	clear_tg_offline_cfs_rqs(rq);
 }
 
 #endif /* CONFIG_SMP */