[v4,1/2] sched/fair: Add EAS checks before updating overutilized

Message ID 20240301151725.874604-2-sshegde@linux.ibm.com
State New
Headers
Series sched/fair: Limit access to overutilized |

Commit Message

Shrikanth Hegde March 1, 2024, 3:17 p.m. UTC
  Overutilized field of root domain is only used for EAS(energy aware scheduler)
to decide whether to do load balance or not. It is not used if EAS
not possible.

Currently enqueue_task_fair and task_tick_fair accesses, sometime updates
this field. In update_sd_lb_stats it is updated often. This causes cache
contention due to true sharing and burns a lot of cycles. overload and
overutilized are part of the same cacheline. Updating it often invalidates
the cacheline. That causes access  to overload to slow down due to
false sharing. Hence add EAS check before accessing/updating this field.
EAS check is optimized at compile time or it is a static branch.
Hence it shouldn't cost much.

With the patch, both enqueue_task_fair and newidle_balance don't show
up as hot routines in perf profile.

6.8-rc4:
7.18%  swapper          [kernel.vmlinux]              [k] enqueue_task_fair
6.78%  s                [kernel.vmlinux]              [k] newidle_balance
+patch:
0.14%  swapper          [kernel.vmlinux]              [k] enqueue_task_fair
0.00%  swapper          [kernel.vmlinux]              [k] newidle_balance

Minor change: trace_sched_overutilized_tp expect that second argument to
be bool. So do a int to bool conversion for that.

Fixes: 2802bf3cd936 ("sched/fair: Add over-utilization/tipping point indicator")
Reviewed-by: Srikar Dronamraju <srikar@linux.ibm.com>
Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
 kernel/sched/fair.c | 49 +++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 19 deletions(-)

--
2.39.3
  

Comments

Qais Yousef March 3, 2024, 6:50 p.m. UTC | #1
On 03/01/24 20:47, Shrikanth Hegde wrote:
> Overutilized field of root domain is only used for EAS(energy aware scheduler)
> to decide whether to do load balance or not. It is not used if EAS
> not possible.
> 
> Currently enqueue_task_fair and task_tick_fair accesses, sometime updates
> this field. In update_sd_lb_stats it is updated often. This causes cache
> contention due to true sharing and burns a lot of cycles. overload and
> overutilized are part of the same cacheline. Updating it often invalidates
> the cacheline. That causes access  to overload to slow down due to
> false sharing. Hence add EAS check before accessing/updating this field.
> EAS check is optimized at compile time or it is a static branch.
> Hence it shouldn't cost much.
> 
> With the patch, both enqueue_task_fair and newidle_balance don't show
> up as hot routines in perf profile.
> 
> 6.8-rc4:
> 7.18%  swapper          [kernel.vmlinux]              [k] enqueue_task_fair
> 6.78%  s                [kernel.vmlinux]              [k] newidle_balance
> +patch:
> 0.14%  swapper          [kernel.vmlinux]              [k] enqueue_task_fair
> 0.00%  swapper          [kernel.vmlinux]              [k] newidle_balance
> 
> Minor change: trace_sched_overutilized_tp expect that second argument to
> be bool. So do a int to bool conversion for that.
> 
> Fixes: 2802bf3cd936 ("sched/fair: Add over-utilization/tipping point indicator")
> Reviewed-by: Srikar Dronamraju <srikar@linux.ibm.com>
> Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
> ---
>  kernel/sched/fair.c | 49 +++++++++++++++++++++++++++------------------
>  1 file changed, 30 insertions(+), 19 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6a16129f9a5c..a71f8a1506e4 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6670,15 +6670,29 @@ static inline bool cpu_overutilized(int cpu)
>  	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
>  }
> 
> -static inline void update_overutilized_status(struct rq *rq)
> +static inline void set_rd_overutilized_status(struct root_domain *rd,
> +					      unsigned int status)
>  {
> -	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
> -		WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
> -		trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
> -	}

Can we add

	if (!sched_energy_enabled())
		return;

here and avoid sprinkling the condition in other various places instead?

> +	WRITE_ONCE(rd->overutilized, status);
> +	trace_sched_overutilized_tp(rd, !!status);
> +}
> +
> +static inline void check_update_overutilized_status(struct rq *rq)
> +{
> +	/*
> +	 * overutilized field is used for load balancing decisions only
> +	 * if energy aware scheduler is being used
> +	 */

nit: I think this comment is unnecessary but I don't mind keeping it

> +	if (!sched_energy_enabled())
> +		return;
> +
> +	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
> +		set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED);
>  }
>  #else
> -static inline void update_overutilized_status(struct rq *rq) { }
> +static inline void check_update_overutilized_status(struct rq *rq) { }
> +static inline void set_rd_overutilized_status(struct root_domain *rd,
> +					      unsigned int status) { }
>  #endif
> 
>  /* Runqueue only has SCHED_IDLE tasks enqueued */
> @@ -6779,7 +6793,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	 * and the following generally works well enough in practice.
>  	 */
>  	if (!task_new)
> -		update_overutilized_status(rq);
> +		check_update_overutilized_status(rq);
> 
>  enqueue_throttle:
>  	assert_list_leaf_cfs_rq(rq);
> @@ -9902,7 +9916,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>  		if (nr_running > 1)
>  			*sg_status |= SG_OVERLOAD;
> 
> -		if (cpu_overutilized(i))
> +		if (sched_energy_enabled() && cpu_overutilized(i))

I think we can drop sched_energy_enable() here if we add it to
set_rd_overutilized_status()

>  			*sg_status |= SG_OVERUTILIZED;
> 
>  #ifdef CONFIG_NUMA_BALANCING
> @@ -10596,19 +10610,16 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>  		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
> 
>  	if (!env->sd->parent) {
> -		struct root_domain *rd = env->dst_rq->rd;
> -
>  		/* update overload indicator if we are at root domain */
> -		WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
> +		WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD);
> 
>  		/* Update over-utilization (tipping point, U >= 0) indicator */
> -		WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
> -		trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
> -	} else if (sg_status & SG_OVERUTILIZED) {
> -		struct root_domain *rd = env->dst_rq->rd;
> -
> -		WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
> -		trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
> +		if (sched_energy_enabled()) {

ditto

> +			set_rd_overutilized_status(env->dst_rq->rd,
> +						   sg_status & SG_OVERUTILIZED);
> +		}
> +	} else if (sched_energy_enabled() && (sg_status & SG_OVERUTILIZED)) {

ditto

> +		set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED);
>  	}
> 
>  	update_idle_cpu_scan(env, sum_util);
> @@ -12609,7 +12620,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>  		task_tick_numa(rq, curr);
> 
>  	update_misfit_status(curr, rq);
> -	update_overutilized_status(task_rq(curr));
> +	check_update_overutilized_status(task_rq(curr));
> 
>  	task_tick_core(rq, curr);
>  }
> --
> 2.39.3
>
  
Shrikanth Hegde March 4, 2024, 8:24 a.m. UTC | #2
On 3/4/24 12:20 AM, Qais Yousef wrote:
> On 03/01/24 20:47, Shrikanth Hegde wrote:
>> Overutilized field of root domain is only used for EAS(energy aware scheduler)

[...]


Hi Qais, Thanks for taking a look. 

>> ---
>>  kernel/sched/fair.c | 49 +++++++++++++++++++++++++++------------------
>>  1 file changed, 30 insertions(+), 19 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 6a16129f9a5c..a71f8a1506e4 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6670,15 +6670,29 @@ static inline bool cpu_overutilized(int cpu)
>>  	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
>>  }
>>
>> -static inline void update_overutilized_status(struct rq *rq)
>> +static inline void set_rd_overutilized_status(struct root_domain *rd,
>> +					      unsigned int status)
>>  {
>> -	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
>> -		WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
>> -		trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
>> -	}
> 
> Can we add
> 
> 	if (!sched_energy_enabled())
> 		return;

This is very close to what i had till v2. But it was pointed out that, it 
would end up calling sched_energy_enabled twice in  check_update_overutilized_status. 
In check_update_overutilized_status, it would be better to avoid access to 
overutilized and computing cpu_overutilized if EAS is not enabled. 

I am okay with either code. keeping sched_energy_enabled in set_rd_overutilized_status
would be less code and more readable. But would call sched_energy_enabled twice. 

Dietmar, Pierre, 
Could you please provide your inputs here? 


> 
> here and avoid sprinkling the condition in other various places instead?
> 
>> +	WRITE_ONCE(rd->overutilized, status);
>> +	trace_sched_overutilized_tp(rd, !!status);
>> +}
>> +
>> +static inline void check_update_overutilized_status(struct rq *rq)
>> +{
>> +	/*
>> +	 * overutilized field is used for load balancing decisions only
>> +	 * if energy aware scheduler is being used
>> +	 */
> 
> nit: I think this comment is unnecessary but I don't mind keeping it
> 
>> +	if (!sched_energy_enabled())
>> +		return;
>> +
>> +	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
>> +		set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED);
>>  }
>>  #else
>> -static inline void update_overutilized_status(struct rq *rq) { }
>> +static inline void check_update_overutilized_status(struct rq *rq) { }
>> +static inline void set_rd_overutilized_status(struct root_domain *rd,
>> +					      unsigned int status) { }
>>  #endif
>>
>>  /* Runqueue only has SCHED_IDLE tasks enqueued */
>> @@ -6779,7 +6793,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>  	 * and the following generally works well enough in practice.
>>  	 */
>>  	if (!task_new)
>> -		update_overutilized_status(rq);
>> +		check_update_overutilized_status(rq);
>>
>>  enqueue_throttle:
>>  	assert_list_leaf_cfs_rq(rq);
>> @@ -9902,7 +9916,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>>  		if (nr_running > 1)
>>  			*sg_status |= SG_OVERLOAD;
>>
>> -		if (cpu_overutilized(i))
>> +		if (sched_energy_enabled() && cpu_overutilized(i))
> 
> I think we can drop sched_energy_enable() here if we add it to
> set_rd_overutilized_status()

we can avoid additional call to cpu_overutilized. So we should keep it. 

> 
>>  			*sg_status |= SG_OVERUTILIZED;
>>
>>  #ifdef CONFIG_NUMA_BALANCING
>> @@ -10596,19 +10610,16 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>>  		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
>>
>>  	if (!env->sd->parent) {
>> -		struct root_domain *rd = env->dst_rq->rd;
>> -
>>  		/* update overload indicator if we are at root domain */
>> -		WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
>> +		WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD);
>>
>>  		/* Update over-utilization (tipping point, U >= 0) indicator */
>> -		WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
>> -		trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
>> -	} else if (sg_status & SG_OVERUTILIZED) {
>> -		struct root_domain *rd = env->dst_rq->rd;
>> -
>> -		WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
>> -		trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
>> +		if (sched_energy_enabled()) {
> 
> ditto

First comment would apply for these two.

>> +			set_rd_overutilized_status(env->dst_rq->rd,
>> +						   sg_status & SG_OVERUTILIZED);
>> +		}
>> +	} else if (sched_energy_enabled() && (sg_status & SG_OVERUTILIZED)) {
> 
> ditto
> 
>> +		set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED);
>>  	}
>>
>>  	update_idle_cpu_scan(env, sum_util);
>> @@ -12609,7 +12620,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>>  		task_tick_numa(rq, curr);
>>
>>  	update_misfit_status(curr, rq);
>> -	update_overutilized_status(task_rq(curr));
>> +	check_update_overutilized_status(task_rq(curr));
>>
>>  	task_tick_core(rq, curr);
>>  }
>> --
>> 2.39.3
>>
  

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a16129f9a5c..a71f8a1506e4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6670,15 +6670,29 @@  static inline bool cpu_overutilized(int cpu)
 	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
 }

-static inline void update_overutilized_status(struct rq *rq)
+static inline void set_rd_overutilized_status(struct root_domain *rd,
+					      unsigned int status)
 {
-	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
-		WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
-		trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
-	}
+	WRITE_ONCE(rd->overutilized, status);
+	trace_sched_overutilized_tp(rd, !!status);
+}
+
+static inline void check_update_overutilized_status(struct rq *rq)
+{
+	/*
+	 * overutilized field is used for load balancing decisions only
+	 * if energy aware scheduler is being used
+	 */
+	if (!sched_energy_enabled())
+		return;
+
+	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
+		set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED);
 }
 #else
-static inline void update_overutilized_status(struct rq *rq) { }
+static inline void check_update_overutilized_status(struct rq *rq) { }
+static inline void set_rd_overutilized_status(struct root_domain *rd,
+					      unsigned int status) { }
 #endif

 /* Runqueue only has SCHED_IDLE tasks enqueued */
@@ -6779,7 +6793,7 @@  enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 * and the following generally works well enough in practice.
 	 */
 	if (!task_new)
-		update_overutilized_status(rq);
+		check_update_overutilized_status(rq);

 enqueue_throttle:
 	assert_list_leaf_cfs_rq(rq);
@@ -9902,7 +9916,7 @@  static inline void update_sg_lb_stats(struct lb_env *env,
 		if (nr_running > 1)
 			*sg_status |= SG_OVERLOAD;

-		if (cpu_overutilized(i))
+		if (sched_energy_enabled() && cpu_overutilized(i))
 			*sg_status |= SG_OVERUTILIZED;

 #ifdef CONFIG_NUMA_BALANCING
@@ -10596,19 +10610,16 @@  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);

 	if (!env->sd->parent) {
-		struct root_domain *rd = env->dst_rq->rd;
-
 		/* update overload indicator if we are at root domain */
-		WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
+		WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD);

 		/* Update over-utilization (tipping point, U >= 0) indicator */
-		WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
-		trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
-	} else if (sg_status & SG_OVERUTILIZED) {
-		struct root_domain *rd = env->dst_rq->rd;
-
-		WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
-		trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
+		if (sched_energy_enabled()) {
+			set_rd_overutilized_status(env->dst_rq->rd,
+						   sg_status & SG_OVERUTILIZED);
+		}
+	} else if (sched_energy_enabled() && (sg_status & SG_OVERUTILIZED)) {
+		set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED);
 	}

 	update_idle_cpu_scan(env, sum_util);
@@ -12609,7 +12620,7 @@  static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 		task_tick_numa(rq, curr);

 	update_misfit_status(curr, rq);
-	update_overutilized_status(task_rq(curr));
+	check_update_overutilized_status(task_rq(curr));

 	task_tick_core(rq, curr);
 }