[v3,1/6] sched/fair: Determine active load balance for SMT sched groups

Message ID e24f35d142308790f69be65930b82794ef6658a2.1688770494.git.tim.c.chen@linux.intel.com
State New
Headers
Series Enable Cluster Scheduling for x86 Hybrid CPUs |

Commit Message

Tim Chen July 7, 2023, 10:57 p.m. UTC
  From: Tim C Chen <tim.c.chen@linux.intel.com>

On hybrid CPUs with scheduling cluster enabled, we will need to
consider balancing between SMT CPU cluster, and Atom core cluster.

Below shows such a hybrid x86 CPU with 4 big cores and 8 atom cores.
Each scheduling cluster span a L2 cache.

          --L2-- --L2-- --L2-- --L2-- ----L2---- -----L2------
          [0, 1] [2, 3] [4, 5] [5, 6] [7 8 9 10] [11 12 13 14]
          Big    Big    Big    Big    Atom       Atom
          core   core   core   core   Module     Module

If the busiest group is a big core with both SMT CPUs busy, we should
active load balance if destination group has idle CPU cores.  Such
condition is considered by asym_active_balance() in load balancing but not
considered when looking for busiest group and computing load imbalance.
Add this consideration in find_busiest_group() and calculate_imbalance().

In addition, update the logic determining the busier group when one group
is SMT and the other group is non SMT but both groups are partially busy
with idle CPU. The busier group should be the group with idle cores rather
than the group with one busy SMT CPU.  We do not want to make the SMT group
the busiest one to pull the only task off SMT CPU and causing the whole core to
go empty.

Otherwise suppose in the search for the busiest group, we first encounter
an SMT group with 1 task and set it as the busiest.  The destination
group is an atom cluster with 1 task and we next encounter an atom
cluster group with 3 tasks, we will not pick this atom cluster over the
SMT group, even though we should.  As a result, we do not load balance
the busier Atom cluster (with 3 tasks) towards the local atom cluster
(with 1 task).  And it doesn't make sense to pick the 1 task SMT group
as the busier group as we also should not pull task off the SMT towards
the 1 task atom cluster and make the SMT core completely empty.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 kernel/sched/fair.c | 80 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 3 deletions(-)
  

Comments

Shrikanth Hegde July 14, 2023, 1:06 p.m. UTC | #1
On 7/8/23 4:27 AM, Tim Chen wrote:
> From: Tim C Chen <tim.c.chen@linux.intel.com>
> 

Hi Tim.  Sorry for the delayed response.

> On hybrid CPUs with scheduling cluster enabled, we will need to
> consider balancing between SMT CPU cluster, and Atom core cluster.
> 
> Below shows such a hybrid x86 CPU with 4 big cores and 8 atom cores.
> Each scheduling cluster span a L2 cache.
> 
>           --L2-- --L2-- --L2-- --L2-- ----L2---- -----L2------
>           [0, 1] [2, 3] [4, 5] [5, 6] [7 8 9 10] [11 12 13 14]
>           Big    Big    Big    Big    Atom       Atom
>           core   core   core   core   Module     Module
> 
> If the busiest group is a big core with both SMT CPUs busy, we should
> active load balance if destination group has idle CPU cores.  Such
> condition is considered by asym_active_balance() in load balancing but not
> considered when looking for busiest group and computing load imbalance.
> Add this consideration in find_busiest_group() and calculate_imbalance().
> 
> In addition, update the logic determining the busier group when one group
> is SMT and the other group is non SMT but both groups are partially busy
> with idle CPU. The busier group should be the group with idle cores rather
> than the group with one busy SMT CPU.  We do not want to make the SMT group
> the busiest one to pull the only task off SMT CPU and causing the whole core to
> go empty.
> 
> Otherwise suppose in the search for the busiest group, we first encounter
> an SMT group with 1 task and set it as the busiest.  The destination
> group is an atom cluster with 1 task and we next encounter an atom
> cluster group with 3 tasks, we will not pick this atom cluster over the
> SMT group, even though we should.  As a result, we do not load balance
> the busier Atom cluster (with 3 tasks) towards the local atom cluster
> (with 1 task).  And it doesn't make sense to pick the 1 task SMT group
> as the busier group as we also should not pull task off the SMT towards
> the 1 task atom cluster and make the SMT core completely empty.
> 
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> ---
>  kernel/sched/fair.c | 80 +++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 77 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 87317634fab2..f636d6c09dc6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8279,6 +8279,11 @@ enum group_type {
>  	 * more powerful CPU.
>  	 */
>  	group_misfit_task,
> +	/*
> +	 * Balance SMT group that's fully busy. Can benefit from migration
> +	 * a task on SMT with busy sibling to another CPU on idle core.
> +	 */
> +	group_smt_balance,

Could you please explain what group_smt_balance does differently? AFAIU it is doing the same 
thing as group_fully_busy but for one domain above SMT domains right?


>  	/*
>  	 * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
>  	 * and the task should be migrated to it instead of running on the
> @@ -8987,6 +8992,7 @@ struct sg_lb_stats {
>  	unsigned int group_weight;
>  	enum group_type group_type;
>  	unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
> +	unsigned int group_smt_balance;  /* Task on busy SMT be moved */
>  	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
>  #ifdef CONFIG_NUMA_BALANCING
>  	unsigned int nr_numa_running;
> @@ -9260,6 +9266,9 @@ group_type group_classify(unsigned int imbalance_pct,
>  	if (sgs->group_asym_packing)
>  		return group_asym_packing;
> 
> +	if (sgs->group_smt_balance)
> +		return group_smt_balance;
> +
>  	if (sgs->group_misfit_task_load)
>  		return group_misfit_task;
> 
> @@ -9333,6 +9342,36 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs
>  	return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
>  }
> 
> +/* One group has more than one SMT CPU while the other group does not */
> +static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
> +				    struct sched_group *sg2)
> +{
> +	if (!sg1 || !sg2)
> +		return false;
> +
> +	return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
> +		(sg2->flags & SD_SHARE_CPUCAPACITY);
> +}
> +
> +static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
> +			       struct sched_group *group)
> +{
> +	if (env->idle == CPU_NOT_IDLE)
> +		return false;
> +
> +	/*
> +	 * For SMT source group, it is better to move a task
> +	 * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
> +	 * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
> +	 * will not be on.
> +	 */
> +	if (group->flags & SD_SHARE_CPUCAPACITY &&
> +	    sgs->sum_h_nr_running > 1)
> +		return true;
> +

If we consider symmetric platforms which have SMT4 such as power10. 
we have a topology like below. multiple such MC will form DIE(PKG)


[0 2 4 6][1 3 5 7][8 10 12 14][9 11 13 15]
[--SMT--][--SMT--][----SMT---][---SMT----]
[--sg1--][--sg1--][---sg1----][---sg1----]
[--------------MC------------------------]

In case of SMT4, if there is any group which has 2 or more tasks, that 
group will be marked as group_smt_balance. previously, if that group had 2
or 3 tasks, it would have been marked as group_has_spare. Since all the groups have 
SMT that means behavior would be same fully busy right? That can cause some 
corner cases. No?

One example is Lets say sg1 has 4 tasks. and sg2 has 0 tasks and is trying to do 
load balance. Previously imbalance would have been 2, instead now imbalance would be 1.
But in subsequent lb it would be balanced.



> +	return false;
> +}
> +
>  static inline bool
>  sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
>  {
> @@ -9425,6 +9464,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>  		sgs->group_asym_packing = 1;
>  	}
> 
> +	/* Check for loaded SMT group to be balanced to dst CPU */
> +	if (!local_group && smt_balance(env, sgs, group))
> +		sgs->group_smt_balance = 1;
> +
>  	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
> 
>  	/* Computing avg_load makes sense only when group is overloaded */
> @@ -9509,6 +9552,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>  			return false;
>  		break;
> 
> +	case group_smt_balance:
>  	case group_fully_busy:
>  		/*
>  		 * Select the fully busy group with highest avg_load. In
> @@ -9537,6 +9581,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>  		break;
> 
>  	case group_has_spare:
> +		/*
> +		 * Do not pick sg with SMT CPUs over sg with pure CPUs,
> +		 * as we do not want to pull task off SMT core with one task
> +		 * and make the core idle.
> +		 */
> +		if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
> +			if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
> +				return false;
> +			else
> +				return true;> +		}
> +
>  		/*
>  		 * Select not overloaded group with lowest number of idle cpus
>  		 * and highest number of running tasks. We could also compare
> @@ -9733,6 +9789,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
> 
>  	case group_imbalanced:
>  	case group_asym_packing:
> +	case group_smt_balance:
>  		/* Those types are not used in the slow wakeup path */
>  		return false;
> 
> @@ -9864,6 +9921,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
> 
>  	case group_imbalanced:
>  	case group_asym_packing:
> +	case group_smt_balance:
>  		/* Those type are not used in the slow wakeup path */
>  		return NULL;
> 
> @@ -10118,6 +10176,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
>  		return;
>  	}
> 
> +	if (busiest->group_type == group_smt_balance) {
> +		/* Reduce number of tasks sharing CPU capacity */
> +		env->migration_type = migrate_task;
> +		env->imbalance = 1;
> +		return;
> +	}
> +
>  	if (busiest->group_type == group_imbalanced) {
>  		/*
>  		 * In the group_imb case we cannot rely on group-wide averages
> @@ -10363,16 +10428,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
>  		goto force_balance;
> 
>  	if (busiest->group_type != group_overloaded) {
> -		if (env->idle == CPU_NOT_IDLE)
> +		if (env->idle == CPU_NOT_IDLE) {
>  			/*
>  			 * If the busiest group is not overloaded (and as a
>  			 * result the local one too) but this CPU is already
>  			 * busy, let another idle CPU try to pull task.
>  			 */
>  			goto out_balanced;
> +		}
> +
> +		if (busiest->group_type == group_smt_balance &&
> +		    smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
> +			/* Let non SMT CPU pull from SMT CPU sharing with sibling */
> +			goto force_balance;
> +		}
> 
>  		if (busiest->group_weight > 1 &&
> -		    local->idle_cpus <= (busiest->idle_cpus + 1))
> +		    local->idle_cpus <= (busiest->idle_cpus + 1)) {
>  			/*
>  			 * If the busiest group is not overloaded
>  			 * and there is no imbalance between this and busiest
> @@ -10383,12 +10455,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
>  			 * there is more than 1 CPU per group.
>  			 */
>  			goto out_balanced;
> +		}
> 
> -		if (busiest->sum_h_nr_running == 1)
> +		if (busiest->sum_h_nr_running == 1) {
>  			/*
>  			 * busiest doesn't have any tasks waiting to run
>  			 */
>  			goto out_balanced;
> +		}
>  	}
> 
>  force_balance:
  
Tobias Huschle July 14, 2023, 2:53 p.m. UTC | #2
On 2023-07-08 00:57, Tim Chen wrote:
> From: Tim C Chen <tim.c.chen@linux.intel.com>
> 
> On hybrid CPUs with scheduling cluster enabled, we will need to
> consider balancing between SMT CPU cluster, and Atom core cluster.
> 
> Below shows such a hybrid x86 CPU with 4 big cores and 8 atom cores.
> Each scheduling cluster span a L2 cache.
> 
>           --L2-- --L2-- --L2-- --L2-- ----L2---- -----L2------
>           [0, 1] [2, 3] [4, 5] [5, 6] [7 8 9 10] [11 12 13 14]
>           Big    Big    Big    Big    Atom       Atom
>           core   core   core   core   Module     Module
> 
> If the busiest group is a big core with both SMT CPUs busy, we should
> active load balance if destination group has idle CPU cores.  Such
> condition is considered by asym_active_balance() in load balancing but 
> not
> considered when looking for busiest group and computing load imbalance.
> Add this consideration in find_busiest_group() and 
> calculate_imbalance().
> 
> In addition, update the logic determining the busier group when one 
> group
> is SMT and the other group is non SMT but both groups are partially 
> busy
> with idle CPU. The busier group should be the group with idle cores 
> rather
> than the group with one busy SMT CPU.  We do not want to make the SMT 
> group
> the busiest one to pull the only task off SMT CPU and causing the whole 
> core to
> go empty.
> 
> Otherwise suppose in the search for the busiest group, we first 
> encounter
> an SMT group with 1 task and set it as the busiest.  The destination
> group is an atom cluster with 1 task and we next encounter an atom
> cluster group with 3 tasks, we will not pick this atom cluster over the
> SMT group, even though we should.  As a result, we do not load balance
> the busier Atom cluster (with 3 tasks) towards the local atom cluster
> (with 1 task).  And it doesn't make sense to pick the 1 task SMT group
> as the busier group as we also should not pull task off the SMT towards
> the 1 task atom cluster and make the SMT core completely empty.
> 
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> ---
>  kernel/sched/fair.c | 80 +++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 77 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 87317634fab2..f636d6c09dc6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8279,6 +8279,11 @@ enum group_type {
>  	 * more powerful CPU.
>  	 */
>  	group_misfit_task,
> +	/*
> +	 * Balance SMT group that's fully busy. Can benefit from migration
> +	 * a task on SMT with busy sibling to another CPU on idle core.
> +	 */
> +	group_smt_balance,

Would it make sense to move smt_balance?, s.t. we get:

group_has_spare < group_smt_balance < group_fully_busy

Conceptually I would be more intuitive to me like this as the
smt_balance groups are more busy than has_spare ones, but less busy
then fully_busy ones.

 From a functional perspective I could also see some impact when
update_sd_pick_busiest compares the group types. In that case we
would remove tasks from fully busy groups before moving them
from smt_balance groups. Not sure which way would be to prefer
to increase overall throughput.

Since smt_balance is only selected if the group has SMT, this
should still not pull the last task off of a non-SMT CPU.

>  	/*
>  	 * SD_ASYM_PACKING only: One local CPU with higher capacity is 
> available,
>  	 * and the task should be migrated to it instead of running on the
> @@ -8987,6 +8992,7 @@ struct sg_lb_stats {
>  	unsigned int group_weight;
>  	enum group_type group_type;
>  	unsigned int group_asym_packing; /* Tasks should be moved to 
> preferred CPU */
> +	unsigned int group_smt_balance;  /* Task on busy SMT be moved */
>  	unsigned long group_misfit_task_load; /* A CPU has a task too big
> for its capacity */
>  #ifdef CONFIG_NUMA_BALANCING
>  	unsigned int nr_numa_running;
> @@ -9260,6 +9266,9 @@ group_type group_classify(unsigned int 
> imbalance_pct,
>  	if (sgs->group_asym_packing)
>  		return group_asym_packing;
> 
> +	if (sgs->group_smt_balance)
> +		return group_smt_balance;
> +
>  	if (sgs->group_misfit_task_load)
>  		return group_misfit_task;
> 
> @@ -9333,6 +9342,36 @@ sched_asym(struct lb_env *env, struct
> sd_lb_stats *sds,  struct sg_lb_stats *sgs
>  	return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
>  }
> 
> +/* One group has more than one SMT CPU while the other group does not 
> */
> +static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
> +				    struct sched_group *sg2)
> +{
> +	if (!sg1 || !sg2)
> +		return false;
> +
> +	return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
> +		(sg2->flags & SD_SHARE_CPUCAPACITY);
> +}
> +
> +static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats 
> *sgs,
> +			       struct sched_group *group)
> +{
> +	if (env->idle == CPU_NOT_IDLE)
> +		return false;
> +
> +	/*
> +	 * For SMT source group, it is better to move a task
> +	 * to a CPU that doesn't have multiple tasks sharing its CPU 
> capacity.
> +	 * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
> +	 * will not be on.
> +	 */
> +	if (group->flags & SD_SHARE_CPUCAPACITY &&
> +	    sgs->sum_h_nr_running > 1)
> +		return true;
> +
> +	return false;
> +}
> +
>  static inline bool
>  sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
>  {
> @@ -9425,6 +9464,10 @@ static inline void update_sg_lb_stats(struct 
> lb_env *env,
>  		sgs->group_asym_packing = 1;
>  	}
> 
> +	/* Check for loaded SMT group to be balanced to dst CPU */
> +	if (!local_group && smt_balance(env, sgs, group))
> +		sgs->group_smt_balance = 1;
> +
>  	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
> 
>  	/* Computing avg_load makes sense only when group is overloaded */
> @@ -9509,6 +9552,7 @@ static bool update_sd_pick_busiest(struct lb_env 
> *env,
>  			return false;
>  		break;
> 
> +	case group_smt_balance:
>  	case group_fully_busy:
>  		/*
>  		 * Select the fully busy group with highest avg_load. In
> @@ -9537,6 +9581,18 @@ static bool update_sd_pick_busiest(struct lb_env 
> *env,
>  		break;
> 
>  	case group_has_spare:
> +		/*
> +		 * Do not pick sg with SMT CPUs over sg with pure CPUs,
> +		 * as we do not want to pull task off SMT core with one task
> +		 * and make the core idle.
> +		 */
> +		if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
> +			if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
> +				return false;
> +			else
> +				return true;
> +		}
> +
>  		/*
>  		 * Select not overloaded group with lowest number of idle cpus
>  		 * and highest number of running tasks. We could also compare
> @@ -9733,6 +9789,7 @@ static bool update_pick_idlest(struct sched_group 
> *idlest,
> 
>  	case group_imbalanced:
>  	case group_asym_packing:
> +	case group_smt_balance:
>  		/* Those types are not used in the slow wakeup path */
>  		return false;
> 
> @@ -9864,6 +9921,7 @@ find_idlest_group(struct sched_domain *sd,
> struct task_struct *p, int this_cpu)
> 
>  	case group_imbalanced:
>  	case group_asym_packing:
> +	case group_smt_balance:
>  		/* Those type are not used in the slow wakeup path */
>  		return NULL;
> 
> @@ -10118,6 +10176,13 @@ static inline void calculate_imbalance(struct
> lb_env *env, struct sd_lb_stats *s
>  		return;
>  	}
> 
> +	if (busiest->group_type == group_smt_balance) {
> +		/* Reduce number of tasks sharing CPU capacity */
> +		env->migration_type = migrate_task;
> +		env->imbalance = 1;
> +		return;
> +	}
> +
>  	if (busiest->group_type == group_imbalanced) {
>  		/*
>  		 * In the group_imb case we cannot rely on group-wide averages
> @@ -10363,16 +10428,23 @@ static struct sched_group
> *find_busiest_group(struct lb_env *env)
>  		goto force_balance;
> 
>  	if (busiest->group_type != group_overloaded) {
> -		if (env->idle == CPU_NOT_IDLE)
> +		if (env->idle == CPU_NOT_IDLE) {
>  			/*
>  			 * If the busiest group is not overloaded (and as a
>  			 * result the local one too) but this CPU is already
>  			 * busy, let another idle CPU try to pull task.
>  			 */
>  			goto out_balanced;
> +		}
> +
> +		if (busiest->group_type == group_smt_balance &&
> +		    smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
> +			/* Let non SMT CPU pull from SMT CPU sharing with sibling */
> +			goto force_balance;
> +		}
> 
>  		if (busiest->group_weight > 1 &&
> -		    local->idle_cpus <= (busiest->idle_cpus + 1))
> +		    local->idle_cpus <= (busiest->idle_cpus + 1)) {
>  			/*
>  			 * If the busiest group is not overloaded
>  			 * and there is no imbalance between this and busiest
> @@ -10383,12 +10455,14 @@ static struct sched_group
> *find_busiest_group(struct lb_env *env)
>  			 * there is more than 1 CPU per group.
>  			 */
>  			goto out_balanced;
> +		}
> 
> -		if (busiest->sum_h_nr_running == 1)
> +		if (busiest->sum_h_nr_running == 1) {
>  			/*
>  			 * busiest doesn't have any tasks waiting to run
>  			 */
>  			goto out_balanced;
> +		}
>  	}
> 
>  force_balance:
  
Tim Chen July 14, 2023, 11:05 p.m. UTC | #3
On Fri, 2023-07-14 at 18:36 +0530, Shrikanth Hegde wrote:

> 
> 
> If we consider symmetric platforms which have SMT4 such as power10. 
> we have a topology like below. multiple such MC will form DIE(PKG)
> 
> 
> [0 2 4 6][1 3 5 7][8 10 12 14][9 11 13 15]
> [--SMT--][--SMT--][----SMT---][---SMT----]
> [--sg1--][--sg1--][---sg1----][---sg1----]
> [--------------MC------------------------]
> 
> In case of SMT4, if there is any group which has 2 or more tasks, that 
> group will be marked as group_smt_balance. previously, if that group had 2
> or 3 tasks, it would have been marked as group_has_spare. Since all the groups have 
> SMT that means behavior would be same fully busy right? That can cause some 
> corner cases. No?

You raised a good point. I was looking from SMT2
perspective so group_smt_balance implies group_fully_busy.
That is no longer true for SMT4.

I am thinking of the following fix on the current patch
to take care of SMT4. Do you think this addresses
concerns from you and Tobias?

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 294a662c9410..3fc8d3a3bd22 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9588,6 +9588,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
                break;
 
        case group_smt_balance:
+               /* no idle cpus on both groups handled by group_fully_busy below */
+               if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) {
+                       if (sgs->idle_cpus > busiest->idle_cpus)
+                               return false;
+                       if (sgs->idle_cpus < busiest->idle_cpus)
+                               return true;
+                       if (sgs->sum_nr_running <= busiest_sum_nr_running)
+                               return false;
+                       else
+                               return true;
+               }


I will be on vacation next three weeks so my response will be slow.

Tim

> 
> One example is Lets say sg1 has 4 tasks. and sg2 has 0 tasks and is trying to do 
> load balance. Previously imbalance would have been 2, instead now imbalance would be 1.
> But in subsequent lb it would be balanced.
> 
> 
> 
> > +	return false;
> > +}
> > +
> >  static inline bool
> >  sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
> >  {
> > @@ -9425,6 +9464,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> >  		sgs->group_asym_packing = 1;
> >  	}
> > 
> > +	/* Check for loaded SMT group to be balanced to dst CPU */
> > +	if (!local_group && smt_balance(env, sgs, group))
> > +		sgs->group_smt_balance = 1;
> > +
> >  	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
> > 
> >  	/* Computing avg_load makes sense only when group is overloaded */
> > @@ -9509,6 +9552,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> >  			return false;
> >  		break;
> > 
> > +	case group_smt_balance:
> >  	case group_fully_busy:
> >  		/*
> >  		 * Select the fully busy group with highest avg_load. In
> > @@ -9537,6 +9581,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> >  		break;
> > 
> >  	case group_has_spare:
> > +		/*
> > +		 * Do not pick sg with SMT CPUs over sg with pure CPUs,
> > +		 * as we do not want to pull task off SMT core with one task
> > +		 * and make the core idle.
> > +		 */
> > +		if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
> > +			if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
> > +				return false;
> > +			else
> > +				return true;> +		}
> > +
> >  		/*
> >  		 * Select not overloaded group with lowest number of idle cpus
> >  		 * and highest number of running tasks. We could also compare
> > @@ -9733,6 +9789,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
> > 
> >  	case group_imbalanced:
> >  	case group_asym_packing:
> > +	case group_smt_balance:
> >  		/* Those types are not used in the slow wakeup path */
> >  		return false;
> > 
> > @@ -9864,6 +9921,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
> > 
> >  	case group_imbalanced:
> >  	case group_asym_packing:
> > +	case group_smt_balance:
> >  		/* Those type are not used in the slow wakeup path */
> >  		return NULL;
> > 
> > @@ -10118,6 +10176,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
> >  		return;
> >  	}
> > 
> > +	if (busiest->group_type == group_smt_balance) {
> > +		/* Reduce number of tasks sharing CPU capacity */
> > +		env->migration_type = migrate_task;
> > +		env->imbalance = 1;
> > +		return;
> > +	}
> > +
> >  	if (busiest->group_type == group_imbalanced) {
> >  		/*
> >  		 * In the group_imb case we cannot rely on group-wide averages
> > @@ -10363,16 +10428,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> >  		goto force_balance;
> > 
> >  	if (busiest->group_type != group_overloaded) {
> > -		if (env->idle == CPU_NOT_IDLE)
> > +		if (env->idle == CPU_NOT_IDLE) {
> >  			/*
> >  			 * If the busiest group is not overloaded (and as a
> >  			 * result the local one too) but this CPU is already
> >  			 * busy, let another idle CPU try to pull task.
> >  			 */
> >  			goto out_balanced;
> > +		}
> > +
> > +		if (busiest->group_type == group_smt_balance &&
> > +		    smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
> > +			/* Let non SMT CPU pull from SMT CPU sharing with sibling */
> > +			goto force_balance;
> > +		}
> > 
> >  		if (busiest->group_weight > 1 &&
> > -		    local->idle_cpus <= (busiest->idle_cpus + 1))
> > +		    local->idle_cpus <= (busiest->idle_cpus + 1)) {
> >  			/*
> >  			 * If the busiest group is not overloaded
> >  			 * and there is no imbalance between this and busiest
> > @@ -10383,12 +10455,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> >  			 * there is more than 1 CPU per group.
> >  			 */
> >  			goto out_balanced;
> > +		}
> > 
> > -		if (busiest->sum_h_nr_running == 1)
> > +		if (busiest->sum_h_nr_running == 1) {
> >  			/*
> >  			 * busiest doesn't have any tasks waiting to run
> >  			 */
> >  			goto out_balanced;
> > +		}
> >  	}
> > 
> >  force_balance:
  
Tim Chen July 14, 2023, 11:29 p.m. UTC | #4
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 87317634fab2..f636d6c09dc6 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -8279,6 +8279,11 @@ enum group_type {
> >  	 * more powerful CPU.
> >  	 */
> >  	group_misfit_task,
> > +	/*
> > +	 * Balance SMT group that's fully busy. Can benefit from migration
> > +	 * a task on SMT with busy sibling to another CPU on idle core.
> > +	 */
> > +	group_smt_balance,
> 
> Would it make sense to move smt_balance?, s.t. we get:
> 
> group_has_spare < group_smt_balance < group_fully_busy
> 
> Conceptually I would be more intuitive to me like this as the
> smt_balance groups are more busy than has_spare ones, but less busy
> then fully_busy ones.
> 
>  From a functional perspective I could also see some impact when
> update_sd_pick_busiest compares the group types. In that case we
> would remove tasks from fully busy groups before moving them
> from smt_balance groups. Not sure which way would be to prefer
> to increase overall throughput.
> 
> Since smt_balance is only selected if the group has SMT, this
> should still not pull the last task off of a non-SMT CPU.
> 
> 

I think you have similar concerns as Shrikanth on this patch.
Can you see if my fix to update_sd_pick_busiest() in my reply
to Shrikanth addresses what you have in mind.

Tim
  
Tim Chen July 15, 2023, 6:25 p.m. UTC | #5
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 294a662c9410..3fc8d3a3bd22 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9588,6 +9588,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>                 break;
>  
>         case group_smt_balance:
> +               /* no idle cpus on both groups handled by group_fully_busy below */
> +               if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) {
> +                       if (sgs->idle_cpus > busiest->idle_cpus)
> +                               return false;
> +                       if (sgs->idle_cpus < busiest->idle_cpus)
> +                               return true;
> +                       if (sgs->sum_nr_running <= busiest_sum_nr_running)

typo: should be busiest->sum->nr_running

> +                               return false;
> +                       else
> +                               return true;
> +               }
> 

Tim
  
Shrikanth Hegde July 16, 2023, 7:36 p.m. UTC | #6
On 7/15/23 4:35 AM, Tim Chen wrote:
> On Fri, 2023-07-14 at 18:36 +0530, Shrikanth Hegde wrote:
> 
>>
>>
>> If we consider symmetric platforms which have SMT4 such as power10. 
>> we have a topology like below. multiple such MC will form DIE(PKG)
>>
>>
>> [0 2 4 6][1 3 5 7][8 10 12 14][9 11 13 15]
>> [--SMT--][--SMT--][----SMT---][---SMT----]
>> [--sg1--][--sg1--][---sg1----][---sg1----]
>> [--------------MC------------------------]
>>
>> In case of SMT4, if there is any group which has 2 or more tasks, that 
>> group will be marked as group_smt_balance. previously, if that group had 2
>> or 3 tasks, it would have been marked as group_has_spare. Since all the groups have 
>> SMT that means behavior would be same fully busy right? That can cause some 
>> corner cases. No?
> 
> You raised a good point. I was looking from SMT2
> perspective so group_smt_balance implies group_fully_busy.
> That is no longer true for SMT4.
> 
> I am thinking of the following fix on the current patch
> to take care of SMT4. Do you think this addresses

Thanks Tim for taking a look at it again. 

Yes. I think this would address some of the corner cases. 
Any SMT4 group having 2,3,4 will have smt_balance as the group type, and busiest one 
is the one which has least number of idle cpu's. (same conditions as group_has_spare)




> concerns from you and Tobias?
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 294a662c9410..3fc8d3a3bd22 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9588,6 +9588,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>                 break;
>  
>         case group_smt_balance:
> +               /* no idle cpus on both groups handled by group_fully_busy below */
> +               if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) {
> +                       if (sgs->idle_cpus > busiest->idle_cpus)
> +                               return false;
> +                       if (sgs->idle_cpus < busiest->idle_cpus)
> +                               return true;
> +                       if (sgs->sum_nr_running <= busiest_sum_nr_running)
> +                               return false;
> +                       else
> +                               return true;
> +               }
> 
> 
> I will be on vacation next three weeks so my response will be slow.
> 
> Tim
> 
>>

Small suggestion to above code to avoid compiler warning of switch case falling
through and else case can be removed, since update_sd_pick_busiest by default returns true.

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e5a75c76bcaa..ae364ac6f22e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9728,9 +9728,9 @@ static bool update_sd_pick_busiest(struct lb_env *env,
                                return true;
                        if (sgs->sum_nr_running <= busiest->sum_nr_running)
                                return false;
-                       else
-                               return true;
                }
+               break;
+
        case group_fully_busy:
                /*
                 * Select the fully busy group with highest avg_load. In



>> One example is Lets say sg1 has 4 tasks. and sg2 has 0 tasks and is trying to do 
>> load balance. Previously imbalance would have been 2, instead now imbalance would be 1.
>> But in subsequent lb it would be balanced.
>>
>>
>>
>>> +	return false;
>>> +}
>>> +
>>>  static inline bool
>>>  sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
>>>  {
>>> @@ -9425,6 +9464,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>>>  		sgs->group_asym_packing = 1;
>>>  	}
>>>
>>> +	/* Check for loaded SMT group to be balanced to dst CPU */
>>> +	if (!local_group && smt_balance(env, sgs, group))
>>> +		sgs->group_smt_balance = 1;
>>> +
>>>  	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
>>>
>>>  	/* Computing avg_load makes sense only when group is overloaded */
>>> @@ -9509,6 +9552,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>>>  			return false;
>>>  		break;
>>>
>>> +	case group_smt_balance:
>>>  	case group_fully_busy:
>>>  		/*
>>>  		 * Select the fully busy group with highest avg_load. In
>>> @@ -9537,6 +9581,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>>>  		break;
>>>
>>>  	case group_has_spare:
>>> +		/*
>>> +		 * Do not pick sg with SMT CPUs over sg with pure CPUs,
>>> +		 * as we do not want to pull task off SMT core with one task
>>> +		 * and make the core idle.
>>> +		 */
>>> +		if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
>>> +			if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
>>> +				return false;
>>> +			else
>>> +				return true;> +		}
>>> +
>>>  		/*
>>>  		 * Select not overloaded group with lowest number of idle cpus
>>>  		 * and highest number of running tasks. We could also compare
>>> @@ -9733,6 +9789,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
>>>
>>>  	case group_imbalanced:
>>>  	case group_asym_packing:
>>> +	case group_smt_balance:
>>>  		/* Those types are not used in the slow wakeup path */
>>>  		return false;
>>>
>>> @@ -9864,6 +9921,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
>>>
>>>  	case group_imbalanced:
>>>  	case group_asym_packing:
>>> +	case group_smt_balance:
>>>  		/* Those type are not used in the slow wakeup path */
>>>  		return NULL;
>>>
>>> @@ -10118,6 +10176,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
>>>  		return;
>>>  	}
>>>
>>> +	if (busiest->group_type == group_smt_balance) {
>>> +		/* Reduce number of tasks sharing CPU capacity */
>>> +		env->migration_type = migrate_task;
>>> +		env->imbalance = 1;
>>> +		return;
>>> +	}
>>> +
>>>  	if (busiest->group_type == group_imbalanced) {
>>>  		/*
>>>  		 * In the group_imb case we cannot rely on group-wide averages
>>> @@ -10363,16 +10428,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
>>>  		goto force_balance;
>>>
>>>  	if (busiest->group_type != group_overloaded) {
>>> -		if (env->idle == CPU_NOT_IDLE)
>>> +		if (env->idle == CPU_NOT_IDLE) {
>>>  			/*
>>>  			 * If the busiest group is not overloaded (and as a
>>>  			 * result the local one too) but this CPU is already
>>>  			 * busy, let another idle CPU try to pull task.
>>>  			 */
>>>  			goto out_balanced;
>>> +		}
>>> +
>>> +		if (busiest->group_type == group_smt_balance &&
>>> +		    smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
>>> +			/* Let non SMT CPU pull from SMT CPU sharing with sibling */
>>> +			goto force_balance;
>>> +		}
>>>
>>>  		if (busiest->group_weight > 1 &&
>>> -		    local->idle_cpus <= (busiest->idle_cpus + 1))
>>> +		    local->idle_cpus <= (busiest->idle_cpus + 1)) {
>>>  			/*
>>>  			 * If the busiest group is not overloaded
>>>  			 * and there is no imbalance between this and busiest
>>> @@ -10383,12 +10455,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
>>>  			 * there is more than 1 CPU per group.
>>>  			 */
>>>  			goto out_balanced;
>>> +		}
>>>
>>> -		if (busiest->sum_h_nr_running == 1)
>>> +		if (busiest->sum_h_nr_running == 1) {
>>>  			/*
>>>  			 * busiest doesn't have any tasks waiting to run
>>>  			 */
>>>  			goto out_balanced;
>>> +		}
>>>  	}
>>>
>>>  force_balance:
>
  
Peter Zijlstra July 17, 2023, 11:10 a.m. UTC | #7
On Mon, Jul 17, 2023 at 01:06:59AM +0530, Shrikanth Hegde wrote:
> 
> 
> On 7/15/23 4:35 AM, Tim Chen wrote:
> > On Fri, 2023-07-14 at 18:36 +0530, Shrikanth Hegde wrote:
> > 
> >>
> >>
> >> If we consider symmetric platforms which have SMT4 such as power10. 
> >> we have a topology like below. multiple such MC will form DIE(PKG)
> >>
> >>
> >> [0 2 4 6][1 3 5 7][8 10 12 14][9 11 13 15]
> >> [--SMT--][--SMT--][----SMT---][---SMT----]
> >> [--sg1--][--sg1--][---sg1----][---sg1----]
> >> [--------------MC------------------------]
> >>
> >> In case of SMT4, if there is any group which has 2 or more tasks, that 
> >> group will be marked as group_smt_balance. previously, if that group had 2
> >> or 3 tasks, it would have been marked as group_has_spare. Since all the groups have 
> >> SMT that means behavior would be same fully busy right? That can cause some 
> >> corner cases. No?
> > 
> > You raised a good point. I was looking from SMT2
> > perspective so group_smt_balance implies group_fully_busy.
> > That is no longer true for SMT4.
> > 
> > I am thinking of the following fix on the current patch
> > to take care of SMT4. Do you think this addresses
> 
> Thanks Tim for taking a look at it again. 
> 
> Yes. I think this would address some of the corner cases. 
> Any SMT4 group having 2,3,4 will have smt_balance as the group type, and busiest one 
> is the one which has least number of idle cpu's. (same conditions as group_has_spare)
> 
> 
> 
> 
> > concerns from you and Tobias?
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 294a662c9410..3fc8d3a3bd22 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -9588,6 +9588,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> >                 break;
> >  
> >         case group_smt_balance:
> > +               /* no idle cpus on both groups handled by group_fully_busy below */
> > +               if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) {
> > +                       if (sgs->idle_cpus > busiest->idle_cpus)
> > +                               return false;
> > +                       if (sgs->idle_cpus < busiest->idle_cpus)
> > +                               return true;
> > +                       if (sgs->sum_nr_running <= busiest_sum_nr_running)
> > +                               return false;
> > +                       else
> > +                               return true;
> > +               }
> > 
> > 
> > I will be on vacation next three weeks so my response will be slow.
> > 
> > Tim
> > 
> >>
> 
> Small suggestion to above code to avoid compiler warning of switch case falling
> through and else case can be removed, since update_sd_pick_busiest by default returns true.
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e5a75c76bcaa..ae364ac6f22e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9728,9 +9728,9 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>                                 return true;
>                         if (sgs->sum_nr_running <= busiest->sum_nr_running)
>                                 return false;
> -                       else
> -                               return true;
>                 }
> +               break;
> +
>         case group_fully_busy:
>                 /*
>                  * Select the fully busy group with highest avg_load. In
> 
> 

Can someone please send a full patch for this? I've already queued Tim's
patches in tip/sched/core (tip-bot seems to have died somewhere last
week, it's being worked on).
  
Shrikanth Hegde July 17, 2023, 12:18 p.m. UTC | #8
On 7/17/23 4:40 PM, Peter Zijlstra wrote:
> On Mon, Jul 17, 2023 at 01:06:59AM +0530, Shrikanth Hegde wrote:
>>
>>
>> On 7/15/23 4:35 AM, Tim Chen wrote:
>>> On Fri, 2023-07-14 at 18:36 +0530, Shrikanth Hegde wrote:
>>>
>>>>
>>>>
>>>> If we consider symmetric platforms which have SMT4 such as power10. 
>>>> we have a topology like below. multiple such MC will form DIE(PKG)
>>>>
>>>>
>>>> [0 2 4 6][1 3 5 7][8 10 12 14][9 11 13 15]
>>>> [--SMT--][--SMT--][----SMT---][---SMT----]
>>>> [--sg1--][--sg1--][---sg1----][---sg1----]
>>>> [--------------MC------------------------]
>>>>
>>>> In case of SMT4, if there is any group which has 2 or more tasks, that 
>>>> group will be marked as group_smt_balance. previously, if that group had 2
>>>> or 3 tasks, it would have been marked as group_has_spare. Since all the groups have 
>>>> SMT that means behavior would be same fully busy right? That can cause some 
>>>> corner cases. No?
>>>
>>> You raised a good point. I was looking from SMT2
>>> perspective so group_smt_balance implies group_fully_busy.
>>> That is no longer true for SMT4.
>>>
>>> I am thinking of the following fix on the current patch
>>> to take care of SMT4. Do you think this addresses
>>
>> Thanks Tim for taking a look at it again. 
>>
>> Yes. I think this would address some of the corner cases. 
>> Any SMT4 group having 2,3,4 will have smt_balance as the group type, and busiest one 
>> is the one which has least number of idle cpu's. (same conditions as group_has_spare)
>>
>>
>>
>>
>>> concerns from you and Tobias?
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 294a662c9410..3fc8d3a3bd22 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -9588,6 +9588,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>>>                 break;
>>>  
>>>         case group_smt_balance:
>>> +               /* no idle cpus on both groups handled by group_fully_busy below */
>>> +               if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) {
>>> +                       if (sgs->idle_cpus > busiest->idle_cpus)
>>> +                               return false;
>>> +                       if (sgs->idle_cpus < busiest->idle_cpus)
>>> +                               return true;
>>> +                       if (sgs->sum_nr_running <= busiest_sum_nr_running)
>>> +                               return false;
>>> +                       else
>>> +                               return true;
>>> +               }
>>>
>>>
>>> I will be on vacation next three weeks so my response will be slow.
>>>
>>> Tim
>>>
>>>>
>>
>> Small suggestion to above code to avoid compiler warning of switch case falling
>> through and else case can be removed, since update_sd_pick_busiest by default returns true.
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index e5a75c76bcaa..ae364ac6f22e 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -9728,9 +9728,9 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>>                                 return true;
>>                         if (sgs->sum_nr_running <= busiest->sum_nr_running)
>>                                 return false;
>> -                       else
>> -                               return true;
>>                 }
>> +               break;
>> +
>>         case group_fully_busy:
>>                 /*
>>                  * Select the fully busy group with highest avg_load. In
>>
>>
> 
> Can someone please send a full patch for this? I've already queued Tim's
> patches in tip/sched/core (tip-bot seems to have died somewhere last
> week, it's being worked on).

Hi Peter. 

Sending on behalf of tim. I have included my suggestion as well. Hope that's ok.
Please find below the patch as of now. it includes the couple of changes that are discussed. (in 1/6 and in 3/6)


---
 kernel/sched/fair.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 932e7b78894a..9502013abe33 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9532,7 +9532,7 @@ static inline long sibling_imbalance(struct lb_env *env,
 	imbalance /= ncores_local + ncores_busiest;
 
 	/* Take advantage of resource in an empty sched group */
-	if (imbalance == 0 && local->sum_nr_running == 0 &&
+	if (imbalance <= 1 && local->sum_nr_running == 0 &&
 	    busiest->sum_nr_running > 1)
 		imbalance = 2;
 
@@ -9720,6 +9720,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 		break;
 
 	case group_smt_balance:
+		/* no idle cpus on both groups handled by group_fully_busy below */
+		if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) {
+			if (sgs->idle_cpus > busiest->idle_cpus)
+				return false;
+			if (sgs->idle_cpus < busiest->idle_cpus)
+				return true;
+			if (sgs->sum_nr_running <= busiest->sum_nr_running)
+				return false;
+		}
+		break;
+
 	case group_fully_busy:
 		/*
 		 * Select the fully busy group with highest avg_load. In
  
Peter Zijlstra July 17, 2023, 1:37 p.m. UTC | #9
On Mon, Jul 17, 2023 at 05:48:02PM +0530, Shrikanth Hegde wrote:

> Hi Peter. 
> 
> Sending on behalf of tim. I have included my suggestion as well. Hope
> that's ok.  Please find below the patch as of now. it includes the
> couple of changes that are discussed. (in 1/6 and in 3/6)

Could you please add a Changelog and SoB thingies such that I can apply
the thing?

Given Tim is on holidays, perhaps do something like:

Originally-by: Tim Chen <...>

After all, you did some changes and verified it actually works etc..


> ---
>  kernel/sched/fair.c | 13 ++++++++++++-
>  1 file changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 932e7b78894a..9502013abe33 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9532,7 +9532,7 @@ static inline long sibling_imbalance(struct lb_env *env,
>  	imbalance /= ncores_local + ncores_busiest;
>  
>  	/* Take advantage of resource in an empty sched group */
> -	if (imbalance == 0 && local->sum_nr_running == 0 &&
> +	if (imbalance <= 1 && local->sum_nr_running == 0 &&
>  	    busiest->sum_nr_running > 1)
>  		imbalance = 2;
>  
> @@ -9720,6 +9720,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>  		break;
>  
>  	case group_smt_balance:
> +		/* no idle cpus on both groups handled by group_fully_busy below */
> +		if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) {
> +			if (sgs->idle_cpus > busiest->idle_cpus)
> +				return false;
> +			if (sgs->idle_cpus < busiest->idle_cpus)
> +				return true;
> +			if (sgs->sum_nr_running <= busiest->sum_nr_running)
> +				return false;
> +		}
> +		break;
> +
>  	case group_fully_busy:
>  		/*
>  		 * Select the fully busy group with highest avg_load. In
> -- 
> 2.31.1
  
Tobias Huschle July 18, 2023, 6:07 a.m. UTC | #10
On 2023-07-15 01:05, Tim Chen wrote:
> On Fri, 2023-07-14 at 18:36 +0530, Shrikanth Hegde wrote:
> 
>> 
>> 
>> If we consider symmetric platforms which have SMT4 such as power10.
>> we have a topology like below. multiple such MC will form DIE(PKG)
>> 
>> 
>> [0 2 4 6][1 3 5 7][8 10 12 14][9 11 13 15]
>> [--SMT--][--SMT--][----SMT---][---SMT----]
>> [--sg1--][--sg1--][---sg1----][---sg1----]
>> [--------------MC------------------------]
>> 
>> In case of SMT4, if there is any group which has 2 or more tasks, that
>> group will be marked as group_smt_balance. previously, if that group 
>> had 2
>> or 3 tasks, it would have been marked as group_has_spare. Since all 
>> the groups have
>> SMT that means behavior would be same fully busy right? That can cause 
>> some
>> corner cases. No?
> 
> You raised a good point. I was looking from SMT2
> perspective so group_smt_balance implies group_fully_busy.
> That is no longer true for SMT4.
> 
> I am thinking of the following fix on the current patch
> to take care of SMT4. Do you think this addresses
> concerns from you and Tobias?
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 294a662c9410..3fc8d3a3bd22 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9588,6 +9588,17 @@ static bool update_sd_pick_busiest(struct lb_env 
> *env,
>                 break;
> 
>         case group_smt_balance:
> +               /* no idle cpus on both groups handled by
> group_fully_busy below */
> +               if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) {
> +                       if (sgs->idle_cpus > busiest->idle_cpus)
> +                               return false;
> +                       if (sgs->idle_cpus < busiest->idle_cpus)
> +                               return true;
> +                       if (sgs->sum_nr_running <= 
> busiest_sum_nr_running)
> +                               return false;
> +                       else
> +                               return true;
> +               }
> 
> 
> I will be on vacation next three weeks so my response will be slow.
> 
> Tim
> 

What if the setup is asymmetric, where SMT2 and SMT4 would mix, e.g.

[0 1][2 3 4 5]
[SMT][--SMT--]

If now CPUs 0,2,3 have a running task, both groups would be classified 
as
smt_balance. But if it comes to the selection of the busiest group, the 
smaller
group would be selected, as it has less idle CPUs, right? Which could 
lead
to the smaller group being left with no tasks.
Using the absolute numbers of task is what made the prefer_sibling path 
problematic,
I would assume that the same holds true here. Therefore, I would prefer 
avg_load,
or, similar to prefer_siblings, a ratio over the number of cores.

I can't really test that on s390 as we always have SMT2. But, we can 
have these
asymmetries on higher levels, e.g.

[0 1][2 3][4 5][6 7][8 9]
[SMT][SMT][SMT][SMT][SMT]
[-----core----][--core--]

For large configurations this can be true for even higher levels.
Therefore, the idea was to move the smt_balance state around and adapt 
its
conditions to something like this (which would require to reorder the 
commits):

@@ -8330,6 +8330,11 @@ enum fbq_type { regular, remote, all };
  enum group_type {
         /* The group has spare capacity that can be used to run more 
tasks.  */
         group_has_spare = 0,
+       /*
+        * Balance SMT group that's fully busy. Can benefit from 
migration
+        * a task on SMT with busy sibling to another CPU on idle core.
+        */
+       group_smt_balance,
         /*
          * The group is fully used and the tasks don't compete for more 
CPU
          * cycles. Nevertheless, some tasks might wait before running.
@@ -8340,11 +8345,6 @@ enum group_type {
          * more powerful CPU.
          */
         group_misfit_task,
-       /*
-        * Balance SMT group that's fully busy. Can benefit from 
migration
-        * a task on SMT with busy sibling to another CPU on idle core.
-        */
-       group_smt_balance,
         /*
          * SD_ASYM_PACKING only: One local CPU with higher capacity is 
available,
          * and the task should be migrated to it instead of running on 
the
@@ -9327,15 +9327,15 @@ group_type group_classify(unsigned int 
imbalance_pct,
         if (sgs->group_asym_packing)
                 return group_asym_packing;

-       if (sgs->group_smt_balance)
-               return group_smt_balance;
-
         if (sgs->group_misfit_task_load)
                 return group_misfit_task;

         if (!group_has_capacity(imbalance_pct, sgs))
                 return group_fully_busy;

+       if (sgs->group_smt_balance)
+               return group_smt_balance;
+
         return group_has_spare;
  }

@@ -9457,8 +9457,7 @@ static inline bool smt_balance(struct lb_env *env, 
struct sg_lb_stats *sgs,
          * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
          * will not be on.
          */
-       if (group->flags & SD_SHARE_CPUCAPACITY &&
-           sgs->sum_h_nr_running > 1)
+       if (sgs->sum_h_nr_running > group->cores)
                 return true;

         return false;

The s390 problem is currently solved by changing the prefer_sibling 
path. When
disabling that flag, we might have an issue, will have to verify that 
though.

>> 
>> One example is Lets say sg1 has 4 tasks. and sg2 has 0 tasks and is 
>> trying to do
>> load balance. Previously imbalance would have been 2, instead now 
>> imbalance would be 1.
>> But in subsequent lb it would be balanced.
>> 
>> 
>> 
>> > +	return false;
>> > +}
>> > +
>> >  static inline bool
>> >  sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
>> >  {
>> > @@ -9425,6 +9464,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>> >  		sgs->group_asym_packing = 1;
>> >  	}
>> >
>> > +	/* Check for loaded SMT group to be balanced to dst CPU */
>> > +	if (!local_group && smt_balance(env, sgs, group))
>> > +		sgs->group_smt_balance = 1;
>> > +
>> >  	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
>> >
>> >  	/* Computing avg_load makes sense only when group is overloaded */
>> > @@ -9509,6 +9552,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>> >  			return false;
>> >  		break;
>> >
>> > +	case group_smt_balance:
>> >  	case group_fully_busy:
>> >  		/*
>> >  		 * Select the fully busy group with highest avg_load. In
>> > @@ -9537,6 +9581,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>> >  		break;
>> >
>> >  	case group_has_spare:
>> > +		/*
>> > +		 * Do not pick sg with SMT CPUs over sg with pure CPUs,
>> > +		 * as we do not want to pull task off SMT core with one task
>> > +		 * and make the core idle.
>> > +		 */
>> > +		if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
>> > +			if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
>> > +				return false;
>> > +			else
>> > +				return true;> +		}
>> > +
>> >  		/*
>> >  		 * Select not overloaded group with lowest number of idle cpus
>> >  		 * and highest number of running tasks. We could also compare
>> > @@ -9733,6 +9789,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
>> >
>> >  	case group_imbalanced:
>> >  	case group_asym_packing:
>> > +	case group_smt_balance:
>> >  		/* Those types are not used in the slow wakeup path */
>> >  		return false;
>> >
>> > @@ -9864,6 +9921,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
>> >
>> >  	case group_imbalanced:
>> >  	case group_asym_packing:
>> > +	case group_smt_balance:
>> >  		/* Those type are not used in the slow wakeup path */
>> >  		return NULL;
>> >
>> > @@ -10118,6 +10176,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
>> >  		return;
>> >  	}
>> >
>> > +	if (busiest->group_type == group_smt_balance) {
>> > +		/* Reduce number of tasks sharing CPU capacity */
>> > +		env->migration_type = migrate_task;
>> > +		env->imbalance = 1;
>> > +		return;
>> > +	}
>> > +
>> >  	if (busiest->group_type == group_imbalanced) {
>> >  		/*
>> >  		 * In the group_imb case we cannot rely on group-wide averages
>> > @@ -10363,16 +10428,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
>> >  		goto force_balance;
>> >
>> >  	if (busiest->group_type != group_overloaded) {
>> > -		if (env->idle == CPU_NOT_IDLE)
>> > +		if (env->idle == CPU_NOT_IDLE) {
>> >  			/*
>> >  			 * If the busiest group is not overloaded (and as a
>> >  			 * result the local one too) but this CPU is already
>> >  			 * busy, let another idle CPU try to pull task.
>> >  			 */
>> >  			goto out_balanced;
>> > +		}
>> > +
>> > +		if (busiest->group_type == group_smt_balance &&
>> > +		    smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
>> > +			/* Let non SMT CPU pull from SMT CPU sharing with sibling */
>> > +			goto force_balance;
>> > +		}
>> >
>> >  		if (busiest->group_weight > 1 &&
>> > -		    local->idle_cpus <= (busiest->idle_cpus + 1))
>> > +		    local->idle_cpus <= (busiest->idle_cpus + 1)) {
>> >  			/*
>> >  			 * If the busiest group is not overloaded
>> >  			 * and there is no imbalance between this and busiest
>> > @@ -10383,12 +10455,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
>> >  			 * there is more than 1 CPU per group.
>> >  			 */
>> >  			goto out_balanced;
>> > +		}
>> >
>> > -		if (busiest->sum_h_nr_running == 1)
>> > +		if (busiest->sum_h_nr_running == 1) {
>> >  			/*
>> >  			 * busiest doesn't have any tasks waiting to run
>> >  			 */
>> >  			goto out_balanced;
>> > +		}
>> >  	}
>> >
>> >  force_balance:
  
Shrikanth Hegde July 18, 2023, 2:52 p.m. UTC | #11
On 7/18/23 11:37 AM, Tobias Huschle wrote:
> On 2023-07-15 01:05, Tim Chen wrote:
>> On Fri, 2023-07-14 at 18:36 +0530, Shrikanth Hegde wrote:
>>
>>>
>>>
>>> If we consider symmetric platforms which have SMT4 such as power10.
>>> we have a topology like below. multiple such MC will form DIE(PKG)
>>>
>>>
>>> [0 2 4 6][1 3 5 7][8 10 12 14][9 11 13 15]
>>> [--SMT--][--SMT--][----SMT---][---SMT----]
>>> [--sg1--][--sg1--][---sg1----][---sg1----]
>>> [--------------MC------------------------]
>>>
>>> In case of SMT4, if there is any group which has 2 or more tasks, that
>>> group will be marked as group_smt_balance. previously, if that group
>>> had 2
>>> or 3 tasks, it would have been marked as group_has_spare. Since all
>>> the groups have
>>> SMT that means behavior would be same fully busy right? That can
>>> cause some
>>> corner cases. No?
>>
>> You raised a good point. I was looking from SMT2
>> perspective so group_smt_balance implies group_fully_busy.
>> That is no longer true for SMT4.
>>
>> I am thinking of the following fix on the current patch
>> to take care of SMT4. Do you think this addresses
>> concerns from you and Tobias?
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 294a662c9410..3fc8d3a3bd22 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -9588,6 +9588,17 @@ static bool update_sd_pick_busiest(struct
>> lb_env *env,
>>                 break;
>>
>>         case group_smt_balance:
>> +               /* no idle cpus on both groups handled by
>> group_fully_busy below */
>> +               if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) {
>> +                       if (sgs->idle_cpus > busiest->idle_cpus)
>> +                               return false;
>> +                       if (sgs->idle_cpus < busiest->idle_cpus)
>> +                               return true;
>> +                       if (sgs->sum_nr_running <=
>> busiest_sum_nr_running)
>> +                               return false;
>> +                       else
>> +                               return true;
>> +               }
>>
>>
>> I will be on vacation next three weeks so my response will be slow.
>>
>> Tim
>>
> 
> What if the setup is asymmetric, where SMT2 and SMT4 would mix, e.g.
> 
> [0 1][2 3 4 5]
> [SMT][--SMT--]
> 
> If now CPUs 0,2,3 have a running task, both groups would be classified as
> smt_balance. But if it comes to the selection of the busiest group, the
> smaller
> group would be selected, as it has less idle CPUs, right? Which could lead
> to the smaller group being left with no tasks.
> Using the absolute numbers of task is what made the prefer_sibling path
> problematic,


Yes. But Not sure how realistic is that configuration. on power10, we typically
have all cores in either SMT1, SMT2 or SMT4. But not mixed configs.
One can offline a CPUs to get into that cases in SMT4. 

> I would assume that the same holds true here. Therefore, I would prefer
> avg_load,
> or, similar to prefer_siblings, a ratio over the number of cores.
> 
> I can't really test that on s390 as we always have SMT2. But, we can
> have these
> asymmetries on higher levels, e.g


IIUC, on higher levels, group will not have SD_SHARE_CPUCAPACITY, so it shouldn't 
run into group_smt_balance. 

> 
> [0 1][2 3][4 5][6 7][8 9]
> [SMT][SMT][SMT][SMT][SMT]
> [-----core----][--core--]
> 
> For large configurations this can be true for even higher levels.
> Therefore, the idea was to move the smt_balance state around and adapt its
> conditions to something like this (which would require to reorder the
> commits):
> 
> @@ -8330,6 +8330,11 @@ enum fbq_type { regular, remote, all };
>  enum group_type {
>         /* The group has spare capacity that can be used to run more
> tasks.  */
>         group_has_spare = 0,
> +       /*
> +        * Balance SMT group that's fully busy. Can benefit from migration
> +        * a task on SMT with busy sibling to another CPU on idle core.
> +        */
> +       group_smt_balance,
>         /*
>          * The group is fully used and the tasks don't compete for more CPU
>          * cycles. Nevertheless, some tasks might wait before running.
> @@ -8340,11 +8345,6 @@ enum group_type {
>          * more powerful CPU.
>          */
>         group_misfit_task,
> -       /*
> -        * Balance SMT group that's fully busy. Can benefit from migration
> -        * a task on SMT with busy sibling to another CPU on idle core.
> -        */
> -       group_smt_balance,
>         /*
>          * SD_ASYM_PACKING only: One local CPU with higher capacity is
> available,


IIUC, for cluster topology of this patch, busiest group should be a SMT if it has 2 
threads compared to an Atom cluster having 4 threads. Atom cluster will be group_fully_busy, 
whereas SMT group will be group_smt_balance. For that to happen group_smt_balance should have 
higher group_type. 

>          * and the task should be migrated to it instead of running on the
> @@ -9327,15 +9327,15 @@ group_type group_classify(unsigned int
> imbalance_pct,
>         if (sgs->group_asym_packing)
>                 return group_asym_packing;
> 
> -       if (sgs->group_smt_balance)
> -               return group_smt_balance;
> -
>         if (sgs->group_misfit_task_load)
>                 return group_misfit_task;
> 
>         if (!group_has_capacity(imbalance_pct, sgs))
>                 return group_fully_busy;
> 
> +       if (sgs->group_smt_balance)
> +               return group_smt_balance;
> +
>         return group_has_spare;
>  }
> 
> @@ -9457,8 +9457,7 @@ static inline bool smt_balance(struct lb_env *env,
> struct sg_lb_stats *sgs,
>          * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
>          * will not be on.
>          */
> -       if (group->flags & SD_SHARE_CPUCAPACITY &&
> -           sgs->sum_h_nr_running > 1)
> +       if (sgs->sum_h_nr_running > group->cores)

In case of Power10, where we have SMT4, group->cores will be 1. I dont see 
a difference here.

>                 return true;
> 
>         return false;
> 
> The s390 problem is currently solved by changing the prefer_sibling
> path. When
> disabling that flag, we might have an issue, will have to verify that
> though.
> 
>>>
>>> One example is Lets say sg1 has 4 tasks. and sg2 has 0 tasks and is
>>> trying to do
>>> load balance. Previously imbalance would have been 2, instead now
>>> imbalance would be 1.
>>> But in subsequent lb it would be balanced.
>>>
>>>
>>>
>>> > +    return false;
>>> > +}
>>> > +
>>> >  static inline bool
>>> >  sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
>>> >  {
>>> > @@ -9425,6 +9464,10 @@ static inline void update_sg_lb_stats(struct
>>> lb_env *env,
>>> >          sgs->group_asym_packing = 1;
>>> >      }
>>> >
>>> > +    /* Check for loaded SMT group to be balanced to dst CPU */
>>> > +    if (!local_group && smt_balance(env, sgs, group))
>>> > +        sgs->group_smt_balance = 1;
>>> > +
>>> >      sgs->group_type = group_classify(env->sd->imbalance_pct,
>>> group, sgs);
>>> >
>>> >      /* Computing avg_load makes sense only when group is
>>> overloaded */
>>> > @@ -9509,6 +9552,7 @@ static bool update_sd_pick_busiest(struct
>>> lb_env *env,
>>> >              return false;
>>> >          break;
>>> >
>>> > +    case group_smt_balance:
>>> >      case group_fully_busy:
>>> >          /*
>>> >           * Select the fully busy group with highest avg_load. In
>>> > @@ -9537,6 +9581,18 @@ static bool update_sd_pick_busiest(struct
>>> lb_env *env,
>>> >          break;
>>> >
>>> >      case group_has_spare:
>>> > +        /*
>>> > +         * Do not pick sg with SMT CPUs over sg with pure CPUs,
>>> > +         * as we do not want to pull task off SMT core with one task
>>> > +         * and make the core idle.
>>> > +         */
>>> > +        if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
>>> > +            if (sg->flags & SD_SHARE_CPUCAPACITY &&
>>> sgs->sum_h_nr_running <= 1)
>>> > +                return false;
>>> > +            else
>>> > +                return true;> +        }
>>> > +
>>> >          /*
>>> >           * Select not overloaded group with lowest number of idle
>>> cpus
>>> >           * and highest number of running tasks. We could also compare
>>> > @@ -9733,6 +9789,7 @@ static bool update_pick_idlest(struct
>>> sched_group *idlest,
>>> >
>>> >      case group_imbalanced:
>>> >      case group_asym_packing:
>>> > +    case group_smt_balance:
>>> >          /* Those types are not used in the slow wakeup path */
>>> >          return false;
>>> >
>>> > @@ -9864,6 +9921,7 @@ find_idlest_group(struct sched_domain *sd,
>>> struct task_struct *p, int this_cpu)
>>> >
>>> >      case group_imbalanced:
>>> >      case group_asym_packing:
>>> > +    case group_smt_balance:
>>> >          /* Those type are not used in the slow wakeup path */
>>> >          return NULL;
>>> >
>>> > @@ -10118,6 +10176,13 @@ static inline void
>>> calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
>>> >          return;
>>> >      }
>>> >
>>> > +    if (busiest->group_type == group_smt_balance) {
>>> > +        /* Reduce number of tasks sharing CPU capacity */
>>> > +        env->migration_type = migrate_task;
>>> > +        env->imbalance = 1;
>>> > +        return;
>>> > +    }
>>> > +
>>> >      if (busiest->group_type == group_imbalanced) {
>>> >          /*
>>> >           * In the group_imb case we cannot rely on group-wide
>>> averages
>>> > @@ -10363,16 +10428,23 @@ static struct sched_group
>>> *find_busiest_group(struct lb_env *env)
>>> >          goto force_balance;
>>> >
>>> >      if (busiest->group_type != group_overloaded) {
>>> > -        if (env->idle == CPU_NOT_IDLE)
>>> > +        if (env->idle == CPU_NOT_IDLE) {
>>> >              /*
>>> >               * If the busiest group is not overloaded (and as a
>>> >               * result the local one too) but this CPU is already
>>> >               * busy, let another idle CPU try to pull task.
>>> >               */
>>> >              goto out_balanced;
>>> > +        }
>>> > +
>>> > +        if (busiest->group_type == group_smt_balance &&
>>> > +            smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
>>> > +            /* Let non SMT CPU pull from SMT CPU sharing with
>>> sibling */
>>> > +            goto force_balance;
>>> > +        }
>>> >
>>> >          if (busiest->group_weight > 1 &&
>>> > -            local->idle_cpus <= (busiest->idle_cpus + 1))
>>> > +            local->idle_cpus <= (busiest->idle_cpus + 1)) {
>>> >              /*
>>> >               * If the busiest group is not overloaded
>>> >               * and there is no imbalance between this and busiest
>>> > @@ -10383,12 +10455,14 @@ static struct sched_group
>>> *find_busiest_group(struct lb_env *env)
>>> >               * there is more than 1 CPU per group.
>>> >               */
>>> >              goto out_balanced;
>>> > +        }
>>> >
>>> > -        if (busiest->sum_h_nr_running == 1)
>>> > +        if (busiest->sum_h_nr_running == 1) {
>>> >              /*
>>> >               * busiest doesn't have any tasks waiting to run
>>> >               */
>>> >              goto out_balanced;
>>> > +        }
>>> >      }
>>> >
>>> >  force_balance:
  
Tobias Huschle July 19, 2023, 8:14 a.m. UTC | #12
On 2023-07-18 16:52, Shrikanth Hegde wrote:
> On 7/18/23 11:37 AM, Tobias Huschle wrote:
>> On 2023-07-15 01:05, Tim Chen wrote:
>>> On Fri, 2023-07-14 at 18:36 +0530, Shrikanth Hegde wrote:
>>> 
>>>> 
>>>> 
>>>> If we consider symmetric platforms which have SMT4 such as power10.
>>>> we have a topology like below. multiple such MC will form DIE(PKG)
>>>> 
>>>> 
>>>> [0 2 4 6][1 3 5 7][8 10 12 14][9 11 13 15]
>>>> [--SMT--][--SMT--][----SMT---][---SMT----]
>>>> [--sg1--][--sg1--][---sg1----][---sg1----]
>>>> [--------------MC------------------------]
>>>> 
>>>> In case of SMT4, if there is any group which has 2 or more tasks, 
>>>> that
>>>> group will be marked as group_smt_balance. previously, if that group
>>>> had 2
>>>> or 3 tasks, it would have been marked as group_has_spare. Since all
>>>> the groups have
>>>> SMT that means behavior would be same fully busy right? That can
>>>> cause some
>>>> corner cases. No?
>>> 
>>> You raised a good point. I was looking from SMT2
>>> perspective so group_smt_balance implies group_fully_busy.
>>> That is no longer true for SMT4.
>>> 
>>> I am thinking of the following fix on the current patch
>>> to take care of SMT4. Do you think this addresses
>>> concerns from you and Tobias?
>>> 
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 294a662c9410..3fc8d3a3bd22 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -9588,6 +9588,17 @@ static bool update_sd_pick_busiest(struct
>>> lb_env *env,
>>>                 break;
>>> 
>>>         case group_smt_balance:
>>> +               /* no idle cpus on both groups handled by
>>> group_fully_busy below */
>>> +               if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) {
>>> +                       if (sgs->idle_cpus > busiest->idle_cpus)
>>> +                               return false;
>>> +                       if (sgs->idle_cpus < busiest->idle_cpus)
>>> +                               return true;
>>> +                       if (sgs->sum_nr_running <=
>>> busiest_sum_nr_running)
>>> +                               return false;
>>> +                       else
>>> +                               return true;
>>> +               }
>>> 
>>> 
>>> I will be on vacation next three weeks so my response will be slow.
>>> 
>>> Tim
>>> 
>> 
>> What if the setup is asymmetric, where SMT2 and SMT4 would mix, e.g.
>> 
>> [0 1][2 3 4 5]
>> [SMT][--SMT--]
>> 
>> If now CPUs 0,2,3 have a running task, both groups would be classified 
>> as
>> smt_balance. But if it comes to the selection of the busiest group, 
>> the
>> smaller
>> group would be selected, as it has less idle CPUs, right? Which could 
>> lead
>> to the smaller group being left with no tasks.
>> Using the absolute numbers of task is what made the prefer_sibling 
>> path
>> problematic,
> 
> 
> Yes. But Not sure how realistic is that configuration. on power10, we 
> typically
> have all cores in either SMT1, SMT2 or SMT4. But not mixed configs.
> One can offline a CPUs to get into that cases in SMT4.

I'm also not sure if there is a real case for that. The assumption that 
two groups
are always of equal size was the issue why the prefer_sibling path did 
not work as
expected. I just wanted to point out that we might introduce a similar 
assumption
here again. It might be valid to assume that if there are no usecases 
for having
two cores with a different number of SMT threads.

> 
>> I would assume that the same holds true here. Therefore, I would 
>> prefer
>> avg_load,
>> or, similar to prefer_siblings, a ratio over the number of cores.
>> 
>> I can't really test that on s390 as we always have SMT2. But, we can
>> have these
>> asymmetries on higher levels, e.g
> 
> 
> IIUC, on higher levels, group will not have SD_SHARE_CPUCAPACITY, so
> it shouldn't
> run into group_smt_balance.
> 
>> 
>> [0 1][2 3][4 5][6 7][8 9]
>> [SMT][SMT][SMT][SMT][SMT]
>> [-----core----][--core--]
>> 
>> For large configurations this can be true for even higher levels.
>> Therefore, the idea was to move the smt_balance state around and adapt 
>> its
>> conditions to something like this (which would require to reorder the
>> commits):
>> 
>> @@ -8330,6 +8330,11 @@ enum fbq_type { regular, remote, all };
>>  enum group_type {
>>         /* The group has spare capacity that can be used to run more
>> tasks.  */
>>         group_has_spare = 0,
>> +       /*
>> +        * Balance SMT group that's fully busy. Can benefit from 
>> migration
>> +        * a task on SMT with busy sibling to another CPU on idle 
>> core.
>> +        */
>> +       group_smt_balance,
>>         /*
>>          * The group is fully used and the tasks don't compete for 
>> more CPU
>>          * cycles. Nevertheless, some tasks might wait before running.
>> @@ -8340,11 +8345,6 @@ enum group_type {
>>          * more powerful CPU.
>>          */
>>         group_misfit_task,
>> -       /*
>> -        * Balance SMT group that's fully busy. Can benefit from 
>> migration
>> -        * a task on SMT with busy sibling to another CPU on idle 
>> core.
>> -        */
>> -       group_smt_balance,
>>         /*
>>          * SD_ASYM_PACKING only: One local CPU with higher capacity is
>> available,
> 
> 
> IIUC, for cluster topology of this patch, busiest group should be a
> SMT if it has 2
> threads compared to an Atom cluster having 4 threads. Atom cluster
> will be group_fully_busy,
> whereas SMT group will be group_smt_balance. For that to happen
> group_smt_balance should have
> higher group_type.

Makes sense.

> 
>>          * and the task should be migrated to it instead of running on 
>> the
>> @@ -9327,15 +9327,15 @@ group_type group_classify(unsigned int
>> imbalance_pct,
>>         if (sgs->group_asym_packing)
>>                 return group_asym_packing;
>> 
>> -       if (sgs->group_smt_balance)
>> -               return group_smt_balance;
>> -
>>         if (sgs->group_misfit_task_load)
>>                 return group_misfit_task;
>> 
>>         if (!group_has_capacity(imbalance_pct, sgs))
>>                 return group_fully_busy;
>> 
>> +       if (sgs->group_smt_balance)
>> +               return group_smt_balance;
>> +
>>         return group_has_spare;
>>  }
>> 
>> @@ -9457,8 +9457,7 @@ static inline bool smt_balance(struct lb_env 
>> *env,
>> struct sg_lb_stats *sgs,
>>          * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
>>          * will not be on.
>>          */
>> -       if (group->flags & SD_SHARE_CPUCAPACITY &&
>> -           sgs->sum_h_nr_running > 1)
>> +       if (sgs->sum_h_nr_running > group->cores)
> 
> In case of Power10, where we have SMT4, group->cores will be 1. I dont 
> see
> a difference here.

The aim of this change was to also make use of this further up in the 
hierarchy,
where SD_SHARE_CPUCAPACITY is not set. Up there, it would be possible to 
have
more than one core, also potentially different numbers (at least on 
s390).

It appears to work fine without these changes though, so I think there 
is
nothing to do for now.

> 
>>                 return true;
>> 
>>         return false;
>> 
>> The s390 problem is currently solved by changing the prefer_sibling
>> path. When
>> disabling that flag, we might have an issue, will have to verify that
>> though.
>> 
>>>> 
>>>> One example is Lets say sg1 has 4 tasks. and sg2 has 0 tasks and is
>>>> trying to do
>>>> load balance. Previously imbalance would have been 2, instead now
>>>> imbalance would be 1.
>>>> But in subsequent lb it would be balanced.
>>>> 
>>>> 
>>>> 
>>>> > +    return false;
>>>> > +}
>>>> > +
>>>> >  static inline bool
>>>> >  sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
>>>> >  {
>>>> > @@ -9425,6 +9464,10 @@ static inline void update_sg_lb_stats(struct
>>>> lb_env *env,
>>>> >          sgs->group_asym_packing = 1;
>>>> >      }
>>>> >
>>>> > +    /* Check for loaded SMT group to be balanced to dst CPU */
>>>> > +    if (!local_group && smt_balance(env, sgs, group))
>>>> > +        sgs->group_smt_balance = 1;
>>>> > +
>>>> >      sgs->group_type = group_classify(env->sd->imbalance_pct,
>>>> group, sgs);
>>>> >
>>>> >      /* Computing avg_load makes sense only when group is
>>>> overloaded */
>>>> > @@ -9509,6 +9552,7 @@ static bool update_sd_pick_busiest(struct
>>>> lb_env *env,
>>>> >              return false;
>>>> >          break;
>>>> >
>>>> > +    case group_smt_balance:
>>>> >      case group_fully_busy:
>>>> >          /*
>>>> >           * Select the fully busy group with highest avg_load. In
>>>> > @@ -9537,6 +9581,18 @@ static bool update_sd_pick_busiest(struct
>>>> lb_env *env,
>>>> >          break;
>>>> >
>>>> >      case group_has_spare:
>>>> > +        /*
>>>> > +         * Do not pick sg with SMT CPUs over sg with pure CPUs,
>>>> > +         * as we do not want to pull task off SMT core with one task
>>>> > +         * and make the core idle.
>>>> > +         */
>>>> > +        if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
>>>> > +            if (sg->flags & SD_SHARE_CPUCAPACITY &&
>>>> sgs->sum_h_nr_running <= 1)
>>>> > +                return false;
>>>> > +            else
>>>> > +                return true;> +        }
>>>> > +
>>>> >          /*
>>>> >           * Select not overloaded group with lowest number of idle
>>>> cpus
>>>> >           * and highest number of running tasks. We could also compare
>>>> > @@ -9733,6 +9789,7 @@ static bool update_pick_idlest(struct
>>>> sched_group *idlest,
>>>> >
>>>> >      case group_imbalanced:
>>>> >      case group_asym_packing:
>>>> > +    case group_smt_balance:
>>>> >          /* Those types are not used in the slow wakeup path */
>>>> >          return false;
>>>> >
>>>> > @@ -9864,6 +9921,7 @@ find_idlest_group(struct sched_domain *sd,
>>>> struct task_struct *p, int this_cpu)
>>>> >
>>>> >      case group_imbalanced:
>>>> >      case group_asym_packing:
>>>> > +    case group_smt_balance:
>>>> >          /* Those type are not used in the slow wakeup path */
>>>> >          return NULL;
>>>> >
>>>> > @@ -10118,6 +10176,13 @@ static inline void
>>>> calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
>>>> >          return;
>>>> >      }
>>>> >
>>>> > +    if (busiest->group_type == group_smt_balance) {
>>>> > +        /* Reduce number of tasks sharing CPU capacity */
>>>> > +        env->migration_type = migrate_task;
>>>> > +        env->imbalance = 1;
>>>> > +        return;
>>>> > +    }
>>>> > +
>>>> >      if (busiest->group_type == group_imbalanced) {
>>>> >          /*
>>>> >           * In the group_imb case we cannot rely on group-wide
>>>> averages
>>>> > @@ -10363,16 +10428,23 @@ static struct sched_group
>>>> *find_busiest_group(struct lb_env *env)
>>>> >          goto force_balance;
>>>> >
>>>> >      if (busiest->group_type != group_overloaded) {
>>>> > -        if (env->idle == CPU_NOT_IDLE)
>>>> > +        if (env->idle == CPU_NOT_IDLE) {
>>>> >              /*
>>>> >               * If the busiest group is not overloaded (and as a
>>>> >               * result the local one too) but this CPU is already
>>>> >               * busy, let another idle CPU try to pull task.
>>>> >               */
>>>> >              goto out_balanced;
>>>> > +        }
>>>> > +
>>>> > +        if (busiest->group_type == group_smt_balance &&
>>>> > +            smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
>>>> > +            /* Let non SMT CPU pull from SMT CPU sharing with
>>>> sibling */
>>>> > +            goto force_balance;
>>>> > +        }
>>>> >
>>>> >          if (busiest->group_weight > 1 &&
>>>> > -            local->idle_cpus <= (busiest->idle_cpus + 1))
>>>> > +            local->idle_cpus <= (busiest->idle_cpus + 1)) {
>>>> >              /*
>>>> >               * If the busiest group is not overloaded
>>>> >               * and there is no imbalance between this and busiest
>>>> > @@ -10383,12 +10455,14 @@ static struct sched_group
>>>> *find_busiest_group(struct lb_env *env)
>>>> >               * there is more than 1 CPU per group.
>>>> >               */
>>>> >              goto out_balanced;
>>>> > +        }
>>>> >
>>>> > -        if (busiest->sum_h_nr_running == 1)
>>>> > +        if (busiest->sum_h_nr_running == 1) {
>>>> >              /*
>>>> >               * busiest doesn't have any tasks waiting to run
>>>> >               */
>>>> >              goto out_balanced;
>>>> > +        }
>>>> >      }
>>>> >
>>>> >  force_balance:
  

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 87317634fab2..f636d6c09dc6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8279,6 +8279,11 @@  enum group_type {
 	 * more powerful CPU.
 	 */
 	group_misfit_task,
+	/*
+	 * Balance SMT group that's fully busy. Can benefit from migration
+	 * a task on SMT with busy sibling to another CPU on idle core.
+	 */
+	group_smt_balance,
 	/*
 	 * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
 	 * and the task should be migrated to it instead of running on the
@@ -8987,6 +8992,7 @@  struct sg_lb_stats {
 	unsigned int group_weight;
 	enum group_type group_type;
 	unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
+	unsigned int group_smt_balance;  /* Task on busy SMT be moved */
 	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int nr_numa_running;
@@ -9260,6 +9266,9 @@  group_type group_classify(unsigned int imbalance_pct,
 	if (sgs->group_asym_packing)
 		return group_asym_packing;
 
+	if (sgs->group_smt_balance)
+		return group_smt_balance;
+
 	if (sgs->group_misfit_task_load)
 		return group_misfit_task;
 
@@ -9333,6 +9342,36 @@  sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs
 	return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
 }
 
+/* One group has more than one SMT CPU while the other group does not */
+static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
+				    struct sched_group *sg2)
+{
+	if (!sg1 || !sg2)
+		return false;
+
+	return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
+		(sg2->flags & SD_SHARE_CPUCAPACITY);
+}
+
+static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+			       struct sched_group *group)
+{
+	if (env->idle == CPU_NOT_IDLE)
+		return false;
+
+	/*
+	 * For SMT source group, it is better to move a task
+	 * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
+	 * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
+	 * will not be on.
+	 */
+	if (group->flags & SD_SHARE_CPUCAPACITY &&
+	    sgs->sum_h_nr_running > 1)
+		return true;
+
+	return false;
+}
+
 static inline bool
 sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
 {
@@ -9425,6 +9464,10 @@  static inline void update_sg_lb_stats(struct lb_env *env,
 		sgs->group_asym_packing = 1;
 	}
 
+	/* Check for loaded SMT group to be balanced to dst CPU */
+	if (!local_group && smt_balance(env, sgs, group))
+		sgs->group_smt_balance = 1;
+
 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
 
 	/* Computing avg_load makes sense only when group is overloaded */
@@ -9509,6 +9552,7 @@  static bool update_sd_pick_busiest(struct lb_env *env,
 			return false;
 		break;
 
+	case group_smt_balance:
 	case group_fully_busy:
 		/*
 		 * Select the fully busy group with highest avg_load. In
@@ -9537,6 +9581,18 @@  static bool update_sd_pick_busiest(struct lb_env *env,
 		break;
 
 	case group_has_spare:
+		/*
+		 * Do not pick sg with SMT CPUs over sg with pure CPUs,
+		 * as we do not want to pull task off SMT core with one task
+		 * and make the core idle.
+		 */
+		if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
+			if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
+				return false;
+			else
+				return true;
+		}
+
 		/*
 		 * Select not overloaded group with lowest number of idle cpus
 		 * and highest number of running tasks. We could also compare
@@ -9733,6 +9789,7 @@  static bool update_pick_idlest(struct sched_group *idlest,
 
 	case group_imbalanced:
 	case group_asym_packing:
+	case group_smt_balance:
 		/* Those types are not used in the slow wakeup path */
 		return false;
 
@@ -9864,6 +9921,7 @@  find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 
 	case group_imbalanced:
 	case group_asym_packing:
+	case group_smt_balance:
 		/* Those type are not used in the slow wakeup path */
 		return NULL;
 
@@ -10118,6 +10176,13 @@  static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		return;
 	}
 
+	if (busiest->group_type == group_smt_balance) {
+		/* Reduce number of tasks sharing CPU capacity */
+		env->migration_type = migrate_task;
+		env->imbalance = 1;
+		return;
+	}
+
 	if (busiest->group_type == group_imbalanced) {
 		/*
 		 * In the group_imb case we cannot rely on group-wide averages
@@ -10363,16 +10428,23 @@  static struct sched_group *find_busiest_group(struct lb_env *env)
 		goto force_balance;
 
 	if (busiest->group_type != group_overloaded) {
-		if (env->idle == CPU_NOT_IDLE)
+		if (env->idle == CPU_NOT_IDLE) {
 			/*
 			 * If the busiest group is not overloaded (and as a
 			 * result the local one too) but this CPU is already
 			 * busy, let another idle CPU try to pull task.
 			 */
 			goto out_balanced;
+		}
+
+		if (busiest->group_type == group_smt_balance &&
+		    smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
+			/* Let non SMT CPU pull from SMT CPU sharing with sibling */
+			goto force_balance;
+		}
 
 		if (busiest->group_weight > 1 &&
-		    local->idle_cpus <= (busiest->idle_cpus + 1))
+		    local->idle_cpus <= (busiest->idle_cpus + 1)) {
 			/*
 			 * If the busiest group is not overloaded
 			 * and there is no imbalance between this and busiest
@@ -10383,12 +10455,14 @@  static struct sched_group *find_busiest_group(struct lb_env *env)
 			 * there is more than 1 CPU per group.
 			 */
 			goto out_balanced;
+		}
 
-		if (busiest->sum_h_nr_running == 1)
+		if (busiest->sum_h_nr_running == 1) {
 			/*
 			 * busiest doesn't have any tasks waiting to run
 			 */
 			goto out_balanced;
+		}
 	}
 
 force_balance: