[RFC,1/6] sched/uclamp: Track uclamped util_avg in sched_avg

Message ID 5564fc23d5e6425d069c36b4cef48edbe77fe64d.1696345700.git.Hongyan.Xia2@arm.com
State New
Headers
Series sched: uclamp sum aggregation |

Commit Message

Hongyan Xia Oct. 4, 2023, 9:04 a.m. UTC
  From: Hongyan Xia <hongyan.xia2@arm.com>

Track a uclamped version of util_avg in sched_avg, which clamps util_avg
within [uclamp[UCLAMP_MIN], uclamp[UCLAMP_MAX]] every time util_avg is
updated. At the CFS rq level, cfs_rq->avg.util_avg_uclamp must always be
the sum of all util_avg_uclamp of entities on this cfs_rq. So, each
time the util_avg_uclamp of an entity gets updated, we also track the
delta and update the cfs_rq.

We can't put the update of cfs_rq util_avg_uclamp separately in
propagate_entity_load_avg(), because util_avg_uclamp of se and cfs_rq
are not tracked separately, unlike util_avg. As a result,
util_avg_uclamp of the se and the cfs_rq the se is on must be updated
at the same time.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 include/linux/sched.h |  9 ++++++++-
 kernel/sched/fair.c   | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/sched/pelt.c   | 43 +++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h  | 37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 127 insertions(+), 1 deletion(-)
  

Comments

Dietmar Eggemann Oct. 31, 2023, 3:52 p.m. UTC | #1
On 04/10/2023 11:04, Hongyan Xia wrote:
> From: Hongyan Xia <hongyan.xia2@arm.com>

[...]

> @@ -6445,6 +6450,21 @@ static int sched_idle_cpu(int cpu)
>  }
>  #endif
>  
> +void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se);

IMHO, `struct sched_avg *avg` can only be the one of a cfs_rq. So
passing a cfs_rq would eliminate the question whether this can be from
another se.

> +static void update_se_chain(struct task_struct *p)
> +{
> +#ifdef CONFIG_UCLAMP_TASK
> +	struct sched_entity *se = &p->se;
> +	struct rq *rq = task_rq(p);
> +
> +	for_each_sched_entity(se) {
> +		struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> +		___update_util_avg_uclamp(&cfs_rq->avg, se);
> +	}
> +#endif
> +}
>  /*
>   * The enqueue_task method is called before nr_running is
>   * increased. Here we update the fair scheduling stats and
> @@ -6511,6 +6531,16 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  			goto enqueue_throttle;
>  	}
>  
> +	/*
> +	 * Re-evaluate the se hierarchy now that on_rq is true. This is
> +	 * important to enforce uclamp the moment a task with uclamp is
> +	 * enqueued, rather than waiting a timer tick for uclamp to kick in.
> +	 *
> +	 * XXX: This duplicates some of the work already done in the above for
> +	 * loops.
> +	 */
> +	update_se_chain(p);

I try to figure out why this is necessary here:

enqueue_task_fair()
  for_each_sched_entity()
    enqueue_entity()
      update_load_avg()
        __update_load_avg_se()
          ___update_util_avg_uclamp()        <-- if se->on_rq,
                                                 update p (se) if we
                                                 cross PELT period (1)
                                                 boundaries
          ___decay_util_avg_uclamp_towards() <-- decay p (se)      (2)

      enqueue_util_avg_uclamp()          <-- enqueue p into cfs_rq (3)

      se->on_rq = 1                          <-- set p (se) on_rq  (4)

  for_each_sched_entity()
    update_load_avg()                        <-- update all on_rq se's
                                                 in the hierarchy  (5)

  update_se_chain()                          <-- update p (se) and its
                                                 se hierarchy      (6)

(1) Skip p since it is !se->on_rq.

(2) Decay p->se->avg.util_avg_uclamp to 0 since it was sleeping.

(3) Attach p to its cfs_rq

...

(6) Update all all se's and cfs_rq's involved in p's task_group
    hierarchy (including propagation from se (level=x+1) to cfs_rq
    (level=x))

Question for me is why can't you integrate the util_avg_uclamp signals
for se's and cfs_rq's/rq's much closer into existing PELT functions?

[...]
  
Hongyan Xia Nov. 9, 2023, 4:05 p.m. UTC | #2
On 31/10/2023 15:52, Dietmar Eggemann wrote:
> On 04/10/2023 11:04, Hongyan Xia wrote:
>> From: Hongyan Xia <hongyan.xia2@arm.com>
> 
> [...]
> 
>> @@ -6445,6 +6450,21 @@ static int sched_idle_cpu(int cpu)
>>   }
>>   #endif
>>   
>> +void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se);
> 
> IMHO, `struct sched_avg *avg` can only be the one of a cfs_rq. So
> passing a cfs_rq would eliminate the question whether this can be from
> another se.

At the moment, yes, the avg can only come from cfs_rq. The reason why I 
kept sched_avg is that once we have sum aggregation for RT tasks, it's 
very likely we will end up calling this function on rt_rq->avg, so 
having just sched_avg here will work for RT in the future.

>> +static void update_se_chain(struct task_struct *p)
>> +{
>> +#ifdef CONFIG_UCLAMP_TASK
>> +	struct sched_entity *se = &p->se;
>> +	struct rq *rq = task_rq(p);
>> +
>> +	for_each_sched_entity(se) {
>> +		struct cfs_rq *cfs_rq = cfs_rq_of(se);
>> +
>> +		___update_util_avg_uclamp(&cfs_rq->avg, se);
>> +	}
>> +#endif
>> +}
>>   /*
>>    * The enqueue_task method is called before nr_running is
>>    * increased. Here we update the fair scheduling stats and
>> @@ -6511,6 +6531,16 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>   			goto enqueue_throttle;
>>   	}
>>   
>> +	/*
>> +	 * Re-evaluate the se hierarchy now that on_rq is true. This is
>> +	 * important to enforce uclamp the moment a task with uclamp is
>> +	 * enqueued, rather than waiting a timer tick for uclamp to kick in.
>> +	 *
>> +	 * XXX: This duplicates some of the work already done in the above for
>> +	 * loops.
>> +	 */
>> +	update_se_chain(p);
> 
> I try to figure out why this is necessary here:
> 
> enqueue_task_fair()
>    for_each_sched_entity()
>      enqueue_entity()
>        update_load_avg()
>          __update_load_avg_se()
>            ___update_util_avg_uclamp()        <-- if se->on_rq,
>                                                   update p (se) if we
>                                                   cross PELT period (1)
>                                                   boundaries
>            ___decay_util_avg_uclamp_towards() <-- decay p (se)      (2)
> 
>        enqueue_util_avg_uclamp()          <-- enqueue p into cfs_rq (3)
> 
>        se->on_rq = 1                          <-- set p (se) on_rq  (4)
> 
>    for_each_sched_entity()
>      update_load_avg()                        <-- update all on_rq se's
>                                                   in the hierarchy  (5)
> 
>    update_se_chain()                          <-- update p (se) and its
>                                                   se hierarchy      (6)
> 
> (1) Skip p since it is !se->on_rq.
> 
> (2) Decay p->se->avg.util_avg_uclamp to 0 since it was sleeping.
> 
> (3) Attach p to its cfs_rq
> 
> ...
> 
> (6) Update all all se's and cfs_rq's involved in p's task_group
>      hierarchy (including propagation from se (level=x+1) to cfs_rq
>      (level=x))
> 
> Question for me is why can't you integrate the util_avg_uclamp signals
> for se's and cfs_rq's/rq's much closer into existing PELT functions?

So the problem is that when we enqueue the task (say UCLAMP_MIN of 200), 
at that exact moment se->on_rq is false, so we only decay and not 
enforce UCLAMP_MIN. Further up in the hierarchy we do update_load_avg(), 
but the se of the task has already been processed so UCLAMP_MIN has not 
taken any effect. To make sure UCLAMP_MIN is immediately effective, I 
just re-evaluate the whole hierarchy from bottom to top.

I probably didn't quite catch what you said here. Could you elaborate a 
bit on what 'much closer' means?

Hongyan
  
Dietmar Eggemann Nov. 14, 2023, 12:59 p.m. UTC | #3
On 09/11/2023 17:05, Hongyan Xia wrote:
> On 31/10/2023 15:52, Dietmar Eggemann wrote:
>> On 04/10/2023 11:04, Hongyan Xia wrote:
>>> From: Hongyan Xia <hongyan.xia2@arm.com>
>>
>> [...]
>>
>>> @@ -6445,6 +6450,21 @@ static int sched_idle_cpu(int cpu)
>>>   }
>>>   #endif
>>>   +void ___update_util_avg_uclamp(struct sched_avg *avg, struct
>>> sched_entity *se);
>>
>> IMHO, `struct sched_avg *avg` can only be the one of a cfs_rq. So
>> passing a cfs_rq would eliminate the question whether this can be from
>> another se.
> 
> At the moment, yes, the avg can only come from cfs_rq. The reason why I
> kept sched_avg is that once we have sum aggregation for RT tasks, it's
> very likely we will end up calling this function on rt_rq->avg, so
> having just sched_avg here will work for RT in the future.

Ah, OK. IMHO would be better to use cfs_rq for now and widen this
interface once RT is covered.

[...]

>> Question for me is why can't you integrate the util_avg_uclamp signals
>> for se's and cfs_rq's/rq's much closer into existing PELT functions?
> 
> So the problem is that when we enqueue the task (say UCLAMP_MIN of 200),
> at that exact moment se->on_rq is false, so we only decay and not
> enforce UCLAMP_MIN. Further up in the hierarchy we do update_load_avg(),
> but the se of the task has already been processed so UCLAMP_MIN has not
> taken any effect. To make sure UCLAMP_MIN is immediately effective, I
> just re-evaluate the whole hierarchy from bottom to top.
> 
> I probably didn't quite catch what you said here. Could you elaborate a
> bit on what 'much closer' means?

I see. But can we not use DO_ATTACH (and DO_DETACH) for this?

(flags & DO_ATTACH) characterizes the enqueuing of the task. So with
DO_ATTACH (and so !se->on_rq (and cfs_rq->curr != se)) we could (a)
decay the task and (b) add it to the cfs_rq in enqueue_entity() ->
update_load_avg(..., | DO_ATTACH).

Like we do for PELT and a wakeup migration enqueuing (a)

update_load_avg()

  __update_load_avg_se()

    if (___update_load_sum(..., cfs_rq->curr == se)
                                ^^^^^^^^^^^^^^^^^^
                                'int running' for utilization

      ___update_load_avg()

  if (!se->avg.last_update_time && (flags & DO_ATTACH))
      ^^^^^^^^^^^^^^^^^^^^^^^^^
      wakeup migration

    attach_entity_load_avg()

Notice, for PELT we attach/detach to/from the cfs_rq which gives us
blocked contribution. For util_avg_clamped we do enqueue/dequeue, so
only runnable contribution.
  
Vincent Guittot Dec. 4, 2023, 4:07 p.m. UTC | #4
On Wed, 4 Oct 2023 at 11:05, Hongyan Xia <Hongyan.Xia2@arm.com> wrote:
>
> From: Hongyan Xia <hongyan.xia2@arm.com>
>
> Track a uclamped version of util_avg in sched_avg, which clamps util_avg
> within [uclamp[UCLAMP_MIN], uclamp[UCLAMP_MAX]] every time util_avg is
> updated. At the CFS rq level, cfs_rq->avg.util_avg_uclamp must always be
> the sum of all util_avg_uclamp of entities on this cfs_rq. So, each
> time the util_avg_uclamp of an entity gets updated, we also track the
> delta and update the cfs_rq.

No please don't do that. Don't start to mix several different signals
in one. Typically uclamp_min doen't say you want to use at least this
amount of cpu capacity.

With tasks with a util_avg of 10 but a uclamp min of 1024 what does it
mean when you start  to sum this value ?




>
> We can't put the update of cfs_rq util_avg_uclamp separately in
> propagate_entity_load_avg(), because util_avg_uclamp of se and cfs_rq
> are not tracked separately, unlike util_avg. As a result,
> util_avg_uclamp of the se and the cfs_rq the se is on must be updated
> at the same time.
>
> Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
> ---
>  include/linux/sched.h |  9 ++++++++-
>  kernel/sched/fair.c   | 39 +++++++++++++++++++++++++++++++++++++++
>  kernel/sched/pelt.c   | 43 +++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/sched.h  | 37 +++++++++++++++++++++++++++++++++++++
>  4 files changed, 127 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 177b3f3676ef..825d7b86b006 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -499,7 +499,14 @@ struct sched_avg {
>         u32                             period_contrib;
>         unsigned long                   load_avg;
>         unsigned long                   runnable_avg;
> -       unsigned long                   util_avg;
> +       unsigned int                    util_avg;
> +#ifdef CONFIG_UCLAMP_TASK
> +       /*
> +        * XXX: util_avg shrunk to accommodate util_avg_uclamp.
> +        * What are the consequences?
> +        */
> +       unsigned int                    util_avg_uclamp;
> +#endif
>         struct util_est                 util_est;
>  } ____cacheline_aligned;
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 0b7445cd5af9..33e5a6e751c0 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1077,6 +1077,9 @@ void post_init_entity_util_avg(struct task_struct *p)
>         }
>
>         sa->runnable_avg = sa->util_avg;
> +#ifdef CONFIG_UCLAMP_TASK
> +       sa->util_avg_uclamp = sa->util_avg;
> +#endif
>  }
>
>  #else /* !CONFIG_SMP */
> @@ -5068,6 +5071,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>         update_stats_enqueue_fair(cfs_rq, se, flags);
>         if (!curr)
>                 __enqueue_entity(cfs_rq, se);
> +       enqueue_util_avg_uclamp(cfs_rq, se);
>         se->on_rq = 1;
>
>         if (cfs_rq->nr_running == 1) {
> @@ -5138,6 +5142,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>         update_entity_lag(cfs_rq, se);
>         if (se != cfs_rq->curr)
>                 __dequeue_entity(cfs_rq, se);
> +       dequeue_util_avg_uclamp(cfs_rq, se);
>         se->on_rq = 0;
>         account_entity_dequeue(cfs_rq, se);
>
> @@ -6445,6 +6450,21 @@ static int sched_idle_cpu(int cpu)
>  }
>  #endif
>
> +void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se);
> +
> +static void update_se_chain(struct task_struct *p)
> +{
> +#ifdef CONFIG_UCLAMP_TASK
> +       struct sched_entity *se = &p->se;
> +       struct rq *rq = task_rq(p);
> +
> +       for_each_sched_entity(se) {
> +               struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> +               ___update_util_avg_uclamp(&cfs_rq->avg, se);
> +       }
> +#endif
> +}
>  /*
>   * The enqueue_task method is called before nr_running is
>   * increased. Here we update the fair scheduling stats and
> @@ -6511,6 +6531,16 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>                         goto enqueue_throttle;
>         }
>
> +       /*
> +        * Re-evaluate the se hierarchy now that on_rq is true. This is
> +        * important to enforce uclamp the moment a task with uclamp is
> +        * enqueued, rather than waiting a timer tick for uclamp to kick in.
> +        *
> +        * XXX: This duplicates some of the work already done in the above for
> +        * loops.
> +        */
> +       update_se_chain(p);
> +
>         /* At this point se is NULL and we are at root level*/
>         add_nr_running(rq, 1);
>
> @@ -6612,6 +6642,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  dequeue_throttle:
>         util_est_update(&rq->cfs, p, task_sleep);
>         hrtick_update(rq);
> +
> +#ifdef CONFIG_UCLAMP_TASK
> +       if (rq->cfs.h_nr_running == 0) {
> +               WARN_ONCE(rq->cfs.avg.util_avg_uclamp,
> +                       "0 tasks on CFS of CPU %d, but util_avg_uclamp is %u\n",
> +                       rq->cpu, rq->cfs.avg.util_avg_uclamp);
> +               WRITE_ONCE(rq->cfs.avg.util_avg_uclamp, 0);
> +       }
> +#endif
>  }
>
>  #ifdef CONFIG_SMP
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index 0f310768260c..c656e4dcb1d1 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -266,6 +266,48 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
>         WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
>  }
>
> +#ifdef CONFIG_UCLAMP_TASK
> +/* avg must belong to the queue this se is on. */
> +void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se)
> +{
> +       unsigned int util;
> +       int delta;
> +
> +       if (entity_is_task(se)) {
> +               unsigned int uclamp_min, uclamp_max;
> +
> +               if (!se->on_rq)
> +                       return;
> +
> +               util = READ_ONCE(se->avg.util_avg);
> +               uclamp_min = uclamp_eff_value(task_of(se), UCLAMP_MIN);
> +               uclamp_max = uclamp_eff_value(task_of(se), UCLAMP_MAX);
> +               util = clamp(util, uclamp_min, uclamp_max);
> +       } else {
> +               util = READ_ONCE(group_cfs_rq(se)->avg.util_avg_uclamp);
> +
> +               if (!se->on_rq) {
> +                       WRITE_ONCE(se->avg.util_avg_uclamp, util);
> +                       return;
> +               }
> +       }
> +
> +       delta = util - READ_ONCE(se->avg.util_avg_uclamp);
> +       if (delta == 0)
> +               return;
> +
> +       WRITE_ONCE(se->avg.util_avg_uclamp, util);
> +       util = READ_ONCE(avg->util_avg_uclamp);
> +       util += delta;
> +       WRITE_ONCE(avg->util_avg_uclamp, util);
> +}
> +#else /* !CONFIG_UCLAMP_TASK */
> +static void
> +___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se)
> +{
> +}
> +#endif
> +
>  /*
>   * sched_entity:
>   *
> @@ -309,6 +351,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
>                                 cfs_rq->curr == se)) {
>
>                 ___update_load_avg(&se->avg, se_weight(se));
> +               ___update_util_avg_uclamp(&cfs_rq->avg, se);
>                 cfs_se_util_change(&se->avg);
>                 trace_pelt_se_tp(se);
>                 return 1;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 3a01b7a2bf66..2eefcdb0c3b0 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3133,6 +3133,33 @@ static inline bool uclamp_is_used(void)
>  {
>         return static_branch_likely(&sched_uclamp_used);
>  }
> +
> +static inline void enqueue_util_avg_uclamp(struct cfs_rq *cfs_rq,
> +                                          struct sched_entity *se)
> +{
> +       unsigned int cfs_val = READ_ONCE(cfs_rq->avg.util_avg_uclamp);
> +       unsigned int se_val = READ_ONCE(se->avg.util_avg_uclamp);
> +
> +       WRITE_ONCE(cfs_rq->avg.util_avg_uclamp, cfs_val + se_val);
> +}
> +
> +static inline void dequeue_util_avg_uclamp(struct cfs_rq *cfs_rq,
> +                                          struct sched_entity *se)
> +{
> +       unsigned int cfs_val = READ_ONCE(cfs_rq->avg.util_avg_uclamp);
> +       unsigned int se_val = READ_ONCE(se->avg.util_avg_uclamp), new_val;
> +
> +       if (cfs_val > se_val)
> +               new_val = cfs_val - se_val;
> +       else {
> +               WARN_ONCE(cfs_val < se_val,
> +                       "CPU %d. cfs_rq %p, cfs_val %u is even less than se_val %u before subtraction\n",
> +                       rq_of(cfs_rq)->cpu, cfs_rq, cfs_val, se_val);
> +               new_val = 0;
> +       }
> +
> +       WRITE_ONCE(cfs_rq->avg.util_avg_uclamp, new_val);
> +}
>  #else /* CONFIG_UCLAMP_TASK */
>  static inline unsigned long uclamp_eff_value(struct task_struct *p,
>                                              enum uclamp_id clamp_id)
> @@ -3175,6 +3202,16 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
>  {
>         return false;
>  }
> +
> +static inline void enqueue_util_avg_uclamp(struct cfs_rq *cfs_rq,
> +                                          struct sched_entity *se)
> +{
> +}
> +
> +static inline void dequeue_util_avg_uclamp(struct cfs_rq *cfs_rq,
> +                                          struct sched_entity *se)
> +{
> +}
>  #endif /* CONFIG_UCLAMP_TASK */
>
>  #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
> --
> 2.34.1
>
  
Hongyan Xia Dec. 5, 2023, 2:24 p.m. UTC | #5
On 04/12/2023 16:07, Vincent Guittot wrote:
> On Wed, 4 Oct 2023 at 11:05, Hongyan Xia <Hongyan.Xia2@arm.com> wrote:
>>
>> From: Hongyan Xia <hongyan.xia2@arm.com>
>>
>> Track a uclamped version of util_avg in sched_avg, which clamps util_avg
>> within [uclamp[UCLAMP_MIN], uclamp[UCLAMP_MAX]] every time util_avg is
>> updated. At the CFS rq level, cfs_rq->avg.util_avg_uclamp must always be
>> the sum of all util_avg_uclamp of entities on this cfs_rq. So, each
>> time the util_avg_uclamp of an entity gets updated, we also track the
>> delta and update the cfs_rq.
> 
> No please don't do that. Don't start to mix several different signals
> in one. Typically uclamp_min doen't say you want to use at least this
> amount of cpu capacity.

But I'd say with uclamp, PELT is already broken and is a mixed signal 
anyway. When uclamp is used, a util_avg value X doesn't mean X, it means 
X under some rq uclamp, and the worst part is that the rq uclamp may or 
may not have anything to do with this task's uclamp.

Pretending X is a true PELT value now (and not a mixed value) is why we 
have so many problems today. For example in the frequency spike problem, 
if a task A has no idle time under uclamp_max, its PELT does not reflect 
reality. The moment another task B comes in and uncap the rq uclamp_max, 
the current scheduler code thinks the 1024 of A means a real 1024, which 
is wrong and is exactly why we see a spike when B joins. It's also why 
we need to special case 0 spare capacity with recent patches, because rq 
util_avg has lost its original PELT meaning under uclamp.

Because X is not the true PELT, we always have to do something to bring 
it back into reality. What the current max aggregation code does is to 
introduce corner cases, like treating 0 spare capacity as potential 
opportunity to queue more tasks (which creates further problems in my 
tests), and maybe introducing uclamp load balancing special cases in the 
future, or introducing uclamp filtering to delay the effect of wrong 
PELT values.

What this series does is not much different. We keep the somewhat wrong 
value X, but we also remember under what uclamp values did we get that X 
to bring things back into reality, which means now we have [X, 
uclamp_min when X happens, uclamp_max when X happens]. To save space, 
this becomes [X, clamped X], which is what this series does. The 
original PELT value X is kept, but we use the clamped X in several 
places to improve our decisions.

> 
> With tasks with a util_avg of 10 but a uclamp min of 1024 what does it
> mean when you start  to sum this value ?

Like I replied in another comment, assuming a uclamp_min of 1024 is a 
hint to run the task on the big CPUs, I don't think it's right to 
directly use uclamp as a CPU placement indicator. A uclamp value may 
come from ADFP from userspace. An ADPF uclamp_min value of little CPU 
capacity + 1 certainly doesn't mean a game on purpose wants to avoid the 
little core. It simply means it wants at least this much performance, 
and whether this results in placing the game thread on a big CPU is 
purely the job of EAS (or CAS, etc.). We want to use little CPU + 1 as 
uclamp_min because we know the SoC and the little CPU is bad, but uclamp 
should be generic and should not rely on knowing the SoC.

Basically, under sum aggregation one would not use a uclamp_min value of 
1024 to place a small task on a big core. A uclamp_min of 1024 under sum 
aggregation has the meaning in ADPF, which is a hint to try to run me as 
fast as possible.

> 
> 
>> [...]
  
Vincent Guittot Dec. 5, 2023, 4:22 p.m. UTC | #6
On Tue, 5 Dec 2023 at 15:24, Hongyan Xia <hongyan.xia2@arm.com> wrote:
>
> On 04/12/2023 16:07, Vincent Guittot wrote:
> > On Wed, 4 Oct 2023 at 11:05, Hongyan Xia <Hongyan.Xia2@arm.com> wrote:
> >>
> >> From: Hongyan Xia <hongyan.xia2@arm.com>
> >>
> >> Track a uclamped version of util_avg in sched_avg, which clamps util_avg
> >> within [uclamp[UCLAMP_MIN], uclamp[UCLAMP_MAX]] every time util_avg is
> >> updated. At the CFS rq level, cfs_rq->avg.util_avg_uclamp must always be
> >> the sum of all util_avg_uclamp of entities on this cfs_rq. So, each
> >> time the util_avg_uclamp of an entity gets updated, we also track the
> >> delta and update the cfs_rq.
> >
> > No please don't do that. Don't start to mix several different signals
> > in one. Typically uclamp_min doen't say you want to use at least this
> > amount of cpu capacity.
>
> But I'd say with uclamp, PELT is already broken and is a mixed signal

PELT has nothing to do with uclamp, you could argue that EAS is making
a wrong use or mix of PELT signals and uclamp hints to select a CPU
but PELT itself is not impacted by uclamp and should stay out of
uclamp policy.

> anyway. When uclamp is used, a util_avg value X doesn't mean X, it means
> X under some rq uclamp, and the worst part is that the rq uclamp may or
> may not have anything to do with this task's uclamp.

I think that you are mixing PELT and how it is(was) used by EAS.

Have you looked at the latest tip/sched/core and the changes in
effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long
*min, unsigned long *max) ?

We return 3 values:
- the actual utilization which is not impacted by uclamp
- a targeted min value which takes into account uclamp_min
- a targeted max value which takes into account uclamp_max

https://lore.kernel.org/lkml/20231122133904.446032-1-vincent.guittot@linaro.org/

>
> Pretending X is a true PELT value now (and not a mixed value) is why we
> have so many problems today. For example in the frequency spike problem,
> if a task A has no idle time under uclamp_max, its PELT does not reflect
> reality. The moment another task B comes in and uncap the rq uclamp_max,

you are mixing 2 things. The PELT signal of the task is correct.

> the current scheduler code thinks the 1024 of A means a real 1024, which
> is wrong and is exactly why we see a spike when B joins. It's also why

This is the true actual utilization, the actual util_avg of A is
really 1024. But users want to give a hint to lower its needs with
uclamp_max.

> we need to special case 0 spare capacity with recent patches, because rq
> util_avg has lost its original PELT meaning under uclamp.
>
> Because X is not the true PELT, we always have to do something to bring
> it back into reality. What the current max aggregation code does is to
> introduce corner cases, like treating 0 spare capacity as potential
> opportunity to queue more tasks (which creates further problems in my
> tests), and maybe introducing uclamp load balancing special cases in the
> future, or introducing uclamp filtering to delay the effect of wrong
> PELT values.
>
> What this series does is not much different. We keep the somewhat wrong
> value X, but we also remember under what uclamp values did we get that X
> to bring things back into reality, which means now we have [X,
> uclamp_min when X happens, uclamp_max when X happens]. To save space,
> this becomes [X, clamped X], which is what this series does. The
> original PELT value X is kept, but we use the clamped X in several
> places to improve our decisions.
>
> >
> > With tasks with a util_avg of 10 but a uclamp min of 1024 what does it
> > mean when you start  to sum this value ?
>
> Like I replied in another comment, assuming a uclamp_min of 1024 is a
> hint to run the task on the big CPUs, I don't think it's right to

not especially to run on a big but to say that the task needs more
performance than what its actual utilization looks

> directly use uclamp as a CPU placement indicator. A uclamp value may
> come from ADFP from userspace. An ADPF uclamp_min value of little CPU
> capacity + 1 certainly doesn't mean a game on purpose wants to avoid the
> little core. It simply means it wants at least this much performance,
> and whether this results in placing the game thread on a big CPU is
> purely the job of EAS (or CAS, etc.). We want to use little CPU + 1 as
> uclamp_min because we know the SoC and the little CPU is bad, but uclamp
> should be generic and should not rely on knowing the SoC.
>
> Basically, under sum aggregation one would not use a uclamp_min value of
> 1024 to place a small task on a big core. A uclamp_min of 1024 under sum
> aggregation has the meaning in ADPF, which is a hint to try to run me as
> fast as possible.
>
> >
> >
> >> [...]
  

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 177b3f3676ef..825d7b86b006 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -499,7 +499,14 @@  struct sched_avg {
 	u32				period_contrib;
 	unsigned long			load_avg;
 	unsigned long			runnable_avg;
-	unsigned long			util_avg;
+	unsigned int			util_avg;
+#ifdef CONFIG_UCLAMP_TASK
+	/*
+	 * XXX: util_avg shrunk to accommodate util_avg_uclamp.
+	 * What are the consequences?
+	 */
+	unsigned int			util_avg_uclamp;
+#endif
 	struct util_est			util_est;
 } ____cacheline_aligned;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b7445cd5af9..33e5a6e751c0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1077,6 +1077,9 @@  void post_init_entity_util_avg(struct task_struct *p)
 	}
 
 	sa->runnable_avg = sa->util_avg;
+#ifdef CONFIG_UCLAMP_TASK
+	sa->util_avg_uclamp = sa->util_avg;
+#endif
 }
 
 #else /* !CONFIG_SMP */
@@ -5068,6 +5071,7 @@  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_stats_enqueue_fair(cfs_rq, se, flags);
 	if (!curr)
 		__enqueue_entity(cfs_rq, se);
+	enqueue_util_avg_uclamp(cfs_rq, se);
 	se->on_rq = 1;
 
 	if (cfs_rq->nr_running == 1) {
@@ -5138,6 +5142,7 @@  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_entity_lag(cfs_rq, se);
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
+	dequeue_util_avg_uclamp(cfs_rq, se);
 	se->on_rq = 0;
 	account_entity_dequeue(cfs_rq, se);
 
@@ -6445,6 +6450,21 @@  static int sched_idle_cpu(int cpu)
 }
 #endif
 
+void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se);
+
+static void update_se_chain(struct task_struct *p)
+{
+#ifdef CONFIG_UCLAMP_TASK
+	struct sched_entity *se = &p->se;
+	struct rq *rq = task_rq(p);
+
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+		___update_util_avg_uclamp(&cfs_rq->avg, se);
+	}
+#endif
+}
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -6511,6 +6531,16 @@  enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			goto enqueue_throttle;
 	}
 
+	/*
+	 * Re-evaluate the se hierarchy now that on_rq is true. This is
+	 * important to enforce uclamp the moment a task with uclamp is
+	 * enqueued, rather than waiting a timer tick for uclamp to kick in.
+	 *
+	 * XXX: This duplicates some of the work already done in the above for
+	 * loops.
+	 */
+	update_se_chain(p);
+
 	/* At this point se is NULL and we are at root level*/
 	add_nr_running(rq, 1);
 
@@ -6612,6 +6642,15 @@  static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 dequeue_throttle:
 	util_est_update(&rq->cfs, p, task_sleep);
 	hrtick_update(rq);
+
+#ifdef CONFIG_UCLAMP_TASK
+	if (rq->cfs.h_nr_running == 0) {
+		WARN_ONCE(rq->cfs.avg.util_avg_uclamp,
+			"0 tasks on CFS of CPU %d, but util_avg_uclamp is %u\n",
+			rq->cpu, rq->cfs.avg.util_avg_uclamp);
+		WRITE_ONCE(rq->cfs.avg.util_avg_uclamp, 0);
+	}
+#endif
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 0f310768260c..c656e4dcb1d1 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -266,6 +266,48 @@  ___update_load_avg(struct sched_avg *sa, unsigned long load)
 	WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
 }
 
+#ifdef CONFIG_UCLAMP_TASK
+/* avg must belong to the queue this se is on. */
+void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se)
+{
+	unsigned int util;
+	int delta;
+
+	if (entity_is_task(se)) {
+		unsigned int uclamp_min, uclamp_max;
+
+		if (!se->on_rq)
+			return;
+
+		util = READ_ONCE(se->avg.util_avg);
+		uclamp_min = uclamp_eff_value(task_of(se), UCLAMP_MIN);
+		uclamp_max = uclamp_eff_value(task_of(se), UCLAMP_MAX);
+		util = clamp(util, uclamp_min, uclamp_max);
+	} else {
+		util = READ_ONCE(group_cfs_rq(se)->avg.util_avg_uclamp);
+
+		if (!se->on_rq) {
+			WRITE_ONCE(se->avg.util_avg_uclamp, util);
+			return;
+		}
+	}
+
+	delta = util - READ_ONCE(se->avg.util_avg_uclamp);
+	if (delta == 0)
+		return;
+
+	WRITE_ONCE(se->avg.util_avg_uclamp, util);
+	util = READ_ONCE(avg->util_avg_uclamp);
+	util += delta;
+	WRITE_ONCE(avg->util_avg_uclamp, util);
+}
+#else /* !CONFIG_UCLAMP_TASK */
+static void
+___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se)
+{
+}
+#endif
+
 /*
  * sched_entity:
  *
@@ -309,6 +351,7 @@  int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
 				cfs_rq->curr == se)) {
 
 		___update_load_avg(&se->avg, se_weight(se));
+		___update_util_avg_uclamp(&cfs_rq->avg, se);
 		cfs_se_util_change(&se->avg);
 		trace_pelt_se_tp(se);
 		return 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3a01b7a2bf66..2eefcdb0c3b0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3133,6 +3133,33 @@  static inline bool uclamp_is_used(void)
 {
 	return static_branch_likely(&sched_uclamp_used);
 }
+
+static inline void enqueue_util_avg_uclamp(struct cfs_rq *cfs_rq,
+					   struct sched_entity *se)
+{
+	unsigned int cfs_val = READ_ONCE(cfs_rq->avg.util_avg_uclamp);
+	unsigned int se_val = READ_ONCE(se->avg.util_avg_uclamp);
+
+	WRITE_ONCE(cfs_rq->avg.util_avg_uclamp, cfs_val + se_val);
+}
+
+static inline void dequeue_util_avg_uclamp(struct cfs_rq *cfs_rq,
+					   struct sched_entity *se)
+{
+	unsigned int cfs_val = READ_ONCE(cfs_rq->avg.util_avg_uclamp);
+	unsigned int se_val = READ_ONCE(se->avg.util_avg_uclamp), new_val;
+
+	if (cfs_val > se_val)
+		new_val = cfs_val - se_val;
+	else {
+		WARN_ONCE(cfs_val < se_val,
+			"CPU %d. cfs_rq %p, cfs_val %u is even less than se_val %u before subtraction\n",
+			rq_of(cfs_rq)->cpu, cfs_rq, cfs_val, se_val);
+		new_val = 0;
+	}
+
+	WRITE_ONCE(cfs_rq->avg.util_avg_uclamp, new_val);
+}
 #else /* CONFIG_UCLAMP_TASK */
 static inline unsigned long uclamp_eff_value(struct task_struct *p,
 					     enum uclamp_id clamp_id)
@@ -3175,6 +3202,16 @@  static inline bool uclamp_rq_is_idle(struct rq *rq)
 {
 	return false;
 }
+
+static inline void enqueue_util_avg_uclamp(struct cfs_rq *cfs_rq,
+					   struct sched_entity *se)
+{
+}
+
+static inline void dequeue_util_avg_uclamp(struct cfs_rq *cfs_rq,
+					   struct sched_entity *se)
+{
+}
 #endif /* CONFIG_UCLAMP_TASK */
 
 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ