[RFC,2/6] sched/uclamp: Simulate PELT decay in util_avg_uclamp

Message ID d73fc3e9a02f047902fdd5e4c07402452d6e0590.1696345700.git.Hongyan.Xia2@arm.com
State New
Headers
Series sched: uclamp sum aggregation |

Commit Message

Hongyan Xia Oct. 4, 2023, 9:04 a.m. UTC
  From: Hongyan Xia <hongyan.xia2@arm.com>

Because util_avg_uclamp is not directly managed by PELT, it lacks the
nice property of slowly decaying to a lower value, resulting in
performance degredation due to premature frequency drops.

Add functions to decay root cfs utilization and tasks that are not on
the rq. This way, we get the benefits of PELT while still maintaining
uclamp. The rules are simple:

1. When task is se->on_rq, enforce its util_avg_uclamp within uclamp
   range.
2. When task is !se->on_rq, PELT decay its util_avg_uclamp.
3. When the root CFS util drops, PELT decay to the target frequency
   instead of immediately dropping to a lower target frequency.

TODO: Can we somehow integrate this uclamp sum aggregation directly into
util_avg, so that we don't need to introduce a new util_avg_uclamp
signal and don't need to simulate PELT decay?

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/fair.c  |  20 +++++++++
 kernel/sched/pelt.c  | 103 ++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h |   2 +
 3 files changed, 119 insertions(+), 6 deletions(-)
  

Comments

Dietmar Eggemann Nov. 1, 2023, 4:06 p.m. UTC | #1
On 04/10/2023 11:04, Hongyan Xia wrote:
> From: Hongyan Xia <hongyan.xia2@arm.com>
> 
> Because util_avg_uclamp is not directly managed by PELT, it lacks the
> nice property of slowly decaying to a lower value, resulting in
> performance degredation due to premature frequency drops.
> 
> Add functions to decay root cfs utilization and tasks that are not on
> the rq. This way, we get the benefits of PELT while still maintaining
> uclamp. The rules are simple:
> 
> 1. When task is se->on_rq, enforce its util_avg_uclamp within uclamp
>    range.
> 2. When task is !se->on_rq, PELT decay its util_avg_uclamp.
> 3. When the root CFS util drops, PELT decay to the target frequency
>    instead of immediately dropping to a lower target frequency.
> 
> TODO: Can we somehow integrate this uclamp sum aggregation directly into
> util_avg, so that we don't need to introduce a new util_avg_uclamp
> signal and don't need to simulate PELT decay?

That's a good question. I'm wondering why you were not able to integrate
the maintenance of the util_avg_uclamp values inside the existing PELT
update functionality in fair.c ((__update_load_avg_xxx(),
propagate_entity_load_avg() -> update_tg_cfs_util() etc.)

Why do you need extra functions like ___decay_util_avg_uclamp_towards()
and ___update_util_avg_uclamp() for this?

> Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
> ---
>  kernel/sched/fair.c  |  20 +++++++++
>  kernel/sched/pelt.c  | 103 ++++++++++++++++++++++++++++++++++++++++---
>  kernel/sched/sched.h |   2 +
>  3 files changed, 119 insertions(+), 6 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 33e5a6e751c0..420af57d01ee 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4311,17 +4311,22 @@ static inline int
>  update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>  {
>  	unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
> +	unsigned int removed_root_util = 0;

 unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
-unsigned int removed_root_util = 0;
+unsigned int __maybe_unused removed_root_util = 0;

Otherwise you get `warning: unused variable ‘rq’` w/ !CONFIG_UCLAMP_TASK

>  	struct sched_avg *sa = &cfs_rq->avg;
>  	int decayed = 0;
>  
>  	if (cfs_rq->removed.nr) {
>  		unsigned long r;
> +		struct rq *rq = rq_of(cfs_rq);
>  		u32 divider = get_pelt_divider(&cfs_rq->avg);
>  
>  		raw_spin_lock(&cfs_rq->removed.lock);
>  		swap(cfs_rq->removed.util_avg, removed_util);
>  		swap(cfs_rq->removed.load_avg, removed_load);
>  		swap(cfs_rq->removed.runnable_avg, removed_runnable);
> +#ifdef CONFIG_UCLAMP_TASK
> +		swap(rq->root_cfs_util_uclamp_removed, removed_root_util);
> +#endif
>  		cfs_rq->removed.nr = 0;
>  		raw_spin_unlock(&cfs_rq->removed.lock);
>  

[...]


>  #ifdef CONFIG_UCLAMP_TASK
> +static void ___decay_util_avg_uclamp_towards(u64 now,
> +					     u64 last_update_time,
> +					     u32 period_contrib,
> +					     unsigned int *old,
> +					     unsigned int new_val)
> +{
> +	unsigned int old_val = READ_ONCE(*old);
> +	u64 delta, periods;
> +
> +	if (old_val <= new_val) {
> +		WRITE_ONCE(*old, new_val);
> +		return;
> +	}

Why is the function called `decay`? In case `new >= old` you set old =
new and bail out. So it's also more like an `update` function?

> +	if (!last_update_time)
> +		return;
> +	delta = now - last_update_time;
> +	if ((s64)delta < 0)
> +		return;
> +	delta >>= 10;
> +	if (!delta)
> +		return;
> +
> +	delta += period_contrib;
> +	periods = delta / 1024;
> +	if (periods) {
> +		u64 diff = old_val - new_val;
> +
> +		/*
> +		 * Let's assume 3 tasks, A, B and C. A is still on rq but B and
> +		 * C have just been dequeued. The cfs.avg.util_avg_uclamp has
> +		 * become A but root_cfs_util_uclamp just starts to decay and is
> +		 * now still A + B + C.
> +		 *
> +		 * After p periods with y being the decay factor, the new
> +		 * root_cfs_util_uclamp should become
> +		 *
> +		 * A + B * y^p + C * y^p == A + (A + B + C - A) * y^p
> +		 *     == cfs.avg.util_avg_uclamp +
> +		 *        (root_cfs_util_uclamp_at_the_start - cfs.avg.util_avg_uclamp) * y^p
> +		 *     == cfs.avg.util_avg_uclamp + diff * y^p
> +		 *
> +		 * So, instead of summing up each individual decayed values, we
> +		 * could just decay the diff and not bother with the summation
> +		 * at all. This is why we decay the diff here.
> +		 */
> +		diff = decay_load(diff, periods);
> +		WRITE_ONCE(*old, new_val + diff);
> +	}
> +}

Looks like ___decay_util_avg_uclamp_towards() is used for:

(1) tasks with !se->on_rq to decay before enqueue

(2) rq->root_cfs_util_uclamp to align with
    &rq_of(cfs_rq)->cfs->avg.util_avg_uclamp

All the cfs_rq's and the taskgroup se's seem to be updated only in
___update_util_avg_uclamp() (which also handles the propagation towards
the root taskgroup).

[...]
  
Hongyan Xia Nov. 9, 2023, 4:50 p.m. UTC | #2
On 01/11/2023 16:06, Dietmar Eggemann wrote:
> On 04/10/2023 11:04, Hongyan Xia wrote:
>> From: Hongyan Xia <hongyan.xia2@arm.com>
>>
>> Because util_avg_uclamp is not directly managed by PELT, it lacks the
>> nice property of slowly decaying to a lower value, resulting in
>> performance degredation due to premature frequency drops.
>>
>> Add functions to decay root cfs utilization and tasks that are not on
>> the rq. This way, we get the benefits of PELT while still maintaining
>> uclamp. The rules are simple:
>>
>> 1. When task is se->on_rq, enforce its util_avg_uclamp within uclamp
>>     range.
>> 2. When task is !se->on_rq, PELT decay its util_avg_uclamp.
>> 3. When the root CFS util drops, PELT decay to the target frequency
>>     instead of immediately dropping to a lower target frequency.
>>
>> TODO: Can we somehow integrate this uclamp sum aggregation directly into
>> util_avg, so that we don't need to introduce a new util_avg_uclamp
>> signal and don't need to simulate PELT decay?
> 
> That's a good question. I'm wondering why you were not able to integrate
> the maintenance of the util_avg_uclamp values inside the existing PELT
> update functionality in fair.c ((__update_load_avg_xxx(),
> propagate_entity_load_avg() -> update_tg_cfs_util() etc.)
> 
> Why do you need extra functions like ___decay_util_avg_uclamp_towards()
> and ___update_util_avg_uclamp() for this?

These new functions are already in __update_load_avg_xxx(). I just 
separate the new code into a separate function for readability.

I think we have talked offline on why we can't do things in 
propagate_entity_load_avg() -> update_tg_cfs_util(). Currently cfs_rq 
and se utilization is tracked independently. However, we can't track 
them separately in sum aggregation. If a cfs_rq has two tasks with 
utilization at 1024 and UCLAMP_MAX of 100, the cfs_rq must sum up those 
two tasks, at 200, and cannot use the logic inside 
propagate_entity_load_avg() -> update_tg_cfs_util() which is for 
tracking cfs_rq independently and won't know the utilization is only 200.

Again I may have misunderstood what you meant. Do you have a concrete 
example on how to do this without extra functions?

One idea I once had is to use the existing util_avg, and introduce a 
'util_bias' variable. When there's no uclamp, this bias is 0 and 
util_avg is what it is. When there's uclamp, for example, two tasks at 
utilization of 400 but UCLAMP_MAX of 300, then each task has a util_bias 
of -100, and cfs_rq will sum up the biases, at -200. Then, the cfs_rq 
will run at 600 instead of 800. This way, no PELT simulation is needed. 
However, this doesn't work, because say two tasks with utilization of 
1024 but UCLAMP_MAX at 100, each will have a bias of -924 and cfs_rq 
will sum up and have a bias of -1848, but the cfs_rq will be at 1024, 
not 2048, so adding this bias will give you cfs_rq utilization of -824, 
which is clearly wrong here.

>> Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
>> ---
>>   kernel/sched/fair.c  |  20 +++++++++
>>   kernel/sched/pelt.c  | 103 ++++++++++++++++++++++++++++++++++++++++---
>>   kernel/sched/sched.h |   2 +
>>   3 files changed, 119 insertions(+), 6 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 33e5a6e751c0..420af57d01ee 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -4311,17 +4311,22 @@ static inline int
>>   update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>>   {
>>   	unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
>> +	unsigned int removed_root_util = 0;
> 
>   unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
> -unsigned int removed_root_util = 0;
> +unsigned int __maybe_unused removed_root_util = 0;
> 
> Otherwise you get `warning: unused variable ‘rq’` w/ !CONFIG_UCLAMP_TASK

Thanks. Ack.
> 
> [...]
> 
> 
>>   #ifdef CONFIG_UCLAMP_TASK
>> +static void ___decay_util_avg_uclamp_towards(u64 now,
>> +					     u64 last_update_time,
>> +					     u32 period_contrib,
>> +					     unsigned int *old,
>> +					     unsigned int new_val)
>> +{
>> +	unsigned int old_val = READ_ONCE(*old);
>> +	u64 delta, periods;
>> +
>> +	if (old_val <= new_val) {
>> +		WRITE_ONCE(*old, new_val);
>> +		return;
>> +	}
> 
> Why is the function called `decay`? In case `new >= old` you set old =
> new and bail out. So it's also more like an `update` function?

Bad naming indeed. I will rename it to ___update_util_avg_uclamp_towards

>> +	if (!last_update_time)
>> +		return;
>> +	delta = now - last_update_time;
>> +	if ((s64)delta < 0)
>> +		return;
>> +	delta >>= 10;
>> +	if (!delta)
>> +		return;
>> +
>> +	delta += period_contrib;
>> +	periods = delta / 1024;
>> +	if (periods) {
>> +		u64 diff = old_val - new_val;
>> +
>> +		/*
>> +		 * Let's assume 3 tasks, A, B and C. A is still on rq but B and
>> +		 * C have just been dequeued. The cfs.avg.util_avg_uclamp has
>> +		 * become A but root_cfs_util_uclamp just starts to decay and is
>> +		 * now still A + B + C.
>> +		 *
>> +		 * After p periods with y being the decay factor, the new
>> +		 * root_cfs_util_uclamp should become
>> +		 *
>> +		 * A + B * y^p + C * y^p == A + (A + B + C - A) * y^p
>> +		 *     == cfs.avg.util_avg_uclamp +
>> +		 *        (root_cfs_util_uclamp_at_the_start - cfs.avg.util_avg_uclamp) * y^p
>> +		 *     == cfs.avg.util_avg_uclamp + diff * y^p
>> +		 *
>> +		 * So, instead of summing up each individual decayed values, we
>> +		 * could just decay the diff and not bother with the summation
>> +		 * at all. This is why we decay the diff here.
>> +		 */
>> +		diff = decay_load(diff, periods);
>> +		WRITE_ONCE(*old, new_val + diff);
>> +	}
>> +}
> 
> Looks like ___decay_util_avg_uclamp_towards() is used for:
> 
> (1) tasks with !se->on_rq to decay before enqueue
> 
> (2) rq->root_cfs_util_uclamp to align with
>      &rq_of(cfs_rq)->cfs->avg.util_avg_uclamp
> 
> All the cfs_rq's and the taskgroup se's seem to be updated only in
> ___update_util_avg_uclamp() (which also handles the propagation towards
> the root taskgroup).

Yes, I would say that's a nice summary.

When !se->on_rq, we never use ___update_util_avg_uclamp() but instead do 
___update_util_avg_uclamp_towards().

> 
> [...]
  
Vincent Guittot Dec. 4, 2023, 4:07 p.m. UTC | #3
On Wed, 4 Oct 2023 at 11:05, Hongyan Xia <Hongyan.Xia2@arm.com> wrote:
>
> From: Hongyan Xia <hongyan.xia2@arm.com>
>
> Because util_avg_uclamp is not directly managed by PELT, it lacks the
> nice property of slowly decaying to a lower value, resulting in
> performance degredation due to premature frequency drops.

That a very good reason for not doing this

>
> Add functions to decay root cfs utilization and tasks that are not on
> the rq. This way, we get the benefits of PELT while still maintaining
> uclamp. The rules are simple:

Nack. This just highlights that you are mixing different things and
then trying to make it work.

Please keep PELT out of uclamp_min/max

>
> 1. When task is se->on_rq, enforce its util_avg_uclamp within uclamp
>    range.
> 2. When task is !se->on_rq, PELT decay its util_avg_uclamp.
> 3. When the root CFS util drops, PELT decay to the target frequency
>    instead of immediately dropping to a lower target frequency.
>
> TODO: Can we somehow integrate this uclamp sum aggregation directly into
> util_avg, so that we don't need to introduce a new util_avg_uclamp
> signal and don't need to simulate PELT decay?
>
> Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
> ---
>  kernel/sched/fair.c  |  20 +++++++++
>  kernel/sched/pelt.c  | 103 ++++++++++++++++++++++++++++++++++++++++---
>  kernel/sched/sched.h |   2 +
>  3 files changed, 119 insertions(+), 6 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 33e5a6e751c0..420af57d01ee 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4311,17 +4311,22 @@ static inline int
>  update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>  {
>         unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
> +       unsigned int removed_root_util = 0;
>         struct sched_avg *sa = &cfs_rq->avg;
>         int decayed = 0;
>
>         if (cfs_rq->removed.nr) {
>                 unsigned long r;
> +               struct rq *rq = rq_of(cfs_rq);
>                 u32 divider = get_pelt_divider(&cfs_rq->avg);
>
>                 raw_spin_lock(&cfs_rq->removed.lock);
>                 swap(cfs_rq->removed.util_avg, removed_util);
>                 swap(cfs_rq->removed.load_avg, removed_load);
>                 swap(cfs_rq->removed.runnable_avg, removed_runnable);
> +#ifdef CONFIG_UCLAMP_TASK
> +               swap(rq->root_cfs_util_uclamp_removed, removed_root_util);
> +#endif
>                 cfs_rq->removed.nr = 0;
>                 raw_spin_unlock(&cfs_rq->removed.lock);
>
> @@ -4346,6 +4351,12 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>                  *    util_avg * minimum possible divider
>                  */
>                 sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
> +#ifdef CONFIG_UCLAMP_TASK
> +               r = removed_root_util;
> +               sub_positive(&rq->root_cfs_util_uclamp, r);
> +               rq->root_cfs_util_uclamp =
> +                       max(rq->root_cfs_util_uclamp, rq->cfs.avg.util_avg_uclamp);
> +#endif
>
>                 r = removed_runnable;
>                 sub_positive(&sa->runnable_avg, r);
> @@ -4527,6 +4538,7 @@ static void sync_entity_load_avg(struct sched_entity *se)
>  static void remove_entity_load_avg(struct sched_entity *se)
>  {
>         struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +       struct rq *rq = rq_of(cfs_rq);
>         unsigned long flags;
>
>         /*
> @@ -4542,6 +4554,9 @@ static void remove_entity_load_avg(struct sched_entity *se)
>         cfs_rq->removed.util_avg        += se->avg.util_avg;
>         cfs_rq->removed.load_avg        += se->avg.load_avg;
>         cfs_rq->removed.runnable_avg    += se->avg.runnable_avg;
> +#ifdef CONFIG_UCLAMP_TASK
> +       rq->root_cfs_util_uclamp_removed += se->avg.util_avg_uclamp;
> +#endif
>         raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
>  }
>
> @@ -6462,6 +6477,11 @@ static void update_se_chain(struct task_struct *p)
>                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
>
>                 ___update_util_avg_uclamp(&cfs_rq->avg, se);
> +               if (&rq->cfs == cfs_rq) {
> +                       rq->root_cfs_util_uclamp = max(rq->root_cfs_util_uclamp,
> +                                                      cfs_rq->avg.util_avg_uclamp);
> +                       cfs_rq_util_change(cfs_rq, 0);
> +               }
>         }
>  #endif
>  }
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index c656e4dcb1d1..83d5ac7e7ddb 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -267,6 +267,57 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
>  }
>
>  #ifdef CONFIG_UCLAMP_TASK
> +static void ___decay_util_avg_uclamp_towards(u64 now,
> +                                            u64 last_update_time,
> +                                            u32 period_contrib,
> +                                            unsigned int *old,
> +                                            unsigned int new_val)
> +{
> +       unsigned int old_val = READ_ONCE(*old);
> +       u64 delta, periods;
> +
> +       if (old_val <= new_val) {
> +               WRITE_ONCE(*old, new_val);
> +               return;
> +       }
> +
> +       if (!last_update_time)
> +               return;
> +       delta = now - last_update_time;
> +       if ((s64)delta < 0)
> +               return;
> +       delta >>= 10;
> +       if (!delta)
> +               return;
> +
> +       delta += period_contrib;
> +       periods = delta / 1024;
> +       if (periods) {
> +               u64 diff = old_val - new_val;
> +
> +               /*
> +                * Let's assume 3 tasks, A, B and C. A is still on rq but B and
> +                * C have just been dequeued. The cfs.avg.util_avg_uclamp has
> +                * become A but root_cfs_util_uclamp just starts to decay and is
> +                * now still A + B + C.
> +                *
> +                * After p periods with y being the decay factor, the new
> +                * root_cfs_util_uclamp should become
> +                *
> +                * A + B * y^p + C * y^p == A + (A + B + C - A) * y^p
> +                *     == cfs.avg.util_avg_uclamp +
> +                *        (root_cfs_util_uclamp_at_the_start - cfs.avg.util_avg_uclamp) * y^p
> +                *     == cfs.avg.util_avg_uclamp + diff * y^p
> +                *
> +                * So, instead of summing up each individual decayed values, we
> +                * could just decay the diff and not bother with the summation
> +                * at all. This is why we decay the diff here.
> +                */
> +               diff = decay_load(diff, periods);
> +               WRITE_ONCE(*old, new_val + diff);
> +       }
> +}
> +
>  /* avg must belong to the queue this se is on. */
>  void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se)
>  {
> @@ -336,17 +387,33 @@ ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se)
>
>  int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
>  {
> +       u64 last_update_time = se->avg.last_update_time;
> +       u32 period_contrib = se->avg.period_contrib;
> +       int ret = 0;
> +
>         if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
>                 ___update_load_avg(&se->avg, se_weight(se));
>                 trace_pelt_se_tp(se);
> -               return 1;
> +               ret = 1;
>         }
>
> -       return 0;
> +#ifdef CONFIG_UCLAMP_TASK
> +       if (entity_is_task(se))
> +               ___decay_util_avg_uclamp_towards(now,
> +                                                last_update_time,
> +                                                period_contrib,
> +                                                &se->avg.util_avg_uclamp,
> +                                                0);
> +#endif
> +       return ret;
>  }
>
>  int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> +       u64 last_update_time = se->avg.last_update_time;
> +       u32 period_contrib = se->avg.period_contrib;
> +       int ret = 0;
> +
>         if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
>                                 cfs_rq->curr == se)) {
>
> @@ -354,14 +421,26 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
>                 ___update_util_avg_uclamp(&cfs_rq->avg, se);
>                 cfs_se_util_change(&se->avg);
>                 trace_pelt_se_tp(se);
> -               return 1;
> +               ret = 1;
>         }
>
> -       return 0;
> +#ifdef CONFIG_UCLAMP_TASK
> +       if (!se->on_rq && entity_is_task(se))
> +               ___decay_util_avg_uclamp_towards(now,
> +                                                last_update_time,
> +                                                period_contrib,
> +                                                &se->avg.util_avg_uclamp,
> +                                                0);
> +#endif
> +       return ret;
>  }
>
>  int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
>  {
> +       u64 last_update_time = cfs_rq->avg.last_update_time;
> +       u32 period_contrib = cfs_rq->avg.period_contrib;
> +       int ret = 0;
> +
>         if (___update_load_sum(now, &cfs_rq->avg,
>                                 scale_load_down(cfs_rq->load.weight),
>                                 cfs_rq->h_nr_running,
> @@ -369,10 +448,22 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
>
>                 ___update_load_avg(&cfs_rq->avg, 1);
>                 trace_pelt_cfs_tp(cfs_rq);
> -               return 1;
> +               ret = 1;
>         }
>
> -       return 0;
> +#ifdef CONFIG_UCLAMP_TASK
> +       if (&rq_of(cfs_rq)->cfs == cfs_rq) {
> +               unsigned int target = READ_ONCE(cfs_rq->avg.util_avg_uclamp);
> +
> +               ___decay_util_avg_uclamp_towards(now,
> +                               last_update_time,
> +                               period_contrib,
> +                               &rq_of(cfs_rq)->root_cfs_util_uclamp,
> +                               target);
> +       }
> +#endif
> +
> +       return ret;
>  }
>
>  /*
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 2eefcdb0c3b0..98fa5e79f4e9 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -992,6 +992,8 @@ struct rq {
>         /* Utilization clamp values based on CPU's RUNNABLE tasks */
>         struct uclamp_rq        uclamp[UCLAMP_CNT] ____cacheline_aligned;
>         unsigned int            uclamp_flags;
> +       unsigned int            root_cfs_util_uclamp;
> +       unsigned int            root_cfs_util_uclamp_removed;
>  #define UCLAMP_FLAG_IDLE 0x01
>  #endif
>
> --
> 2.34.1
>
  
Hongyan Xia Dec. 5, 2023, 2:47 p.m. UTC | #4
On 04/12/2023 16:07, Vincent Guittot wrote:
> On Wed, 4 Oct 2023 at 11:05, Hongyan Xia <Hongyan.Xia2@arm.com> wrote:
>>
>> From: Hongyan Xia <hongyan.xia2@arm.com>
>>
>> Because util_avg_uclamp is not directly managed by PELT, it lacks the
>> nice property of slowly decaying to a lower value, resulting in
>> performance degredation due to premature frequency drops.
> 
> That a very good reason for not doing this

This is not much different from util_guest and the "additive uclamp" 
proposal in LPC 2023. The major difference is that I introduce a new 
signal right beside util_avg (which then needs to have PELT behavior) 
and they introduce signals in a way like util_est.

Now that I think about it, maybe my way is indeed too drastic, and maybe 
the util_est way is better.

>>
>> Add functions to decay root cfs utilization and tasks that are not on
>> the rq. This way, we get the benefits of PELT while still maintaining
>> uclamp. The rules are simple:
> 
> Nack. This just highlights that you are mixing different things and
> then trying to make it work.
> 
> Please keep PELT out of uclamp_min/max

Well, like in my previous comment I think PELT is already a mixed signal 
anyway, and treating it like it's not mixed with uclamp has already 
shown many problems which need corner case code like 0 spare capacity 
and uclamp filtering to make things work properly, and the code to fix 
uclamp is still growing.

I will see if I can rework this series in a util_est style, and will 
probably converge with the other two proposals, but fundamentally the 
idea is that we don't have a pure PELT signal anyway. Achieving a PELT 
value of X doesn't mean much if we don't know under what uclamp values 
was it achieved, and having a clamped(X) on the side can be a very good 
hint.

Hongyan
  

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 33e5a6e751c0..420af57d01ee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4311,17 +4311,22 @@  static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
 	unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
+	unsigned int removed_root_util = 0;
 	struct sched_avg *sa = &cfs_rq->avg;
 	int decayed = 0;
 
 	if (cfs_rq->removed.nr) {
 		unsigned long r;
+		struct rq *rq = rq_of(cfs_rq);
 		u32 divider = get_pelt_divider(&cfs_rq->avg);
 
 		raw_spin_lock(&cfs_rq->removed.lock);
 		swap(cfs_rq->removed.util_avg, removed_util);
 		swap(cfs_rq->removed.load_avg, removed_load);
 		swap(cfs_rq->removed.runnable_avg, removed_runnable);
+#ifdef CONFIG_UCLAMP_TASK
+		swap(rq->root_cfs_util_uclamp_removed, removed_root_util);
+#endif
 		cfs_rq->removed.nr = 0;
 		raw_spin_unlock(&cfs_rq->removed.lock);
 
@@ -4346,6 +4351,12 @@  update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 		 *    util_avg * minimum possible divider
 		 */
 		sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
+#ifdef CONFIG_UCLAMP_TASK
+		r = removed_root_util;
+		sub_positive(&rq->root_cfs_util_uclamp, r);
+		rq->root_cfs_util_uclamp =
+			max(rq->root_cfs_util_uclamp, rq->cfs.avg.util_avg_uclamp);
+#endif
 
 		r = removed_runnable;
 		sub_positive(&sa->runnable_avg, r);
@@ -4527,6 +4538,7 @@  static void sync_entity_load_avg(struct sched_entity *se)
 static void remove_entity_load_avg(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	struct rq *rq = rq_of(cfs_rq);
 	unsigned long flags;
 
 	/*
@@ -4542,6 +4554,9 @@  static void remove_entity_load_avg(struct sched_entity *se)
 	cfs_rq->removed.util_avg	+= se->avg.util_avg;
 	cfs_rq->removed.load_avg	+= se->avg.load_avg;
 	cfs_rq->removed.runnable_avg	+= se->avg.runnable_avg;
+#ifdef CONFIG_UCLAMP_TASK
+	rq->root_cfs_util_uclamp_removed += se->avg.util_avg_uclamp;
+#endif
 	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
 }
 
@@ -6462,6 +6477,11 @@  static void update_se_chain(struct task_struct *p)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		___update_util_avg_uclamp(&cfs_rq->avg, se);
+		if (&rq->cfs == cfs_rq) {
+			rq->root_cfs_util_uclamp = max(rq->root_cfs_util_uclamp,
+						       cfs_rq->avg.util_avg_uclamp);
+			cfs_rq_util_change(cfs_rq, 0);
+		}
 	}
 #endif
 }
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index c656e4dcb1d1..83d5ac7e7ddb 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -267,6 +267,57 @@  ___update_load_avg(struct sched_avg *sa, unsigned long load)
 }
 
 #ifdef CONFIG_UCLAMP_TASK
+static void ___decay_util_avg_uclamp_towards(u64 now,
+					     u64 last_update_time,
+					     u32 period_contrib,
+					     unsigned int *old,
+					     unsigned int new_val)
+{
+	unsigned int old_val = READ_ONCE(*old);
+	u64 delta, periods;
+
+	if (old_val <= new_val) {
+		WRITE_ONCE(*old, new_val);
+		return;
+	}
+
+	if (!last_update_time)
+		return;
+	delta = now - last_update_time;
+	if ((s64)delta < 0)
+		return;
+	delta >>= 10;
+	if (!delta)
+		return;
+
+	delta += period_contrib;
+	periods = delta / 1024;
+	if (periods) {
+		u64 diff = old_val - new_val;
+
+		/*
+		 * Let's assume 3 tasks, A, B and C. A is still on rq but B and
+		 * C have just been dequeued. The cfs.avg.util_avg_uclamp has
+		 * become A but root_cfs_util_uclamp just starts to decay and is
+		 * now still A + B + C.
+		 *
+		 * After p periods with y being the decay factor, the new
+		 * root_cfs_util_uclamp should become
+		 *
+		 * A + B * y^p + C * y^p == A + (A + B + C - A) * y^p
+		 *     == cfs.avg.util_avg_uclamp +
+		 *        (root_cfs_util_uclamp_at_the_start - cfs.avg.util_avg_uclamp) * y^p
+		 *     == cfs.avg.util_avg_uclamp + diff * y^p
+		 *
+		 * So, instead of summing up each individual decayed values, we
+		 * could just decay the diff and not bother with the summation
+		 * at all. This is why we decay the diff here.
+		 */
+		diff = decay_load(diff, periods);
+		WRITE_ONCE(*old, new_val + diff);
+	}
+}
+
 /* avg must belong to the queue this se is on. */
 void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se)
 {
@@ -336,17 +387,33 @@  ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se)
 
 int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
 {
+	u64 last_update_time = se->avg.last_update_time;
+	u32 period_contrib = se->avg.period_contrib;
+	int ret = 0;
+
 	if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
 		___update_load_avg(&se->avg, se_weight(se));
 		trace_pelt_se_tp(se);
-		return 1;
+		ret = 1;
 	}
 
-	return 0;
+#ifdef CONFIG_UCLAMP_TASK
+	if (entity_is_task(se))
+		___decay_util_avg_uclamp_towards(now,
+						 last_update_time,
+						 period_contrib,
+						 &se->avg.util_avg_uclamp,
+						 0);
+#endif
+	return ret;
 }
 
 int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	u64 last_update_time = se->avg.last_update_time;
+	u32 period_contrib = se->avg.period_contrib;
+	int ret = 0;
+
 	if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
 				cfs_rq->curr == se)) {
 
@@ -354,14 +421,26 @@  int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
 		___update_util_avg_uclamp(&cfs_rq->avg, se);
 		cfs_se_util_change(&se->avg);
 		trace_pelt_se_tp(se);
-		return 1;
+		ret = 1;
 	}
 
-	return 0;
+#ifdef CONFIG_UCLAMP_TASK
+	if (!se->on_rq && entity_is_task(se))
+		___decay_util_avg_uclamp_towards(now,
+						 last_update_time,
+						 period_contrib,
+						 &se->avg.util_avg_uclamp,
+						 0);
+#endif
+	return ret;
 }
 
 int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
 {
+	u64 last_update_time = cfs_rq->avg.last_update_time;
+	u32 period_contrib = cfs_rq->avg.period_contrib;
+	int ret = 0;
+
 	if (___update_load_sum(now, &cfs_rq->avg,
 				scale_load_down(cfs_rq->load.weight),
 				cfs_rq->h_nr_running,
@@ -369,10 +448,22 @@  int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
 
 		___update_load_avg(&cfs_rq->avg, 1);
 		trace_pelt_cfs_tp(cfs_rq);
-		return 1;
+		ret = 1;
 	}
 
-	return 0;
+#ifdef CONFIG_UCLAMP_TASK
+	if (&rq_of(cfs_rq)->cfs == cfs_rq) {
+		unsigned int target = READ_ONCE(cfs_rq->avg.util_avg_uclamp);
+
+		___decay_util_avg_uclamp_towards(now,
+				last_update_time,
+				period_contrib,
+				&rq_of(cfs_rq)->root_cfs_util_uclamp,
+				target);
+	}
+#endif
+
+	return ret;
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2eefcdb0c3b0..98fa5e79f4e9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -992,6 +992,8 @@  struct rq {
 	/* Utilization clamp values based on CPU's RUNNABLE tasks */
 	struct uclamp_rq	uclamp[UCLAMP_CNT] ____cacheline_aligned;
 	unsigned int		uclamp_flags;
+	unsigned int		root_cfs_util_uclamp;
+	unsigned int		root_cfs_util_uclamp_removed;
 #define UCLAMP_FLAG_IDLE 0x01
 #endif