[v1] Revert "cpufreq: schedutil: Move max CPU capacity to sugov_policy"

Message ID 20221110195732.1382314-1-wusamuel@google.com
State New
Headers
Series [v1] Revert "cpufreq: schedutil: Move max CPU capacity to sugov_policy" |

Commit Message

Sam Wu Nov. 10, 2022, 7:57 p.m. UTC
  This reverts commit 6d5afdc97ea71958287364a1f1d07e59ef151b11.

On a Pixel 6 device, it is observed that this commit increases
latency by approximately 50ms, or 20%, in migrating a task
that requires full CPU utilization from a LITTLE CPU to Fmax
on a big CPU. Reverting this change restores the latency back
to its original baseline value.

Fixes: 6d5afdc97ea7 ("cpufreq: schedutil: Move max CPU capacity to sugov_policy")
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Saravana Kannan <saravanak@google.com>
Cc: Isaac J. Manjarres <isaacmanjarres@google.com>
Signed-off-by: Sam Wu <wusamuel@google.com>
---
 kernel/sched/cpufreq_schedutil.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)
  

Comments

Saravana Kannan Nov. 15, 2022, 10:35 p.m. UTC | #1
On Thu, Nov 10, 2022 at 11:57 AM Sam Wu <wusamuel@google.com> wrote:
>
> This reverts commit 6d5afdc97ea71958287364a1f1d07e59ef151b11.
>
> On a Pixel 6 device, it is observed that this commit increases
> latency by approximately 50ms, or 20%, in migrating a task
> that requires full CPU utilization from a LITTLE CPU to Fmax
> on a big CPU. Reverting this change restores the latency back
> to its original baseline value.
>
> Fixes: 6d5afdc97ea7 ("cpufreq: schedutil: Move max CPU capacity to sugov_policy")
> Cc: Lukasz Luba <lukasz.luba@arm.com>
> Cc: Saravana Kannan <saravanak@google.com>
> Cc: Isaac J. Manjarres <isaacmanjarres@google.com>
> Signed-off-by: Sam Wu <wusamuel@google.com>

Rafael, can we pick this up please?

Lukasz, no objections to the idea itself, but it's causing regression
and we'd like to revert and then fix it.

-Saravana

> ---
>  kernel/sched/cpufreq_schedutil.c | 30 +++++++++++++++---------------
>  1 file changed, 15 insertions(+), 15 deletions(-)
>
> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> index 9161d1136d01..1207c78f85c1 100644
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -25,9 +25,6 @@ struct sugov_policy {
>         unsigned int            next_freq;
>         unsigned int            cached_raw_freq;
>
> -       /* max CPU capacity, which is equal for all CPUs in freq. domain */
> -       unsigned long           max;
> -
>         /* The next fields are only needed if fast switch cannot be used: */
>         struct                  irq_work irq_work;
>         struct                  kthread_work work;
> @@ -51,6 +48,7 @@ struct sugov_cpu {
>
>         unsigned long           util;
>         unsigned long           bw_dl;
> +       unsigned long           max;
>
>         /* The field below is for single-CPU policies only: */
>  #ifdef CONFIG_NO_HZ_COMMON
> @@ -160,6 +158,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
>  {
>         struct rq *rq = cpu_rq(sg_cpu->cpu);
>
> +       sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
>         sg_cpu->bw_dl = cpu_bw_dl(rq);
>         sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
>                                           FREQUENCY_UTIL, NULL);
> @@ -254,7 +253,6 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
>   */
>  static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
>  {
> -       struct sugov_policy *sg_policy = sg_cpu->sg_policy;
>         unsigned long boost;
>
>         /* No boost currently required */
> @@ -282,8 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
>          * sg_cpu->util is already in capacity scale; convert iowait_boost
>          * into the same scale so we can compare.
>          */
> -       boost = sg_cpu->iowait_boost * sg_policy->max;
> -       boost >>= SCHED_CAPACITY_SHIFT;
> +       boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
>         boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
>         if (sg_cpu->util < boost)
>                 sg_cpu->util = boost;
> @@ -340,7 +337,7 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
>         if (!sugov_update_single_common(sg_cpu, time, flags))
>                 return;
>
> -       next_f = get_next_freq(sg_policy, sg_cpu->util, sg_policy->max);
> +       next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
>         /*
>          * Do not reduce the frequency if the CPU has not been idle
>          * recently, as the reduction is likely to be premature then.
> @@ -376,7 +373,6 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
>                                      unsigned int flags)
>  {
>         struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
> -       struct sugov_policy *sg_policy = sg_cpu->sg_policy;
>         unsigned long prev_util = sg_cpu->util;
>
>         /*
> @@ -403,8 +399,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
>                 sg_cpu->util = prev_util;
>
>         cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
> -                                  map_util_perf(sg_cpu->util),
> -                                  sg_policy->max);
> +                                  map_util_perf(sg_cpu->util), sg_cpu->max);
>
>         sg_cpu->sg_policy->last_freq_update_time = time;
>  }
> @@ -413,19 +408,25 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
>  {
>         struct sugov_policy *sg_policy = sg_cpu->sg_policy;
>         struct cpufreq_policy *policy = sg_policy->policy;
> -       unsigned long util = 0;
> +       unsigned long util = 0, max = 1;
>         unsigned int j;
>
>         for_each_cpu(j, policy->cpus) {
>                 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
> +               unsigned long j_util, j_max;
>
>                 sugov_get_util(j_sg_cpu);
>                 sugov_iowait_apply(j_sg_cpu, time);
> +               j_util = j_sg_cpu->util;
> +               j_max = j_sg_cpu->max;
>
> -               util = max(j_sg_cpu->util, util);
> +               if (j_util * max > j_max * util) {
> +                       util = j_util;
> +                       max = j_max;
> +               }
>         }
>
> -       return get_next_freq(sg_policy, util, sg_policy->max);
> +       return get_next_freq(sg_policy, util, max);
>  }
>
>  static void
> @@ -751,7 +752,7 @@ static int sugov_start(struct cpufreq_policy *policy)
>  {
>         struct sugov_policy *sg_policy = policy->governor_data;
>         void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
> -       unsigned int cpu = cpumask_first(policy->cpus);
> +       unsigned int cpu;
>
>         sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
>         sg_policy->last_freq_update_time        = 0;
> @@ -759,7 +760,6 @@ static int sugov_start(struct cpufreq_policy *policy)
>         sg_policy->work_in_progress             = false;
>         sg_policy->limits_changed               = false;
>         sg_policy->cached_raw_freq              = 0;
> -       sg_policy->max                          = arch_scale_cpu_capacity(cpu);
>
>         sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
>
> --
> 2.38.1.431.g37b22c650d-goog
>
  
Lukasz Luba Nov. 16, 2022, 11:35 a.m. UTC | #2
Hi Sam,

On 11/10/22 19:57, Sam Wu wrote:
> This reverts commit 6d5afdc97ea71958287364a1f1d07e59ef151b11.
> 
> On a Pixel 6 device, it is observed that this commit increases
> latency by approximately 50ms, or 20%, in migrating a task
> that requires full CPU utilization from a LITTLE CPU to Fmax
> on a big CPU. Reverting this change restores the latency back
> to its original baseline value.

Which mainline kernel version you use in pixel6?

Could you elaborate a bit how is it possible?
Do you have the sg_policy setup properly (and at right time)?
Do you have the cpu capacity from arch_scale_cpu_capacity()
set correctly and at the right time during this cpufreq
governor setup?

IIRC in Android there is a different code for setting up the
cpufreq sched governor clones. In mainline we don't have to do
those tricks, so this might be the main difference.

Could you trace the value that is read from
arch_scale_cpu_capacity() and share it with us?
I suspect this value changes in time in your kernel.

Regards,
Lukasz
  
Lukasz Luba Nov. 16, 2022, 11:43 a.m. UTC | #3
On 11/15/22 22:35, Saravana Kannan wrote:
> On Thu, Nov 10, 2022 at 11:57 AM Sam Wu <wusamuel@google.com> wrote:
>>
>> This reverts commit 6d5afdc97ea71958287364a1f1d07e59ef151b11.
>>
>> On a Pixel 6 device, it is observed that this commit increases
>> latency by approximately 50ms, or 20%, in migrating a task
>> that requires full CPU utilization from a LITTLE CPU to Fmax
>> on a big CPU. Reverting this change restores the latency back
>> to its original baseline value.
>>
>> Fixes: 6d5afdc97ea7 ("cpufreq: schedutil: Move max CPU capacity to sugov_policy")
>> Cc: Lukasz Luba <lukasz.luba@arm.com>
>> Cc: Saravana Kannan <saravanak@google.com>
>> Cc: Isaac J. Manjarres <isaacmanjarres@google.com>
>> Signed-off-by: Sam Wu <wusamuel@google.com>
> 
> Rafael, can we pick this up please?
> 
> Lukasz, no objections to the idea itself, but it's causing regression
> and we'd like to revert and then fix it.

If you see this in mainline kernel, then I'm fine with reverting it.
Then I will have to trace why this CPU capacity value can change over
time in mainline kernel (while it shouldn't, because we register the
cpufreq policy and the governor later, after we calculate the capacity
in arch_topology.c). Maybe something has changed in mainline in the
meantime in this CPU capacity setup code, which caused this side effect.

I know that android-mainline has some different setup code for those
custom vendor governors. I just want to eliminate this bit and be on the
same page.

Regards,
Lukasz

> 
> -Saravana
>
  
Rafael J. Wysocki Nov. 16, 2022, 12:17 p.m. UTC | #4
On Wed, Nov 16, 2022 at 12:43 PM Lukasz Luba <lukasz.luba@arm.com> wrote:
>
>
>
> On 11/15/22 22:35, Saravana Kannan wrote:
> > On Thu, Nov 10, 2022 at 11:57 AM Sam Wu <wusamuel@google.com> wrote:
> >>
> >> This reverts commit 6d5afdc97ea71958287364a1f1d07e59ef151b11.
> >>
> >> On a Pixel 6 device, it is observed that this commit increases
> >> latency by approximately 50ms, or 20%, in migrating a task
> >> that requires full CPU utilization from a LITTLE CPU to Fmax
> >> on a big CPU. Reverting this change restores the latency back
> >> to its original baseline value.
> >>
> >> Fixes: 6d5afdc97ea7 ("cpufreq: schedutil: Move max CPU capacity to sugov_policy")
> >> Cc: Lukasz Luba <lukasz.luba@arm.com>
> >> Cc: Saravana Kannan <saravanak@google.com>
> >> Cc: Isaac J. Manjarres <isaacmanjarres@google.com>
> >> Signed-off-by: Sam Wu <wusamuel@google.com>
> >
> > Rafael, can we pick this up please?
> >
> > Lukasz, no objections to the idea itself, but it's causing regression
> > and we'd like to revert and then fix it.
>
> If you see this in mainline kernel, then I'm fine with reverting it.

OK, I'll wait for the confirmation of this.

> Then I will have to trace why this CPU capacity value can change over
> time in mainline kernel (while it shouldn't, because we register the
> cpufreq policy and the governor later, after we calculate the capacity
> in arch_topology.c). Maybe something has changed in mainline in the
> meantime in this CPU capacity setup code, which caused this side effect.
>
> I know that android-mainline has some different setup code for those
> custom vendor governors. I just want to eliminate this bit and be on the
> same page.
  
Sam Wu Nov. 18, 2022, 1 a.m. UTC | #5
On Wed, Nov 16, 2022 at 3:35 AM Lukasz Luba <lukasz.luba@arm.com> wrote:
> Which mainline kernel version you use in pixel6?
I am using kernel version 6.1-rc5.
>
> Could you elaborate a bit how is it possible?
> Do you have the sg_policy setup properly (and at right time)?
> Do you have the cpu capacity from arch_scale_cpu_capacity()
> set correctly and at the right time during this cpufreq
> governor setup?
>
> IIRC in Android there is a different code for setting up the
> cpufreq sched governor clones. In mainline we don't have to do
> those tricks, so this might be the main difference.
This behavior is seen on the mainline kernel. There isn't any vendor code
modifying the behavior, and the schedutil governor is being used.
>
> Could you trace the value that is read from
> arch_scale_cpu_capacity() and share it with us?
> I suspect this value changes in time in your kernel.
There's an additional CPU capacity normalization step during
init_cpu_capacity_callback() that does not happen until all the CPUs come
online. However, the sugov_start() function can be called for a subset of
CPUs before all the CPUs are brought up and before the normalization of
the CPU capacity values, so there could be a stale value stored
in sugov_policy.max field.

Best,
Sam
  
Linux regression tracking (Thorsten Leemhuis) Nov. 20, 2022, 6:07 p.m. UTC | #6
[Note: this mail is primarily send for documentation purposes and/or for
regzbot, my Linux kernel regression tracking bot. That's why I removed
most or all folks from the list of recipients, but left any that looked
like a mailing lists. These mails usually contain '#forregzbot' in the
subject, to make them easy to spot and filter out.]

[TLDR: I'm adding this regression report to the list of tracked
regressions; all text from me you find below is based on a few templates
paragraphs you might have encountered already already in similar form.]

Hi, this is your Linux kernel regression tracker.

On 10.11.22 20:57, Sam Wu wrote:
> This reverts commit 6d5afdc97ea71958287364a1f1d07e59ef151b11.
> 
> On a Pixel 6 device, it is observed that this commit increases
> latency by approximately 50ms, or 20%, in migrating a task
> that requires full CPU utilization from a LITTLE CPU to Fmax
> on a big CPU. Reverting this change restores the latency back
> to its original baseline value.
> 
> Fixes: 6d5afdc97ea7 ("cpufreq: schedutil: Move max CPU capacity to sugov_policy")
> Cc: Lukasz Luba <lukasz.luba@arm.com>
> Cc: Saravana Kannan <saravanak@google.com>
> Cc: Isaac J. Manjarres <isaacmanjarres@google.com>
> Signed-off-by: Sam Wu <wusamuel@google.com>

Thanks for the report. To be sure below issue doesn't fall through the
cracks unnoticed, I'm adding it to regzbot, my Linux kernel regression
tracking bot:

#regzbot ^introduced 6d5afdc97ea7
#regzbot title cpufreq: schedutil: improved latency on Pixel 6
#regzbot ignore-activity

This isn't a regression? This issue or a fix for it are already
discussed somewhere else? It was fixed already? You want to clarify when
the regression started to happen? Or point out I got the title or
something else totally wrong? Then just reply -- ideally with also
telling regzbot about it, as explained here:
https://linux-regtracking.leemhuis.info/tracked-regression/

Reminder for developers: When fixing the issue, add 'Link:' tags
pointing to the report (the mail this one replies to), as explained for
in the Linux kernel's documentation; above webpage explains why this is
important for tracked regressions.

Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)

P.S.: As the Linux kernel's regression tracker I deal with a lot of
reports and sometimes miss something important when writing mails like
this. If that's the case here, don't hesitate to tell me in a public
reply, it's in everyone's interest to set the public record straight.
  
Rafael J. Wysocki Nov. 21, 2022, 7:18 p.m. UTC | #7
On Fri, Nov 18, 2022 at 2:00 AM Sam Wu <wusamuel@google.com> wrote:
>
> On Wed, Nov 16, 2022 at 3:35 AM Lukasz Luba <lukasz.luba@arm.com> wrote:
> > Which mainline kernel version you use in pixel6?
> I am using kernel version 6.1-rc5.
> >
> > Could you elaborate a bit how is it possible?
> > Do you have the sg_policy setup properly (and at right time)?
> > Do you have the cpu capacity from arch_scale_cpu_capacity()
> > set correctly and at the right time during this cpufreq
> > governor setup?
> >
> > IIRC in Android there is a different code for setting up the
> > cpufreq sched governor clones. In mainline we don't have to do
> > those tricks, so this might be the main difference.
> This behavior is seen on the mainline kernel. There isn't any vendor code
> modifying the behavior, and the schedutil governor is being used.
> >
> > Could you trace the value that is read from
> > arch_scale_cpu_capacity() and share it with us?
> > I suspect this value changes in time in your kernel.
> There's an additional CPU capacity normalization step during
> init_cpu_capacity_callback() that does not happen until all the CPUs come
> online. However, the sugov_start() function can be called for a subset of
> CPUs before all the CPUs are brought up and before the normalization of
> the CPU capacity values, so there could be a stale value stored
> in sugov_policy.max field.

OK, the revert has been applied as 6.1-rc material, thanks!
  
Lukasz Luba Nov. 22, 2022, 8:58 a.m. UTC | #8
Hi Rafael and Sam

On 11/21/22 19:18, Rafael J. Wysocki wrote:
> On Fri, Nov 18, 2022 at 2:00 AM Sam Wu <wusamuel@google.com> wrote:
>>
>> On Wed, Nov 16, 2022 at 3:35 AM Lukasz Luba <lukasz.luba@arm.com> wrote:
>>> Which mainline kernel version you use in pixel6?
>> I am using kernel version 6.1-rc5.
>>>
>>> Could you elaborate a bit how is it possible?
>>> Do you have the sg_policy setup properly (and at right time)?
>>> Do you have the cpu capacity from arch_scale_cpu_capacity()
>>> set correctly and at the right time during this cpufreq
>>> governor setup?
>>>
>>> IIRC in Android there is a different code for setting up the
>>> cpufreq sched governor clones. In mainline we don't have to do
>>> those tricks, so this might be the main difference.
>> This behavior is seen on the mainline kernel. There isn't any vendor code
>> modifying the behavior, and the schedutil governor is being used.
>>>
>>> Could you trace the value that is read from
>>> arch_scale_cpu_capacity() and share it with us?
>>> I suspect this value changes in time in your kernel.
>> There's an additional CPU capacity normalization step during
>> init_cpu_capacity_callback() that does not happen until all the CPUs come
>> online. However, the sugov_start() function can be called for a subset of
>> CPUs before all the CPUs are brought up and before the normalization of
>> the CPU capacity values, so there could be a stale value stored
>> in sugov_policy.max field.
> 
> OK, the revert has been applied as 6.1-rc material, thanks!

I was on a business trip last week so couldn't check this.
Now I'm back and I've checked the booting sequence.
Yes, there is some race condition and the mechanism
using blocking_notifier_call_chain() in the cpufreq_online()
doesn't help while we are registering that schedutil
new policy.

I will have to go through those mechanisms and check them.
I agree, for now the best option is to revert the patch.

My apologies for introducing this issues.
Thanks Sam for capturing it.

Regards,
Lukasz
  
Linux regression tracking (Thorsten Leemhuis) Nov. 27, 2022, 12:06 p.m. UTC | #9
On 20.11.22 19:07, Thorsten Leemhuis wrote:
> On 10.11.22 20:57, Sam Wu wrote:
>> This reverts commit 6d5afdc97ea71958287364a1f1d07e59ef151b11.
>>
>> On a Pixel 6 device, it is observed that this commit increases
>> latency by approximately 50ms, or 20%, in migrating a task
>> that requires full CPU utilization from a LITTLE CPU to Fmax
>> on a big CPU. Reverting this change restores the latency back
>> to its original baseline value.
>>
>> Fixes: 6d5afdc97ea7 ("cpufreq: schedutil: Move max CPU capacity to sugov_policy")
>> Cc: Lukasz Luba <lukasz.luba@arm.com>
>> Cc: Saravana Kannan <saravanak@google.com>
>> Cc: Isaac J. Manjarres <isaacmanjarres@google.com>
>> Signed-off-by: Sam Wu <wusamuel@google.com>
> 
> Thanks for the report. To be sure below issue doesn't fall through the
> cracks unnoticed, I'm adding it to regzbot, my Linux kernel regression
> tracking bot:
> 
> #regzbot ^introduced 6d5afdc97ea7
> #regzbot title cpufreq: schedutil: improved latency on Pixel 6
> #regzbot ignore-activity

#regzbot fixed-by: cdcc5ef26b3
  
Vincent Guittot Nov. 30, 2022, 10:42 a.m. UTC | #10
Hi All

Just for the log and because it took me a while to figure out the root
cause of the problem: This patch also creates a regression for
snapdragon845 based systems and probably on any QC chipsets that use a
LUT to update the OPP table at boot. The behavior is the same as
described by Sam with a staled value in sugov_policy.max field.

Regards,
Vincent

On Tue, 22 Nov 2022 at 09:58, Lukasz Luba <lukasz.luba@arm.com> wrote:
>
> Hi Rafael and Sam
>
> On 11/21/22 19:18, Rafael J. Wysocki wrote:
> > On Fri, Nov 18, 2022 at 2:00 AM Sam Wu <wusamuel@google.com> wrote:
> >>
> >> On Wed, Nov 16, 2022 at 3:35 AM Lukasz Luba <lukasz.luba@arm.com> wrote:
> >>> Which mainline kernel version you use in pixel6?
> >> I am using kernel version 6.1-rc5.
> >>>
> >>> Could you elaborate a bit how is it possible?
> >>> Do you have the sg_policy setup properly (and at right time)?
> >>> Do you have the cpu capacity from arch_scale_cpu_capacity()
> >>> set correctly and at the right time during this cpufreq
> >>> governor setup?
> >>>
> >>> IIRC in Android there is a different code for setting up the
> >>> cpufreq sched governor clones. In mainline we don't have to do
> >>> those tricks, so this might be the main difference.
> >> This behavior is seen on the mainline kernel. There isn't any vendor code
> >> modifying the behavior, and the schedutil governor is being used.
> >>>
> >>> Could you trace the value that is read from
> >>> arch_scale_cpu_capacity() and share it with us?
> >>> I suspect this value changes in time in your kernel.
> >> There's an additional CPU capacity normalization step during
> >> init_cpu_capacity_callback() that does not happen until all the CPUs come
> >> online. However, the sugov_start() function can be called for a subset of
> >> CPUs before all the CPUs are brought up and before the normalization of
> >> the CPU capacity values, so there could be a stale value stored
> >> in sugov_policy.max field.
> >
> > OK, the revert has been applied as 6.1-rc material, thanks!
>
> I was on a business trip last week so couldn't check this.
> Now I'm back and I've checked the booting sequence.
> Yes, there is some race condition and the mechanism
> using blocking_notifier_call_chain() in the cpufreq_online()
> doesn't help while we are registering that schedutil
> new policy.
>
> I will have to go through those mechanisms and check them.
> I agree, for now the best option is to revert the patch.
>
> My apologies for introducing this issues.
> Thanks Sam for capturing it.
>
> Regards,
> Lukasz
  
Lukasz Luba Nov. 30, 2022, 2:04 p.m. UTC | #11
Hi Vincent,

On 11/30/22 10:42, Vincent Guittot wrote:
> Hi All
> 
> Just for the log and because it took me a while to figure out the root
> cause of the problem: This patch also creates a regression for
> snapdragon845 based systems and probably on any QC chipsets that use a
> LUT to update the OPP table at boot. The behavior is the same as
> described by Sam with a staled value in sugov_policy.max field.

Thanks for sharing this info and apologies that you spent cycles
on it.

I have checked that whole setup code (capacity + cpufreq policy and
governor). It looks like to have a proper capacity of CPUs, we need
to wait till the last policy is created. It's due to the arch_topology.c
mechanism which is only triggered after all CPUs' got the policy.
Unfortunately, this leads to a chicken & egg situation for this
schedutil setup of max capacity.

I have experimented with this code, which triggers an update in
the schedutil, when all CPUs got the policy and sugov gov
(with trace_printk() to mach the output below)

-------------------------8<-----------------------------------------
diff --git a/kernel/sched/cpufreq_schedutil.c 
b/kernel/sched/cpufreq_schedutil.c
index 9161d1136d01..f1913a857218 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -59,6 +59,7 @@ struct sugov_cpu {
  };

  static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
+static cpumask_var_t cpus_to_visit;

  /************************ Governor internals ***********************/

@@ -783,6 +784,22 @@ static int sugov_start(struct cpufreq_policy *policy)

                 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, 
uu);
         }
+
+       cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus);
+
+       if (cpumask_empty(cpus_to_visit)) {
+               trace_printk("schedutil the visit cpu mask is empty now\n");
+               for_each_possible_cpu(cpu) {
+                       struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+                       struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+
+                       sg_policy->max = arch_scale_cpu_capacity(cpu);
+
+                       trace_printk("SCHEDUTIL: NEW  CPU%u 
cpu_capacity=%lu\n",
+                               cpu, sg_policy->max);
+               }
+       }
+
         return 0;
  }

@@ -800,6 +817,8 @@ static void sugov_stop(struct cpufreq_policy *policy)
                 irq_work_sync(&sg_policy->irq_work);
                 kthread_cancel_work_sync(&sg_policy->work);
         }
+
+       cpumask_or(cpus_to_visit, cpus_to_visit, policy->related_cpus);
  }

  static void sugov_limits(struct cpufreq_policy *policy)
@@ -829,6 +848,11 @@ struct cpufreq_governor schedutil_gov = {
  #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
  struct cpufreq_governor *cpufreq_default_governor(void)
  {
+       if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL))
+               return NULL;
+
+       cpumask_copy(cpus_to_visit, cpu_possible_mask);
+
         return &schedutil_gov;
  }
  #endif

---------------------------------->8---------------------------------


That simple approach fixes the issue. I have also tested it with
governor change a few times and setting back the schedutil.

-------------------------------------------
    kworker/u12:1-48      [004] .....     2.208847: sugov_start: 
schedutil the visit cpu mask is empty now
    kworker/u12:1-48      [004] .....     2.208854: sugov_start: 
SCHEDUTIL: NEW  CPU0 cpu_capacity=381
    kworker/u12:1-48      [004] .....     2.208857: sugov_start: 
SCHEDUTIL: NEW  CPU1 cpu_capacity=381
    kworker/u12:1-48      [004] .....     2.208860: sugov_start: 
SCHEDUTIL: NEW  CPU2 cpu_capacity=381
    kworker/u12:1-48      [004] .....     2.208862: sugov_start: 
SCHEDUTIL: NEW  CPU3 cpu_capacity=381
    kworker/u12:1-48      [004] .....     2.208864: sugov_start: 
SCHEDUTIL: NEW  CPU4 cpu_capacity=1024
    kworker/u12:1-48      [004] .....     2.208866: sugov_start: 
SCHEDUTIL: NEW  CPU5 cpu_capacity=1024
             bash-615     [005] .....    35.317113: sugov_start: 
schedutil the visit cpu mask is empty now
             bash-615     [005] .....    35.317120: sugov_start: 
SCHEDUTIL: NEW  CPU0 cpu_capacity=381
             bash-615     [005] .....    35.317123: sugov_start: 
SCHEDUTIL: NEW  CPU1 cpu_capacity=381
             bash-615     [005] .....    35.317125: sugov_start: 
SCHEDUTIL: NEW  CPU2 cpu_capacity=381
             bash-615     [005] .....    35.317127: sugov_start: 
SCHEDUTIL: NEW  CPU3 cpu_capacity=381
             bash-615     [005] .....    35.317129: sugov_start: 
SCHEDUTIL: NEW  CPU4 cpu_capacity=1024
             bash-615     [005] .....    35.317131: sugov_start: 
SCHEDUTIL: NEW  CPU5 cpu_capacity=1024
             bash-623     [003] .....    57.633328: sugov_start: 
schedutil the visit cpu mask is empty now
             bash-623     [003] .....    57.633336: sugov_start: 
SCHEDUTIL: NEW  CPU0 cpu_capacity=381
             bash-623     [003] .....    57.633339: sugov_start: 
SCHEDUTIL: NEW  CPU1 cpu_capacity=381
             bash-623     [003] .....    57.633340: sugov_start: 
SCHEDUTIL: NEW  CPU2 cpu_capacity=381
             bash-623     [003] .....    57.633342: sugov_start: 
SCHEDUTIL: NEW  CPU3 cpu_capacity=381
             bash-623     [003] .....    57.633343: sugov_start: 
SCHEDUTIL: NEW  CPU4 cpu_capacity=1024
             bash-623     [003] .....    57.633344: sugov_start: 
SCHEDUTIL: NEW  CPU5 cpu_capacity=1024
----------------------------------------------------

It should work.

Regards,
Lukasz
  
Vincent Guittot Nov. 30, 2022, 2:29 p.m. UTC | #12
On Wed, 30 Nov 2022 at 15:04, Lukasz Luba <lukasz.luba@arm.com> wrote:
>
> Hi Vincent,
>
> On 11/30/22 10:42, Vincent Guittot wrote:
> > Hi All
> >
> > Just for the log and because it took me a while to figure out the root
> > cause of the problem: This patch also creates a regression for
> > snapdragon845 based systems and probably on any QC chipsets that use a
> > LUT to update the OPP table at boot. The behavior is the same as
> > described by Sam with a staled value in sugov_policy.max field.
>
> Thanks for sharing this info and apologies that you spent cycles
> on it.
>
> I have checked that whole setup code (capacity + cpufreq policy and
> governor). It looks like to have a proper capacity of CPUs, we need
> to wait till the last policy is created. It's due to the arch_topology.c
> mechanism which is only triggered after all CPUs' got the policy.
> Unfortunately, this leads to a chicken & egg situation for this
> schedutil setup of max capacity.
>
> I have experimented with this code, which triggers an update in
> the schedutil, when all CPUs got the policy and sugov gov
> (with trace_printk() to mach the output below)

Your proposal below looks similar to what is done in arch_topology.c.
arch_topology.c triggers a rebuild of sched_domain and removes its
cpufreq notifier cb once it has visited all CPUs, could it also
trigger an update of CPU's policy with cpufreq_update_policy() ?

At this point you will be sure that the normalization has happened and
the max capacity will not change.

I don't know if it's a global problem or only for systems using arch_topology

>
> -------------------------8<-----------------------------------------
> diff --git a/kernel/sched/cpufreq_schedutil.c
> b/kernel/sched/cpufreq_schedutil.c
> index 9161d1136d01..f1913a857218 100644
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -59,6 +59,7 @@ struct sugov_cpu {
>   };
>
>   static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
> +static cpumask_var_t cpus_to_visit;
>
>   /************************ Governor internals ***********************/
>
> @@ -783,6 +784,22 @@ static int sugov_start(struct cpufreq_policy *policy)
>
>                  cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
> uu);
>          }
> +
> +       cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus);
> +
> +       if (cpumask_empty(cpus_to_visit)) {
> +               trace_printk("schedutil the visit cpu mask is empty now\n");
> +               for_each_possible_cpu(cpu) {
> +                       struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
> +                       struct sugov_policy *sg_policy = sg_cpu->sg_policy;
> +
> +                       sg_policy->max = arch_scale_cpu_capacity(cpu);
> +
> +                       trace_printk("SCHEDUTIL: NEW  CPU%u
> cpu_capacity=%lu\n",
> +                               cpu, sg_policy->max);
> +               }
> +       }
> +
>          return 0;
>   }
>
> @@ -800,6 +817,8 @@ static void sugov_stop(struct cpufreq_policy *policy)
>                  irq_work_sync(&sg_policy->irq_work);
>                  kthread_cancel_work_sync(&sg_policy->work);
>          }
> +
> +       cpumask_or(cpus_to_visit, cpus_to_visit, policy->related_cpus);
>   }
>
>   static void sugov_limits(struct cpufreq_policy *policy)
> @@ -829,6 +848,11 @@ struct cpufreq_governor schedutil_gov = {
>   #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
>   struct cpufreq_governor *cpufreq_default_governor(void)
>   {
> +       if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL))
> +               return NULL;
> +
> +       cpumask_copy(cpus_to_visit, cpu_possible_mask);
> +
>          return &schedutil_gov;
>   }
>   #endif
>
> ---------------------------------->8---------------------------------
>
>
> That simple approach fixes the issue. I have also tested it with
> governor change a few times and setting back the schedutil.
>
> -------------------------------------------
>     kworker/u12:1-48      [004] .....     2.208847: sugov_start:
> schedutil the visit cpu mask is empty now
>     kworker/u12:1-48      [004] .....     2.208854: sugov_start:
> SCHEDUTIL: NEW  CPU0 cpu_capacity=381
>     kworker/u12:1-48      [004] .....     2.208857: sugov_start:
> SCHEDUTIL: NEW  CPU1 cpu_capacity=381
>     kworker/u12:1-48      [004] .....     2.208860: sugov_start:
> SCHEDUTIL: NEW  CPU2 cpu_capacity=381
>     kworker/u12:1-48      [004] .....     2.208862: sugov_start:
> SCHEDUTIL: NEW  CPU3 cpu_capacity=381
>     kworker/u12:1-48      [004] .....     2.208864: sugov_start:
> SCHEDUTIL: NEW  CPU4 cpu_capacity=1024
>     kworker/u12:1-48      [004] .....     2.208866: sugov_start:
> SCHEDUTIL: NEW  CPU5 cpu_capacity=1024
>              bash-615     [005] .....    35.317113: sugov_start:
> schedutil the visit cpu mask is empty now
>              bash-615     [005] .....    35.317120: sugov_start:
> SCHEDUTIL: NEW  CPU0 cpu_capacity=381
>              bash-615     [005] .....    35.317123: sugov_start:
> SCHEDUTIL: NEW  CPU1 cpu_capacity=381
>              bash-615     [005] .....    35.317125: sugov_start:
> SCHEDUTIL: NEW  CPU2 cpu_capacity=381
>              bash-615     [005] .....    35.317127: sugov_start:
> SCHEDUTIL: NEW  CPU3 cpu_capacity=381
>              bash-615     [005] .....    35.317129: sugov_start:
> SCHEDUTIL: NEW  CPU4 cpu_capacity=1024
>              bash-615     [005] .....    35.317131: sugov_start:
> SCHEDUTIL: NEW  CPU5 cpu_capacity=1024
>              bash-623     [003] .....    57.633328: sugov_start:
> schedutil the visit cpu mask is empty now
>              bash-623     [003] .....    57.633336: sugov_start:
> SCHEDUTIL: NEW  CPU0 cpu_capacity=381
>              bash-623     [003] .....    57.633339: sugov_start:
> SCHEDUTIL: NEW  CPU1 cpu_capacity=381
>              bash-623     [003] .....    57.633340: sugov_start:
> SCHEDUTIL: NEW  CPU2 cpu_capacity=381
>              bash-623     [003] .....    57.633342: sugov_start:
> SCHEDUTIL: NEW  CPU3 cpu_capacity=381
>              bash-623     [003] .....    57.633343: sugov_start:
> SCHEDUTIL: NEW  CPU4 cpu_capacity=1024
>              bash-623     [003] .....    57.633344: sugov_start:
> SCHEDUTIL: NEW  CPU5 cpu_capacity=1024
> ----------------------------------------------------
>
> It should work.
>
> Regards,
> Lukasz
  
Lukasz Luba Nov. 30, 2022, 3 p.m. UTC | #13
On 11/30/22 14:29, Vincent Guittot wrote:
> On Wed, 30 Nov 2022 at 15:04, Lukasz Luba <lukasz.luba@arm.com> wrote:
>>
>> Hi Vincent,
>>
>> On 11/30/22 10:42, Vincent Guittot wrote:
>>> Hi All
>>>
>>> Just for the log and because it took me a while to figure out the root
>>> cause of the problem: This patch also creates a regression for
>>> snapdragon845 based systems and probably on any QC chipsets that use a
>>> LUT to update the OPP table at boot. The behavior is the same as
>>> described by Sam with a staled value in sugov_policy.max field.
>>
>> Thanks for sharing this info and apologies that you spent cycles
>> on it.
>>
>> I have checked that whole setup code (capacity + cpufreq policy and
>> governor). It looks like to have a proper capacity of CPUs, we need
>> to wait till the last policy is created. It's due to the arch_topology.c
>> mechanism which is only triggered after all CPUs' got the policy.
>> Unfortunately, this leads to a chicken & egg situation for this
>> schedutil setup of max capacity.
>>
>> I have experimented with this code, which triggers an update in
>> the schedutil, when all CPUs got the policy and sugov gov
>> (with trace_printk() to mach the output below)
> 
> Your proposal below looks similar to what is done in arch_topology.c.

Yes, even the name 'cpus_to_visit' looks similar ;)

> arch_topology.c triggers a rebuild of sched_domain and removes its
> cpufreq notifier cb once it has visited all CPUs, could it also
> trigger an update of CPU's policy with cpufreq_update_policy() ?
> 
> At this point you will be sure that the normalization has happened and
> the max capacity will not change.

True, they are done under that blocking notification chain, for the
last policy init. This is before the last time we call the
schedutil sugov_start with that last policy. That's why this code
is able to see that properly normalized max capacity under the:
trace_printk("schedutil the visit cpu mask is empty now\n");


> 
> I don't know if it's a global problem or only for systems using arch_topology
> 

It would only be for those with arch_topology, so only our asymmetric
systems AFAICS.
  
Viresh Kumar Dec. 5, 2022, 9:18 a.m. UTC | #14
Lukasz,

On 10-11-22, 19:57, Sam Wu wrote:
> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> index 9161d1136d01..1207c78f85c1 100644
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -25,9 +25,6 @@ struct sugov_policy {
>  	unsigned int		next_freq;
>  	unsigned int		cached_raw_freq;
>  
> -	/* max CPU capacity, which is equal for all CPUs in freq. domain */
> -	unsigned long		max;
> -
>  	/* The next fields are only needed if fast switch cannot be used: */
>  	struct			irq_work irq_work;
>  	struct			kthread_work work;
> @@ -51,6 +48,7 @@ struct sugov_cpu {
>  
>  	unsigned long		util;
>  	unsigned long		bw_dl;
> +	unsigned long		max;

IIUC, this part, i.e. moving max to sugov_policy, wasn't the problem
here, right ? Can you send a patch for that at least first, since this
is fully reverted now.

Or it doesn't make sense?
  
Lukasz Luba Dec. 6, 2022, 8:17 a.m. UTC | #15
Hi Viresh,

On 12/5/22 09:18, Viresh Kumar wrote:
> Lukasz,
> 
> On 10-11-22, 19:57, Sam Wu wrote:
>> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
>> index 9161d1136d01..1207c78f85c1 100644
>> --- a/kernel/sched/cpufreq_schedutil.c
>> +++ b/kernel/sched/cpufreq_schedutil.c
>> @@ -25,9 +25,6 @@ struct sugov_policy {
>>   	unsigned int		next_freq;
>>   	unsigned int		cached_raw_freq;
>>   
>> -	/* max CPU capacity, which is equal for all CPUs in freq. domain */
>> -	unsigned long		max;
>> -
>>   	/* The next fields are only needed if fast switch cannot be used: */
>>   	struct			irq_work irq_work;
>>   	struct			kthread_work work;
>> @@ -51,6 +48,7 @@ struct sugov_cpu {
>>   
>>   	unsigned long		util;
>>   	unsigned long		bw_dl;
>> +	unsigned long		max;
> 
> IIUC, this part, i.e. moving max to sugov_policy, wasn't the problem
> here, right ? Can you send a patch for that at least first, since this
> is fully reverted now.
> 
> Or it doesn't make sense?
> 

Yes, that still could make sense. We could still optimize a bit that
code in the sugov_next_freq_shared(). Look at that function. It loops
over all CPUs in the policy and calls sugov_get_util() which calls
this arch_scale_cpu_capacity() N times. Then it does those
multiplications below:

if (j_util * max > j_max * util)

which will be 2*N mul operations...
IMO this is pointless and heavy for LITTLE cores which are 4 or
sometimes 6 in the policy.

As you could see, my code just left that loop with a simple
max() operation.

I might just attack this code differently. Switch to that
sugov_policy::max, fetch the cpu capacity only once, before that loop.
I will rewrite a bit the sugov_get_util() and adjust the
2nd user of it: sugov_update_single_common()

Regards,
Lukasz
  

Patch

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 9161d1136d01..1207c78f85c1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -25,9 +25,6 @@  struct sugov_policy {
 	unsigned int		next_freq;
 	unsigned int		cached_raw_freq;
 
-	/* max CPU capacity, which is equal for all CPUs in freq. domain */
-	unsigned long		max;
-
 	/* The next fields are only needed if fast switch cannot be used: */
 	struct			irq_work irq_work;
 	struct			kthread_work work;
@@ -51,6 +48,7 @@  struct sugov_cpu {
 
 	unsigned long		util;
 	unsigned long		bw_dl;
+	unsigned long		max;
 
 	/* The field below is for single-CPU policies only: */
 #ifdef CONFIG_NO_HZ_COMMON
@@ -160,6 +158,7 @@  static void sugov_get_util(struct sugov_cpu *sg_cpu)
 {
 	struct rq *rq = cpu_rq(sg_cpu->cpu);
 
+	sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
 	sg_cpu->bw_dl = cpu_bw_dl(rq);
 	sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
 					  FREQUENCY_UTIL, NULL);
@@ -254,7 +253,6 @@  static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
  */
 static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
 {
-	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	unsigned long boost;
 
 	/* No boost currently required */
@@ -282,8 +280,7 @@  static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
 	 * sg_cpu->util is already in capacity scale; convert iowait_boost
 	 * into the same scale so we can compare.
 	 */
-	boost = sg_cpu->iowait_boost * sg_policy->max;
-	boost >>= SCHED_CAPACITY_SHIFT;
+	boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
 	boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
 	if (sg_cpu->util < boost)
 		sg_cpu->util = boost;
@@ -340,7 +337,7 @@  static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
 	if (!sugov_update_single_common(sg_cpu, time, flags))
 		return;
 
-	next_f = get_next_freq(sg_policy, sg_cpu->util, sg_policy->max);
+	next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
 	/*
 	 * Do not reduce the frequency if the CPU has not been idle
 	 * recently, as the reduction is likely to be premature then.
@@ -376,7 +373,6 @@  static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
 				     unsigned int flags)
 {
 	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
-	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	unsigned long prev_util = sg_cpu->util;
 
 	/*
@@ -403,8 +399,7 @@  static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
 		sg_cpu->util = prev_util;
 
 	cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
-				   map_util_perf(sg_cpu->util),
-				   sg_policy->max);
+				   map_util_perf(sg_cpu->util), sg_cpu->max);
 
 	sg_cpu->sg_policy->last_freq_update_time = time;
 }
@@ -413,19 +408,25 @@  static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 {
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	struct cpufreq_policy *policy = sg_policy->policy;
-	unsigned long util = 0;
+	unsigned long util = 0, max = 1;
 	unsigned int j;
 
 	for_each_cpu(j, policy->cpus) {
 		struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
+		unsigned long j_util, j_max;
 
 		sugov_get_util(j_sg_cpu);
 		sugov_iowait_apply(j_sg_cpu, time);
+		j_util = j_sg_cpu->util;
+		j_max = j_sg_cpu->max;
 
-		util = max(j_sg_cpu->util, util);
+		if (j_util * max > j_max * util) {
+			util = j_util;
+			max = j_max;
+		}
 	}
 
-	return get_next_freq(sg_policy, util, sg_policy->max);
+	return get_next_freq(sg_policy, util, max);
 }
 
 static void
@@ -751,7 +752,7 @@  static int sugov_start(struct cpufreq_policy *policy)
 {
 	struct sugov_policy *sg_policy = policy->governor_data;
 	void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
-	unsigned int cpu = cpumask_first(policy->cpus);
+	unsigned int cpu;
 
 	sg_policy->freq_update_delay_ns	= sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
 	sg_policy->last_freq_update_time	= 0;
@@ -759,7 +760,6 @@  static int sugov_start(struct cpufreq_policy *policy)
 	sg_policy->work_in_progress		= false;
 	sg_policy->limits_changed		= false;
 	sg_policy->cached_raw_freq		= 0;
-	sg_policy->max				= arch_scale_cpu_capacity(cpu);
 
 	sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);