[1/4] cpufreq: Add a cpufreq pressure feedback for the scheduler

Message ID 20231212142730.998913-2-vincent.guittot@linaro.org
State New
Headers
Series Rework system pressure interface to the scheduler |

Commit Message

Vincent Guittot Dec. 12, 2023, 2:27 p.m. UTC
  Provide to the scheduler a feedback about the temporary max available
capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
filtered as the pressure will happen for dozens ms or more.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 48 +++++++++++++++++++++++++++++++++++++++
 include/linux/cpufreq.h   | 10 ++++++++
 2 files changed, 58 insertions(+)
  

Comments

Viresh Kumar Dec. 13, 2023, 7:17 a.m. UTC | #1
On 12-12-23, 15:27, Vincent Guittot wrote:
> Provide to the scheduler a feedback about the temporary max available
> capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
> filtered as the pressure will happen for dozens ms or more.
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>  drivers/cpufreq/cpufreq.c | 48 +++++++++++++++++++++++++++++++++++++++
>  include/linux/cpufreq.h   | 10 ++++++++
>  2 files changed, 58 insertions(+)
> 
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 44db4f59c4cc..7d5f71be8d29 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -2563,6 +2563,50 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
>  }
>  EXPORT_SYMBOL(cpufreq_get_policy);
>  
> +DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
> +EXPORT_PER_CPU_SYMBOL_GPL(cpufreq_pressure);
> +
> +/**
> + * cpufreq_update_pressure() - Update cpufreq pressure for CPUs
> + * @cpus        : The related CPUs for which max capacity has been reduced
> + * @capped_freq : The maximum allowed frequency that CPUs can run at
> + *
> + * Update the value of cpufreq pressure for all @cpus in the mask. The
> + * cpumask should include all (online+offline) affected CPUs, to avoid
> + * operating on stale data when hot-plug is used for some CPUs. The
> + * @capped_freq reflects the currently allowed max CPUs frequency due to
> + * freq_qos capping. It might be also a boost frequency value, which is bigger
> + * than the internal 'capacity_freq_ref' max frequency. In such case the
> + * pressure value should simply be removed, since this is an indication that
> + * there is no capping. The @capped_freq must be provided in kHz.
> + */
> +static void cpufreq_update_pressure(const struct cpumask *cpus,

Since this is defined as 'static', why not just pass policy here ?

> +				      unsigned long capped_freq)
> +{
> +	unsigned long max_capacity, capacity, pressure;
> +	u32 max_freq;
> +	int cpu;
> +
> +	cpu = cpumask_first(cpus);
> +	max_capacity = arch_scale_cpu_capacity(cpu);

This anyway expects all of them to be from the same policy ..

> +	max_freq = arch_scale_freq_ref(cpu);
> +
> +	/*
> +	 * Handle properly the boost frequencies, which should simply clean
> +	 * the thermal pressure value.
> +	 */
> +	if (max_freq <= capped_freq)
> +		capacity = max_capacity;
> +	else
> +		capacity = mult_frac(max_capacity, capped_freq, max_freq);
> +
> +	pressure = max_capacity - capacity;
> +

Extra blank line here.

> +
> +	for_each_cpu(cpu, cpus)
> +		WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure);
> +}
> +
>  /**
>   * cpufreq_set_policy - Modify cpufreq policy parameters.
>   * @policy: Policy object to modify.
> @@ -2584,6 +2628,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
>  {
>  	struct cpufreq_policy_data new_data;
>  	struct cpufreq_governor *old_gov;
> +	struct cpumask *cpus;
>  	int ret;
>  
>  	memcpy(&new_data.cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo));
> @@ -2618,6 +2663,9 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
>  	policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
>  	trace_cpu_frequency_limits(policy);
>  
> +	cpus = policy->related_cpus;

You don't need the extra variable anyway, but lets just pass policy
instead to the routine.

> +	cpufreq_update_pressure(cpus, policy->max);
> +
>  	policy->cached_target_freq = UINT_MAX;
>  
>  	pr_debug("new min and max freqs are %u - %u kHz\n",
> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index afda5f24d3dd..b1d97edd3253 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -241,6 +241,12 @@ struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
>  void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
>  void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
>  bool has_target_index(void);
> +
> +DECLARE_PER_CPU(unsigned long, cpufreq_pressure);
> +static inline unsigned long cpufreq_get_pressure(int cpu)
> +{
> +	return per_cpu(cpufreq_pressure, cpu);
> +}
>  #else
>  static inline unsigned int cpufreq_get(unsigned int cpu)
>  {
> @@ -263,6 +269,10 @@ static inline bool cpufreq_supports_freq_invariance(void)
>  	return false;
>  }
>  static inline void disable_cpufreq(void) { }
> +static inline unsigned long cpufreq_get_pressure(int cpu)
> +{
> +	return 0;
> +}
>  #endif
>  
>  #ifdef CONFIG_CPU_FREQ_STAT
> -- 
> 2.34.1
  
Vincent Guittot Dec. 13, 2023, 8:05 a.m. UTC | #2
On Wed, 13 Dec 2023 at 08:17, Viresh Kumar <viresh.kumar@linaro.org> wrote:
>
> On 12-12-23, 15:27, Vincent Guittot wrote:
> > Provide to the scheduler a feedback about the temporary max available
> > capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
> > filtered as the pressure will happen for dozens ms or more.
> >
> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > ---
> >  drivers/cpufreq/cpufreq.c | 48 +++++++++++++++++++++++++++++++++++++++
> >  include/linux/cpufreq.h   | 10 ++++++++
> >  2 files changed, 58 insertions(+)
> >
> > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> > index 44db4f59c4cc..7d5f71be8d29 100644
> > --- a/drivers/cpufreq/cpufreq.c
> > +++ b/drivers/cpufreq/cpufreq.c
> > @@ -2563,6 +2563,50 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
> >  }
> >  EXPORT_SYMBOL(cpufreq_get_policy);
> >
> > +DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
> > +EXPORT_PER_CPU_SYMBOL_GPL(cpufreq_pressure);
> > +
> > +/**
> > + * cpufreq_update_pressure() - Update cpufreq pressure for CPUs
> > + * @cpus        : The related CPUs for which max capacity has been reduced
> > + * @capped_freq : The maximum allowed frequency that CPUs can run at
> > + *
> > + * Update the value of cpufreq pressure for all @cpus in the mask. The
> > + * cpumask should include all (online+offline) affected CPUs, to avoid
> > + * operating on stale data when hot-plug is used for some CPUs. The
> > + * @capped_freq reflects the currently allowed max CPUs frequency due to
> > + * freq_qos capping. It might be also a boost frequency value, which is bigger
> > + * than the internal 'capacity_freq_ref' max frequency. In such case the
> > + * pressure value should simply be removed, since this is an indication that
> > + * there is no capping. The @capped_freq must be provided in kHz.
> > + */
> > +static void cpufreq_update_pressure(const struct cpumask *cpus,
>
> Since this is defined as 'static', why not just pass policy here ?

Mainly because we only need the cpumask and also because this follows
the same pattern as other place like arch_topology.c

>
> > +                                   unsigned long capped_freq)
> > +{
> > +     unsigned long max_capacity, capacity, pressure;
> > +     u32 max_freq;
> > +     int cpu;
> > +
> > +     cpu = cpumask_first(cpus);
> > +     max_capacity = arch_scale_cpu_capacity(cpu);
>
> This anyway expects all of them to be from the same policy ..
>
> > +     max_freq = arch_scale_freq_ref(cpu);
> > +
> > +     /*
> > +      * Handle properly the boost frequencies, which should simply clean
> > +      * the thermal pressure value.
> > +      */
> > +     if (max_freq <= capped_freq)
> > +             capacity = max_capacity;
> > +     else
> > +             capacity = mult_frac(max_capacity, capped_freq, max_freq);
> > +
> > +     pressure = max_capacity - capacity;
> > +
>
> Extra blank line here.
>
> > +
> > +     for_each_cpu(cpu, cpus)
> > +             WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure);
> > +}
> > +
> >  /**
> >   * cpufreq_set_policy - Modify cpufreq policy parameters.
> >   * @policy: Policy object to modify.
> > @@ -2584,6 +2628,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
> >  {
> >       struct cpufreq_policy_data new_data;
> >       struct cpufreq_governor *old_gov;
> > +     struct cpumask *cpus;
> >       int ret;
> >
> >       memcpy(&new_data.cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo));
> > @@ -2618,6 +2663,9 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
> >       policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
> >       trace_cpu_frequency_limits(policy);
> >
> > +     cpus = policy->related_cpus;
>
> You don't need the extra variable anyway, but lets just pass policy
> instead to the routine.

In fact I have followed what was done in cpufreq_cooling.c with
arch_update_thermal_pressure().

Will remove it

>
> > +     cpufreq_update_pressure(cpus, policy->max);
> > +
> >       policy->cached_target_freq = UINT_MAX;
> >
> >       pr_debug("new min and max freqs are %u - %u kHz\n",
> > diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> > index afda5f24d3dd..b1d97edd3253 100644
> > --- a/include/linux/cpufreq.h
> > +++ b/include/linux/cpufreq.h
> > @@ -241,6 +241,12 @@ struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
> >  void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
> >  void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
> >  bool has_target_index(void);
> > +
> > +DECLARE_PER_CPU(unsigned long, cpufreq_pressure);
> > +static inline unsigned long cpufreq_get_pressure(int cpu)
> > +{
> > +     return per_cpu(cpufreq_pressure, cpu);
> > +}
> >  #else
> >  static inline unsigned int cpufreq_get(unsigned int cpu)
> >  {
> > @@ -263,6 +269,10 @@ static inline bool cpufreq_supports_freq_invariance(void)
> >       return false;
> >  }
> >  static inline void disable_cpufreq(void) { }
> > +static inline unsigned long cpufreq_get_pressure(int cpu)
> > +{
> > +     return 0;
> > +}
> >  #endif
> >
> >  #ifdef CONFIG_CPU_FREQ_STAT
> > --
> > 2.34.1
>
> --
> viresh
  
Tim Chen Dec. 14, 2023, 12:41 a.m. UTC | #3
On Tue, 2023-12-12 at 15:27 +0100, Vincent Guittot wrote:
> Provide to the scheduler a feedback about the temporary max available
> capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
> filtered as the pressure will happen for dozens ms or more.
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>  drivers/cpufreq/cpufreq.c | 48 +++++++++++++++++++++++++++++++++++++++
>  include/linux/cpufreq.h   | 10 ++++++++
>  2 files changed, 58 insertions(+)
> 
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 44db4f59c4cc..7d5f71be8d29 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -2563,6 +2563,50 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
>  }
>  EXPORT_SYMBOL(cpufreq_get_policy);
>  
> +DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
> +EXPORT_PER_CPU_SYMBOL_GPL(cpufreq_pressure);
> +
> +/**
> + * cpufreq_update_pressure() - Update cpufreq pressure for CPUs
> + * @cpus        : The related CPUs for which max capacity has been reduced
> + * @capped_freq : The maximum allowed frequency that CPUs can run at
> + *
> + * Update the value of cpufreq pressure for all @cpus in the mask. The
> + * cpumask should include all (online+offline) affected CPUs, to avoid
> + * operating on stale data when hot-plug is used for some CPUs. The
> + * @capped_freq reflects the currently allowed max CPUs frequency due to
> + * freq_qos capping. It might be also a boost frequency value, which is bigger
> + * than the internal 'capacity_freq_ref' max frequency. In such case the
> + * pressure value should simply be removed, since this is an indication that
> + * there is no capping. The @capped_freq must be provided in kHz.
> + */
> +static void cpufreq_update_pressure(const struct cpumask *cpus,
> +				      unsigned long capped_freq)
> +{
> +	unsigned long max_capacity, capacity, pressure;
> +	u32 max_freq;
> +	int cpu;
> +
> +	cpu = cpumask_first(cpus);
> +	max_capacity = arch_scale_cpu_capacity(cpu);
> +	max_freq = arch_scale_freq_ref(cpu);
> +
> +	/*
> +	 * Handle properly the boost frequencies, which should simply clean
> +	 * the thermal pressure value.
> +	 */
> +	if (max_freq <= capped_freq)
> +		capacity = max_capacity;
> +	else
> +		capacity = mult_frac(max_capacity, capped_freq, max_freq);
> +
> +	pressure = max_capacity - capacity;
> +
> +
> +	for_each_cpu(cpu, cpus)
> +		WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure);

Seems like the pressure value computed from the first CPU applies to all CPU.
Will this be valid for non-homogeneous CPUs that could have different
max_freq and max_capacity?

Tim
  
Viresh Kumar Dec. 14, 2023, 5:36 a.m. UTC | #4
On 13-12-23, 16:41, Tim Chen wrote:
> Seems like the pressure value computed from the first CPU applies to all CPU.
> Will this be valid for non-homogeneous CPUs that could have different
> max_freq and max_capacity?

The will be part of different cpufreq policies and so it will work
fine.
  
Viresh Kumar Dec. 14, 2023, 5:43 a.m. UTC | #5
On 12-12-23, 15:27, Vincent Guittot wrote:
> @@ -2618,6 +2663,9 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
>  	policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
>  	trace_cpu_frequency_limits(policy);
>  
> +	cpus = policy->related_cpus;
> +	cpufreq_update_pressure(cpus, policy->max);
> +
>  	policy->cached_target_freq = UINT_MAX;

One more question, why are you doing this from cpufreq_set_policy ? If
due to cpufreq cooling or from userspace, we end up limiting the
maximum possible frequency, will this routine always get called ?
  
Vincent Guittot Dec. 14, 2023, 7:57 a.m. UTC | #6
On Thu, 14 Dec 2023 at 06:43, Viresh Kumar <viresh.kumar@linaro.org> wrote:
>
> On 12-12-23, 15:27, Vincent Guittot wrote:
> > @@ -2618,6 +2663,9 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
> >       policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
> >       trace_cpu_frequency_limits(policy);
> >
> > +     cpus = policy->related_cpus;
> > +     cpufreq_update_pressure(cpus, policy->max);
> > +
> >       policy->cached_target_freq = UINT_MAX;
>
> One more question, why are you doing this from cpufreq_set_policy ? If
> due to cpufreq cooling or from userspace, we end up limiting the
> maximum possible frequency, will this routine always get called ?

Yes, any update of a FREQ_QOS_MAX ends up calling cpufreq_set_policy()
to update the policy->max


>
> --
> viresh
  
Lukasz Luba Dec. 14, 2023, 9:08 a.m. UTC | #7
On 12/14/23 07:57, Vincent Guittot wrote:
> On Thu, 14 Dec 2023 at 06:43, Viresh Kumar <viresh.kumar@linaro.org> wrote:
>>
>> On 12-12-23, 15:27, Vincent Guittot wrote:
>>> @@ -2618,6 +2663,9 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
>>>        policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
>>>        trace_cpu_frequency_limits(policy);
>>>
>>> +     cpus = policy->related_cpus;
>>> +     cpufreq_update_pressure(cpus, policy->max);
>>> +
>>>        policy->cached_target_freq = UINT_MAX;
>>
>> One more question, why are you doing this from cpufreq_set_policy ? If
>> due to cpufreq cooling or from userspace, we end up limiting the
>> maximum possible frequency, will this routine always get called ?
> 
> Yes, any update of a FREQ_QOS_MAX ends up calling cpufreq_set_policy()
> to update the policy->max
> 

Agree, cpufreq sysfs scaling_max_freq is also important to handle
in this new design. Currently we don't reflect that as reduced CPU
capacity in the scheduler. There was discussion when I proposed to feed
that CPU frequency reduction into thermal_pressure [1].

The same applies for the DTPM which is missing currently the proper
impact to the CPU reduced capacity in the scheduler.

IMHO any limit set into FREQ_QOS_MAX should be visible in this
new design of capacity reduction signaling.

[1] https://lore.kernel.org/lkml/20220930094821.31665-2-lukasz.luba@arm.com/
  
Lukasz Luba Dec. 14, 2023, 9:21 a.m. UTC | #8
On 12/12/23 14:27, Vincent Guittot wrote:
> Provide to the scheduler a feedback about the temporary max available
> capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
> filtered as the pressure will happen for dozens ms or more.
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>   drivers/cpufreq/cpufreq.c | 48 +++++++++++++++++++++++++++++++++++++++
>   include/linux/cpufreq.h   | 10 ++++++++
>   2 files changed, 58 insertions(+)
> 
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 44db4f59c4cc..7d5f71be8d29 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -2563,6 +2563,50 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
>   }
>   EXPORT_SYMBOL(cpufreq_get_policy);
>   
> +DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
> +EXPORT_PER_CPU_SYMBOL_GPL(cpufreq_pressure);

Why do we export this variable when we have get/update functions?
Do we expect modules would manipulate those per-cpu variables
independently and not like we do per-cpumask in the update func.?
  
Rafael J. Wysocki Dec. 14, 2023, 9:40 a.m. UTC | #9
On Thu, Dec 14, 2023 at 10:07 AM Lukasz Luba <lukasz.luba@arm.com> wrote:
>
> On 12/14/23 07:57, Vincent Guittot wrote:
> > On Thu, 14 Dec 2023 at 06:43, Viresh Kumar <viresh.kumar@linaro.org> wrote:
> >>
> >> On 12-12-23, 15:27, Vincent Guittot wrote:
> >>> @@ -2618,6 +2663,9 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
> >>>        policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
> >>>        trace_cpu_frequency_limits(policy);
> >>>
> >>> +     cpus = policy->related_cpus;
> >>> +     cpufreq_update_pressure(cpus, policy->max);
> >>> +
> >>>        policy->cached_target_freq = UINT_MAX;
> >>
> >> One more question, why are you doing this from cpufreq_set_policy ? If
> >> due to cpufreq cooling or from userspace, we end up limiting the
> >> maximum possible frequency, will this routine always get called ?
> >
> > Yes, any update of a FREQ_QOS_MAX ends up calling cpufreq_set_policy()
> > to update the policy->max
> >
>
> Agree, cpufreq sysfs scaling_max_freq is also important to handle
> in this new design. Currently we don't reflect that as reduced CPU
> capacity in the scheduler. There was discussion when I proposed to feed
> that CPU frequency reduction into thermal_pressure [1].
>
> The same applies for the DTPM which is missing currently the proper
> impact to the CPU reduced capacity in the scheduler.
>
> IMHO any limit set into FREQ_QOS_MAX should be visible in this
> new design of capacity reduction signaling.
>
> [1] https://lore.kernel.org/lkml/20220930094821.31665-2-lukasz.luba@arm.com/

Actually, freq_qos_read_value(&policy->constraints, FREQ_QOS_MAX) will
return the requisite limit.
  
Lukasz Luba Dec. 14, 2023, 10:41 a.m. UTC | #10
On 12/14/23 09:40, Rafael J. Wysocki wrote:
> On Thu, Dec 14, 2023 at 10:07 AM Lukasz Luba <lukasz.luba@arm.com> wrote:
>>
>> On 12/14/23 07:57, Vincent Guittot wrote:
>>> On Thu, 14 Dec 2023 at 06:43, Viresh Kumar <viresh.kumar@linaro.org> wrote:
>>>>
>>>> On 12-12-23, 15:27, Vincent Guittot wrote:
>>>>> @@ -2618,6 +2663,9 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
>>>>>         policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
>>>>>         trace_cpu_frequency_limits(policy);
>>>>>
>>>>> +     cpus = policy->related_cpus;
>>>>> +     cpufreq_update_pressure(cpus, policy->max);
>>>>> +
>>>>>         policy->cached_target_freq = UINT_MAX;
>>>>
>>>> One more question, why are you doing this from cpufreq_set_policy ? If
>>>> due to cpufreq cooling or from userspace, we end up limiting the
>>>> maximum possible frequency, will this routine always get called ?
>>>
>>> Yes, any update of a FREQ_QOS_MAX ends up calling cpufreq_set_policy()
>>> to update the policy->max
>>>
>>
>> Agree, cpufreq sysfs scaling_max_freq is also important to handle
>> in this new design. Currently we don't reflect that as reduced CPU
>> capacity in the scheduler. There was discussion when I proposed to feed
>> that CPU frequency reduction into thermal_pressure [1].
>>
>> The same applies for the DTPM which is missing currently the proper
>> impact to the CPU reduced capacity in the scheduler.
>>
>> IMHO any limit set into FREQ_QOS_MAX should be visible in this
>> new design of capacity reduction signaling.
>>
>> [1] https://lore.kernel.org/lkml/20220930094821.31665-2-lukasz.luba@arm.com/
> 
> Actually, freq_qos_read_value(&policy->constraints, FREQ_QOS_MAX) will
> return the requisite limit.

Yes, but we need to translate that information from freq domain
into capacity domain and plumb ii into scheduler as stolen CPU capacity.
Ideally, w/o any 'smoothing' but just instant value.
That's the hope of this patch set re-design.
  
Vincent Guittot Dec. 14, 2023, 11:06 a.m. UTC | #11
On Thu, 14 Dec 2023 at 10:20, Lukasz Luba <lukasz.luba@arm.com> wrote:
>
>
>
> On 12/12/23 14:27, Vincent Guittot wrote:
> > Provide to the scheduler a feedback about the temporary max available
> > capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
> > filtered as the pressure will happen for dozens ms or more.
> >
> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > ---
> >   drivers/cpufreq/cpufreq.c | 48 +++++++++++++++++++++++++++++++++++++++
> >   include/linux/cpufreq.h   | 10 ++++++++
> >   2 files changed, 58 insertions(+)
> >
> > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> > index 44db4f59c4cc..7d5f71be8d29 100644
> > --- a/drivers/cpufreq/cpufreq.c
> > +++ b/drivers/cpufreq/cpufreq.c
> > @@ -2563,6 +2563,50 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
> >   }
> >   EXPORT_SYMBOL(cpufreq_get_policy);
> >
> > +DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
> > +EXPORT_PER_CPU_SYMBOL_GPL(cpufreq_pressure);
>
> Why do we export this variable when we have get/update functions?
> Do we expect modules would manipulate those per-cpu variables
> independently and not like we do per-cpumask in the update func.?

No, I will remove the EXPORT_PER_CPU_SYMBOL_GPL
  

Patch

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 44db4f59c4cc..7d5f71be8d29 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2563,6 +2563,50 @@  int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_get_policy);
 
+DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
+EXPORT_PER_CPU_SYMBOL_GPL(cpufreq_pressure);
+
+/**
+ * cpufreq_update_pressure() - Update cpufreq pressure for CPUs
+ * @cpus        : The related CPUs for which max capacity has been reduced
+ * @capped_freq : The maximum allowed frequency that CPUs can run at
+ *
+ * Update the value of cpufreq pressure for all @cpus in the mask. The
+ * cpumask should include all (online+offline) affected CPUs, to avoid
+ * operating on stale data when hot-plug is used for some CPUs. The
+ * @capped_freq reflects the currently allowed max CPUs frequency due to
+ * freq_qos capping. It might be also a boost frequency value, which is bigger
+ * than the internal 'capacity_freq_ref' max frequency. In such case the
+ * pressure value should simply be removed, since this is an indication that
+ * there is no capping. The @capped_freq must be provided in kHz.
+ */
+static void cpufreq_update_pressure(const struct cpumask *cpus,
+				      unsigned long capped_freq)
+{
+	unsigned long max_capacity, capacity, pressure;
+	u32 max_freq;
+	int cpu;
+
+	cpu = cpumask_first(cpus);
+	max_capacity = arch_scale_cpu_capacity(cpu);
+	max_freq = arch_scale_freq_ref(cpu);
+
+	/*
+	 * Handle properly the boost frequencies, which should simply clean
+	 * the thermal pressure value.
+	 */
+	if (max_freq <= capped_freq)
+		capacity = max_capacity;
+	else
+		capacity = mult_frac(max_capacity, capped_freq, max_freq);
+
+	pressure = max_capacity - capacity;
+
+
+	for_each_cpu(cpu, cpus)
+		WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure);
+}
+
 /**
  * cpufreq_set_policy - Modify cpufreq policy parameters.
  * @policy: Policy object to modify.
@@ -2584,6 +2628,7 @@  static int cpufreq_set_policy(struct cpufreq_policy *policy,
 {
 	struct cpufreq_policy_data new_data;
 	struct cpufreq_governor *old_gov;
+	struct cpumask *cpus;
 	int ret;
 
 	memcpy(&new_data.cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo));
@@ -2618,6 +2663,9 @@  static int cpufreq_set_policy(struct cpufreq_policy *policy,
 	policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
 	trace_cpu_frequency_limits(policy);
 
+	cpus = policy->related_cpus;
+	cpufreq_update_pressure(cpus, policy->max);
+
 	policy->cached_target_freq = UINT_MAX;
 
 	pr_debug("new min and max freqs are %u - %u kHz\n",
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index afda5f24d3dd..b1d97edd3253 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -241,6 +241,12 @@  struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
 void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
 void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
 bool has_target_index(void);
+
+DECLARE_PER_CPU(unsigned long, cpufreq_pressure);
+static inline unsigned long cpufreq_get_pressure(int cpu)
+{
+	return per_cpu(cpufreq_pressure, cpu);
+}
 #else
 static inline unsigned int cpufreq_get(unsigned int cpu)
 {
@@ -263,6 +269,10 @@  static inline bool cpufreq_supports_freq_invariance(void)
 	return false;
 }
 static inline void disable_cpufreq(void) { }
+static inline unsigned long cpufreq_get_pressure(int cpu)
+{
+	return 0;
+}
 #endif
 
 #ifdef CONFIG_CPU_FREQ_STAT