[v2,2/3] cpufreq: qcom-nvmem: Enable virtual power domain devices

Message ID 20231018-msm8909-cpufreq-v2-2-0962df95f654@kernkonzept.com
State New
Headers
Series cpufreq: Add basic cpufreq scaling for Qualcomm MSM8909 |

Commit Message

Stephan Gerhold Oct. 18, 2023, 8:06 a.m. UTC
  The genpd core caches performance state votes from devices that are
runtime suspended as of commit 3c5a272202c2 ("PM: domains: Improve
runtime PM performance state handling"). They get applied once the
device becomes active again.

To attach the power domains needed by qcom-cpufreq-nvmem the OPP core
calls genpd_dev_pm_attach_by_id(). This results in "virtual" dummy
devices that use runtime PM only to control the enable and performance
state for the attached power domain.

However, at the moment nothing ever resumes the virtual devices created
for qcom-cpufreq-nvmem. They remain permanently runtime suspended. This
means that performance state votes made during cpufreq scaling get
always cached and never applied to the hardware.

Fix this by enabling the devices after attaching them and use
dev_pm_syscore_device() to ensure the power domains also stay on when
going to suspend. Since it supplies the CPU we can never turn it off
from Linux. There are other mechanisms to turn it off when needed,
usually in the RPM firmware (RPMPD) or the cpuidle path (CPR genpd).

Without this fix performance states votes are silently ignored, and the
CPU/CPR voltage is never adjusted. This has been broken since 5.14 but
for some reason no one noticed this on QCS404 so far.

Cc: stable@vger.kernel.org
Fixes: 1cb8339ca225 ("cpufreq: qcom: Add support for qcs404 on nvmem driver")
Signed-off-by: Stephan Gerhold <stephan.gerhold@kernkonzept.com>
---
 drivers/cpufreq/qcom-cpufreq-nvmem.c | 49 +++++++++++++++++++++++++++++++++---
 1 file changed, 46 insertions(+), 3 deletions(-)
  

Comments

Ulf Hansson Oct. 19, 2023, 10:24 a.m. UTC | #1
On Wed, 18 Oct 2023 at 10:06, Stephan Gerhold
<stephan.gerhold@kernkonzept.com> wrote:
>
> The genpd core caches performance state votes from devices that are
> runtime suspended as of commit 3c5a272202c2 ("PM: domains: Improve
> runtime PM performance state handling"). They get applied once the
> device becomes active again.
>
> To attach the power domains needed by qcom-cpufreq-nvmem the OPP core
> calls genpd_dev_pm_attach_by_id(). This results in "virtual" dummy
> devices that use runtime PM only to control the enable and performance
> state for the attached power domain.
>
> However, at the moment nothing ever resumes the virtual devices created
> for qcom-cpufreq-nvmem. They remain permanently runtime suspended. This
> means that performance state votes made during cpufreq scaling get
> always cached and never applied to the hardware.
>
> Fix this by enabling the devices after attaching them and use
> dev_pm_syscore_device() to ensure the power domains also stay on when
> going to suspend. Since it supplies the CPU we can never turn it off
> from Linux. There are other mechanisms to turn it off when needed,
> usually in the RPM firmware (RPMPD) or the cpuidle path (CPR genpd).

I believe we discussed using dev_pm_syscore_device() for the previous
version. It's not intended to be used for things like the above.

Moreover, I was under the impression that it wasn't really needed. In
fact, I would think that this actually breaks things for system
suspend/resume, as in this case the cpr driver's genpd
->power_on|off() callbacks are no longer getting called due this,
which means that the cpr state machine isn't going to be restored
properly. Or did I get this wrong?

Kind regards
Uffe

>
> Without this fix performance states votes are silently ignored, and the
> CPU/CPR voltage is never adjusted. This has been broken since 5.14 but
> for some reason no one noticed this on QCS404 so far.
>
> Cc: stable@vger.kernel.org
> Fixes: 1cb8339ca225 ("cpufreq: qcom: Add support for qcs404 on nvmem driver")
> Signed-off-by: Stephan Gerhold <stephan.gerhold@kernkonzept.com>
> ---
>  drivers/cpufreq/qcom-cpufreq-nvmem.c | 49 +++++++++++++++++++++++++++++++++---
>  1 file changed, 46 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c b/drivers/cpufreq/qcom-cpufreq-nvmem.c
> index 82a244f3fa52..3794390089b0 100644
> --- a/drivers/cpufreq/qcom-cpufreq-nvmem.c
> +++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c
> @@ -25,6 +25,7 @@
>  #include <linux/platform_device.h>
>  #include <linux/pm_domain.h>
>  #include <linux/pm_opp.h>
> +#include <linux/pm_runtime.h>
>  #include <linux/slab.h>
>  #include <linux/soc/qcom/smem.h>
>
> @@ -47,6 +48,7 @@ struct qcom_cpufreq_match_data {
>
>  struct qcom_cpufreq_drv_cpu {
>         int opp_token;
> +       struct device **virt_devs;
>  };
>
>  struct qcom_cpufreq_drv {
> @@ -268,6 +270,18 @@ static const struct qcom_cpufreq_match_data match_data_ipq8074 = {
>         .get_version = qcom_cpufreq_ipq8074_name_version,
>  };
>
> +static void qcom_cpufreq_put_virt_devs(struct qcom_cpufreq_drv *drv, unsigned cpu)
> +{
> +       const char * const *name = drv->data->genpd_names;
> +       int i;
> +
> +       if (!drv->cpus[cpu].virt_devs)
> +               return;
> +
> +       for (i = 0; *name; i++, name++)
> +               pm_runtime_put(drv->cpus[cpu].virt_devs[i]);
> +}
> +
>  static int qcom_cpufreq_probe(struct platform_device *pdev)
>  {
>         struct qcom_cpufreq_drv *drv;
> @@ -321,6 +335,7 @@ static int qcom_cpufreq_probe(struct platform_device *pdev)
>         of_node_put(np);
>
>         for_each_possible_cpu(cpu) {
> +               struct device **virt_devs = NULL;
>                 struct dev_pm_opp_config config = {
>                         .supported_hw = NULL,
>                 };
> @@ -341,7 +356,7 @@ static int qcom_cpufreq_probe(struct platform_device *pdev)
>
>                 if (drv->data->genpd_names) {
>                         config.genpd_names = drv->data->genpd_names;
> -                       config.virt_devs = NULL;
> +                       config.virt_devs = &virt_devs;
>                 }
>
>                 if (config.supported_hw || config.genpd_names) {
> @@ -352,6 +367,30 @@ static int qcom_cpufreq_probe(struct platform_device *pdev)
>                                 goto free_opp;
>                         }
>                 }
> +
> +               if (virt_devs) {
> +                       const char * const *name = config.genpd_names;
> +                       int i, j;
> +
> +                       for (i = 0; *name; i++, name++) {
> +                               ret = pm_runtime_resume_and_get(virt_devs[i]);
> +                               if (ret) {
> +                                       dev_err(cpu_dev, "failed to resume %s: %d\n",
> +                                               *name, ret);
> +
> +                                       /* Rollback previous PM runtime calls */
> +                                       name = config.genpd_names;
> +                                       for (j = 0; *name && j < i; j++, name++)
> +                                               pm_runtime_put(virt_devs[j]);
> +
> +                                       goto free_opp;
> +                               }
> +
> +                               /* Keep CPU power domain always-on */
> +                               dev_pm_syscore_device(virt_devs[i], true);
> +                       }
> +                       drv->cpus[cpu].virt_devs = virt_devs;
> +               }
>         }
>
>         cpufreq_dt_pdev = platform_device_register_simple("cpufreq-dt", -1,
> @@ -365,8 +404,10 @@ static int qcom_cpufreq_probe(struct platform_device *pdev)
>         dev_err(cpu_dev, "Failed to register platform device\n");
>
>  free_opp:
> -       for_each_possible_cpu(cpu)
> +       for_each_possible_cpu(cpu) {
> +               qcom_cpufreq_put_virt_devs(drv, cpu);
>                 dev_pm_opp_clear_config(drv->cpus[cpu].opp_token);
> +       }
>         return ret;
>  }
>
> @@ -377,8 +418,10 @@ static void qcom_cpufreq_remove(struct platform_device *pdev)
>
>         platform_device_unregister(cpufreq_dt_pdev);
>
> -       for_each_possible_cpu(cpu)
> +       for_each_possible_cpu(cpu) {
> +               qcom_cpufreq_put_virt_devs(drv, cpu);
>                 dev_pm_opp_clear_config(drv->cpus[cpu].opp_token);
> +       }
>  }
>
>  static struct platform_driver qcom_cpufreq_driver = {
>
> --
> 2.39.2
>
  
Ulf Hansson Oct. 19, 2023, 11:26 a.m. UTC | #2
On Thu, 19 Oct 2023 at 12:24, Ulf Hansson <ulf.hansson@linaro.org> wrote:
>
> On Wed, 18 Oct 2023 at 10:06, Stephan Gerhold
> <stephan.gerhold@kernkonzept.com> wrote:
> >
> > The genpd core caches performance state votes from devices that are
> > runtime suspended as of commit 3c5a272202c2 ("PM: domains: Improve
> > runtime PM performance state handling"). They get applied once the
> > device becomes active again.
> >
> > To attach the power domains needed by qcom-cpufreq-nvmem the OPP core
> > calls genpd_dev_pm_attach_by_id(). This results in "virtual" dummy
> > devices that use runtime PM only to control the enable and performance
> > state for the attached power domain.
> >
> > However, at the moment nothing ever resumes the virtual devices created
> > for qcom-cpufreq-nvmem. They remain permanently runtime suspended. This
> > means that performance state votes made during cpufreq scaling get
> > always cached and never applied to the hardware.
> >
> > Fix this by enabling the devices after attaching them and use
> > dev_pm_syscore_device() to ensure the power domains also stay on when
> > going to suspend. Since it supplies the CPU we can never turn it off
> > from Linux. There are other mechanisms to turn it off when needed,
> > usually in the RPM firmware (RPMPD) or the cpuidle path (CPR genpd).
>
> I believe we discussed using dev_pm_syscore_device() for the previous
> version. It's not intended to be used for things like the above.
>
> Moreover, I was under the impression that it wasn't really needed. In
> fact, I would think that this actually breaks things for system
> suspend/resume, as in this case the cpr driver's genpd
> ->power_on|off() callbacks are no longer getting called due this,
> which means that the cpr state machine isn't going to be restored
> properly. Or did I get this wrong?

BTW, if you really need something like the above, the proper way to do
it would instead be to call device_set_awake_path() for the device.

This informs genpd that the device needs to stay powered-on during
system suspend (assuming that GENPD_FLAG_ACTIVE_WAKEUP has been set
for it), hence it will keep the corresponding PM domain powered-on
too.

[...]

Kind regards
Uffe
  
Stephan Gerhold Oct. 19, 2023, 1:05 p.m. UTC | #3
On Thu, Oct 19, 2023 at 01:26:19PM +0200, Ulf Hansson wrote:
> On Thu, 19 Oct 2023 at 12:24, Ulf Hansson <ulf.hansson@linaro.org> wrote:
> > On Wed, 18 Oct 2023 at 10:06, Stephan Gerhold
> > <stephan.gerhold@kernkonzept.com> wrote:
> > >
> > > The genpd core caches performance state votes from devices that are
> > > runtime suspended as of commit 3c5a272202c2 ("PM: domains: Improve
> > > runtime PM performance state handling"). They get applied once the
> > > device becomes active again.
> > >
> > > To attach the power domains needed by qcom-cpufreq-nvmem the OPP core
> > > calls genpd_dev_pm_attach_by_id(). This results in "virtual" dummy
> > > devices that use runtime PM only to control the enable and performance
> > > state for the attached power domain.
> > >
> > > However, at the moment nothing ever resumes the virtual devices created
> > > for qcom-cpufreq-nvmem. They remain permanently runtime suspended. This
> > > means that performance state votes made during cpufreq scaling get
> > > always cached and never applied to the hardware.
> > >
> > > Fix this by enabling the devices after attaching them and use
> > > dev_pm_syscore_device() to ensure the power domains also stay on when
> > > going to suspend. Since it supplies the CPU we can never turn it off
> > > from Linux. There are other mechanisms to turn it off when needed,
> > > usually in the RPM firmware (RPMPD) or the cpuidle path (CPR genpd).
> >
> > I believe we discussed using dev_pm_syscore_device() for the previous
> > version. It's not intended to be used for things like the above.
> >

Sorry, looks like we still had a misunderstanding in the conclusion of
the previous discussion. :')

> > Moreover, I was under the impression that it wasn't really needed. In
> > fact, I would think that this actually breaks things for system
> > suspend/resume, as in this case the cpr driver's genpd
> > ->power_on|off() callbacks are no longer getting called due this,
> > which means that the cpr state machine isn't going to be restored
> > properly. Or did I get this wrong?
> 

We strictly need the RPMPDs to be always-on, also across system suspend
[1]. The RPM firmware will drop the votes internally as soon as the
CPU(s) have entered deep cpuidle. We can't do this from Linux, because
we need the CPU to continue running until it was shut down cleanly.

For CPR, we strictly need the backing regulator to be always-on, also
across system suspend. Typically the hardware will turn off the
regulator as soon as the CPU(s) enter deep cpuidle. Similarly, we can't
do this from Linux, because we need the CPU to continue running until it
was shut down cleanly.

My understanding was that we're going to pause the CPR state machine
using the system suspend/resume callbacks on the driver, instead of
using the genpd->power_on|off() callbacks [2]. I can submit a separate
patch for this.

I didn't prioritize this because QCS404 (as the only current user of
CPR) doesn't have proper deep cpuidle/power management set up yet. It's
not entirely clear to me if there is any advantage (or perhaps even
disadvantage) if we pause the CPR state machine while the shared L2
cache is still being actively powered by the CPR power rail during
system suspend. I suspect this is a configuration that was never
considered in the hardware design.

Given the strict requirement for the RPMPDs, I only see two options:

 1. Have an always-on consumer that prevents the power domains to be
    powered off during system suspend. This is what this patch tries to
    achieve.

Or:

 2. Come up with a way to register the RPMPDs used by the CPU with
    GENPD_FLAG_ALWAYS_ON. This would also be doable, but isn't as
    straightfoward as "regulator-always-on" in the DT because the rpmpd
    DT node represents multiple genpds in a single DT node [3].

What do you think? Do you see some other solution perhaps? I hope we can
clear up the misunderstanding. :-)

[1]: https://lore.kernel.org/linux-arm-msm/ZQGqfMigCFZP_HLA@gerhold.net/
[2]: https://lore.kernel.org/linux-arm-msm/CAPDyKFoiup8KNv=1LFGKDdDLA1pHsdJUgTTWMdgxnikEmReXzg@mail.gmail.com/
[3]: https://lore.kernel.org/linux-arm-msm/ZSg-XtwMxg3_fWxc@gerhold.net/

> BTW, if you really need something like the above, the proper way to do
> it would instead be to call device_set_awake_path() for the device.
> 
> This informs genpd that the device needs to stay powered-on during
> system suspend (assuming that GENPD_FLAG_ACTIVE_WAKEUP has been set
> for it), hence it will keep the corresponding PM domain powered-on
> too.
> 

Thanks, I can try if this works as alternative to the
dev_pm_syscore_device()!

I will wait for your thoughts on the above before accidentally going
into the wrong direction again. :-)

Thanks!
Stephan
  
Ulf Hansson Oct. 19, 2023, 2:12 p.m. UTC | #4
On Thu, 19 Oct 2023 at 15:05, Stephan Gerhold <stephan@gerhold.net> wrote:
>
> On Thu, Oct 19, 2023 at 01:26:19PM +0200, Ulf Hansson wrote:
> > On Thu, 19 Oct 2023 at 12:24, Ulf Hansson <ulf.hansson@linaro.org> wrote:
> > > On Wed, 18 Oct 2023 at 10:06, Stephan Gerhold
> > > <stephan.gerhold@kernkonzept.com> wrote:
> > > >
> > > > The genpd core caches performance state votes from devices that are
> > > > runtime suspended as of commit 3c5a272202c2 ("PM: domains: Improve
> > > > runtime PM performance state handling"). They get applied once the
> > > > device becomes active again.
> > > >
> > > > To attach the power domains needed by qcom-cpufreq-nvmem the OPP core
> > > > calls genpd_dev_pm_attach_by_id(). This results in "virtual" dummy
> > > > devices that use runtime PM only to control the enable and performance
> > > > state for the attached power domain.
> > > >
> > > > However, at the moment nothing ever resumes the virtual devices created
> > > > for qcom-cpufreq-nvmem. They remain permanently runtime suspended. This
> > > > means that performance state votes made during cpufreq scaling get
> > > > always cached and never applied to the hardware.
> > > >
> > > > Fix this by enabling the devices after attaching them and use
> > > > dev_pm_syscore_device() to ensure the power domains also stay on when
> > > > going to suspend. Since it supplies the CPU we can never turn it off
> > > > from Linux. There are other mechanisms to turn it off when needed,
> > > > usually in the RPM firmware (RPMPD) or the cpuidle path (CPR genpd).
> > >
> > > I believe we discussed using dev_pm_syscore_device() for the previous
> > > version. It's not intended to be used for things like the above.
> > >
>
> Sorry, looks like we still had a misunderstanding in the conclusion of
> the previous discussion. :')
>
> > > Moreover, I was under the impression that it wasn't really needed. In
> > > fact, I would think that this actually breaks things for system
> > > suspend/resume, as in this case the cpr driver's genpd
> > > ->power_on|off() callbacks are no longer getting called due this,
> > > which means that the cpr state machine isn't going to be restored
> > > properly. Or did I get this wrong?
> >
>
> We strictly need the RPMPDs to be always-on, also across system suspend
> [1]. The RPM firmware will drop the votes internally as soon as the
> CPU(s) have entered deep cpuidle. We can't do this from Linux, because
> we need the CPU to continue running until it was shut down cleanly.
>
> For CPR, we strictly need the backing regulator to be always-on, also
> across system suspend. Typically the hardware will turn off the
> regulator as soon as the CPU(s) enter deep cpuidle. Similarly, we can't
> do this from Linux, because we need the CPU to continue running until it
> was shut down cleanly.
>
> My understanding was that we're going to pause the CPR state machine
> using the system suspend/resume callbacks on the driver, instead of
> using the genpd->power_on|off() callbacks [2]. I can submit a separate
> patch for this.

If we are going to do 1) as described below, this looks to me that
it's going to be needed.

How will otherwise the cpr state machine be saved/restored during
system suspend/resume? Note that, beyond 1), the genpd's
->power_on|off() callbacks are no longer going to be called during
system suspend/resume.

In a way this also means that the cpr genpd provider might as well
also have GENPD_FLAG_ALWAYS_ON set for it.

>
> I didn't prioritize this because QCS404 (as the only current user of
> CPR) doesn't have proper deep cpuidle/power management set up yet. It's
> not entirely clear to me if there is any advantage (or perhaps even
> disadvantage) if we pause the CPR state machine while the shared L2
> cache is still being actively powered by the CPR power rail during
> system suspend. I suspect this is a configuration that was never
> considered in the hardware design.

I see.

>
> Given the strict requirement for the RPMPDs, I only see two options:
>
>  1. Have an always-on consumer that prevents the power domains to be
>     powered off during system suspend. This is what this patch tries to
>     achieve.
>
> Or:
>
>  2. Come up with a way to register the RPMPDs used by the CPU with
>     GENPD_FLAG_ALWAYS_ON. This would also be doable, but isn't as
>     straightfoward as "regulator-always-on" in the DT because the rpmpd
>     DT node represents multiple genpds in a single DT node [3].

Yes, it sounds like it may be easier to do 1).

>
> What do you think? Do you see some other solution perhaps? I hope we can
> clear up the misunderstanding. :-)

Yes, thanks!

>
> [1]: https://lore.kernel.org/linux-arm-msm/ZQGqfMigCFZP_HLA@gerhold.net/
> [2]: https://lore.kernel.org/linux-arm-msm/CAPDyKFoiup8KNv=1LFGKDdDLA1pHsdJUgTTWMdgxnikEmReXzg@mail.gmail.com/
> [3]: https://lore.kernel.org/linux-arm-msm/ZSg-XtwMxg3_fWxc@gerhold.net/
>
> > BTW, if you really need something like the above, the proper way to do
> > it would instead be to call device_set_awake_path() for the device.
> >
> > This informs genpd that the device needs to stay powered-on during
> > system suspend (assuming that GENPD_FLAG_ACTIVE_WAKEUP has been set
> > for it), hence it will keep the corresponding PM domain powered-on
> > too.
> >
>
> Thanks, I can try if this works as alternative to the
> dev_pm_syscore_device()!

Yes, please. We don't want to abuse the dev_pm_syscore_device() thingy.

>
> I will wait for your thoughts on the above before accidentally going
> into the wrong direction again. :-)

No worries, we are moving forward. :-)

Kind regards
Uffe
  
Stephan Gerhold Oct. 19, 2023, 2:48 p.m. UTC | #5
On Thu, Oct 19, 2023 at 04:12:56PM +0200, Ulf Hansson wrote:
> On Thu, 19 Oct 2023 at 15:05, Stephan Gerhold <stephan@gerhold.net> wrote:
> > On Thu, Oct 19, 2023 at 01:26:19PM +0200, Ulf Hansson wrote:
> > > On Thu, 19 Oct 2023 at 12:24, Ulf Hansson <ulf.hansson@linaro.org> wrote:
> > > > On Wed, 18 Oct 2023 at 10:06, Stephan Gerhold
> > > > <stephan.gerhold@kernkonzept.com> wrote:
> > > > >
> > > > > The genpd core caches performance state votes from devices that are
> > > > > runtime suspended as of commit 3c5a272202c2 ("PM: domains: Improve
> > > > > runtime PM performance state handling"). They get applied once the
> > > > > device becomes active again.
> > > > >
> > > > > To attach the power domains needed by qcom-cpufreq-nvmem the OPP core
> > > > > calls genpd_dev_pm_attach_by_id(). This results in "virtual" dummy
> > > > > devices that use runtime PM only to control the enable and performance
> > > > > state for the attached power domain.
> > > > >
> > > > > However, at the moment nothing ever resumes the virtual devices created
> > > > > for qcom-cpufreq-nvmem. They remain permanently runtime suspended. This
> > > > > means that performance state votes made during cpufreq scaling get
> > > > > always cached and never applied to the hardware.
> > > > >
> > > > > Fix this by enabling the devices after attaching them and use
> > > > > dev_pm_syscore_device() to ensure the power domains also stay on when
> > > > > going to suspend. Since it supplies the CPU we can never turn it off
> > > > > from Linux. There are other mechanisms to turn it off when needed,
> > > > > usually in the RPM firmware (RPMPD) or the cpuidle path (CPR genpd).
> > > >
> > > > I believe we discussed using dev_pm_syscore_device() for the previous
> > > > version. It's not intended to be used for things like the above.
> > > >
> >
> > Sorry, looks like we still had a misunderstanding in the conclusion of
> > the previous discussion. :')
> >
> > > > Moreover, I was under the impression that it wasn't really needed. In
> > > > fact, I would think that this actually breaks things for system
> > > > suspend/resume, as in this case the cpr driver's genpd
> > > > ->power_on|off() callbacks are no longer getting called due this,
> > > > which means that the cpr state machine isn't going to be restored
> > > > properly. Or did I get this wrong?
> > >
> >
> > We strictly need the RPMPDs to be always-on, also across system suspend
> > [1]. The RPM firmware will drop the votes internally as soon as the
> > CPU(s) have entered deep cpuidle. We can't do this from Linux, because
> > we need the CPU to continue running until it was shut down cleanly.
> >
> > For CPR, we strictly need the backing regulator to be always-on, also
> > across system suspend. Typically the hardware will turn off the
> > regulator as soon as the CPU(s) enter deep cpuidle. Similarly, we can't
> > do this from Linux, because we need the CPU to continue running until it
> > was shut down cleanly.
> >
> > My understanding was that we're going to pause the CPR state machine
> > using the system suspend/resume callbacks on the driver, instead of
> > using the genpd->power_on|off() callbacks [2]. I can submit a separate
> > patch for this.
> 
> If we are going to do 1) as described below, this looks to me that
> it's going to be needed.
> 

Yep.

> How will otherwise the cpr state machine be saved/restored during
> system suspend/resume? Note that, beyond 1), the genpd's
> ->power_on|off() callbacks are no longer going to be called during
> system suspend/resume.
> 

(Side note: I think "save/restore" might be the wrong words for
 suspend/resume of CPR. Looking at the code most of the configuration
 appears to be preserved across suspend/resume. Nothing is saved, it
 literally just disables the state machine during suspend and re-enables
 it during resume.

 I'm not entirely sure what's the reason for doing this. Perhaps the
 main goal is just to prevent the CPR state machine from getting stuck
 or sending pointless IRQs that won't be handled while Linux is
 suspended.)

> In a way this also means that the cpr genpd provider might as well
> also have GENPD_FLAG_ALWAYS_ON set for it.

Conceptually I would consider CPR to be a generic power domain provider
that could supply any kind of device. I know at least of CPUs and GPUs.
We need "always-on" only for the CPU, but not necessarily for other
devices.

For a GPU, the Linux driver (running on the CPU) can stop the GPU, wait
for completion and then invoke the ->power_off() callback of CPR. In
that case it is also safe to disable the backing regulator from Linux.
(I briefly mentioned this already in the previous discussion I think.)

We could set GENPD_FLAG_ALWAYS_ON for the CPR compatibles where we know
that they are only used to supply CPUs, but if we're going to do (1)
anyway there might not be much of an advantage for the extra complexity.

> 
> >
> > I didn't prioritize this because QCS404 (as the only current user of
> > CPR) doesn't have proper deep cpuidle/power management set up yet. It's
> > not entirely clear to me if there is any advantage (or perhaps even
> > disadvantage) if we pause the CPR state machine while the shared L2
> > cache is still being actively powered by the CPR power rail during
> > system suspend. I suspect this is a configuration that was never
> > considered in the hardware design.
> 
> I see.
> 
> >
> > Given the strict requirement for the RPMPDs, I only see two options:
> >
> >  1. Have an always-on consumer that prevents the power domains to be
> >     powered off during system suspend. This is what this patch tries to
> >     achieve.
> >
> > Or:
> >
> >  2. Come up with a way to register the RPMPDs used by the CPU with
> >     GENPD_FLAG_ALWAYS_ON. This would also be doable, but isn't as
> >     straightfoward as "regulator-always-on" in the DT because the rpmpd
> >     DT node represents multiple genpds in a single DT node [3].
> 
> Yes, it sounds like it may be easier to do 1).
> 
> >
> > What do you think? Do you see some other solution perhaps? I hope we can
> > clear up the misunderstanding. :-)
> 
> Yes, thanks!
> 
> >
> > [1]: https://lore.kernel.org/linux-arm-msm/ZQGqfMigCFZP_HLA@gerhold.net/
> > [2]: https://lore.kernel.org/linux-arm-msm/CAPDyKFoiup8KNv=1LFGKDdDLA1pHsdJUgTTWMdgxnikEmReXzg@mail.gmail.com/
> > [3]: https://lore.kernel.org/linux-arm-msm/ZSg-XtwMxg3_fWxc@gerhold.net/
> >
> > > BTW, if you really need something like the above, the proper way to do
> > > it would instead be to call device_set_awake_path() for the device.
> > >
> > > This informs genpd that the device needs to stay powered-on during
> > > system suspend (assuming that GENPD_FLAG_ACTIVE_WAKEUP has been set
> > > for it), hence it will keep the corresponding PM domain powered-on
> > > too.
> > >
> >
> > Thanks, I can try if this works as alternative to the
> > dev_pm_syscore_device()!
> 
> Yes, please. We don't want to abuse the dev_pm_syscore_device() thingy.
> 

Could you clarify the idea behind GENPD_FLAG_ACTIVE_WAKEUP? Would I set
it conditionally for all RPMPDs or just the ones consumed by the CPU?
How does the genpd *provider* know if one of its *consumer* devices
needs to have its power domain kept on for wakeup?

Thanks!
Stephan
  
Ulf Hansson Oct. 19, 2023, 3:19 p.m. UTC | #6
On Thu, 19 Oct 2023 at 16:49, Stephan Gerhold <stephan@gerhold.net> wrote:
>
> On Thu, Oct 19, 2023 at 04:12:56PM +0200, Ulf Hansson wrote:
> > On Thu, 19 Oct 2023 at 15:05, Stephan Gerhold <stephan@gerhold.net> wrote:
> > > On Thu, Oct 19, 2023 at 01:26:19PM +0200, Ulf Hansson wrote:
> > > > On Thu, 19 Oct 2023 at 12:24, Ulf Hansson <ulf.hansson@linaro.org> wrote:
> > > > > On Wed, 18 Oct 2023 at 10:06, Stephan Gerhold
> > > > > <stephan.gerhold@kernkonzept.com> wrote:
> > > > > >
> > > > > > The genpd core caches performance state votes from devices that are
> > > > > > runtime suspended as of commit 3c5a272202c2 ("PM: domains: Improve
> > > > > > runtime PM performance state handling"). They get applied once the
> > > > > > device becomes active again.
> > > > > >
> > > > > > To attach the power domains needed by qcom-cpufreq-nvmem the OPP core
> > > > > > calls genpd_dev_pm_attach_by_id(). This results in "virtual" dummy
> > > > > > devices that use runtime PM only to control the enable and performance
> > > > > > state for the attached power domain.
> > > > > >
> > > > > > However, at the moment nothing ever resumes the virtual devices created
> > > > > > for qcom-cpufreq-nvmem. They remain permanently runtime suspended. This
> > > > > > means that performance state votes made during cpufreq scaling get
> > > > > > always cached and never applied to the hardware.
> > > > > >
> > > > > > Fix this by enabling the devices after attaching them and use
> > > > > > dev_pm_syscore_device() to ensure the power domains also stay on when
> > > > > > going to suspend. Since it supplies the CPU we can never turn it off
> > > > > > from Linux. There are other mechanisms to turn it off when needed,
> > > > > > usually in the RPM firmware (RPMPD) or the cpuidle path (CPR genpd).
> > > > >
> > > > > I believe we discussed using dev_pm_syscore_device() for the previous
> > > > > version. It's not intended to be used for things like the above.
> > > > >
> > >
> > > Sorry, looks like we still had a misunderstanding in the conclusion of
> > > the previous discussion. :')
> > >
> > > > > Moreover, I was under the impression that it wasn't really needed. In
> > > > > fact, I would think that this actually breaks things for system
> > > > > suspend/resume, as in this case the cpr driver's genpd
> > > > > ->power_on|off() callbacks are no longer getting called due this,
> > > > > which means that the cpr state machine isn't going to be restored
> > > > > properly. Or did I get this wrong?
> > > >
> > >
> > > We strictly need the RPMPDs to be always-on, also across system suspend
> > > [1]. The RPM firmware will drop the votes internally as soon as the
> > > CPU(s) have entered deep cpuidle. We can't do this from Linux, because
> > > we need the CPU to continue running until it was shut down cleanly.
> > >
> > > For CPR, we strictly need the backing regulator to be always-on, also
> > > across system suspend. Typically the hardware will turn off the
> > > regulator as soon as the CPU(s) enter deep cpuidle. Similarly, we can't
> > > do this from Linux, because we need the CPU to continue running until it
> > > was shut down cleanly.
> > >
> > > My understanding was that we're going to pause the CPR state machine
> > > using the system suspend/resume callbacks on the driver, instead of
> > > using the genpd->power_on|off() callbacks [2]. I can submit a separate
> > > patch for this.
> >
> > If we are going to do 1) as described below, this looks to me that
> > it's going to be needed.
> >
>
> Yep.
>
> > How will otherwise the cpr state machine be saved/restored during
> > system suspend/resume? Note that, beyond 1), the genpd's
> > ->power_on|off() callbacks are no longer going to be called during
> > system suspend/resume.
> >
>
> (Side note: I think "save/restore" might be the wrong words for
>  suspend/resume of CPR. Looking at the code most of the configuration
>  appears to be preserved across suspend/resume. Nothing is saved, it
>  literally just disables the state machine during suspend and re-enables
>  it during resume.
>
>  I'm not entirely sure what's the reason for doing this. Perhaps the
>  main goal is just to prevent the CPR state machine from getting stuck
>  or sending pointless IRQs that won't be handled while Linux is
>  suspended.)

If only the latter, that is a very good reason too. Drivers should
take care of their devices to make sure they are not triggering
spurious irqs during system suspend.

>
> > In a way this also means that the cpr genpd provider might as well
> > also have GENPD_FLAG_ALWAYS_ON set for it.
>
> Conceptually I would consider CPR to be a generic power domain provider
> that could supply any kind of device. I know at least of CPUs and GPUs.
> We need "always-on" only for the CPU, but not necessarily for other
> devices.
>
> For a GPU, the Linux driver (running on the CPU) can stop the GPU, wait
> for completion and then invoke the ->power_off() callback of CPR. In
> that case it is also safe to disable the backing regulator from Linux.
> (I briefly mentioned this already in the previous discussion I think.)
>
> We could set GENPD_FLAG_ALWAYS_ON for the CPR compatibles where we know
> that they are only used to supply CPUs, but if we're going to do (1)
> anyway there might not be much of an advantage for the extra complexity.

Okay, fair enough. Let's just stick with 1) and skip using
GENPD_FLAG_ALWAYS_ON bit for the cpr genpd provider.

>
> >
> > >
> > > I didn't prioritize this because QCS404 (as the only current user of
> > > CPR) doesn't have proper deep cpuidle/power management set up yet. It's
> > > not entirely clear to me if there is any advantage (or perhaps even
> > > disadvantage) if we pause the CPR state machine while the shared L2
> > > cache is still being actively powered by the CPR power rail during
> > > system suspend. I suspect this is a configuration that was never
> > > considered in the hardware design.
> >
> > I see.
> >
> > >
> > > Given the strict requirement for the RPMPDs, I only see two options:
> > >
> > >  1. Have an always-on consumer that prevents the power domains to be
> > >     powered off during system suspend. This is what this patch tries to
> > >     achieve.
> > >
> > > Or:
> > >
> > >  2. Come up with a way to register the RPMPDs used by the CPU with
> > >     GENPD_FLAG_ALWAYS_ON. This would also be doable, but isn't as
> > >     straightfoward as "regulator-always-on" in the DT because the rpmpd
> > >     DT node represents multiple genpds in a single DT node [3].
> >
> > Yes, it sounds like it may be easier to do 1).
> >
> > >
> > > What do you think? Do you see some other solution perhaps? I hope we can
> > > clear up the misunderstanding. :-)
> >
> > Yes, thanks!
> >
> > >
> > > [1]: https://lore.kernel.org/linux-arm-msm/ZQGqfMigCFZP_HLA@gerhold.net/
> > > [2]: https://lore.kernel.org/linux-arm-msm/CAPDyKFoiup8KNv=1LFGKDdDLA1pHsdJUgTTWMdgxnikEmReXzg@mail.gmail.com/
> > > [3]: https://lore.kernel.org/linux-arm-msm/ZSg-XtwMxg3_fWxc@gerhold.net/
> > >
> > > > BTW, if you really need something like the above, the proper way to do
> > > > it would instead be to call device_set_awake_path() for the device.
> > > >
> > > > This informs genpd that the device needs to stay powered-on during
> > > > system suspend (assuming that GENPD_FLAG_ACTIVE_WAKEUP has been set
> > > > for it), hence it will keep the corresponding PM domain powered-on
> > > > too.
> > > >
> > >
> > > Thanks, I can try if this works as alternative to the
> > > dev_pm_syscore_device()!
> >
> > Yes, please. We don't want to abuse the dev_pm_syscore_device() thingy.
> >
>
> Could you clarify the idea behind GENPD_FLAG_ACTIVE_WAKEUP? Would I set
> it conditionally for all RPMPDs or just the ones consumed by the CPU?
> How does the genpd *provider* know if one of its *consumer* devices
> needs to have its power domain kept on for wakeup?

We are thinking of the GENPD_FLAG_ACTIVE_WAKEUP as a platform
configuration type of flag for the genpd in question. The consumer
driver shouldn't need to know about the details of what is happening
on the PM domain level - only whether it needs its device to remain
powered-on during system suspend or not.

I suspect that the GENPD_FLAG_ACTIVE_WAKEUP is probably okay to set
for most genpds, but there may be some exceptions.

Kind regards
Uffe
  
Stephan Gerhold Oct. 19, 2023, 5:07 p.m. UTC | #7
On Thu, Oct 19, 2023 at 05:19:53PM +0200, Ulf Hansson wrote:
> On Thu, 19 Oct 2023 at 16:49, Stephan Gerhold <stephan@gerhold.net> wrote:
> > On Thu, Oct 19, 2023 at 04:12:56PM +0200, Ulf Hansson wrote:
> > > On Thu, 19 Oct 2023 at 15:05, Stephan Gerhold <stephan@gerhold.net> wrote:
> > > > On Thu, Oct 19, 2023 at 01:26:19PM +0200, Ulf Hansson wrote:
> > > > > BTW, if you really need something like the above, the proper way to do
> > > > > it would instead be to call device_set_awake_path() for the device.
> > > > >
> > > > > This informs genpd that the device needs to stay powered-on during
> > > > > system suspend (assuming that GENPD_FLAG_ACTIVE_WAKEUP has been set
> > > > > for it), hence it will keep the corresponding PM domain powered-on
> > > > > too.
> > > >
> > > > Thanks, I can try if this works as alternative to the
> > > > dev_pm_syscore_device()!
> > >
> > > Yes, please. We don't want to abuse the dev_pm_syscore_device() thingy.
> >
> > Could you clarify the idea behind GENPD_FLAG_ACTIVE_WAKEUP? Would I set
> > it conditionally for all RPMPDs or just the ones consumed by the CPU?
> > How does the genpd *provider* know if one of its *consumer* devices
> > needs to have its power domain kept on for wakeup?
> 
> We are thinking of the GENPD_FLAG_ACTIVE_WAKEUP as a platform
> configuration type of flag for the genpd in question. The consumer
> driver shouldn't need to know about the details of what is happening
> on the PM domain level - only whether it needs its device to remain
> powered-on during system suspend or not.
> 

Thanks! I will test if this works for RPMPD and post new versions of the
patches. By coincidence I think this flag might actually be useful as
temporary solution for CPR. If I:

 1. Change $subject patch to use device_set_awake_path() instead, and
 2. Set GENPD_FLAG_ACTIVE_WAKEUP for the RPMPD genpds, but
 3. Do *not* set GENPD_FLAG_ACTIVE_WAKEUP for the CPR genpd.

Then the genpd ->power_on|off() callbacks should still be called
for CPR during system suspend, right? :D

> I suspect that the GENPD_FLAG_ACTIVE_WAKEUP is probably okay to set
> for most genpds, but there may be some exceptions.
> 

Out of curiosity, do you have an example for such an exception where
GENPD_FLAG_ACTIVE_WAKEUP shouldn't be set, aside from workarounds like
I just described?

As you said, the consumer device should just say that it wants to stay
powered for wakeup during suspend. But if its power domains get powered
off, I would expect that to break. How could a genpd driver still
provide power without being powered on? Wouldn't that rather be a low
performance state?

Thanks,
Stephan
  
Ulf Hansson Oct. 20, 2023, 10:20 a.m. UTC | #8
On Thu, 19 Oct 2023 at 19:08, Stephan Gerhold <stephan@gerhold.net> wrote:
>
> On Thu, Oct 19, 2023 at 05:19:53PM +0200, Ulf Hansson wrote:
> > On Thu, 19 Oct 2023 at 16:49, Stephan Gerhold <stephan@gerhold.net> wrote:
> > > On Thu, Oct 19, 2023 at 04:12:56PM +0200, Ulf Hansson wrote:
> > > > On Thu, 19 Oct 2023 at 15:05, Stephan Gerhold <stephan@gerhold.net> wrote:
> > > > > On Thu, Oct 19, 2023 at 01:26:19PM +0200, Ulf Hansson wrote:
> > > > > > BTW, if you really need something like the above, the proper way to do
> > > > > > it would instead be to call device_set_awake_path() for the device.
> > > > > >
> > > > > > This informs genpd that the device needs to stay powered-on during
> > > > > > system suspend (assuming that GENPD_FLAG_ACTIVE_WAKEUP has been set
> > > > > > for it), hence it will keep the corresponding PM domain powered-on
> > > > > > too.
> > > > >
> > > > > Thanks, I can try if this works as alternative to the
> > > > > dev_pm_syscore_device()!
> > > >
> > > > Yes, please. We don't want to abuse the dev_pm_syscore_device() thingy.
> > >
> > > Could you clarify the idea behind GENPD_FLAG_ACTIVE_WAKEUP? Would I set
> > > it conditionally for all RPMPDs or just the ones consumed by the CPU?
> > > How does the genpd *provider* know if one of its *consumer* devices
> > > needs to have its power domain kept on for wakeup?
> >
> > We are thinking of the GENPD_FLAG_ACTIVE_WAKEUP as a platform
> > configuration type of flag for the genpd in question. The consumer
> > driver shouldn't need to know about the details of what is happening
> > on the PM domain level - only whether it needs its device to remain
> > powered-on during system suspend or not.
> >
>
> Thanks! I will test if this works for RPMPD and post new versions of the
> patches. By coincidence I think this flag might actually be useful as
> temporary solution for CPR. If I:
>
>  1. Change $subject patch to use device_set_awake_path() instead, and
>  2. Set GENPD_FLAG_ACTIVE_WAKEUP for the RPMPD genpds, but
>  3. Do *not* set GENPD_FLAG_ACTIVE_WAKEUP for the CPR genpd.
>
> Then the genpd ->power_on|off() callbacks should still be called
> for CPR during system suspend, right? :D

Yes, correct, that should work fine!

>
> > I suspect that the GENPD_FLAG_ACTIVE_WAKEUP is probably okay to set
> > for most genpds, but there may be some exceptions.
> >
>
> Out of curiosity, do you have an example for such an exception where
> GENPD_FLAG_ACTIVE_WAKEUP shouldn't be set, aside from workarounds like
> I just described?
>
> As you said, the consumer device should just say that it wants to stay
> powered for wakeup during suspend. But if its power domains get powered
> off, I would expect that to break. How could a genpd driver still
> provide power without being powered on? Wouldn't that rather be a low
> performance state?

I think this boils down to how the power-rail that the genpd manages,
is handled by the platform during system suspend.

In principle there could be some other separate logic that helps a
FW/PMIC to understand whether it needs to keep the power-rail on or
not - no matter whether the genpd releases its vote for it during
system suspend.

This becomes mostly hypothetical, but clearly there are a lot of
genpd/platforms that don't use GENPD_FLAG_ACTIVE_WAKEUP too. If those
are just mistakes or just not needed, I don't actually know.

Kind regards
Uffe
  
Stephan Gerhold Oct. 24, 2023, 12:03 p.m. UTC | #9
On Thu, Oct 19, 2023 at 01:26:19PM +0200, Ulf Hansson wrote:
> On Thu, 19 Oct 2023 at 12:24, Ulf Hansson <ulf.hansson@linaro.org> wrote:
> >
> > On Wed, 18 Oct 2023 at 10:06, Stephan Gerhold
> > <stephan.gerhold@kernkonzept.com> wrote:
> > >
> > > The genpd core caches performance state votes from devices that are
> > > runtime suspended as of commit 3c5a272202c2 ("PM: domains: Improve
> > > runtime PM performance state handling"). They get applied once the
> > > device becomes active again.
> > >
> > > To attach the power domains needed by qcom-cpufreq-nvmem the OPP core
> > > calls genpd_dev_pm_attach_by_id(). This results in "virtual" dummy
> > > devices that use runtime PM only to control the enable and performance
> > > state for the attached power domain.
> > >
> > > However, at the moment nothing ever resumes the virtual devices created
> > > for qcom-cpufreq-nvmem. They remain permanently runtime suspended. This
> > > means that performance state votes made during cpufreq scaling get
> > > always cached and never applied to the hardware.
> > >
> > > Fix this by enabling the devices after attaching them and use
> > > dev_pm_syscore_device() to ensure the power domains also stay on when
> > > going to suspend. Since it supplies the CPU we can never turn it off
> > > from Linux. There are other mechanisms to turn it off when needed,
> > > usually in the RPM firmware (RPMPD) or the cpuidle path (CPR genpd).
> >
> > I believe we discussed using dev_pm_syscore_device() for the previous
> > version. It's not intended to be used for things like the above.
> >
> > Moreover, I was under the impression that it wasn't really needed. In
> > fact, I would think that this actually breaks things for system
> > suspend/resume, as in this case the cpr driver's genpd
> > ->power_on|off() callbacks are no longer getting called due this,
> > which means that the cpr state machine isn't going to be restored
> > properly. Or did I get this wrong?
> 
> BTW, if you really need something like the above, the proper way to do
> it would instead be to call device_set_awake_path() for the device.
> 

Unfortunately this does not work correctly. When I use
device_set_awake_path() it does set dev->power.wakeup_path = true.
However, this flag is cleared again in device_prepare() when entering
suspend. To me it looks a bit like wakeup_path is not supposed to be set
directly by drivers? Before and after your commit 8512220c5782 ("PM /
core: Assign the wakeup_path status flag in __device_prepare()") it
seems to be internally bound to device_may_wakeup().

It works if I make device_may_wakeup() return true, with

	device_set_wakeup_capable(dev, true);
	device_wakeup_enable(dev);

but that also allows *disabling* the wakeup from sysfs which doesn't
really make sense for the CPU.

Any ideas?

Thanks!
--
Stephan Gerhold <stephan.gerhold@kernkonzept.com>
Kernkonzept GmbH at Dresden, Germany, HRB 31129, CEO Dr.-Ing. Michael Hohmuth
  
Ulf Hansson Oct. 24, 2023, 12:49 p.m. UTC | #10
On Tue, 24 Oct 2023 at 14:03, Stephan Gerhold
<stephan.gerhold@kernkonzept.com> wrote:
>
> On Thu, Oct 19, 2023 at 01:26:19PM +0200, Ulf Hansson wrote:
> > On Thu, 19 Oct 2023 at 12:24, Ulf Hansson <ulf.hansson@linaro.org> wrote:
> > >
> > > On Wed, 18 Oct 2023 at 10:06, Stephan Gerhold
> > > <stephan.gerhold@kernkonzept.com> wrote:
> > > >
> > > > The genpd core caches performance state votes from devices that are
> > > > runtime suspended as of commit 3c5a272202c2 ("PM: domains: Improve
> > > > runtime PM performance state handling"). They get applied once the
> > > > device becomes active again.
> > > >
> > > > To attach the power domains needed by qcom-cpufreq-nvmem the OPP core
> > > > calls genpd_dev_pm_attach_by_id(). This results in "virtual" dummy
> > > > devices that use runtime PM only to control the enable and performance
> > > > state for the attached power domain.
> > > >
> > > > However, at the moment nothing ever resumes the virtual devices created
> > > > for qcom-cpufreq-nvmem. They remain permanently runtime suspended. This
> > > > means that performance state votes made during cpufreq scaling get
> > > > always cached and never applied to the hardware.
> > > >
> > > > Fix this by enabling the devices after attaching them and use
> > > > dev_pm_syscore_device() to ensure the power domains also stay on when
> > > > going to suspend. Since it supplies the CPU we can never turn it off
> > > > from Linux. There are other mechanisms to turn it off when needed,
> > > > usually in the RPM firmware (RPMPD) or the cpuidle path (CPR genpd).
> > >
> > > I believe we discussed using dev_pm_syscore_device() for the previous
> > > version. It's not intended to be used for things like the above.
> > >
> > > Moreover, I was under the impression that it wasn't really needed. In
> > > fact, I would think that this actually breaks things for system
> > > suspend/resume, as in this case the cpr driver's genpd
> > > ->power_on|off() callbacks are no longer getting called due this,
> > > which means that the cpr state machine isn't going to be restored
> > > properly. Or did I get this wrong?
> >
> > BTW, if you really need something like the above, the proper way to do
> > it would instead be to call device_set_awake_path() for the device.
> >
>
> Unfortunately this does not work correctly. When I use
> device_set_awake_path() it does set dev->power.wakeup_path = true.
> However, this flag is cleared again in device_prepare() when entering
> suspend. To me it looks a bit like wakeup_path is not supposed to be set
> directly by drivers? Before and after your commit 8512220c5782 ("PM /
> core: Assign the wakeup_path status flag in __device_prepare()") it
> seems to be internally bound to device_may_wakeup().
>
> It works if I make device_may_wakeup() return true, with
>
>         device_set_wakeup_capable(dev, true);
>         device_wakeup_enable(dev);
>
> but that also allows *disabling* the wakeup from sysfs which doesn't
> really make sense for the CPU.
>
> Any ideas?

The device_set_awake_path() should be called from a system suspend
callback. So you need to add that callback for the cpufreq driver.

Sorry, if that wasn't clear.

Kind regards
Uffe
  
Stephan Gerhold Oct. 24, 2023, 1:07 p.m. UTC | #11
On Tue, Oct 24, 2023 at 02:49:32PM +0200, Ulf Hansson wrote:
> On Tue, 24 Oct 2023 at 14:03, Stephan Gerhold
> <stephan.gerhold@kernkonzept.com> wrote:
> >
> > On Thu, Oct 19, 2023 at 01:26:19PM +0200, Ulf Hansson wrote:
> > > On Thu, 19 Oct 2023 at 12:24, Ulf Hansson <ulf.hansson@linaro.org> wrote:
> > > >
> > > > On Wed, 18 Oct 2023 at 10:06, Stephan Gerhold
> > > > <stephan.gerhold@kernkonzept.com> wrote:
> > > > >
> > > > > The genpd core caches performance state votes from devices that are
> > > > > runtime suspended as of commit 3c5a272202c2 ("PM: domains: Improve
> > > > > runtime PM performance state handling"). They get applied once the
> > > > > device becomes active again.
> > > > >
> > > > > To attach the power domains needed by qcom-cpufreq-nvmem the OPP core
> > > > > calls genpd_dev_pm_attach_by_id(). This results in "virtual" dummy
> > > > > devices that use runtime PM only to control the enable and performance
> > > > > state for the attached power domain.
> > > > >
> > > > > However, at the moment nothing ever resumes the virtual devices created
> > > > > for qcom-cpufreq-nvmem. They remain permanently runtime suspended. This
> > > > > means that performance state votes made during cpufreq scaling get
> > > > > always cached and never applied to the hardware.
> > > > >
> > > > > Fix this by enabling the devices after attaching them and use
> > > > > dev_pm_syscore_device() to ensure the power domains also stay on when
> > > > > going to suspend. Since it supplies the CPU we can never turn it off
> > > > > from Linux. There are other mechanisms to turn it off when needed,
> > > > > usually in the RPM firmware (RPMPD) or the cpuidle path (CPR genpd).
> > > >
> > > > I believe we discussed using dev_pm_syscore_device() for the previous
> > > > version. It's not intended to be used for things like the above.
> > > >
> > > > Moreover, I was under the impression that it wasn't really needed. In
> > > > fact, I would think that this actually breaks things for system
> > > > suspend/resume, as in this case the cpr driver's genpd
> > > > ->power_on|off() callbacks are no longer getting called due this,
> > > > which means that the cpr state machine isn't going to be restored
> > > > properly. Or did I get this wrong?
> > >
> > > BTW, if you really need something like the above, the proper way to do
> > > it would instead be to call device_set_awake_path() for the device.
> > >
> >
> > Unfortunately this does not work correctly. When I use
> > device_set_awake_path() it does set dev->power.wakeup_path = true.
> > However, this flag is cleared again in device_prepare() when entering
> > suspend. To me it looks a bit like wakeup_path is not supposed to be set
> > directly by drivers? Before and after your commit 8512220c5782 ("PM /
> > core: Assign the wakeup_path status flag in __device_prepare()") it
> > seems to be internally bound to device_may_wakeup().
> >
> > It works if I make device_may_wakeup() return true, with
> >
> >         device_set_wakeup_capable(dev, true);
> >         device_wakeup_enable(dev);
> >
> > but that also allows *disabling* the wakeup from sysfs which doesn't
> > really make sense for the CPU.
> >
> > Any ideas?
> 
> The device_set_awake_path() should be called from a system suspend
> callback. So you need to add that callback for the cpufreq driver.
> 
> Sorry, if that wasn't clear.
> 

Hmm, but at the moment I'm calling this on the virtual genpd devices.
How would it work for them? I don't have a suspend callback for them.

I guess could loop over the virtual devices in the cpufreq driver
suspend callback, but is my driver suspend callback really guaranteed to
run before the device_prepare() that clears "wakeup_path" on the virtual
devices?

Or is this the point where we need device links to make that work?
A quick look suggests "wakeup_path" is just propagated to parents but
not device links, so I don't think that would help, either.

Thanks,
  
Ulf Hansson Oct. 24, 2023, 4:11 p.m. UTC | #12
On Tue, 24 Oct 2023 at 15:07, Stephan Gerhold
<stephan.gerhold@kernkonzept.com> wrote:
>
> On Tue, Oct 24, 2023 at 02:49:32PM +0200, Ulf Hansson wrote:
> > On Tue, 24 Oct 2023 at 14:03, Stephan Gerhold
> > <stephan.gerhold@kernkonzept.com> wrote:
> > >
> > > On Thu, Oct 19, 2023 at 01:26:19PM +0200, Ulf Hansson wrote:
> > > > On Thu, 19 Oct 2023 at 12:24, Ulf Hansson <ulf.hansson@linaro.org> wrote:
> > > > >
> > > > > On Wed, 18 Oct 2023 at 10:06, Stephan Gerhold
> > > > > <stephan.gerhold@kernkonzept.com> wrote:
> > > > > >
> > > > > > The genpd core caches performance state votes from devices that are
> > > > > > runtime suspended as of commit 3c5a272202c2 ("PM: domains: Improve
> > > > > > runtime PM performance state handling"). They get applied once the
> > > > > > device becomes active again.
> > > > > >
> > > > > > To attach the power domains needed by qcom-cpufreq-nvmem the OPP core
> > > > > > calls genpd_dev_pm_attach_by_id(). This results in "virtual" dummy
> > > > > > devices that use runtime PM only to control the enable and performance
> > > > > > state for the attached power domain.
> > > > > >
> > > > > > However, at the moment nothing ever resumes the virtual devices created
> > > > > > for qcom-cpufreq-nvmem. They remain permanently runtime suspended. This
> > > > > > means that performance state votes made during cpufreq scaling get
> > > > > > always cached and never applied to the hardware.
> > > > > >
> > > > > > Fix this by enabling the devices after attaching them and use
> > > > > > dev_pm_syscore_device() to ensure the power domains also stay on when
> > > > > > going to suspend. Since it supplies the CPU we can never turn it off
> > > > > > from Linux. There are other mechanisms to turn it off when needed,
> > > > > > usually in the RPM firmware (RPMPD) or the cpuidle path (CPR genpd).
> > > > >
> > > > > I believe we discussed using dev_pm_syscore_device() for the previous
> > > > > version. It's not intended to be used for things like the above.
> > > > >
> > > > > Moreover, I was under the impression that it wasn't really needed. In
> > > > > fact, I would think that this actually breaks things for system
> > > > > suspend/resume, as in this case the cpr driver's genpd
> > > > > ->power_on|off() callbacks are no longer getting called due this,
> > > > > which means that the cpr state machine isn't going to be restored
> > > > > properly. Or did I get this wrong?
> > > >
> > > > BTW, if you really need something like the above, the proper way to do
> > > > it would instead be to call device_set_awake_path() for the device.
> > > >
> > >
> > > Unfortunately this does not work correctly. When I use
> > > device_set_awake_path() it does set dev->power.wakeup_path = true.
> > > However, this flag is cleared again in device_prepare() when entering
> > > suspend. To me it looks a bit like wakeup_path is not supposed to be set
> > > directly by drivers? Before and after your commit 8512220c5782 ("PM /
> > > core: Assign the wakeup_path status flag in __device_prepare()") it
> > > seems to be internally bound to device_may_wakeup().
> > >
> > > It works if I make device_may_wakeup() return true, with
> > >
> > >         device_set_wakeup_capable(dev, true);
> > >         device_wakeup_enable(dev);
> > >
> > > but that also allows *disabling* the wakeup from sysfs which doesn't
> > > really make sense for the CPU.
> > >
> > > Any ideas?
> >
> > The device_set_awake_path() should be called from a system suspend
> > callback. So you need to add that callback for the cpufreq driver.
> >
> > Sorry, if that wasn't clear.
> >
>
> Hmm, but at the moment I'm calling this on the virtual genpd devices.
> How would it work for them? I don't have a suspend callback for them.
>
> I guess could loop over the virtual devices in the cpufreq driver
> suspend callback, but is my driver suspend callback really guaranteed to
> run before the device_prepare() that clears "wakeup_path" on the virtual
> devices?

Yes, that's guaranteed. dpm_prepare() (which calls device_prepare())
is always being executed before dpm_suspend().

>
> Or is this the point where we need device links to make that work?
> A quick look suggests "wakeup_path" is just propagated to parents but
> not device links, so I don't think that would help, either.

I don't think we need device-links for this, at least the way things
are working currently.

Kind regards
Uffe
  
Stephan Gerhold Oct. 24, 2023, 4:25 p.m. UTC | #13
On Tue, Oct 24, 2023 at 06:11:34PM +0200, Ulf Hansson wrote:
> On Tue, 24 Oct 2023 at 15:07, Stephan Gerhold
> <stephan.gerhold@kernkonzept.com> wrote:
> >
> > On Tue, Oct 24, 2023 at 02:49:32PM +0200, Ulf Hansson wrote:
> > > On Tue, 24 Oct 2023 at 14:03, Stephan Gerhold
> > > <stephan.gerhold@kernkonzept.com> wrote:
> > > >
> > > > On Thu, Oct 19, 2023 at 01:26:19PM +0200, Ulf Hansson wrote:
> > > > > On Thu, 19 Oct 2023 at 12:24, Ulf Hansson <ulf.hansson@linaro.org> wrote:
> > > > > >
> > > > > > On Wed, 18 Oct 2023 at 10:06, Stephan Gerhold
> > > > > > <stephan.gerhold@kernkonzept.com> wrote:
> > > > > > >
> > > > > > > The genpd core caches performance state votes from devices that are
> > > > > > > runtime suspended as of commit 3c5a272202c2 ("PM: domains: Improve
> > > > > > > runtime PM performance state handling"). They get applied once the
> > > > > > > device becomes active again.
> > > > > > >
> > > > > > > To attach the power domains needed by qcom-cpufreq-nvmem the OPP core
> > > > > > > calls genpd_dev_pm_attach_by_id(). This results in "virtual" dummy
> > > > > > > devices that use runtime PM only to control the enable and performance
> > > > > > > state for the attached power domain.
> > > > > > >
> > > > > > > However, at the moment nothing ever resumes the virtual devices created
> > > > > > > for qcom-cpufreq-nvmem. They remain permanently runtime suspended. This
> > > > > > > means that performance state votes made during cpufreq scaling get
> > > > > > > always cached and never applied to the hardware.
> > > > > > >
> > > > > > > Fix this by enabling the devices after attaching them and use
> > > > > > > dev_pm_syscore_device() to ensure the power domains also stay on when
> > > > > > > going to suspend. Since it supplies the CPU we can never turn it off
> > > > > > > from Linux. There are other mechanisms to turn it off when needed,
> > > > > > > usually in the RPM firmware (RPMPD) or the cpuidle path (CPR genpd).
> > > > > >
> > > > > > I believe we discussed using dev_pm_syscore_device() for the previous
> > > > > > version. It's not intended to be used for things like the above.
> > > > > >
> > > > > > Moreover, I was under the impression that it wasn't really needed. In
> > > > > > fact, I would think that this actually breaks things for system
> > > > > > suspend/resume, as in this case the cpr driver's genpd
> > > > > > ->power_on|off() callbacks are no longer getting called due this,
> > > > > > which means that the cpr state machine isn't going to be restored
> > > > > > properly. Or did I get this wrong?
> > > > >
> > > > > BTW, if you really need something like the above, the proper way to do
> > > > > it would instead be to call device_set_awake_path() for the device.
> > > > >
> > > >
> > > > Unfortunately this does not work correctly. When I use
> > > > device_set_awake_path() it does set dev->power.wakeup_path = true.
> > > > However, this flag is cleared again in device_prepare() when entering
> > > > suspend. To me it looks a bit like wakeup_path is not supposed to be set
> > > > directly by drivers? Before and after your commit 8512220c5782 ("PM /
> > > > core: Assign the wakeup_path status flag in __device_prepare()") it
> > > > seems to be internally bound to device_may_wakeup().
> > > >
> > > > It works if I make device_may_wakeup() return true, with
> > > >
> > > >         device_set_wakeup_capable(dev, true);
> > > >         device_wakeup_enable(dev);
> > > >
> > > > but that also allows *disabling* the wakeup from sysfs which doesn't
> > > > really make sense for the CPU.
> > > >
> > > > Any ideas?
> > >
> > > The device_set_awake_path() should be called from a system suspend
> > > callback. So you need to add that callback for the cpufreq driver.
> > >
> > > Sorry, if that wasn't clear.
> > >
> >
> > Hmm, but at the moment I'm calling this on the virtual genpd devices.
> > How would it work for them? I don't have a suspend callback for them.
> >
> > I guess could loop over the virtual devices in the cpufreq driver
> > suspend callback, but is my driver suspend callback really guaranteed to
> > run before the device_prepare() that clears "wakeup_path" on the virtual
> > devices?
> 
> Yes, that's guaranteed. dpm_prepare() (which calls device_prepare())
> is always being executed before dpm_suspend().
> 

Thanks, I think I understand. Maybe. :-)

Just to confirm, I should call device_set_awake_path() for the virtual
genpd devices as part of the PM ->suspend() callback? And this will be
guaranteed to run after the "prepare" phase but before the
"suspend_noirq" phase where the genpd core will check the wakeup flag?

Thanks,
Stepan
  
Ulf Hansson Oct. 25, 2023, 10:05 a.m. UTC | #14
On Tue, 24 Oct 2023 at 18:25, Stephan Gerhold <stephan@gerhold.net> wrote:
>
> On Tue, Oct 24, 2023 at 06:11:34PM +0200, Ulf Hansson wrote:
> > On Tue, 24 Oct 2023 at 15:07, Stephan Gerhold
> > <stephan.gerhold@kernkonzept.com> wrote:
> > >
> > > On Tue, Oct 24, 2023 at 02:49:32PM +0200, Ulf Hansson wrote:
> > > > On Tue, 24 Oct 2023 at 14:03, Stephan Gerhold
> > > > <stephan.gerhold@kernkonzept.com> wrote:
> > > > >
> > > > > On Thu, Oct 19, 2023 at 01:26:19PM +0200, Ulf Hansson wrote:
> > > > > > On Thu, 19 Oct 2023 at 12:24, Ulf Hansson <ulf.hansson@linaro.org> wrote:
> > > > > > >
> > > > > > > On Wed, 18 Oct 2023 at 10:06, Stephan Gerhold
> > > > > > > <stephan.gerhold@kernkonzept.com> wrote:
> > > > > > > >
> > > > > > > > The genpd core caches performance state votes from devices that are
> > > > > > > > runtime suspended as of commit 3c5a272202c2 ("PM: domains: Improve
> > > > > > > > runtime PM performance state handling"). They get applied once the
> > > > > > > > device becomes active again.
> > > > > > > >
> > > > > > > > To attach the power domains needed by qcom-cpufreq-nvmem the OPP core
> > > > > > > > calls genpd_dev_pm_attach_by_id(). This results in "virtual" dummy
> > > > > > > > devices that use runtime PM only to control the enable and performance
> > > > > > > > state for the attached power domain.
> > > > > > > >
> > > > > > > > However, at the moment nothing ever resumes the virtual devices created
> > > > > > > > for qcom-cpufreq-nvmem. They remain permanently runtime suspended. This
> > > > > > > > means that performance state votes made during cpufreq scaling get
> > > > > > > > always cached and never applied to the hardware.
> > > > > > > >
> > > > > > > > Fix this by enabling the devices after attaching them and use
> > > > > > > > dev_pm_syscore_device() to ensure the power domains also stay on when
> > > > > > > > going to suspend. Since it supplies the CPU we can never turn it off
> > > > > > > > from Linux. There are other mechanisms to turn it off when needed,
> > > > > > > > usually in the RPM firmware (RPMPD) or the cpuidle path (CPR genpd).
> > > > > > >
> > > > > > > I believe we discussed using dev_pm_syscore_device() for the previous
> > > > > > > version. It's not intended to be used for things like the above.
> > > > > > >
> > > > > > > Moreover, I was under the impression that it wasn't really needed. In
> > > > > > > fact, I would think that this actually breaks things for system
> > > > > > > suspend/resume, as in this case the cpr driver's genpd
> > > > > > > ->power_on|off() callbacks are no longer getting called due this,
> > > > > > > which means that the cpr state machine isn't going to be restored
> > > > > > > properly. Or did I get this wrong?
> > > > > >
> > > > > > BTW, if you really need something like the above, the proper way to do
> > > > > > it would instead be to call device_set_awake_path() for the device.
> > > > > >
> > > > >
> > > > > Unfortunately this does not work correctly. When I use
> > > > > device_set_awake_path() it does set dev->power.wakeup_path = true.
> > > > > However, this flag is cleared again in device_prepare() when entering
> > > > > suspend. To me it looks a bit like wakeup_path is not supposed to be set
> > > > > directly by drivers? Before and after your commit 8512220c5782 ("PM /
> > > > > core: Assign the wakeup_path status flag in __device_prepare()") it
> > > > > seems to be internally bound to device_may_wakeup().
> > > > >
> > > > > It works if I make device_may_wakeup() return true, with
> > > > >
> > > > >         device_set_wakeup_capable(dev, true);
> > > > >         device_wakeup_enable(dev);
> > > > >
> > > > > but that also allows *disabling* the wakeup from sysfs which doesn't
> > > > > really make sense for the CPU.
> > > > >
> > > > > Any ideas?
> > > >
> > > > The device_set_awake_path() should be called from a system suspend
> > > > callback. So you need to add that callback for the cpufreq driver.
> > > >
> > > > Sorry, if that wasn't clear.
> > > >
> > >
> > > Hmm, but at the moment I'm calling this on the virtual genpd devices.
> > > How would it work for them? I don't have a suspend callback for them.
> > >
> > > I guess could loop over the virtual devices in the cpufreq driver
> > > suspend callback, but is my driver suspend callback really guaranteed to
> > > run before the device_prepare() that clears "wakeup_path" on the virtual
> > > devices?
> >
> > Yes, that's guaranteed. dpm_prepare() (which calls device_prepare())
> > is always being executed before dpm_suspend().
> >
>
> Thanks, I think I understand. Maybe. :-)
>
> Just to confirm, I should call device_set_awake_path() for the virtual
> genpd devices as part of the PM ->suspend() callback? And this will be
> guaranteed to run after the "prepare" phase but before the
> "suspend_noirq" phase where the genpd core will check the wakeup flag?

Correct!

Kind regards
Uffe
  
Stephan Gerhold Nov. 1, 2023, 2:56 p.m. UTC | #15
On Wed, Oct 25, 2023 at 12:05:49PM +0200, Ulf Hansson wrote:
> On Tue, 24 Oct 2023 at 18:25, Stephan Gerhold <stephan@gerhold.net> wrote:
> > On Tue, Oct 24, 2023 at 06:11:34PM +0200, Ulf Hansson wrote:
> > > On Tue, 24 Oct 2023 at 15:07, Stephan Gerhold
> > > <stephan.gerhold@kernkonzept.com> wrote:
> > > > On Tue, Oct 24, 2023 at 02:49:32PM +0200, Ulf Hansson wrote:
> > > > > On Tue, 24 Oct 2023 at 14:03, Stephan Gerhold
> > > > > <stephan.gerhold@kernkonzept.com> wrote:
> > > > > > On Thu, Oct 19, 2023 at 01:26:19PM +0200, Ulf Hansson wrote:
> > > > > > > On Thu, 19 Oct 2023 at 12:24, Ulf Hansson <ulf.hansson@linaro.org> wrote:
> > > > > > > > On Wed, 18 Oct 2023 at 10:06, Stephan Gerhold
> > > > > > > > <stephan.gerhold@kernkonzept.com> wrote:
> > > > > > > > >
> > > > > > > > > The genpd core caches performance state votes from devices that are
> > > > > > > > > runtime suspended as of commit 3c5a272202c2 ("PM: domains: Improve
> > > > > > > > > runtime PM performance state handling"). They get applied once the
> > > > > > > > > device becomes active again.
> > > > > > > > >
> > > > > > > > > To attach the power domains needed by qcom-cpufreq-nvmem the OPP core
> > > > > > > > > calls genpd_dev_pm_attach_by_id(). This results in "virtual" dummy
> > > > > > > > > devices that use runtime PM only to control the enable and performance
> > > > > > > > > state for the attached power domain.
> > > > > > > > >
> > > > > > > > > However, at the moment nothing ever resumes the virtual devices created
> > > > > > > > > for qcom-cpufreq-nvmem. They remain permanently runtime suspended. This
> > > > > > > > > means that performance state votes made during cpufreq scaling get
> > > > > > > > > always cached and never applied to the hardware.
> > > > > > > > >
> > > > > > > > > Fix this by enabling the devices after attaching them and use
> > > > > > > > > dev_pm_syscore_device() to ensure the power domains also stay on when
> > > > > > > > > going to suspend. Since it supplies the CPU we can never turn it off
> > > > > > > > > from Linux. There are other mechanisms to turn it off when needed,
> > > > > > > > > usually in the RPM firmware (RPMPD) or the cpuidle path (CPR genpd).
> > > > > > > >
> > > > > > > > I believe we discussed using dev_pm_syscore_device() for the previous
> > > > > > > > version. It's not intended to be used for things like the above.
> > > > > > > >
> > > > > > > > Moreover, I was under the impression that it wasn't really needed. In
> > > > > > > > fact, I would think that this actually breaks things for system
> > > > > > > > suspend/resume, as in this case the cpr driver's genpd
> > > > > > > > ->power_on|off() callbacks are no longer getting called due this,
> > > > > > > > which means that the cpr state machine isn't going to be restored
> > > > > > > > properly. Or did I get this wrong?
> > > > > > >
> > > > > > > BTW, if you really need something like the above, the proper way to do
> > > > > > > it would instead be to call device_set_awake_path() for the device.
> > > > > > >
> > > > > >
> > > > > > Unfortunately this does not work correctly. When I use
> > > > > > device_set_awake_path() it does set dev->power.wakeup_path = true.
> > > > > > However, this flag is cleared again in device_prepare() when entering
> > > > > > suspend. To me it looks a bit like wakeup_path is not supposed to be set
> > > > > > directly by drivers? Before and after your commit 8512220c5782 ("PM /
> > > > > > core: Assign the wakeup_path status flag in __device_prepare()") it
> > > > > > seems to be internally bound to device_may_wakeup().
> > > > > >
> > > > > > It works if I make device_may_wakeup() return true, with
> > > > > >
> > > > > >         device_set_wakeup_capable(dev, true);
> > > > > >         device_wakeup_enable(dev);
> > > > > >
> > > > > > but that also allows *disabling* the wakeup from sysfs which doesn't
> > > > > > really make sense for the CPU.
> > > > > >
> > > > > > Any ideas?
> > > > >
> > > > > The device_set_awake_path() should be called from a system suspend
> > > > > callback. So you need to add that callback for the cpufreq driver.
> > > > >
> > > > > Sorry, if that wasn't clear.
> > > > >
> > > >
> > > > Hmm, but at the moment I'm calling this on the virtual genpd devices.
> > > > How would it work for them? I don't have a suspend callback for them.
> > > >
> > > > I guess could loop over the virtual devices in the cpufreq driver
> > > > suspend callback, but is my driver suspend callback really guaranteed to
> > > > run before the device_prepare() that clears "wakeup_path" on the virtual
> > > > devices?
> > >
> > > Yes, that's guaranteed. dpm_prepare() (which calls device_prepare())
> > > is always being executed before dpm_suspend().
> > >
> >
> > Thanks, I think I understand. Maybe. :-)
> >
> > Just to confirm, I should call device_set_awake_path() for the virtual
> > genpd devices as part of the PM ->suspend() callback? And this will be
> > guaranteed to run after the "prepare" phase but before the
> > "suspend_noirq" phase where the genpd core will check the wakeup flag?
> 
> Correct!
> 

Thanks, this seems to works as intended! The diff I tested is below, in
case you already have some comments.

I'll put this into proper patches and will send a v3 after the merge
window.

Stephan

diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c b/drivers/cpufreq/qcom-cpufreq-nvmem.c
index 7e9202080c98..e0c82c958018 100644
--- a/drivers/cpufreq/qcom-cpufreq-nvmem.c
+++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c
@@ -23,6 +23,7 @@
 #include <linux/nvmem-consumer.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
+#include <linux/pm.h>
 #include <linux/pm_domain.h>
 #include <linux/pm_opp.h>
 #include <linux/pm_runtime.h>
@@ -426,6 +427,18 @@ static const struct qcom_cpufreq_match_data match_data_ipq8074 = {
 	.get_version = qcom_cpufreq_ipq8074_name_version,
 };
 
+static void qcom_cpufreq_suspend_virt_devs(struct qcom_cpufreq_drv *drv, unsigned cpu)
+{
+	const char * const *name = drv->data->genpd_names;
+	int i;
+
+	if (!drv->cpus[cpu].virt_devs)
+		return;
+
+	for (i = 0; *name; i++, name++)
+		device_set_awake_path(drv->cpus[cpu].virt_devs[i]);
+}
+
 static void qcom_cpufreq_put_virt_devs(struct qcom_cpufreq_drv *drv, unsigned cpu)
 {
 	const char * const *name = drv->data->genpd_names;
@@ -542,9 +555,6 @@ static int qcom_cpufreq_probe(struct platform_device *pdev)
 
 					goto free_opp;
 				}
-
-				/* Keep CPU power domain always-on */
-				dev_pm_syscore_device(virt_devs[i], true);
 			}
 			drv->cpus[cpu].virt_devs = virt_devs;
 		}
@@ -581,11 +591,25 @@ static void qcom_cpufreq_remove(struct platform_device *pdev)
 	}
 }
 
+static int qcom_cpufreq_suspend(struct device *dev)
+{
+	struct qcom_cpufreq_drv *drv = dev_get_drvdata(dev);
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		qcom_cpufreq_suspend_virt_devs(drv, cpu);
+
+	return 0;
+}
+
+static DEFINE_SIMPLE_DEV_PM_OPS(qcom_cpufreq_pm_ops, qcom_cpufreq_suspend, NULL);
+
 static struct platform_driver qcom_cpufreq_driver = {
 	.probe = qcom_cpufreq_probe,
 	.remove_new = qcom_cpufreq_remove,
 	.driver = {
 		.name = "qcom-cpufreq-nvmem",
+		.pm = pm_sleep_ptr(&qcom_cpufreq_pm_ops),
 	},
 };
 
diff --git a/drivers/pmdomain/qcom/rpmpd.c b/drivers/pmdomain/qcom/rpmpd.c
index abb62e4a2bdf..0f91e00b5909 100644
--- a/drivers/pmdomain/qcom/rpmpd.c
+++ b/drivers/pmdomain/qcom/rpmpd.c
@@ -1050,6 +1050,7 @@ static int rpmpd_probe(struct platform_device *pdev)
 		rpmpds[i]->pd.power_off = rpmpd_power_off;
 		rpmpds[i]->pd.power_on = rpmpd_power_on;
 		rpmpds[i]->pd.set_performance_state = rpmpd_set_performance;
+		rpmpds[i]->pd.flags = GENPD_FLAG_ACTIVE_WAKEUP;
 		pm_genpd_init(&rpmpds[i]->pd, NULL, true);
 
 		data->domains[i] = &rpmpds[i]->pd;
  

Patch

diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c b/drivers/cpufreq/qcom-cpufreq-nvmem.c
index 82a244f3fa52..3794390089b0 100644
--- a/drivers/cpufreq/qcom-cpufreq-nvmem.c
+++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c
@@ -25,6 +25,7 @@ 
 #include <linux/platform_device.h>
 #include <linux/pm_domain.h>
 #include <linux/pm_opp.h>
+#include <linux/pm_runtime.h>
 #include <linux/slab.h>
 #include <linux/soc/qcom/smem.h>
 
@@ -47,6 +48,7 @@  struct qcom_cpufreq_match_data {
 
 struct qcom_cpufreq_drv_cpu {
 	int opp_token;
+	struct device **virt_devs;
 };
 
 struct qcom_cpufreq_drv {
@@ -268,6 +270,18 @@  static const struct qcom_cpufreq_match_data match_data_ipq8074 = {
 	.get_version = qcom_cpufreq_ipq8074_name_version,
 };
 
+static void qcom_cpufreq_put_virt_devs(struct qcom_cpufreq_drv *drv, unsigned cpu)
+{
+	const char * const *name = drv->data->genpd_names;
+	int i;
+
+	if (!drv->cpus[cpu].virt_devs)
+		return;
+
+	for (i = 0; *name; i++, name++)
+		pm_runtime_put(drv->cpus[cpu].virt_devs[i]);
+}
+
 static int qcom_cpufreq_probe(struct platform_device *pdev)
 {
 	struct qcom_cpufreq_drv *drv;
@@ -321,6 +335,7 @@  static int qcom_cpufreq_probe(struct platform_device *pdev)
 	of_node_put(np);
 
 	for_each_possible_cpu(cpu) {
+		struct device **virt_devs = NULL;
 		struct dev_pm_opp_config config = {
 			.supported_hw = NULL,
 		};
@@ -341,7 +356,7 @@  static int qcom_cpufreq_probe(struct platform_device *pdev)
 
 		if (drv->data->genpd_names) {
 			config.genpd_names = drv->data->genpd_names;
-			config.virt_devs = NULL;
+			config.virt_devs = &virt_devs;
 		}
 
 		if (config.supported_hw || config.genpd_names) {
@@ -352,6 +367,30 @@  static int qcom_cpufreq_probe(struct platform_device *pdev)
 				goto free_opp;
 			}
 		}
+
+		if (virt_devs) {
+			const char * const *name = config.genpd_names;
+			int i, j;
+
+			for (i = 0; *name; i++, name++) {
+				ret = pm_runtime_resume_and_get(virt_devs[i]);
+				if (ret) {
+					dev_err(cpu_dev, "failed to resume %s: %d\n",
+						*name, ret);
+
+					/* Rollback previous PM runtime calls */
+					name = config.genpd_names;
+					for (j = 0; *name && j < i; j++, name++)
+						pm_runtime_put(virt_devs[j]);
+
+					goto free_opp;
+				}
+
+				/* Keep CPU power domain always-on */
+				dev_pm_syscore_device(virt_devs[i], true);
+			}
+			drv->cpus[cpu].virt_devs = virt_devs;
+		}
 	}
 
 	cpufreq_dt_pdev = platform_device_register_simple("cpufreq-dt", -1,
@@ -365,8 +404,10 @@  static int qcom_cpufreq_probe(struct platform_device *pdev)
 	dev_err(cpu_dev, "Failed to register platform device\n");
 
 free_opp:
-	for_each_possible_cpu(cpu)
+	for_each_possible_cpu(cpu) {
+		qcom_cpufreq_put_virt_devs(drv, cpu);
 		dev_pm_opp_clear_config(drv->cpus[cpu].opp_token);
+	}
 	return ret;
 }
 
@@ -377,8 +418,10 @@  static void qcom_cpufreq_remove(struct platform_device *pdev)
 
 	platform_device_unregister(cpufreq_dt_pdev);
 
-	for_each_possible_cpu(cpu)
+	for_each_possible_cpu(cpu) {
+		qcom_cpufreq_put_virt_devs(drv, cpu);
 		dev_pm_opp_clear_config(drv->cpus[cpu].opp_token);
+	}
 }
 
 static struct platform_driver qcom_cpufreq_driver = {