ACPI: cpufreq: use a platform device to load ACPI PPC and PCC drivers
Commit Message
The acpi-cpufreq and pcc-cpufreq drivers are loaded through per-CPU
module aliases. This can result in many unnecessary load requests during
boot if another frequency module, such as intel_pstate, is already
active. For instance, on a typical Intel system, one can observe that
udev makes 2x#CPUs attempts to insert acpi_cpufreq and 1x#CPUs attempts
for pcc_cpufreq. All these tries then fail if another frequency module
is already registered.
Both acpi-cpufreq and pcc-cpufreq drivers have their platform firmware
interface defined by ACPI. Allowed performance states and parameters
must be same for each CPU. This makes it possible to model these
interfaces as platform devices.
The patch extends the ACPI parsing logic to check the ACPI namespace if
the PPC or PCC interface is present and creates a virtual platform
device for each if it is available. The acpi-cpufreq and pcc-cpufreq
drivers are then updated to map to these devices.
This allows to try loading acpi-cpufreq and pcc-cpufreq only once during
boot and only if a given interface is available in the firmware.
Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
---
drivers/acpi/Makefile | 1 +
drivers/acpi/acpi_cpufreq.c | 49 ++++++++++++++++++++++++++++++++++
drivers/acpi/bus.c | 1 +
drivers/acpi/internal.h | 2 ++
drivers/cpufreq/acpi-cpufreq.c | 39 +++++++++++++++------------
drivers/cpufreq/pcc-cpufreq.c | 34 ++++++++++++++++-------
6 files changed, 99 insertions(+), 27 deletions(-)
create mode 100644 drivers/acpi/acpi_cpufreq.c
Comments
On Tue, Jan 31, 2023 at 02:00:41PM +0100, Petr Pavlu wrote:
> The acpi-cpufreq and pcc-cpufreq drivers are loaded through per-CPU
> module aliases.
It gets me to question, what makes this "per-CPU module aliases" and
how do we find other similar uses as they are likely incorrect too?
> This can result in many unnecessary load requests during
> boot if another frequency module, such as intel_pstate, is already
> active.
Perhaps you should mention that in the worst case, without the fix in
commit 0254127ab977e ("module: Don't wait for GOING modules") some of
these module load requests could fail and prevent boot, and that
discussion over these duplicate module reqests ended up in us deciding that
they are not needed, we just need one.
> For instance, on a typical Intel system, one can observe that
> udev makes 2x#CPUs attempts to insert acpi_cpufreq and 1x#CPUs attempts
> for pcc_cpufreq. All these tries then fail if another frequency module
> is already registered.
>
> Both acpi-cpufreq and pcc-cpufreq drivers have their platform firmware
> interface defined by ACPI. Allowed performance states and parameters
> must be same for each CPU. This makes it possible to model these
> interfaces as platform devices.
>
> The patch extends the ACPI parsing logic to check the ACPI namespace if
> the PPC or PCC interface is present and creates a virtual platform
> device for each if it is available. The acpi-cpufreq and pcc-cpufreq
> drivers are then updated to map to these devices.
>
> This allows to try loading acpi-cpufreq and pcc-cpufreq only once during
> boot and only if a given interface is available in the firmware.
That could cut boot time too? If so how much?
Luis
> Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
> ---
> drivers/acpi/Makefile | 1 +
> drivers/acpi/acpi_cpufreq.c | 49 ++++++++++++++++++++++++++++++++++
> drivers/acpi/bus.c | 1 +
> drivers/acpi/internal.h | 2 ++
> drivers/cpufreq/acpi-cpufreq.c | 39 +++++++++++++++------------
> drivers/cpufreq/pcc-cpufreq.c | 34 ++++++++++++++++-------
> 6 files changed, 99 insertions(+), 27 deletions(-)
> create mode 100644 drivers/acpi/acpi_cpufreq.c
>
> diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
> index feb36c0b9446..880db1082c3e 100644
> --- a/drivers/acpi/Makefile
> +++ b/drivers/acpi/Makefile
> @@ -57,6 +57,7 @@ acpi-y += evged.o
> acpi-y += sysfs.o
> acpi-y += property.o
> acpi-$(CONFIG_X86) += acpi_cmos_rtc.o
> +acpi-$(CONFIG_X86) += acpi_cpufreq.o
> acpi-$(CONFIG_X86) += x86/apple.o
> acpi-$(CONFIG_X86) += x86/utils.o
> acpi-$(CONFIG_X86) += x86/s2idle.o
> diff --git a/drivers/acpi/acpi_cpufreq.c b/drivers/acpi/acpi_cpufreq.c
> new file mode 100644
> index 000000000000..7cf243c67475
> --- /dev/null
> +++ b/drivers/acpi/acpi_cpufreq.c
> @@ -0,0 +1,49 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Registration of platform devices for ACPI Processor Performance Control and
> + * Processor Clocking Control.
> + */
> +
> +#include <linux/acpi.h>
> +#include <linux/platform_device.h>
> +
> +#include "internal.h"
> +
> +static void __init cpufreq_add_device(const char *name)
> +{
> + struct platform_device *pdev;
> +
> + pdev = platform_device_register_simple(name, PLATFORM_DEVID_NONE, NULL,
> + 0);
> + if (IS_ERR(pdev))
> + pr_err("%s device creation failed: %ld\n", name, PTR_ERR(pdev));
> +}
> +
> +static acpi_status __init acpi_pct_match(acpi_handle handle, u32 level,
> + void *context, void **return_value)
> +{
> + bool *pct = context;
> +
> + /* Check if the first CPU has _PCT. The data must be same for all. */
> + *pct = acpi_has_method(handle, "_PCT");
> + return AE_CTRL_TERMINATE;
> +}
> +
> +void __init acpi_cpufreq_init(void)
> +{
> + acpi_status status;
> + acpi_handle handle;
> + bool pct = false;
> +
> + status = acpi_get_handle(NULL, "\\_SB", &handle);
> + if (ACPI_FAILURE(status))
> + return;
> +
> + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
> + ACPI_UINT32_MAX, acpi_pct_match, NULL, &pct, NULL);
> + if (pct)
> + cpufreq_add_device("acpi-cpufreq");
> +
> + if (acpi_has_method(handle, "PCCH"))
> + cpufreq_add_device("pcc-cpufreq");
> +}
> diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
> index 0c05ccde1f7a..f1559e26d5ff 100644
> --- a/drivers/acpi/bus.c
> +++ b/drivers/acpi/bus.c
> @@ -1428,6 +1428,7 @@ static int __init acpi_init(void)
> acpi_viot_init();
> acpi_agdi_init();
> acpi_apmt_init();
> + acpi_cpufreq_init();
> return 0;
> }
>
> diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
> index ec584442fb29..c9b1a5f689fa 100644
> --- a/drivers/acpi/internal.h
> +++ b/drivers/acpi/internal.h
> @@ -157,8 +157,10 @@ static inline void acpi_early_processor_set_pdc(void) {}
>
> #ifdef CONFIG_X86
> void acpi_early_processor_osc(void);
> +void acpi_cpufreq_init(void);
> #else
> static inline void acpi_early_processor_osc(void) {}
> +static inline void acpi_cpufreq_init(void) {}
> #endif
>
> /* --------------------------------------------------------------------------
> diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
> index 78adfb2ffff6..e1a5384cf21c 100644
> --- a/drivers/cpufreq/acpi-cpufreq.c
> +++ b/drivers/cpufreq/acpi-cpufreq.c
> @@ -965,7 +965,7 @@ static void __init acpi_cpufreq_boost_init(void)
> acpi_cpufreq_driver.boost_enabled = boost_state(0);
> }
>
> -static int __init acpi_cpufreq_init(void)
> +static int __init acpi_cpufreq_probe(struct platform_device *pdev)
> {
> int ret;
>
> @@ -1010,13 +1010,32 @@ static int __init acpi_cpufreq_init(void)
> return ret;
> }
>
> -static void __exit acpi_cpufreq_exit(void)
> +static int acpi_cpufreq_remove(struct platform_device *pdev)
> {
> pr_debug("%s\n", __func__);
>
> cpufreq_unregister_driver(&acpi_cpufreq_driver);
>
> free_acpi_perf_data();
> +
> + return 0;
> +}
> +
> +static struct platform_driver acpi_cpufreq_platdrv = {
> + .driver = {
> + .name = "acpi-cpufreq",
> + },
> + .remove = acpi_cpufreq_remove,
> +};
> +
> +static int __init acpi_cpufreq_init(void)
> +{
> + return platform_driver_probe(&acpi_cpufreq_platdrv, acpi_cpufreq_probe);
> +}
> +
> +static void __exit acpi_cpufreq_exit(void)
> +{
> + platform_driver_unregister(&acpi_cpufreq_platdrv);
> }
>
> module_param(acpi_pstate_strict, uint, 0644);
> @@ -1027,18 +1046,4 @@ MODULE_PARM_DESC(acpi_pstate_strict,
> late_initcall(acpi_cpufreq_init);
> module_exit(acpi_cpufreq_exit);
>
> -static const struct x86_cpu_id __maybe_unused acpi_cpufreq_ids[] = {
> - X86_MATCH_FEATURE(X86_FEATURE_ACPI, NULL),
> - X86_MATCH_FEATURE(X86_FEATURE_HW_PSTATE, NULL),
> - {}
> -};
> -MODULE_DEVICE_TABLE(x86cpu, acpi_cpufreq_ids);
> -
> -static const struct acpi_device_id __maybe_unused processor_device_ids[] = {
> - {ACPI_PROCESSOR_OBJECT_HID, },
> - {ACPI_PROCESSOR_DEVICE_HID, },
> - {},
> -};
> -MODULE_DEVICE_TABLE(acpi, processor_device_ids);
> -
> -MODULE_ALIAS("acpi");
> +MODULE_ALIAS("platform:acpi-cpufreq");
> diff --git a/drivers/cpufreq/pcc-cpufreq.c b/drivers/cpufreq/pcc-cpufreq.c
> index 9f3fc7a073d0..0c362932ca60 100644
> --- a/drivers/cpufreq/pcc-cpufreq.c
> +++ b/drivers/cpufreq/pcc-cpufreq.c
> @@ -384,7 +384,7 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
> return ret;
> }
>
> -static int __init pcc_cpufreq_probe(void)
> +static int __init pcc_cpufreq_evaluate(void)
> {
> acpi_status status;
> struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
> @@ -576,7 +576,7 @@ static struct cpufreq_driver pcc_cpufreq_driver = {
> .name = "pcc-cpufreq",
> };
>
> -static int __init pcc_cpufreq_init(void)
> +static int __init pcc_cpufreq_probe(struct platform_device *pdev)
> {
> int ret;
>
> @@ -587,9 +587,9 @@ static int __init pcc_cpufreq_init(void)
> if (acpi_disabled)
> return -ENODEV;
>
> - ret = pcc_cpufreq_probe();
> + ret = pcc_cpufreq_evaluate();
> if (ret) {
> - pr_debug("pcc_cpufreq_init: PCCH evaluation failed\n");
> + pr_debug("pcc_cpufreq_probe: PCCH evaluation failed\n");
> return ret;
> }
>
> @@ -607,21 +607,35 @@ static int __init pcc_cpufreq_init(void)
> return ret;
> }
>
> -static void __exit pcc_cpufreq_exit(void)
> +static int pcc_cpufreq_remove(struct platform_device *pdev)
> {
> cpufreq_unregister_driver(&pcc_cpufreq_driver);
>
> pcc_clear_mapping();
>
> free_percpu(pcc_cpu_info);
> +
> + return 0;
> }
>
> -static const struct acpi_device_id __maybe_unused processor_device_ids[] = {
> - {ACPI_PROCESSOR_OBJECT_HID, },
> - {ACPI_PROCESSOR_DEVICE_HID, },
> - {},
> +static struct platform_driver pcc_cpufreq_platdrv = {
> + .driver = {
> + .name = "pcc-cpufreq",
> + },
> + .remove = pcc_cpufreq_remove,
> };
> -MODULE_DEVICE_TABLE(acpi, processor_device_ids);
> +
> +static int __init pcc_cpufreq_init(void)
> +{
> + return platform_driver_probe(&pcc_cpufreq_platdrv, pcc_cpufreq_probe);
> +}
> +
> +static void __exit pcc_cpufreq_exit(void)
> +{
> + platform_driver_unregister(&pcc_cpufreq_platdrv);
> +}
> +
> +MODULE_ALIAS("platform:pcc-cpufreq");
>
> MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
> MODULE_VERSION(PCC_VERSION);
>
On Tue, Jan 31, 2023 at 2:01 PM Petr Pavlu <petr.pavlu@suse.com> wrote:
>
> The acpi-cpufreq and pcc-cpufreq drivers are loaded through per-CPU
> module aliases. This can result in many unnecessary load requests during
> boot if another frequency module, such as intel_pstate, is already
> active. For instance, on a typical Intel system, one can observe that
> udev makes 2x#CPUs attempts to insert acpi_cpufreq and 1x#CPUs attempts
> for pcc_cpufreq. All these tries then fail if another frequency module
> is already registered.
Right, which is unnecessary overhead.
> Both acpi-cpufreq and pcc-cpufreq drivers have their platform firmware
> interface defined by ACPI. Allowed performance states and parameters
> must be same for each CPU.
This is an assumption made by the code in those drivers, the
specification doesn't actually require this IIRC.
> This makes it possible to model these
> interfaces as platform devices.
>
> The patch extends the ACPI parsing logic to check the ACPI namespace if
> the PPC or PCC interface is present and creates a virtual platform
> device for each if it is available. The acpi-cpufreq and pcc-cpufreq
> drivers are then updated to map to these devices.
>
> This allows to try loading acpi-cpufreq and pcc-cpufreq only once during
> boot and only if a given interface is available in the firmware.
This is clever, but will it always work?
> Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
> ---
> drivers/acpi/Makefile | 1 +
> drivers/acpi/acpi_cpufreq.c | 49 ++++++++++++++++++++++++++++++++++
> drivers/acpi/bus.c | 1 +
> drivers/acpi/internal.h | 2 ++
> drivers/cpufreq/acpi-cpufreq.c | 39 +++++++++++++++------------
> drivers/cpufreq/pcc-cpufreq.c | 34 ++++++++++++++++-------
> 6 files changed, 99 insertions(+), 27 deletions(-)
> create mode 100644 drivers/acpi/acpi_cpufreq.c
>
> diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
> index feb36c0b9446..880db1082c3e 100644
> --- a/drivers/acpi/Makefile
> +++ b/drivers/acpi/Makefile
> @@ -57,6 +57,7 @@ acpi-y += evged.o
> acpi-y += sysfs.o
> acpi-y += property.o
> acpi-$(CONFIG_X86) += acpi_cmos_rtc.o
> +acpi-$(CONFIG_X86) += acpi_cpufreq.o
> acpi-$(CONFIG_X86) += x86/apple.o
> acpi-$(CONFIG_X86) += x86/utils.o
> acpi-$(CONFIG_X86) += x86/s2idle.o
> diff --git a/drivers/acpi/acpi_cpufreq.c b/drivers/acpi/acpi_cpufreq.c
> new file mode 100644
> index 000000000000..7cf243c67475
> --- /dev/null
> +++ b/drivers/acpi/acpi_cpufreq.c
> @@ -0,0 +1,49 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Registration of platform devices for ACPI Processor Performance Control and
> + * Processor Clocking Control.
> + */
> +
> +#include <linux/acpi.h>
> +#include <linux/platform_device.h>
> +
> +#include "internal.h"
> +
> +static void __init cpufreq_add_device(const char *name)
> +{
> + struct platform_device *pdev;
> +
> + pdev = platform_device_register_simple(name, PLATFORM_DEVID_NONE, NULL,
> + 0);
> + if (IS_ERR(pdev))
> + pr_err("%s device creation failed: %ld\n", name, PTR_ERR(pdev));
> +}
> +
> +static acpi_status __init acpi_pct_match(acpi_handle handle, u32 level,
> + void *context, void **return_value)
> +{
> + bool *pct = context;
> +
> + /* Check if the first CPU has _PCT. The data must be same for all. */
> + *pct = acpi_has_method(handle, "_PCT");
> + return AE_CTRL_TERMINATE;
> +}
> +
> +void __init acpi_cpufreq_init(void)
> +{
> + acpi_status status;
> + acpi_handle handle;
> + bool pct = false;
> +
> + status = acpi_get_handle(NULL, "\\_SB", &handle);
> + if (ACPI_FAILURE(status))
> + return;
> +
> + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
> + ACPI_UINT32_MAX, acpi_pct_match, NULL, &pct, NULL);
Well, one more full ACPI Namespace walk at init time.
Also, this only covers processor objects and what about processor
device objects?
> + if (pct)
> + cpufreq_add_device("acpi-cpufreq");
> +
> + if (acpi_has_method(handle, "PCCH"))
> + cpufreq_add_device("pcc-cpufreq");
> +}
> diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
> index 0c05ccde1f7a..f1559e26d5ff 100644
> --- a/drivers/acpi/bus.c
> +++ b/drivers/acpi/bus.c
> @@ -1428,6 +1428,7 @@ static int __init acpi_init(void)
> acpi_viot_init();
> acpi_agdi_init();
> acpi_apmt_init();
> + acpi_cpufreq_init();
> return 0;
> }
>
> diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
> index ec584442fb29..c9b1a5f689fa 100644
> --- a/drivers/acpi/internal.h
> +++ b/drivers/acpi/internal.h
> @@ -157,8 +157,10 @@ static inline void acpi_early_processor_set_pdc(void) {}
>
> #ifdef CONFIG_X86
> void acpi_early_processor_osc(void);
> +void acpi_cpufreq_init(void);
> #else
> static inline void acpi_early_processor_osc(void) {}
> +static inline void acpi_cpufreq_init(void) {}
> #endif
>
> /* --------------------------------------------------------------------------
> diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
> index 78adfb2ffff6..e1a5384cf21c 100644
> --- a/drivers/cpufreq/acpi-cpufreq.c
> +++ b/drivers/cpufreq/acpi-cpufreq.c
> @@ -965,7 +965,7 @@ static void __init acpi_cpufreq_boost_init(void)
> acpi_cpufreq_driver.boost_enabled = boost_state(0);
> }
>
> -static int __init acpi_cpufreq_init(void)
> +static int __init acpi_cpufreq_probe(struct platform_device *pdev)
> {
> int ret;
>
> @@ -1010,13 +1010,32 @@ static int __init acpi_cpufreq_init(void)
> return ret;
> }
>
> -static void __exit acpi_cpufreq_exit(void)
> +static int acpi_cpufreq_remove(struct platform_device *pdev)
> {
> pr_debug("%s\n", __func__);
>
> cpufreq_unregister_driver(&acpi_cpufreq_driver);
>
> free_acpi_perf_data();
> +
> + return 0;
> +}
> +
> +static struct platform_driver acpi_cpufreq_platdrv = {
> + .driver = {
> + .name = "acpi-cpufreq",
> + },
> + .remove = acpi_cpufreq_remove,
> +};
> +
> +static int __init acpi_cpufreq_init(void)
> +{
> + return platform_driver_probe(&acpi_cpufreq_platdrv, acpi_cpufreq_probe);
> +}
> +
> +static void __exit acpi_cpufreq_exit(void)
> +{
> + platform_driver_unregister(&acpi_cpufreq_platdrv);
> }
>
> module_param(acpi_pstate_strict, uint, 0644);
> @@ -1027,18 +1046,4 @@ MODULE_PARM_DESC(acpi_pstate_strict,
> late_initcall(acpi_cpufreq_init);
> module_exit(acpi_cpufreq_exit);
>
> -static const struct x86_cpu_id __maybe_unused acpi_cpufreq_ids[] = {
> - X86_MATCH_FEATURE(X86_FEATURE_ACPI, NULL),
> - X86_MATCH_FEATURE(X86_FEATURE_HW_PSTATE, NULL),
> - {}
> -};
> -MODULE_DEVICE_TABLE(x86cpu, acpi_cpufreq_ids);
> -
> -static const struct acpi_device_id __maybe_unused processor_device_ids[] = {
> - {ACPI_PROCESSOR_OBJECT_HID, },
> - {ACPI_PROCESSOR_DEVICE_HID, },
> - {},
> -};
> -MODULE_DEVICE_TABLE(acpi, processor_device_ids);
> -
> -MODULE_ALIAS("acpi");
> +MODULE_ALIAS("platform:acpi-cpufreq");
> diff --git a/drivers/cpufreq/pcc-cpufreq.c b/drivers/cpufreq/pcc-cpufreq.c
> index 9f3fc7a073d0..0c362932ca60 100644
> --- a/drivers/cpufreq/pcc-cpufreq.c
> +++ b/drivers/cpufreq/pcc-cpufreq.c
> @@ -384,7 +384,7 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
> return ret;
> }
>
> -static int __init pcc_cpufreq_probe(void)
> +static int __init pcc_cpufreq_evaluate(void)
> {
> acpi_status status;
> struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
> @@ -576,7 +576,7 @@ static struct cpufreq_driver pcc_cpufreq_driver = {
> .name = "pcc-cpufreq",
> };
>
> -static int __init pcc_cpufreq_init(void)
> +static int __init pcc_cpufreq_probe(struct platform_device *pdev)
> {
> int ret;
>
> @@ -587,9 +587,9 @@ static int __init pcc_cpufreq_init(void)
> if (acpi_disabled)
> return -ENODEV;
>
> - ret = pcc_cpufreq_probe();
> + ret = pcc_cpufreq_evaluate();
> if (ret) {
> - pr_debug("pcc_cpufreq_init: PCCH evaluation failed\n");
> + pr_debug("pcc_cpufreq_probe: PCCH evaluation failed\n");
> return ret;
> }
>
> @@ -607,21 +607,35 @@ static int __init pcc_cpufreq_init(void)
> return ret;
> }
>
> -static void __exit pcc_cpufreq_exit(void)
> +static int pcc_cpufreq_remove(struct platform_device *pdev)
> {
> cpufreq_unregister_driver(&pcc_cpufreq_driver);
>
> pcc_clear_mapping();
>
> free_percpu(pcc_cpu_info);
> +
> + return 0;
> }
>
> -static const struct acpi_device_id __maybe_unused processor_device_ids[] = {
> - {ACPI_PROCESSOR_OBJECT_HID, },
> - {ACPI_PROCESSOR_DEVICE_HID, },
> - {},
> +static struct platform_driver pcc_cpufreq_platdrv = {
> + .driver = {
> + .name = "pcc-cpufreq",
> + },
> + .remove = pcc_cpufreq_remove,
> };
> -MODULE_DEVICE_TABLE(acpi, processor_device_ids);
> +
> +static int __init pcc_cpufreq_init(void)
> +{
> + return platform_driver_probe(&pcc_cpufreq_platdrv, pcc_cpufreq_probe);
> +}
> +
> +static void __exit pcc_cpufreq_exit(void)
> +{
> + platform_driver_unregister(&pcc_cpufreq_platdrv);
> +}
> +
> +MODULE_ALIAS("platform:pcc-cpufreq");
>
> MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
> MODULE_VERSION(PCC_VERSION);
> --
On 2/6/23 17:28, Luis Chamberlain wrote:
> On Tue, Jan 31, 2023 at 02:00:41PM +0100, Petr Pavlu wrote:
>> The acpi-cpufreq and pcc-cpufreq drivers are loaded through per-CPU
>> module aliases.
>
> It gets me to question, what makes this "per-CPU module aliases" and
> how do we find other similar uses as they are likely incorrect too?
The acpi-cpufreq module aliases:
* 'cpu:type:x86,ven*fam*mod*:feature:*0016*' (X86_FEATURE_HW_PSTATE),
* 'cpu:type:x86,ven*fam*mod*:feature:*00E8*' (X86_FEATURE_ACPI),
* 'acpi*:LNXCPU:*' (ACPI_PROCESSOR_OBJECT_HID),
* 'acpi*:ACPI0007:*' (ACPI_PROCESSOR_DEVICE_HID).
The pcc-cpufreq similarly aliases:
* 'acpi*:LNXCPU:*' (ACPI_PROCESSOR_OBJECT_HID),
* 'acpi*:ACPI0007:*' (ACPI_PROCESSOR_DEVICE_HID).
These per-CPU aliases can be found as follows:
$ grep -E '(cpu:type|ACPI0007|LNXCPU)' /lib/modules/$(uname -r)/modules.alias
I think it is generally ok for modules to use per-CPU aliases. The normal case
should be that the first load succeeds and any subsequent one is then only
a trivial check by libkmod that a given module has been already inserted.
This breaks when a module repeatedly fails to load. In particular, arrangement
of x86 cpufreq and edac drivers is done in a way that they cascade several
modules and once the first one becomes a selected cpufreq/edac driver then any
further module load fails.
If a subsystem decides to use this kind of pattern then it should really limit
the number of such failed loads. The proposed patch improves the situation for
cpufreq. I've not yet looked at the edac stuff.
The second problem with per-CPU aliases is a burst of same loads created by
udevd (as also discussed on the linux-modules mailing list). The daemon
normally processes devices in parallel and as it goes through CPUs, it will
create a number of requests to load the same per-CPU module. This problem is
present even if the needed module is loaded successfully. A lot of unnecessary
CPU work and memory allocations is needed to handle each duplicate load.
My thinking is that this second problem is something that should be addressed
in udevd. The same situation can happen for other hardware devices when they
all have the same type and need same modules. Udevd could avoid attempting
a second parallel load if it recognizes that one of its workers is already
doing the same insert. I hope to have a closer look at this.
There is clearly some overlap in sense that if udevd was improved in this way
then it makes this patch less useful. However, even if this would get
implemented at some point, I think it's still good if acpi-cpufreq and
pcc-cpufreq is on a given system explicitly attempted to be loaded only once.
>> This can result in many unnecessary load requests during
>> boot if another frequency module, such as intel_pstate, is already
>> active.
>
> Perhaps you should mention that in the worst case, without the fix in
> commit 0254127ab977e ("module: Don't wait for GOING modules") some of
> these module load requests could fail and prevent boot, and that
> discussion over these duplicate module reqests ended up in us deciding that
> they are not needed, we just need one.
Ok, I'll add it in v2.
>> For instance, on a typical Intel system, one can observe that
>> udev makes 2x#CPUs attempts to insert acpi_cpufreq and 1x#CPUs attempts
>> for pcc_cpufreq. All these tries then fail if another frequency module
>> is already registered.
>>
>> Both acpi-cpufreq and pcc-cpufreq drivers have their platform firmware
>> interface defined by ACPI. Allowed performance states and parameters
>> must be same for each CPU. This makes it possible to model these
>> interfaces as platform devices.
>>
>> The patch extends the ACPI parsing logic to check the ACPI namespace if
>> the PPC or PCC interface is present and creates a virtual platform
>> device for each if it is available. The acpi-cpufreq and pcc-cpufreq
>> drivers are then updated to map to these devices.
>>
>> This allows to try loading acpi-cpufreq and pcc-cpufreq only once during
>> boot and only if a given interface is available in the firmware.
>
> That could cut boot time too? If so how much?
The following command should provide an approximate measurement of what
happens during the boot:
# perf stat --null --repeat 10 --post 'sleep 5' -- \
udevadm trigger --action=add --subsystem-match=acpi --sysname-match=LNXCPU* --subsystem-match=cpu --sysname-match=cpu* --settle
My test was with 6.2-rc7 which already contains commit 0254127ab977e ("module:
Don't wait for GOING modules"). Kernel modules were signed and compressed with
zstd. Test machines had intel_pstate loaded and so acpi-cpufreq and
pcc-cpufreq always failed to initialize.
Test machine A, with 16 CPUs + 62 GiB memory:
* vanilla: 0.20334 +- 0.00566 seconds time elapsed ( +- 2.79% )
* with the patch: 0.08939 +- 0.00308 seconds time elapsed ( +- 3.44% )
Test machine B, with 288 CPUs + 110 GiB memory.
* vanilla: 1.3439 +- 0.0232 seconds time elapsed ( +- 1.72% )
* with the patch: 0.74916 +- 0.00253 seconds time elapsed ( +- 0.34% )
Udevd processing was parallel, maximum number of its workers was in both cases
2*cpu_count+16. It means the reduced time roughly says how much less the whole
system (all CPUs) was needed to be occupied.
The smaller system A sees a reduction of ~100 ms with the patch while the
bigger one B shows an improvement of ~600 ms.
Thanks,
Petr
On 2/9/23 18:20, Rafael J. Wysocki wrote:
> On Tue, Jan 31, 2023 at 2:01 PM Petr Pavlu <petr.pavlu@suse.com> wrote:
>>
>> The acpi-cpufreq and pcc-cpufreq drivers are loaded through per-CPU
>> module aliases. This can result in many unnecessary load requests during
>> boot if another frequency module, such as intel_pstate, is already
>> active. For instance, on a typical Intel system, one can observe that
>> udev makes 2x#CPUs attempts to insert acpi_cpufreq and 1x#CPUs attempts
>> for pcc_cpufreq. All these tries then fail if another frequency module
>> is already registered.
>
> Right, which is unnecessary overhead.
>
>> Both acpi-cpufreq and pcc-cpufreq drivers have their platform firmware
>> interface defined by ACPI. Allowed performance states and parameters
>> must be same for each CPU.
>
> This is an assumption made by the code in those drivers, the
> specification doesn't actually require this IIRC.
That statement is based on the ACPI Specification Version 6.5, section 8.4.5.
Processor Performance Control [1]:
"In a multiprocessing environment, all CPUs must support the same number of
performance states and each processor performance state must have identical
performance and power-consumption parameters. Performance objects must be
present under each processor object in the system for OSPM to utilize this
feature."
The same paragraph appears already in ACPI Specification Version 2.0 [2],
section 8.3.3 Processor Performance Control, where PPC was first added.
PCC [3] is simpler and uses a single header to describe properties
(frequencies) for all processors.
>> This makes it possible to model these
>> interfaces as platform devices.
>>
>> The patch extends the ACPI parsing logic to check the ACPI namespace if
>> the PPC or PCC interface is present and creates a virtual platform
>> device for each if it is available. The acpi-cpufreq and pcc-cpufreq
>> drivers are then updated to map to these devices.
>>
>> This allows to try loading acpi-cpufreq and pcc-cpufreq only once during
>> boot and only if a given interface is available in the firmware.
>
> This is clever, but will it always work?
I'm not sure if you have any specific problem in mind, for example, some
systems with broken ACPI tables?
When it comes to testing, I ran the patch on a few older Intel machines with
PPC and one with PCC. The acpi-cpufreq and pcc-cpufreq drivers were tested in
both built-in and loadable module configurations. It was checked that
intel_pstate remains the preferred option and is attempted to be initialized
first.
It looks though I should have also done testing on some newer machine as
I missed the ACPI0007 scenario you point out below.
>
>> Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
>> ---
>> drivers/acpi/Makefile | 1 +
>> drivers/acpi/acpi_cpufreq.c | 49 ++++++++++++++++++++++++++++++++++
>> drivers/acpi/bus.c | 1 +
>> drivers/acpi/internal.h | 2 ++
>> drivers/cpufreq/acpi-cpufreq.c | 39 +++++++++++++++------------
>> drivers/cpufreq/pcc-cpufreq.c | 34 ++++++++++++++++-------
>> 6 files changed, 99 insertions(+), 27 deletions(-)
>> create mode 100644 drivers/acpi/acpi_cpufreq.c
>>
>> diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
>> index feb36c0b9446..880db1082c3e 100644
>> --- a/drivers/acpi/Makefile
>> +++ b/drivers/acpi/Makefile
>> @@ -57,6 +57,7 @@ acpi-y += evged.o
>> acpi-y += sysfs.o
>> acpi-y += property.o
>> acpi-$(CONFIG_X86) += acpi_cmos_rtc.o
>> +acpi-$(CONFIG_X86) += acpi_cpufreq.o
>> acpi-$(CONFIG_X86) += x86/apple.o
>> acpi-$(CONFIG_X86) += x86/utils.o
>> acpi-$(CONFIG_X86) += x86/s2idle.o
>> diff --git a/drivers/acpi/acpi_cpufreq.c b/drivers/acpi/acpi_cpufreq.c
>> new file mode 100644
>> index 000000000000..7cf243c67475
>> --- /dev/null
>> +++ b/drivers/acpi/acpi_cpufreq.c
>> @@ -0,0 +1,49 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * Registration of platform devices for ACPI Processor Performance Control and
>> + * Processor Clocking Control.
>> + */
>> +
>> +#include <linux/acpi.h>
>> +#include <linux/platform_device.h>
>> +
>> +#include "internal.h"
>> +
>> +static void __init cpufreq_add_device(const char *name)
>> +{
>> + struct platform_device *pdev;
>> +
>> + pdev = platform_device_register_simple(name, PLATFORM_DEVID_NONE, NULL,
>> + 0);
>> + if (IS_ERR(pdev))
>> + pr_err("%s device creation failed: %ld\n", name, PTR_ERR(pdev));
>> +}
>> +
>> +static acpi_status __init acpi_pct_match(acpi_handle handle, u32 level,
>> + void *context, void **return_value)
>> +{
>> + bool *pct = context;
>> +
>> + /* Check if the first CPU has _PCT. The data must be same for all. */
>> + *pct = acpi_has_method(handle, "_PCT");
>> + return AE_CTRL_TERMINATE;
>> +}
>> +
>> +void __init acpi_cpufreq_init(void)
>> +{
>> + acpi_status status;
>> + acpi_handle handle;
>> + bool pct = false;
>> +
>> + status = acpi_get_handle(NULL, "\\_SB", &handle);
>> + if (ACPI_FAILURE(status))
>> + return;
>> +
>> + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
>> + ACPI_UINT32_MAX, acpi_pct_match, NULL, &pct, NULL);
>
> Well, one more full ACPI Namespace walk at init time.
The code tries to do only a partial walk of the ACPI namespace by terminating
at the first found processor node.
> Also, this only covers processor objects and what about processor
> device objects?
I'll extend the scan to cover also ACPI0007 devices.
[1] https://uefi.org/specs/ACPI/6.5/08_Processor_Configuration_and_Control.html#processor-performance-control
[2] https://uefi.org/sites/default/files/resources/ACPI_2.pdf
[3] https://acpica.org/sites/acpica/files/Processor-Clocking-Control-v1p0.pdf
Thanks,
Petr
@@ -57,6 +57,7 @@ acpi-y += evged.o
acpi-y += sysfs.o
acpi-y += property.o
acpi-$(CONFIG_X86) += acpi_cmos_rtc.o
+acpi-$(CONFIG_X86) += acpi_cpufreq.o
acpi-$(CONFIG_X86) += x86/apple.o
acpi-$(CONFIG_X86) += x86/utils.o
acpi-$(CONFIG_X86) += x86/s2idle.o
new file mode 100644
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Registration of platform devices for ACPI Processor Performance Control and
+ * Processor Clocking Control.
+ */
+
+#include <linux/acpi.h>
+#include <linux/platform_device.h>
+
+#include "internal.h"
+
+static void __init cpufreq_add_device(const char *name)
+{
+ struct platform_device *pdev;
+
+ pdev = platform_device_register_simple(name, PLATFORM_DEVID_NONE, NULL,
+ 0);
+ if (IS_ERR(pdev))
+ pr_err("%s device creation failed: %ld\n", name, PTR_ERR(pdev));
+}
+
+static acpi_status __init acpi_pct_match(acpi_handle handle, u32 level,
+ void *context, void **return_value)
+{
+ bool *pct = context;
+
+ /* Check if the first CPU has _PCT. The data must be same for all. */
+ *pct = acpi_has_method(handle, "_PCT");
+ return AE_CTRL_TERMINATE;
+}
+
+void __init acpi_cpufreq_init(void)
+{
+ acpi_status status;
+ acpi_handle handle;
+ bool pct = false;
+
+ status = acpi_get_handle(NULL, "\\_SB", &handle);
+ if (ACPI_FAILURE(status))
+ return;
+
+ acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
+ ACPI_UINT32_MAX, acpi_pct_match, NULL, &pct, NULL);
+ if (pct)
+ cpufreq_add_device("acpi-cpufreq");
+
+ if (acpi_has_method(handle, "PCCH"))
+ cpufreq_add_device("pcc-cpufreq");
+}
@@ -1428,6 +1428,7 @@ static int __init acpi_init(void)
acpi_viot_init();
acpi_agdi_init();
acpi_apmt_init();
+ acpi_cpufreq_init();
return 0;
}
@@ -157,8 +157,10 @@ static inline void acpi_early_processor_set_pdc(void) {}
#ifdef CONFIG_X86
void acpi_early_processor_osc(void);
+void acpi_cpufreq_init(void);
#else
static inline void acpi_early_processor_osc(void) {}
+static inline void acpi_cpufreq_init(void) {}
#endif
/* --------------------------------------------------------------------------
@@ -965,7 +965,7 @@ static void __init acpi_cpufreq_boost_init(void)
acpi_cpufreq_driver.boost_enabled = boost_state(0);
}
-static int __init acpi_cpufreq_init(void)
+static int __init acpi_cpufreq_probe(struct platform_device *pdev)
{
int ret;
@@ -1010,13 +1010,32 @@ static int __init acpi_cpufreq_init(void)
return ret;
}
-static void __exit acpi_cpufreq_exit(void)
+static int acpi_cpufreq_remove(struct platform_device *pdev)
{
pr_debug("%s\n", __func__);
cpufreq_unregister_driver(&acpi_cpufreq_driver);
free_acpi_perf_data();
+
+ return 0;
+}
+
+static struct platform_driver acpi_cpufreq_platdrv = {
+ .driver = {
+ .name = "acpi-cpufreq",
+ },
+ .remove = acpi_cpufreq_remove,
+};
+
+static int __init acpi_cpufreq_init(void)
+{
+ return platform_driver_probe(&acpi_cpufreq_platdrv, acpi_cpufreq_probe);
+}
+
+static void __exit acpi_cpufreq_exit(void)
+{
+ platform_driver_unregister(&acpi_cpufreq_platdrv);
}
module_param(acpi_pstate_strict, uint, 0644);
@@ -1027,18 +1046,4 @@ MODULE_PARM_DESC(acpi_pstate_strict,
late_initcall(acpi_cpufreq_init);
module_exit(acpi_cpufreq_exit);
-static const struct x86_cpu_id __maybe_unused acpi_cpufreq_ids[] = {
- X86_MATCH_FEATURE(X86_FEATURE_ACPI, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_HW_PSTATE, NULL),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, acpi_cpufreq_ids);
-
-static const struct acpi_device_id __maybe_unused processor_device_ids[] = {
- {ACPI_PROCESSOR_OBJECT_HID, },
- {ACPI_PROCESSOR_DEVICE_HID, },
- {},
-};
-MODULE_DEVICE_TABLE(acpi, processor_device_ids);
-
-MODULE_ALIAS("acpi");
+MODULE_ALIAS("platform:acpi-cpufreq");
@@ -384,7 +384,7 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
return ret;
}
-static int __init pcc_cpufreq_probe(void)
+static int __init pcc_cpufreq_evaluate(void)
{
acpi_status status;
struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
@@ -576,7 +576,7 @@ static struct cpufreq_driver pcc_cpufreq_driver = {
.name = "pcc-cpufreq",
};
-static int __init pcc_cpufreq_init(void)
+static int __init pcc_cpufreq_probe(struct platform_device *pdev)
{
int ret;
@@ -587,9 +587,9 @@ static int __init pcc_cpufreq_init(void)
if (acpi_disabled)
return -ENODEV;
- ret = pcc_cpufreq_probe();
+ ret = pcc_cpufreq_evaluate();
if (ret) {
- pr_debug("pcc_cpufreq_init: PCCH evaluation failed\n");
+ pr_debug("pcc_cpufreq_probe: PCCH evaluation failed\n");
return ret;
}
@@ -607,21 +607,35 @@ static int __init pcc_cpufreq_init(void)
return ret;
}
-static void __exit pcc_cpufreq_exit(void)
+static int pcc_cpufreq_remove(struct platform_device *pdev)
{
cpufreq_unregister_driver(&pcc_cpufreq_driver);
pcc_clear_mapping();
free_percpu(pcc_cpu_info);
+
+ return 0;
}
-static const struct acpi_device_id __maybe_unused processor_device_ids[] = {
- {ACPI_PROCESSOR_OBJECT_HID, },
- {ACPI_PROCESSOR_DEVICE_HID, },
- {},
+static struct platform_driver pcc_cpufreq_platdrv = {
+ .driver = {
+ .name = "pcc-cpufreq",
+ },
+ .remove = pcc_cpufreq_remove,
};
-MODULE_DEVICE_TABLE(acpi, processor_device_ids);
+
+static int __init pcc_cpufreq_init(void)
+{
+ return platform_driver_probe(&pcc_cpufreq_platdrv, pcc_cpufreq_probe);
+}
+
+static void __exit pcc_cpufreq_exit(void)
+{
+ platform_driver_unregister(&pcc_cpufreq_platdrv);
+}
+
+MODULE_ALIAS("platform:pcc-cpufreq");
MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
MODULE_VERSION(PCC_VERSION);