arm64: implement arm64 specific cpu_to_node

Message ID 20240115095931.53765-1-shijie@os.amperecomputing.com
State New
Headers
Series arm64: implement arm64 specific cpu_to_node |

Commit Message

Huang Shijie Jan. 15, 2024, 9:59 a.m. UTC
  After setting the right NUMA node for VMAP stack,
  https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/commit/?id=75b5e0bf90bf

I found there are at least four places in the common code where
the cpu_to_node() is called before it is initialized:
   0.) early_trace_init()         in kernel/trace/trace.c
   1.) sched_init()               in kernel/sched/core.c
   2.) init_sched_fair_class()    in kernel/sched/fair.c
   3.) workqueue_init_early()     in kernel/workqueue.c

We cannot use early_cpu_to_node() for them, since early_cpu_to_node()
does not work for some ARCHs, such as x86, riscv, etc.

So we have to implement the arm64 specific cpu_to_node().
This patch
   0.) introduces the __cpu_to_node function pointer,
       and exports it for kernel modules.

   1.) defines a macro cpu_to_node to override the
       generic percpu implementation of cpu_to_node.

   2.) __cpu_to_node is initialized with early_cpu_to_node() before
       numa_node is initialized.

   3.) __cpu_to_node is set to arm64_cpu_to_node() when numa_node is ready.
      arm64_cpu_to_node() is a clone of the generic cpu_to_node().

Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
---
 arch/arm64/include/asm/topology.h |  4 ++++
 arch/arm64/kernel/smp.c           | 21 ++++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)
  

Comments

Mark Rutland Jan. 15, 2024, 11:44 a.m. UTC | #1
On Mon, Jan 15, 2024 at 05:59:31PM +0800, Huang Shijie wrote:
> After setting the right NUMA node for VMAP stack,
>   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/commit/?id=75b5e0bf90bf
> 
> I found there are at least four places in the common code where
> the cpu_to_node() is called before it is initialized:
>    0.) early_trace_init()         in kernel/trace/trace.c
>    1.) sched_init()               in kernel/sched/core.c
>    2.) init_sched_fair_class()    in kernel/sched/fair.c
>    3.) workqueue_init_early()     in kernel/workqueue.c
> 
> We cannot use early_cpu_to_node() for them, since early_cpu_to_node()
> does not work for some ARCHs, such as x86, riscv, etc.

I spot that x86 seems to have an implementation of early_cpu_to_node(); what's
wrong with it?

> So we have to implement the arm64 specific cpu_to_node().

Surely those early uses of cpu_to_node() are equally broken on those other
architectures, so why should this be specific to arm64?

> This patch
>    0.) introduces the __cpu_to_node function pointer,
>        and exports it for kernel modules.
> 
>    1.) defines a macro cpu_to_node to override the
>        generic percpu implementation of cpu_to_node.
> 
>    2.) __cpu_to_node is initialized with early_cpu_to_node() before
>        numa_node is initialized.
> 
>    3.) __cpu_to_node is set to arm64_cpu_to_node() when numa_node is ready.
>       arm64_cpu_to_node() is a clone of the generic cpu_to_node().

I don't think this is the right approach. Regardlesss of anything else, we
shouldn't have a solution that only fixes arm64.

Why can't we mandate an early_cpu_to_node(), and have the other architectures
implement that?

Why can't we change cpu_to_node() to automatically do the right thing?

Mark.

> 
> Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
> ---
>  arch/arm64/include/asm/topology.h |  4 ++++
>  arch/arm64/kernel/smp.c           | 21 ++++++++++++++++++++-
>  2 files changed, 24 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
> index a323b109b9c4..3e99b40b5f9f 100644
> --- a/arch/arm64/include/asm/topology.h
> +++ b/arch/arm64/include/asm/topology.h
> @@ -12,6 +12,10 @@ int pcibus_to_node(struct pci_bus *bus);
>  				 cpu_all_mask :				\
>  				 cpumask_of_node(pcibus_to_node(bus)))
>  
> +/* override generic percpu implementation of cpu_to_node */
> +extern int (*__cpu_to_node)(int cpu);
> +#define cpu_to_node(cpu) __cpu_to_node(cpu)
> +
>  #endif /* CONFIG_NUMA */
>  
>  #include <linux/arch_topology.h>
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 4ced34f62dab..8a3bc101eaed 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -90,6 +90,21 @@ static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init;
>  
>  static void ipi_setup(int cpu);
>  
> +int (*__cpu_to_node)(int cpu);
> +EXPORT_SYMBOL(__cpu_to_node);
> +
> +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
> +static int arm64_cpu_to_node(int cpu)
> +{
> +	return per_cpu(numa_node, cpu);
> +}
> +#else
> +static int arm64_cpu_to_node(int cpu)
> +{
> +	return 0;
> +}
> +#endif
> +
>  #ifdef CONFIG_HOTPLUG_CPU
>  static void ipi_teardown(int cpu);
>  static int op_cpu_kill(unsigned int cpu);
> @@ -613,6 +628,7 @@ static void __init acpi_parse_and_init_cpus(void)
>  
>  	for (i = 0; i < nr_cpu_ids; i++)
>  		early_map_cpu_to_node(i, acpi_numa_get_nid(i));
> +	__cpu_to_node = early_cpu_to_node;
>  }
>  #else
>  #define acpi_parse_and_init_cpus(...)	do { } while (0)
> @@ -674,6 +690,7 @@ static void __init of_parse_and_init_cpus(void)
>  next:
>  		cpu_count++;
>  	}
> +	__cpu_to_node = early_cpu_to_node;
>  }
>  
>  /*
> @@ -733,7 +750,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>  	 * secondary CPUs present.
>  	 */
>  	if (max_cpus == 0)
> -		return;
> +		goto out;
>  
>  	/*
>  	 * Initialise the present map (which describes the set of CPUs
> @@ -758,6 +775,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>  		set_cpu_present(cpu, true);
>  		numa_store_cpu_info(cpu);
>  	}
> +out:
> +	__cpu_to_node = arm64_cpu_to_node;
>  }
>  
>  static const char *ipi_types[NR_IPI] __tracepoint_string = {
> -- 
> 2.40.1
>
  
Shijie Huang Jan. 16, 2024, 1:47 a.m. UTC | #2
Hi Mark,

在 2024/1/15 19:44, Mark Rutland 写道:
> On Mon, Jan 15, 2024 at 05:59:31PM +0800, Huang Shijie wrote:
>> After setting the right NUMA node for VMAP stack,
>>    https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/commit/?id=75b5e0bf90bf
>>
>> I found there are at least four places in the common code where
>> the cpu_to_node() is called before it is initialized:
>>     0.) early_trace_init()         in kernel/trace/trace.c
>>     1.) sched_init()               in kernel/sched/core.c
>>     2.) init_sched_fair_class()    in kernel/sched/fair.c
>>     3.) workqueue_init_early()     in kernel/workqueue.c
>>
>> We cannot use early_cpu_to_node() for them, since early_cpu_to_node()
>> does not work for some ARCHs, such as x86, riscv, etc.
> I spot that x86 seems to have an implementation of early_cpu_to_node(); what's
> wrong with it?

Yes, you are right. I check the code again, x86 has its own 
early_cpu_to_node().


>
>> So we have to implement the arm64 specific cpu_to_node().
> Surely those early uses of cpu_to_node() are equally broken on those other

yes. some ARCHs also has the same issue. But I am not sure if all ARCHs 
support the NUMA.


> architectures, so why should this be specific to arm64?

I just want to fix the arm64 first. :)

But if you think we should fix the common code firstly, I am okay.


>
>> This patch
>>     0.) introduces the __cpu_to_node function pointer,
>>         and exports it for kernel modules.
>>
>>     1.) defines a macro cpu_to_node to override the
>>         generic percpu implementation of cpu_to_node.
>>
>>     2.) __cpu_to_node is initialized with early_cpu_to_node() before
>>         numa_node is initialized.
>>
>>     3.) __cpu_to_node is set to arm64_cpu_to_node() when numa_node is ready.
>>        arm64_cpu_to_node() is a clone of the generic cpu_to_node().
> I don't think this is the right approach. Regardlesss of anything else, we
> shouldn't have a solution that only fixes arm64.
>
> Why can't we mandate an early_cpu_to_node(), and have the other architectures
> implement that?
>
> Why can't we change cpu_to_node() to automatically do the right thing?

ok, I will change the common code.


Thanks

Huang Shijie
  

Patch

diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index a323b109b9c4..3e99b40b5f9f 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -12,6 +12,10 @@  int pcibus_to_node(struct pci_bus *bus);
 				 cpu_all_mask :				\
 				 cpumask_of_node(pcibus_to_node(bus)))
 
+/* override generic percpu implementation of cpu_to_node */
+extern int (*__cpu_to_node)(int cpu);
+#define cpu_to_node(cpu) __cpu_to_node(cpu)
+
 #endif /* CONFIG_NUMA */
 
 #include <linux/arch_topology.h>
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 4ced34f62dab..8a3bc101eaed 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -90,6 +90,21 @@  static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init;
 
 static void ipi_setup(int cpu);
 
+int (*__cpu_to_node)(int cpu);
+EXPORT_SYMBOL(__cpu_to_node);
+
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+static int arm64_cpu_to_node(int cpu)
+{
+	return per_cpu(numa_node, cpu);
+}
+#else
+static int arm64_cpu_to_node(int cpu)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_HOTPLUG_CPU
 static void ipi_teardown(int cpu);
 static int op_cpu_kill(unsigned int cpu);
@@ -613,6 +628,7 @@  static void __init acpi_parse_and_init_cpus(void)
 
 	for (i = 0; i < nr_cpu_ids; i++)
 		early_map_cpu_to_node(i, acpi_numa_get_nid(i));
+	__cpu_to_node = early_cpu_to_node;
 }
 #else
 #define acpi_parse_and_init_cpus(...)	do { } while (0)
@@ -674,6 +690,7 @@  static void __init of_parse_and_init_cpus(void)
 next:
 		cpu_count++;
 	}
+	__cpu_to_node = early_cpu_to_node;
 }
 
 /*
@@ -733,7 +750,7 @@  void __init smp_prepare_cpus(unsigned int max_cpus)
 	 * secondary CPUs present.
 	 */
 	if (max_cpus == 0)
-		return;
+		goto out;
 
 	/*
 	 * Initialise the present map (which describes the set of CPUs
@@ -758,6 +775,8 @@  void __init smp_prepare_cpus(unsigned int max_cpus)
 		set_cpu_present(cpu, true);
 		numa_store_cpu_info(cpu);
 	}
+out:
+	__cpu_to_node = arm64_cpu_to_node;
 }
 
 static const char *ipi_types[NR_IPI] __tracepoint_string = {