[v7,5/9] x86/smpboot: Split up native_cpu_up into separate phases and document them
Commit Message
From: David Woodhouse <dwmw@amazon.co.uk>
There are four logical parts to what native_cpu_up() does on the BSP (or
on the controlling CPU for a later hotplug):
1) Wake the AP by sending the INIT/SIPI/SIPI sequence.
2) Wait for the AP to make it as far as wait_for_master_cpu() which
sets that CPU's bit in cpu_initialized_mask, then sets the bit in
cpu_callout_mask to let the AP proceed through cpu_init().
3) Waits for the AP to finish cpu_init() and get as far as the
smp_callin() call, which sets that CPU's bit in cpu_callin_mask.
4) Perform the TSC synchronization and wait for the AP to actually
mark itself online in cpu_online_mask.
In preparation to allow these phases to operate in parallel on multiple
APs, split them out into separate functions and document the interactions
a little more clearly in both the BSP and AP code paths.
No functional change intended.
[Usama Arif: fixed rebase conflict]
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Usama Arif <usama.arif@bytedance.com>
---
arch/x86/kernel/smpboot.c | 182 +++++++++++++++++++++++++++-----------
1 file changed, 128 insertions(+), 54 deletions(-)
Comments
On Tue, Feb 07, 2023 at 11:04:32PM +0000, Usama Arif wrote:
> From: David Woodhouse <dwmw@amazon.co.uk>
>
> There are four logical parts to what native_cpu_up() does on the BSP (or
> on the controlling CPU for a later hotplug):
>
> 1) Wake the AP by sending the INIT/SIPI/SIPI sequence.
>
> 2) Wait for the AP to make it as far as wait_for_master_cpu() which
> sets that CPU's bit in cpu_initialized_mask, then sets the bit in
> cpu_callout_mask to let the AP proceed through cpu_init().
>
> 3) Waits for the AP to finish cpu_init() and get as far as the
> smp_callin() call, which sets that CPU's bit in cpu_callin_mask.
>
> 4) Perform the TSC synchronization and wait for the AP to actually
> mark itself online in cpu_online_mask.
>
> In preparation to allow these phases to operate in parallel on multiple
> APs, split them out into separate functions and document the interactions
> a little more clearly in both the BSP and AP code paths.
>
> No functional change intended.
>
> [Usama Arif: fixed rebase conflict]
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> Signed-off-by: Usama Arif <usama.arif@bytedance.com>
> ---
> arch/x86/kernel/smpboot.c | 182 +++++++++++++++++++++++++++-----------
> 1 file changed, 128 insertions(+), 54 deletions(-)
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 3a793772a2aa..eee717086ab7 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -204,6 +204,10 @@ static void smp_callin(void)
>
> wmb();
>
> + /*
> + * This runs the AP through all the cpuhp states to its target
> + * state (CPUHP_ONLINE in the case of serial bringup).
> + */
> notify_cpu_starting(cpuid);
>
> /*
> @@ -231,17 +235,33 @@ static void notrace start_secondary(void *unused)
> load_cr3(swapper_pg_dir);
> __flush_tlb_all();
> #endif
> + /*
> + * Sync point with do_wait_cpu_initialized(). On boot, all secondary
> + * CPUs reach this stage after receiving INIT/SIPI from do_cpu_up()
> + * in the x86/cpu:kick cpuhp stage. At the start of cpu_init() they
> + * will wait for do_wait_cpu_initialized() to set their bit in
> + * smp_callout_mask to release them.
The last sentence of the comment looks confused. The fact is:
For serial case, The BSP waits AP to set cpu_initialized_mask from
wait_for_master_cpu() after fired INIT/SIPI, then AP starts to wait
cpu_callout_mask set by BSP from do_boot_cpu().
Or the comments below "Bringup step two:..." which also looks clear
enough then above.
> + */
> cpu_init_secondary();
> rcu_cpu_starting(raw_smp_processor_id());
> x86_cpuinit.early_percpu_clock_init();
> +
> + /*
> + * Sync point with do_wait_cpu_callin(). The AP doesn't wait here
> + * but just sets the bit to let the controlling CPU (BSP) know that
> + * it's got this far.
> + */
> smp_callin();
>
> enable_start_cpu0 = 0;
>
> /* otherwise gcc will move up smp_processor_id before the cpu_init */
> barrier();
> +
> /*
> - * Check TSC synchronization with the boot CPU:
> + * Check TSC synchronization with the boot CPU (or whichever CPU
> + * is controlling the bringup). It will do its part of this from
> + * do_wait_cpu_online(), making it an implicit sync point.
> */
> check_tsc_sync_target();
>
> @@ -254,6 +274,7 @@ static void notrace start_secondary(void *unused)
> * half valid vector space.
> */
> lock_vector_lock();
> + /* Sync point with do_wait_cpu_online() */
> set_cpu_online(smp_processor_id(), true);
> lapic_online();
> unlock_vector_lock();
> @@ -1083,7 +1104,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
> unsigned long start_ip = real_mode_header->trampoline_start;
>
> unsigned long boot_error = 0;
> - unsigned long timeout;
>
> #ifdef CONFIG_X86_64
> /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
> @@ -1144,55 +1164,94 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
> boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
> cpu0_nmi_registered);
>
> - if (!boot_error) {
> - /*
> - * Wait 10s total for first sign of life from AP
> - */
> - boot_error = -1;
> - timeout = jiffies + 10*HZ;
> - while (time_before(jiffies, timeout)) {
> - if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
> - /*
> - * Tell AP to proceed with initialization
> - */
> - cpumask_set_cpu(cpu, cpu_callout_mask);
> - boot_error = 0;
> - break;
> - }
> - schedule();
> - }
> - }
> + return boot_error;
> +}
>
> - if (!boot_error) {
> - /*
> - * Wait till AP completes initial initialization
> - */
> - while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
> - /*
> - * Allow other tasks to run while we wait for the
> - * AP to come online. This also gives a chance
> - * for the MTRR work(triggered by the AP coming online)
> - * to be completed in the stop machine context.
> - */
> - schedule();
> - }
> +static int do_wait_cpu_cpumask(unsigned int cpu, const struct cpumask *mask)
> +{
> + unsigned long timeout;
> +
> + /*
> + * Wait up to 10s for the CPU to report in.
> + */
> + timeout = jiffies + 10*HZ;
> + while (time_before(jiffies, timeout)) {
> + if (cpumask_test_cpu(cpu, mask))
> + return 0;
> +
> + schedule();
> }
> + return -1;
> +}
>
> - if (x86_platform.legacy.warm_reset) {
> - /*
> - * Cleanup possible dangling ends...
> - */
> - smpboot_restore_warm_reset_vector();
> +/*
> + * Bringup step two: Wait for the target AP to reach cpu_init_secondary()
> + * and thus wait_for_master_cpu(), then set cpu_callout_mask to allow it
> + * to proceed. The AP will then proceed past setting its 'callin' bit
> + * and end up waiting in check_tsc_sync_target() until we reach
> + * do_wait_cpu_online() to tend to it.
> + */
> +static int do_wait_cpu_initialized(unsigned int cpu)
> +{
> + /*
> + * Wait for first sign of life from AP.
> + */
> + if (do_wait_cpu_cpumask(cpu, cpu_initialized_mask))
> + return -1;
> +
> + cpumask_set_cpu(cpu, cpu_callout_mask);
> + return 0;
> +}
> +
> +/*
> + * Bringup step three: Wait for the target AP to reach smp_callin().
> + * The AP is not waiting for us here so we don't need to parallelise
> + * this step. Not entirely clear why we care about this, since we just
> + * proceed directly to TSC synchronization which is the next sync
> + * point with the AP anyway.
> + */
> +static int do_wait_cpu_callin(unsigned int cpu)
> +{
> + /*
> + * Wait till AP completes initial initialization.
> + */
> + return do_wait_cpu_cpumask(cpu, cpu_callin_mask);
> +}
> +
> +/*
> + * Bringup step four: Synchronize the TSC and wait for the target AP
> + * to reach set_cpu_online() in start_secondary().
> + */
> +static int do_wait_cpu_online(unsigned int cpu)
> +{
> + unsigned long flags;
> +
> + /*
> + * Check TSC synchronization with the AP (keep irqs disabled
> + * while doing so):
> + */
> + local_irq_save(flags);
> + check_tsc_sync_source(cpu);
> + local_irq_restore(flags);
> +
> + /*
> + * Wait for the AP to mark itself online. Not entirely
> + * clear why we care, since the generic cpuhp code will
> + * wait for it to each CPUHP_AP_ONLINE_IDLE before going
> + * ahead with the rest of the bringup anyway.
> + */
> + while (!cpu_online(cpu)) {
> + cpu_relax();
> + touch_nmi_watchdog();
> }
>
> - return boot_error;
> + return 0;
> }
>
> -int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
> +static int do_cpu_up(unsigned int cpu, struct task_struct *tidle)
> {
> int apicid = apic->cpu_present_to_apicid(cpu);
> int cpu0_nmi_registered = 0;
> - unsigned long flags;
> int err, ret = 0;
>
> lockdep_assert_irqs_enabled();
> @@ -1239,19 +1298,6 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
> goto unreg_nmi;
> }
>
> - /*
> - * Check TSC synchronization with the AP (keep irqs disabled
> - * while doing so):
> - */
> - local_irq_save(flags);
> - check_tsc_sync_source(cpu);
> - local_irq_restore(flags);
> -
> - while (!cpu_online(cpu)) {
> - cpu_relax();
> - touch_nmi_watchdog();
> - }
> -
> unreg_nmi:
> /*
> * Clean up the nmi handler. Do this after the callin and callout sync
> @@ -1263,6 +1309,34 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
> return ret;
> }
>
> +int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
> +{
> + int ret;
> +
> + ret = do_cpu_up(cpu, tidle);
> + if (ret)
> + return ret;
> +
> + ret = do_wait_cpu_initialized(cpu);
> + if (ret)
> + return ret;
> +
> + ret = do_wait_cpu_callin(cpu);
> + if (ret)
> + return ret;
> +
> + ret = do_wait_cpu_online(cpu);
> +
> + if (x86_platform.legacy.warm_reset) {
> + /*
> + * Cleanup possible dangling ends...
> + */
> + smpboot_restore_warm_reset_vector();
> + }
> +
> + return ret;
> +}
> +
> /**
> * arch_disable_smp_support() - disables SMP support for x86 at runtime
> */
> --
> 2.25.1
>
On Wed, 2023-02-08 at 18:03 +0800, Yuan Yao wrote:
>
> > #endif
> > + /*
> > + * Sync point with do_wait_cpu_initialized(). On boot, all secondary
> > + * CPUs reach this stage after receiving INIT/SIPI from do_cpu_up()
> > + * in the x86/cpu:kick cpuhp stage. At the start of cpu_init() they
> > + * will wait for do_wait_cpu_initialized() to set their bit in
> > + * smp_callout_mask to release them.
>
> The last sentence of the comment looks confused. The fact is:
>
> For serial case, The BSP waits AP to set cpu_initialized_mask from
> wait_for_master_cpu() after fired INIT/SIPI, then AP starts to wait
> cpu_callout_mask set by BSP from do_boot_cpu().
>
> Or the comments below "Bringup step two:..." which also looks clear
> enough then above.
/*
* Sync point with do_wait_cpu_initialized(). Before proceeding through
* cpu_init(), the AP will call wait_for_master_cpu() which sets its
* bit in cpu_initialized_mask and then waits for the BSP to set its
* bit in cpu_callout_mask to release it.
*/
cpu_init_secondary();
Better?
On Wed, Feb 08, 2023 at 12:02:55PM +0000, David Woodhouse wrote:
> On Wed, 2023-02-08 at 18:03 +0800, Yuan Yao wrote:
> >
> > > #endif
> > > + /*
> > > + * Sync point with do_wait_cpu_initialized(). On boot, all secondary
> > > + * CPUs reach this stage after receiving INIT/SIPI from do_cpu_up()
> > > + * in the x86/cpu:kick cpuhp stage. At the start of cpu_init() they
> > > + * will wait for do_wait_cpu_initialized() to set their bit in
> > > + * smp_callout_mask to release them.
> >
> > The last sentence of the comment looks confused. The fact is:
> >
> > For serial case, The BSP waits AP to set cpu_initialized_mask from
> > wait_for_master_cpu() after fired INIT/SIPI, then AP starts to wait
> > cpu_callout_mask set by BSP from do_boot_cpu().
> >
> > Or the comments below "Bringup step two:..." which also looks clear
> > enough then above.
>
>
> /*
> * Sync point with do_wait_cpu_initialized(). Before proceeding through
> * cpu_init(), the AP will call wait_for_master_cpu() which sets its
> * bit in cpu_initialized_mask and then waits for the BSP to set its
> * bit in cpu_callout_mask to release it.
> */
> cpu_init_secondary();
>
>
> Better?
Yes, it's better to me, thanks.
@@ -204,6 +204,10 @@ static void smp_callin(void)
wmb();
+ /*
+ * This runs the AP through all the cpuhp states to its target
+ * state (CPUHP_ONLINE in the case of serial bringup).
+ */
notify_cpu_starting(cpuid);
/*
@@ -231,17 +235,33 @@ static void notrace start_secondary(void *unused)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
#endif
+ /*
+ * Sync point with do_wait_cpu_initialized(). On boot, all secondary
+ * CPUs reach this stage after receiving INIT/SIPI from do_cpu_up()
+ * in the x86/cpu:kick cpuhp stage. At the start of cpu_init() they
+ * will wait for do_wait_cpu_initialized() to set their bit in
+ * smp_callout_mask to release them.
+ */
cpu_init_secondary();
rcu_cpu_starting(raw_smp_processor_id());
x86_cpuinit.early_percpu_clock_init();
+
+ /*
+ * Sync point with do_wait_cpu_callin(). The AP doesn't wait here
+ * but just sets the bit to let the controlling CPU (BSP) know that
+ * it's got this far.
+ */
smp_callin();
enable_start_cpu0 = 0;
/* otherwise gcc will move up smp_processor_id before the cpu_init */
barrier();
+
/*
- * Check TSC synchronization with the boot CPU:
+ * Check TSC synchronization with the boot CPU (or whichever CPU
+ * is controlling the bringup). It will do its part of this from
+ * do_wait_cpu_online(), making it an implicit sync point.
*/
check_tsc_sync_target();
@@ -254,6 +274,7 @@ static void notrace start_secondary(void *unused)
* half valid vector space.
*/
lock_vector_lock();
+ /* Sync point with do_wait_cpu_online() */
set_cpu_online(smp_processor_id(), true);
lapic_online();
unlock_vector_lock();
@@ -1083,7 +1104,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
unsigned long start_ip = real_mode_header->trampoline_start;
unsigned long boot_error = 0;
- unsigned long timeout;
#ifdef CONFIG_X86_64
/* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
@@ -1144,55 +1164,94 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
cpu0_nmi_registered);
- if (!boot_error) {
- /*
- * Wait 10s total for first sign of life from AP
- */
- boot_error = -1;
- timeout = jiffies + 10*HZ;
- while (time_before(jiffies, timeout)) {
- if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
- /*
- * Tell AP to proceed with initialization
- */
- cpumask_set_cpu(cpu, cpu_callout_mask);
- boot_error = 0;
- break;
- }
- schedule();
- }
- }
+ return boot_error;
+}
- if (!boot_error) {
- /*
- * Wait till AP completes initial initialization
- */
- while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
- /*
- * Allow other tasks to run while we wait for the
- * AP to come online. This also gives a chance
- * for the MTRR work(triggered by the AP coming online)
- * to be completed in the stop machine context.
- */
- schedule();
- }
+static int do_wait_cpu_cpumask(unsigned int cpu, const struct cpumask *mask)
+{
+ unsigned long timeout;
+
+ /*
+ * Wait up to 10s for the CPU to report in.
+ */
+ timeout = jiffies + 10*HZ;
+ while (time_before(jiffies, timeout)) {
+ if (cpumask_test_cpu(cpu, mask))
+ return 0;
+
+ schedule();
}
+ return -1;
+}
- if (x86_platform.legacy.warm_reset) {
- /*
- * Cleanup possible dangling ends...
- */
- smpboot_restore_warm_reset_vector();
+/*
+ * Bringup step two: Wait for the target AP to reach cpu_init_secondary()
+ * and thus wait_for_master_cpu(), then set cpu_callout_mask to allow it
+ * to proceed. The AP will then proceed past setting its 'callin' bit
+ * and end up waiting in check_tsc_sync_target() until we reach
+ * do_wait_cpu_online() to tend to it.
+ */
+static int do_wait_cpu_initialized(unsigned int cpu)
+{
+ /*
+ * Wait for first sign of life from AP.
+ */
+ if (do_wait_cpu_cpumask(cpu, cpu_initialized_mask))
+ return -1;
+
+ cpumask_set_cpu(cpu, cpu_callout_mask);
+ return 0;
+}
+
+/*
+ * Bringup step three: Wait for the target AP to reach smp_callin().
+ * The AP is not waiting for us here so we don't need to parallelise
+ * this step. Not entirely clear why we care about this, since we just
+ * proceed directly to TSC synchronization which is the next sync
+ * point with the AP anyway.
+ */
+static int do_wait_cpu_callin(unsigned int cpu)
+{
+ /*
+ * Wait till AP completes initial initialization.
+ */
+ return do_wait_cpu_cpumask(cpu, cpu_callin_mask);
+}
+
+/*
+ * Bringup step four: Synchronize the TSC and wait for the target AP
+ * to reach set_cpu_online() in start_secondary().
+ */
+static int do_wait_cpu_online(unsigned int cpu)
+{
+ unsigned long flags;
+
+ /*
+ * Check TSC synchronization with the AP (keep irqs disabled
+ * while doing so):
+ */
+ local_irq_save(flags);
+ check_tsc_sync_source(cpu);
+ local_irq_restore(flags);
+
+ /*
+ * Wait for the AP to mark itself online. Not entirely
+ * clear why we care, since the generic cpuhp code will
+ * wait for it to each CPUHP_AP_ONLINE_IDLE before going
+ * ahead with the rest of the bringup anyway.
+ */
+ while (!cpu_online(cpu)) {
+ cpu_relax();
+ touch_nmi_watchdog();
}
- return boot_error;
+ return 0;
}
-int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
+static int do_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int apicid = apic->cpu_present_to_apicid(cpu);
int cpu0_nmi_registered = 0;
- unsigned long flags;
int err, ret = 0;
lockdep_assert_irqs_enabled();
@@ -1239,19 +1298,6 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
goto unreg_nmi;
}
- /*
- * Check TSC synchronization with the AP (keep irqs disabled
- * while doing so):
- */
- local_irq_save(flags);
- check_tsc_sync_source(cpu);
- local_irq_restore(flags);
-
- while (!cpu_online(cpu)) {
- cpu_relax();
- touch_nmi_watchdog();
- }
-
unreg_nmi:
/*
* Clean up the nmi handler. Do this after the callin and callout sync
@@ -1263,6 +1309,34 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
return ret;
}
+int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
+{
+ int ret;
+
+ ret = do_cpu_up(cpu, tidle);
+ if (ret)
+ return ret;
+
+ ret = do_wait_cpu_initialized(cpu);
+ if (ret)
+ return ret;
+
+ ret = do_wait_cpu_callin(cpu);
+ if (ret)
+ return ret;
+
+ ret = do_wait_cpu_online(cpu);
+
+ if (x86_platform.legacy.warm_reset) {
+ /*
+ * Cleanup possible dangling ends...
+ */
+ smpboot_restore_warm_reset_vector();
+ }
+
+ return ret;
+}
+
/**
* arch_disable_smp_support() - disables SMP support for x86 at runtime
*/