[v7,5/9] x86/smpboot: Split up native_cpu_up into separate phases and document them

Message ID 20230207230436.2690891-6-usama.arif@bytedance.com
State New
Headers
Series Parallel CPU bringup for x86_64 |

Commit Message

Usama Arif Feb. 7, 2023, 11:04 p.m. UTC
  From: David Woodhouse <dwmw@amazon.co.uk>

There are four logical parts to what native_cpu_up() does on the BSP (or
on the controlling CPU for a later hotplug):

 1) Wake the AP by sending the INIT/SIPI/SIPI sequence.

 2) Wait for the AP to make it as far as wait_for_master_cpu() which
    sets that CPU's bit in cpu_initialized_mask, then sets the bit in
    cpu_callout_mask to let the AP proceed through cpu_init().

 3) Waits for the AP to finish cpu_init() and get as far as the
    smp_callin() call, which sets that CPU's bit in cpu_callin_mask.

 4) Perform the TSC synchronization and wait for the AP to actually
    mark itself online in cpu_online_mask.

In preparation to allow these phases to operate in parallel on multiple
APs, split them out into separate functions and document the interactions
a little more clearly in both the BSP and AP code paths.

No functional change intended.

[Usama Arif: fixed rebase conflict]
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Usama Arif <usama.arif@bytedance.com>
---
 arch/x86/kernel/smpboot.c | 182 +++++++++++++++++++++++++++-----------
 1 file changed, 128 insertions(+), 54 deletions(-)
  

Comments

Yuan Yao Feb. 8, 2023, 10:03 a.m. UTC | #1
On Tue, Feb 07, 2023 at 11:04:32PM +0000, Usama Arif wrote:
> From: David Woodhouse <dwmw@amazon.co.uk>
>
> There are four logical parts to what native_cpu_up() does on the BSP (or
> on the controlling CPU for a later hotplug):
>
>  1) Wake the AP by sending the INIT/SIPI/SIPI sequence.
>
>  2) Wait for the AP to make it as far as wait_for_master_cpu() which
>     sets that CPU's bit in cpu_initialized_mask, then sets the bit in
>     cpu_callout_mask to let the AP proceed through cpu_init().
>
>  3) Waits for the AP to finish cpu_init() and get as far as the
>     smp_callin() call, which sets that CPU's bit in cpu_callin_mask.
>
>  4) Perform the TSC synchronization and wait for the AP to actually
>     mark itself online in cpu_online_mask.
>
> In preparation to allow these phases to operate in parallel on multiple
> APs, split them out into separate functions and document the interactions
> a little more clearly in both the BSP and AP code paths.
>
> No functional change intended.
>
> [Usama Arif: fixed rebase conflict]
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> Signed-off-by: Usama Arif <usama.arif@bytedance.com>
> ---
>  arch/x86/kernel/smpboot.c | 182 +++++++++++++++++++++++++++-----------
>  1 file changed, 128 insertions(+), 54 deletions(-)
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 3a793772a2aa..eee717086ab7 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -204,6 +204,10 @@ static void smp_callin(void)
>
>  	wmb();
>
> +	/*
> +	 * This runs the AP through all the cpuhp states to its target
> +	 * state (CPUHP_ONLINE in the case of serial bringup).
> +	 */
>  	notify_cpu_starting(cpuid);
>
>  	/*
> @@ -231,17 +235,33 @@ static void notrace start_secondary(void *unused)
>  	load_cr3(swapper_pg_dir);
>  	__flush_tlb_all();
>  #endif
> +	/*
> +	 * Sync point with do_wait_cpu_initialized(). On boot, all secondary
> +	 * CPUs reach this stage after receiving INIT/SIPI from do_cpu_up()
> +	 * in the x86/cpu:kick cpuhp stage. At the start of cpu_init() they
> +	 * will wait for do_wait_cpu_initialized() to set their bit in
> +	 * smp_callout_mask to release them.

The last sentence of the comment looks confused. The fact is:

For serial case, The BSP waits AP to set cpu_initialized_mask from
wait_for_master_cpu() after fired INIT/SIPI, then AP starts to wait
cpu_callout_mask set by BSP from do_boot_cpu().

Or the comments below "Bringup step two:..." which also looks clear
enough then above.

> +	 */
>  	cpu_init_secondary();
>  	rcu_cpu_starting(raw_smp_processor_id());
>  	x86_cpuinit.early_percpu_clock_init();
> +
> +	/*
> +	 * Sync point with do_wait_cpu_callin(). The AP doesn't wait here
> +	 * but just sets the bit to let the controlling CPU (BSP) know that
> +	 * it's got this far.
> +	 */
>  	smp_callin();
>
>  	enable_start_cpu0 = 0;
>
>  	/* otherwise gcc will move up smp_processor_id before the cpu_init */
>  	barrier();
> +
>  	/*
> -	 * Check TSC synchronization with the boot CPU:
> +	 * Check TSC synchronization with the boot CPU (or whichever CPU
> +	 * is controlling the bringup). It will do its part of this from
> +	 * do_wait_cpu_online(), making it an implicit sync point.
>  	 */
>  	check_tsc_sync_target();
>
> @@ -254,6 +274,7 @@ static void notrace start_secondary(void *unused)
>  	 * half valid vector space.
>  	 */
>  	lock_vector_lock();
> +	/* Sync point with do_wait_cpu_online() */
>  	set_cpu_online(smp_processor_id(), true);
>  	lapic_online();
>  	unlock_vector_lock();
> @@ -1083,7 +1104,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
>  	unsigned long start_ip = real_mode_header->trampoline_start;
>
>  	unsigned long boot_error = 0;
> -	unsigned long timeout;
>
>  #ifdef CONFIG_X86_64
>  	/* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
> @@ -1144,55 +1164,94 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
>  		boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
>  						     cpu0_nmi_registered);
>
> -	if (!boot_error) {
> -		/*
> -		 * Wait 10s total for first sign of life from AP
> -		 */
> -		boot_error = -1;
> -		timeout = jiffies + 10*HZ;
> -		while (time_before(jiffies, timeout)) {
> -			if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
> -				/*
> -				 * Tell AP to proceed with initialization
> -				 */
> -				cpumask_set_cpu(cpu, cpu_callout_mask);
> -				boot_error = 0;
> -				break;
> -			}
> -			schedule();
> -		}
> -	}
> +	return boot_error;
> +}
>
> -	if (!boot_error) {
> -		/*
> -		 * Wait till AP completes initial initialization
> -		 */
> -		while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
> -			/*
> -			 * Allow other tasks to run while we wait for the
> -			 * AP to come online. This also gives a chance
> -			 * for the MTRR work(triggered by the AP coming online)
> -			 * to be completed in the stop machine context.
> -			 */
> -			schedule();
> -		}
> +static int do_wait_cpu_cpumask(unsigned int cpu, const struct cpumask *mask)
> +{
> +	unsigned long timeout;
> +
> +	/*
> +	 * Wait up to 10s for the CPU to report in.
> +	 */
> +	timeout = jiffies + 10*HZ;
> +	while (time_before(jiffies, timeout)) {
> +		if (cpumask_test_cpu(cpu, mask))
> +			return 0;
> +
> +		schedule();
>  	}
> +	return -1;
> +}
>
> -	if (x86_platform.legacy.warm_reset) {
> -		/*
> -		 * Cleanup possible dangling ends...
> -		 */
> -		smpboot_restore_warm_reset_vector();
> +/*
> + * Bringup step two: Wait for the target AP to reach cpu_init_secondary()
> + * and thus wait_for_master_cpu(), then set cpu_callout_mask to allow it
> + * to proceed.  The AP will then proceed past setting its 'callin' bit
> + * and end up waiting in check_tsc_sync_target() until we reach
> + * do_wait_cpu_online() to tend to it.
> + */
> +static int do_wait_cpu_initialized(unsigned int cpu)
> +{
> +	/*
> +	 * Wait for first sign of life from AP.
> +	 */
> +	if (do_wait_cpu_cpumask(cpu, cpu_initialized_mask))
> +		return -1;
> +
> +	cpumask_set_cpu(cpu, cpu_callout_mask);
> +	return 0;
> +}
> +
> +/*
> + * Bringup step three: Wait for the target AP to reach smp_callin().
> + * The AP is not waiting for us here so we don't need to parallelise
> + * this step. Not entirely clear why we care about this, since we just
> + * proceed directly to TSC synchronization which is the next sync
> + * point with the AP anyway.
> + */
> +static int do_wait_cpu_callin(unsigned int cpu)
> +{
> +	/*
> +	 * Wait till AP completes initial initialization.
> +	 */
> +	return do_wait_cpu_cpumask(cpu, cpu_callin_mask);
> +}
> +
> +/*
> + * Bringup step four: Synchronize the TSC and wait for the target AP
> + * to reach set_cpu_online() in start_secondary().
> + */
> +static int do_wait_cpu_online(unsigned int cpu)
> +{
> +	unsigned long flags;
> +
> +	/*
> +	 * Check TSC synchronization with the AP (keep irqs disabled
> +	 * while doing so):
> +	 */
> +	local_irq_save(flags);
> +	check_tsc_sync_source(cpu);
> +	local_irq_restore(flags);
> +
> +	/*
> +	 * Wait for the AP to mark itself online. Not entirely
> +	 * clear why we care, since the generic cpuhp code will
> +	 * wait for it to each CPUHP_AP_ONLINE_IDLE before going
> +	 * ahead with the rest of the bringup anyway.
> +	 */
> +	while (!cpu_online(cpu)) {
> +		cpu_relax();
> +		touch_nmi_watchdog();
>  	}
>
> -	return boot_error;
> +	return 0;
>  }
>
> -int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
> +static int do_cpu_up(unsigned int cpu, struct task_struct *tidle)
>  {
>  	int apicid = apic->cpu_present_to_apicid(cpu);
>  	int cpu0_nmi_registered = 0;
> -	unsigned long flags;
>  	int err, ret = 0;
>
>  	lockdep_assert_irqs_enabled();
> @@ -1239,19 +1298,6 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
>  		goto unreg_nmi;
>  	}
>
> -	/*
> -	 * Check TSC synchronization with the AP (keep irqs disabled
> -	 * while doing so):
> -	 */
> -	local_irq_save(flags);
> -	check_tsc_sync_source(cpu);
> -	local_irq_restore(flags);
> -
> -	while (!cpu_online(cpu)) {
> -		cpu_relax();
> -		touch_nmi_watchdog();
> -	}
> -
>  unreg_nmi:
>  	/*
>  	 * Clean up the nmi handler. Do this after the callin and callout sync
> @@ -1263,6 +1309,34 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
>  	return ret;
>  }
>
> +int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
> +{
> +	int ret;
> +
> +	ret = do_cpu_up(cpu, tidle);
> +	if (ret)
> +		return ret;
> +
> +	ret = do_wait_cpu_initialized(cpu);
> +	if (ret)
> +		return ret;
> +
> +	ret = do_wait_cpu_callin(cpu);
> +	if (ret)
> +		return ret;
> +
> +	ret = do_wait_cpu_online(cpu);
> +
> +	if (x86_platform.legacy.warm_reset) {
> +		/*
> +		 * Cleanup possible dangling ends...
> +		 */
> +		smpboot_restore_warm_reset_vector();
> +	}
> +
> +	return ret;
> +}
> +
>  /**
>   * arch_disable_smp_support() - disables SMP support for x86 at runtime
>   */
> --
> 2.25.1
>
  
David Woodhouse Feb. 8, 2023, 12:02 p.m. UTC | #2
On Wed, 2023-02-08 at 18:03 +0800, Yuan Yao wrote:
> 
> >   #endif
> > +       /*
> > +        * Sync point with do_wait_cpu_initialized(). On boot, all secondary
> > +        * CPUs reach this stage after receiving INIT/SIPI from do_cpu_up()
> > +        * in the x86/cpu:kick cpuhp stage. At the start of cpu_init() they
> > +        * will wait for do_wait_cpu_initialized() to set their bit in
> > +        * smp_callout_mask to release them.
> 
> The last sentence of the comment looks confused. The fact is:
> 
> For serial case, The BSP waits AP to set cpu_initialized_mask from
> wait_for_master_cpu() after fired INIT/SIPI, then AP starts to wait
> cpu_callout_mask set by BSP from do_boot_cpu().
> 
> Or the comments below "Bringup step two:..." which also looks clear
> enough then above.


	/*
	 * Sync point with do_wait_cpu_initialized(). Before proceeding through
	 * cpu_init(), the AP will call wait_for_master_cpu() which sets its
	 * bit in cpu_initialized_mask and then waits for the BSP to set its
	 * bit in cpu_callout_mask to release it.
	 */
	cpu_init_secondary();


Better?
  
Yuan Yao Feb. 9, 2023, 3:06 a.m. UTC | #3
On Wed, Feb 08, 2023 at 12:02:55PM +0000, David Woodhouse wrote:
> On Wed, 2023-02-08 at 18:03 +0800, Yuan Yao wrote:
> >
> > >   #endif
> > > +       /*
> > > +        * Sync point with do_wait_cpu_initialized(). On boot, all secondary
> > > +        * CPUs reach this stage after receiving INIT/SIPI from do_cpu_up()
> > > +        * in the x86/cpu:kick cpuhp stage. At the start of cpu_init() they
> > > +        * will wait for do_wait_cpu_initialized() to set their bit in
> > > +        * smp_callout_mask to release them.
> >
> > The last sentence of the comment looks confused. The fact is:
> >
> > For serial case, The BSP waits AP to set cpu_initialized_mask from
> > wait_for_master_cpu() after fired INIT/SIPI, then AP starts to wait
> > cpu_callout_mask set by BSP from do_boot_cpu().
> >
> > Or the comments below "Bringup step two:..." which also looks clear
> > enough then above.
>
>
> 	/*
> 	 * Sync point with do_wait_cpu_initialized(). Before proceeding through
> 	 * cpu_init(), the AP will call wait_for_master_cpu() which sets its
> 	 * bit in cpu_initialized_mask and then waits for the BSP to set its
> 	 * bit in cpu_callout_mask to release it.
> 	 */
> 	cpu_init_secondary();
>
>
> Better?

Yes, it's better to me, thanks.
  

Patch

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3a793772a2aa..eee717086ab7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -204,6 +204,10 @@  static void smp_callin(void)
 
 	wmb();
 
+	/*
+	 * This runs the AP through all the cpuhp states to its target
+	 * state (CPUHP_ONLINE in the case of serial bringup).
+	 */
 	notify_cpu_starting(cpuid);
 
 	/*
@@ -231,17 +235,33 @@  static void notrace start_secondary(void *unused)
 	load_cr3(swapper_pg_dir);
 	__flush_tlb_all();
 #endif
+	/*
+	 * Sync point with do_wait_cpu_initialized(). On boot, all secondary
+	 * CPUs reach this stage after receiving INIT/SIPI from do_cpu_up()
+	 * in the x86/cpu:kick cpuhp stage. At the start of cpu_init() they
+	 * will wait for do_wait_cpu_initialized() to set their bit in
+	 * smp_callout_mask to release them.
+	 */
 	cpu_init_secondary();
 	rcu_cpu_starting(raw_smp_processor_id());
 	x86_cpuinit.early_percpu_clock_init();
+
+	/*
+	 * Sync point with do_wait_cpu_callin(). The AP doesn't wait here
+	 * but just sets the bit to let the controlling CPU (BSP) know that
+	 * it's got this far.
+	 */
 	smp_callin();
 
 	enable_start_cpu0 = 0;
 
 	/* otherwise gcc will move up smp_processor_id before the cpu_init */
 	barrier();
+
 	/*
-	 * Check TSC synchronization with the boot CPU:
+	 * Check TSC synchronization with the boot CPU (or whichever CPU
+	 * is controlling the bringup). It will do its part of this from
+	 * do_wait_cpu_online(), making it an implicit sync point.
 	 */
 	check_tsc_sync_target();
 
@@ -254,6 +274,7 @@  static void notrace start_secondary(void *unused)
 	 * half valid vector space.
 	 */
 	lock_vector_lock();
+	/* Sync point with do_wait_cpu_online() */
 	set_cpu_online(smp_processor_id(), true);
 	lapic_online();
 	unlock_vector_lock();
@@ -1083,7 +1104,6 @@  static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
 	unsigned long start_ip = real_mode_header->trampoline_start;
 
 	unsigned long boot_error = 0;
-	unsigned long timeout;
 
 #ifdef CONFIG_X86_64
 	/* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
@@ -1144,55 +1164,94 @@  static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
 		boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
 						     cpu0_nmi_registered);
 
-	if (!boot_error) {
-		/*
-		 * Wait 10s total for first sign of life from AP
-		 */
-		boot_error = -1;
-		timeout = jiffies + 10*HZ;
-		while (time_before(jiffies, timeout)) {
-			if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
-				/*
-				 * Tell AP to proceed with initialization
-				 */
-				cpumask_set_cpu(cpu, cpu_callout_mask);
-				boot_error = 0;
-				break;
-			}
-			schedule();
-		}
-	}
+	return boot_error;
+}
 
-	if (!boot_error) {
-		/*
-		 * Wait till AP completes initial initialization
-		 */
-		while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
-			/*
-			 * Allow other tasks to run while we wait for the
-			 * AP to come online. This also gives a chance
-			 * for the MTRR work(triggered by the AP coming online)
-			 * to be completed in the stop machine context.
-			 */
-			schedule();
-		}
+static int do_wait_cpu_cpumask(unsigned int cpu, const struct cpumask *mask)
+{
+	unsigned long timeout;
+
+	/*
+	 * Wait up to 10s for the CPU to report in.
+	 */
+	timeout = jiffies + 10*HZ;
+	while (time_before(jiffies, timeout)) {
+		if (cpumask_test_cpu(cpu, mask))
+			return 0;
+
+		schedule();
 	}
+	return -1;
+}
 
-	if (x86_platform.legacy.warm_reset) {
-		/*
-		 * Cleanup possible dangling ends...
-		 */
-		smpboot_restore_warm_reset_vector();
+/*
+ * Bringup step two: Wait for the target AP to reach cpu_init_secondary()
+ * and thus wait_for_master_cpu(), then set cpu_callout_mask to allow it
+ * to proceed.  The AP will then proceed past setting its 'callin' bit
+ * and end up waiting in check_tsc_sync_target() until we reach
+ * do_wait_cpu_online() to tend to it.
+ */
+static int do_wait_cpu_initialized(unsigned int cpu)
+{
+	/*
+	 * Wait for first sign of life from AP.
+	 */
+	if (do_wait_cpu_cpumask(cpu, cpu_initialized_mask))
+		return -1;
+
+	cpumask_set_cpu(cpu, cpu_callout_mask);
+	return 0;
+}
+
+/*
+ * Bringup step three: Wait for the target AP to reach smp_callin().
+ * The AP is not waiting for us here so we don't need to parallelise
+ * this step. Not entirely clear why we care about this, since we just
+ * proceed directly to TSC synchronization which is the next sync
+ * point with the AP anyway.
+ */
+static int do_wait_cpu_callin(unsigned int cpu)
+{
+	/*
+	 * Wait till AP completes initial initialization.
+	 */
+	return do_wait_cpu_cpumask(cpu, cpu_callin_mask);
+}
+
+/*
+ * Bringup step four: Synchronize the TSC and wait for the target AP
+ * to reach set_cpu_online() in start_secondary().
+ */
+static int do_wait_cpu_online(unsigned int cpu)
+{
+	unsigned long flags;
+
+	/*
+	 * Check TSC synchronization with the AP (keep irqs disabled
+	 * while doing so):
+	 */
+	local_irq_save(flags);
+	check_tsc_sync_source(cpu);
+	local_irq_restore(flags);
+
+	/*
+	 * Wait for the AP to mark itself online. Not entirely
+	 * clear why we care, since the generic cpuhp code will
+	 * wait for it to each CPUHP_AP_ONLINE_IDLE before going
+	 * ahead with the rest of the bringup anyway.
+	 */
+	while (!cpu_online(cpu)) {
+		cpu_relax();
+		touch_nmi_watchdog();
 	}
 
-	return boot_error;
+	return 0;
 }
 
-int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
+static int do_cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int apicid = apic->cpu_present_to_apicid(cpu);
 	int cpu0_nmi_registered = 0;
-	unsigned long flags;
 	int err, ret = 0;
 
 	lockdep_assert_irqs_enabled();
@@ -1239,19 +1298,6 @@  int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 		goto unreg_nmi;
 	}
 
-	/*
-	 * Check TSC synchronization with the AP (keep irqs disabled
-	 * while doing so):
-	 */
-	local_irq_save(flags);
-	check_tsc_sync_source(cpu);
-	local_irq_restore(flags);
-
-	while (!cpu_online(cpu)) {
-		cpu_relax();
-		touch_nmi_watchdog();
-	}
-
 unreg_nmi:
 	/*
 	 * Clean up the nmi handler. Do this after the callin and callout sync
@@ -1263,6 +1309,34 @@  int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 	return ret;
 }
 
+int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
+{
+	int ret;
+
+	ret = do_cpu_up(cpu, tidle);
+	if (ret)
+		return ret;
+
+	ret = do_wait_cpu_initialized(cpu);
+	if (ret)
+		return ret;
+
+	ret = do_wait_cpu_callin(cpu);
+	if (ret)
+		return ret;
+
+	ret = do_wait_cpu_online(cpu);
+
+	if (x86_platform.legacy.warm_reset) {
+		/*
+		 * Cleanup possible dangling ends...
+		 */
+		smpboot_restore_warm_reset_vector();
+	}
+
+	return ret;
+}
+
 /**
  * arch_disable_smp_support() - disables SMP support for x86 at runtime
  */