[v12,06/11] x86/smpboot: Remove initial_stack on 64-bit

Message ID 20230226110802.103134-7-usama.arif@bytedance.com
State New
Headers
Series Parallel CPU bringup for x86_64 |

Commit Message

Usama Arif Feb. 26, 2023, 11:07 a.m. UTC
  From: Brian Gerst <brgerst@gmail.com>

Eliminating global variables from the CPU startup path in order to simplify
it and facilitate parallel startup.

Remove initial_stack, and load RSP from current_task->thread.sp instead.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Tested-by: Usama Arif <usama.arif@bytedance.com>
Signed-off-by: Usama Arif <usama.arif@bytedance.com>
---
 arch/x86/include/asm/processor.h |  6 ++++-
 arch/x86/include/asm/smp.h       |  5 +++-
 arch/x86/kernel/acpi/sleep.c     |  5 ++--
 arch/x86/kernel/asm-offsets.c    |  1 +
 arch/x86/kernel/head_64.S        | 43 +++++++++++++++++++++-----------
 arch/x86/kernel/smpboot.c        |  7 +++++-
 arch/x86/xen/xen-head.S          |  2 +-
 7 files changed, 48 insertions(+), 21 deletions(-)
  

Comments

Thomas Gleixner Feb. 28, 2023, 4:13 p.m. UTC | #1
On Sun, Feb 26 2023 at 11:07, Usama Arif wrote:
> From: Brian Gerst <brgerst@gmail.com>
>
> Eliminating global variables from the CPU startup path in order to simplify
> it and facilitate parallel startup.

As this patch is now part of the parallel boot series and actually
introduces smpboot_control, the above is neither accurate nor useful.

Folks, really.

> Remove initial_stack, and load RSP from current_task->thread.sp instead.

  
>  #ifdef CONFIG_SMP
> -	initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
> +	current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);

This lacks a comment about the temporary (ab)use of current->thread.sp

>  	early_gdt_descr.address =
>  			(unsigned long)get_cpu_gdt_rw(smp_processor_id());
>  	initial_gs = per_cpu_offset(smp_processor_id());
> +	smpboot_control = smp_processor_id();
>  #endif

> @@ -241,6 +241,24 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
>  	UNWIND_HINT_EMPTY
>  	ANNOTATE_NOENDBR // above
>  
> +#ifdef CONFIG_SMP
> +	movl	smpboot_control(%rip), %ecx
> +
> +	/* Get the per cpu offset for the given CPU# which is in ECX */
> +	movq	__per_cpu_offset(,%rcx,8), %rdx
> +#else
> +	xorl	%edx, %edx
> +#endif /* CONFIG_SMP */

Sigh, we should finally make CONFIG_SMP def_bool y ...

> +	/*
> +	 * Setup a boot time stack - Any secondary CPU will have lost its stack
> +	 * by now because the cr3-switch above unmaps the real-mode stack.
> +	 *
> +	 * RDX contains the per-cpu offset
> +	 */
> +	movq	pcpu_hot + X86_current_task(%rdx), %rax
> +	movq	TASK_threadsp(%rax), %rsp
> +
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index b18c1385e181..62e3bf37f0b8 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -1112,7 +1112,12 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
>  	idle->thread.sp = (unsigned long)task_pt_regs(idle);
>  	early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
>  	initial_code = (unsigned long)start_secondary;
> -	initial_stack  = idle->thread.sp;
> +
> +	if (IS_ENABLED(CONFIG_X86_32)) {
> +		initial_stack  = idle->thread.sp;
> +	} else {
> +		smpboot_control = cpu;
> +	}

Please remove the pointless brackets.
  
Thanks,

        tglx
  
David Woodhouse Feb. 28, 2023, 4:25 p.m. UTC | #2
On Tue, 2023-02-28 at 17:13 +0100, Thomas Gleixner wrote:
> On Sun, Feb 26 2023 at 11:07, Usama Arif wrote:
> > From: Brian Gerst <brgerst@gmail.com>
> > 
> > Eliminating global variables from the CPU startup path in order to simplify
> > it and facilitate parallel startup.
> 
> As this patch is now part of the parallel boot series and actually
> introduces smpboot_control, the above is neither accurate nor useful.

I neglected to mention adding smpboot_control, but the above *is*
accurate. This patch, and the next two, are eliminating global
variables to simplify the startup path and make it possible to run it
in parallel.

Now it's slightly harder to phrase that without the Verboteneworte
'this patch', I'll grant you. But it's trying to explain *why* we're
eliminating those global variables.

I'll try again.

> Folks, really.

Also, while those two lines *happen* to be my addition to Brian's
commit message, I don't know if you knew that. Speak to me how you
like; you know I'll still love you. But be nicer to Brian and Usama.

> > Remove initial_stack, and load RSP from current_task->thread.sp instead.
> 
>   
> >  #ifdef CONFIG_SMP
> > -       initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
> > +       current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
> 
> This lacks a comment about the temporary (ab)use of current->thread.sp

Ack.

> >         early_gdt_descr.address =
> >                         (unsigned long)get_cpu_gdt_rw(smp_processor_id());
> >         initial_gs = per_cpu_offset(smp_processor_id());
> > +       smpboot_control = smp_processor_id();
> >  #endif
> 
> > @@ -241,6 +241,24 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
> >         UNWIND_HINT_EMPTY
> >         ANNOTATE_NOENDBR // above
> >  
> > +#ifdef CONFIG_SMP
> > +       movl    smpboot_control(%rip), %ecx
> > +
> > +       /* Get the per cpu offset for the given CPU# which is in ECX */
> > +       movq    __per_cpu_offset(,%rcx,8), %rdx
> > +#else
> > +       xorl    %edx, %edx
> > +#endif /* CONFIG_SMP */
> 
> Sigh, we should finally make CONFIG_SMP def_bool y ...

Not today :)

> > +       /*
> > +        * Setup a boot time stack - Any secondary CPU will have lost its stack
> > +        * by now because the cr3-switch above unmaps the real-mode stack.
> > +        *
> > +        * RDX contains the per-cpu offset
> > +        */
> > +       movq    pcpu_hot + X86_current_task(%rdx), %rax
> > +       movq    TASK_threadsp(%rax), %rsp
> > +
> > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> > index b18c1385e181..62e3bf37f0b8 100644
> > --- a/arch/x86/kernel/smpboot.c
> > +++ b/arch/x86/kernel/smpboot.c
> > @@ -1112,7 +1112,12 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
> >         idle->thread.sp = (unsigned long)task_pt_regs(idle);
> >         early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
> >         initial_code = (unsigned long)start_secondary;
> > -       initial_stack  = idle->thread.sp;
> > +
> > +       if (IS_ENABLED(CONFIG_X86_32)) {
> > +               initial_stack  = idle->thread.sp;
> > +       } else {
> > +               smpboot_control = cpu;
> > +       }
> 
> Please remove the pointless brackets.

I pondered that, but they only get added back again in the next patch.
It just seemed like adding pointless churn.
  
David Woodhouse Feb. 28, 2023, 5:09 p.m. UTC | #3
On Tue, 2023-02-28 at 17:13 +0100, Thomas Gleixner wrote:
> On Sun, Feb 26 2023 at 11:07, Usama Arif wrote:
> > From: Brian Gerst <brgerst@gmail.com>
> > 
> > Eliminating global variables from the CPU startup path in order to
> > simplify
> > it and facilitate parallel startup.
> 
> As this patch is now part of the parallel boot series and actually
> introduces smpboot_control, the above is neither accurate nor useful.

Better commit message, add a comment where we abuse current->thread.sp
in the sleep path. Didn't remove the {} which would be added back in
the very next patch. Pushed to my tree for Usama's next round.


From b5b9a2ab146cfd840f115b0455930767b6f6fcea Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 24 Feb 2023 10:42:31 -0500
Subject: [PATCH 06/11] x86/smpboot: Remove initial_stack on 64-bit

In order to facilitate parallel startup, start to eliminate global variables
from the CPU startup path.

However, we start by introducing one more: smpboot_control. For now this
merely holds the CPU# of the CPU which is coming up. That CPU can then
find its own per-cpu data, and everything else it needs can be found from
there, allowing the other global variables to be removed.

First to be removed is initial_stack. Each CPU can load %rsp from its
current_task->thread.sp instead. That is already set up with the correct
idle thread for APs. Set up the .sp field in INIT_THREAD on x86 so that
the BSP also finds a suitable stack pointer in the static per-cpu data
when coming up on first boot.

On resume from S3, the CPU needs a temporary stack because its idle task
is already active. Instead of setting initial_stack, the sleep code can
simply set its own current->thread.sp to point to the temporary stack.
The true stack pointer will get restored with the rest of the CPU
context in do_suspend_lowlevel().

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Tested-by: Usama Arif <usama.arif@bytedance.com>
Signed-off-by: Usama Arif <usama.arif@bytedance.com>
Tested-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
---
 arch/x86/include/asm/processor.h |  6 ++++-
 arch/x86/include/asm/smp.h       |  5 +++-
 arch/x86/kernel/acpi/sleep.c     | 17 +++++++++++--
 arch/x86/kernel/asm-offsets.c    |  1 +
 arch/x86/kernel/head_64.S        | 43 +++++++++++++++++++++-----------
 arch/x86/kernel/smpboot.c        |  7 +++++-
 arch/x86/xen/xen-head.S          |  2 +-
 7 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4e35c66edeb7..bdde7316e75b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -648,7 +648,11 @@ static inline void spin_lock_prefetch(const void *x)
 #define KSTK_ESP(task)		(task_pt_regs(task)->sp)
 
 #else
-#define INIT_THREAD { }
+extern unsigned long __end_init_task[];
+
+#define INIT_THREAD {							    \
+	.sp	= (unsigned long)&__end_init_task - sizeof(struct pt_regs), \
+}
 
 extern unsigned long KSTK_ESP(struct task_struct *task);
 
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index b4dbb20dab1a..bf2c51df9e0b 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -199,5 +199,8 @@ extern void nmi_selftest(void);
 #define nmi_selftest() do { } while (0)
 #endif
 
-#endif /* __ASSEMBLY__ */
+extern unsigned int smpboot_control;
+
+#endif /* !__ASSEMBLY__ */
+
 #endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 3b7f4cdbf2e0..23e44416dc04 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -111,13 +111,26 @@ int x86_acpi_suspend_lowlevel(void)
 	saved_magic = 0x12345678;
 #else /* CONFIG_64BIT */
 #ifdef CONFIG_SMP
-	initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
+	/*
+	 * As each CPU starts up, it will find its own stack pointer
+	 * from its current_task->thread.sp. Typically that will be
+	 * the idle thread for a newly-started AP, or even the boot
+	 * CPU which will find it set to &init_task in the static
+	 * per-cpu data.
+	 *
+	 * Make the resuming CPU use the temporary stack at startup
+	 * by setting current->thread.sp to point to that. The true
+	 * %rsp will be restored with the rest of the CPU context,
+	 * by do_suspend_lowlevel().
+	 */
+	current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
 	early_gdt_descr.address =
 			(unsigned long)get_cpu_gdt_rw(smp_processor_id());
 	initial_gs = per_cpu_offset(smp_processor_id());
+	smpboot_control = smp_processor_id();
 #endif
 	initial_code = (unsigned long)wakeup_long64;
-       saved_magic = 0x123456789abcdef0L;
+	saved_magic = 0x123456789abcdef0L;
 #endif /* CONFIG_64BIT */
 
 	/*
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 82c783da16a8..797ae1a15c91 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -108,6 +108,7 @@ static void __used common(void)
 	OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
 	OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
 	OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
+	OFFSET(X86_current_task, pcpu_hot, current_task);
 #ifdef CONFIG_CALL_DEPTH_TRACKING
 	OFFSET(X86_call_depth, pcpu_hot, call_depth);
 #endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 222efd4a09bc..5a2417d788d1 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -61,8 +61,8 @@ SYM_CODE_START_NOALIGN(startup_64)
 	 * tables and then reload them.
 	 */
 
-	/* Set up the stack for verify_cpu(), similar to initial_stack below */
-	leaq	(__end_init_task - FRAME_SIZE)(%rip), %rsp
+	/* Set up the stack for verify_cpu() */
+	leaq	(__end_init_task - PTREGS_SIZE)(%rip), %rsp
 
 	leaq	_text(%rip), %rdi
 
@@ -241,6 +241,24 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 	UNWIND_HINT_EMPTY
 	ANNOTATE_NOENDBR // above
 
+#ifdef CONFIG_SMP
+	movl	smpboot_control(%rip), %ecx
+
+	/* Get the per cpu offset for the given CPU# which is in ECX */
+	movq	__per_cpu_offset(,%rcx,8), %rdx
+#else
+	xorl	%edx, %edx
+#endif /* CONFIG_SMP */
+
+	/*
+	 * Setup a boot time stack - Any secondary CPU will have lost its stack
+	 * by now because the cr3-switch above unmaps the real-mode stack.
+	 *
+	 * RDX contains the per-cpu offset
+	 */
+	movq	pcpu_hot + X86_current_task(%rdx), %rax
+	movq	TASK_threadsp(%rax), %rsp
+
 	/*
 	 * We must switch to a new descriptor in kernel space for the GDT
 	 * because soon the kernel won't have access anymore to the userspace
@@ -275,12 +293,6 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 	movl	initial_gs+4(%rip),%edx
 	wrmsr
 
-	/*
-	 * Setup a boot time stack - Any secondary CPU will have lost its stack
-	 * by now because the cr3-switch above unmaps the real-mode stack
-	 */
-	movq initial_stack(%rip), %rsp
-
 	/* Setup and Load IDT */
 	pushq	%rsi
 	call	early_setup_idt
@@ -372,7 +384,11 @@ SYM_CODE_END(secondary_startup_64)
 SYM_CODE_START(start_cpu0)
 	ANNOTATE_NOENDBR
 	UNWIND_HINT_EMPTY
-	movq	initial_stack(%rip), %rsp
+
+	/* Find the idle task stack */
+	movq	PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx
+	movq	TASK_threadsp(%rcx), %rsp
+
 	jmp	.Ljump_to_C_code
 SYM_CODE_END(start_cpu0)
 #endif
@@ -420,12 +436,6 @@ SYM_DATA(initial_gs,	.quad INIT_PER_CPU_VAR(fixed_percpu_data))
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 SYM_DATA(initial_vc_handler,	.quad handle_vc_boot_ghcb)
 #endif
-
-/*
- * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder
- * reliably detect the end of the stack.
- */
-SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
 	__FINITDATA
 
 	__INIT
@@ -660,6 +670,9 @@ SYM_DATA_END(level1_fixmap_pgt)
 SYM_DATA(early_gdt_descr,		.word GDT_ENTRIES*8-1)
 SYM_DATA_LOCAL(early_gdt_descr_base,	.quad INIT_PER_CPU_VAR(gdt_page))
 
+	.align 16
+SYM_DATA(smpboot_control,		.long 0)
+
 	.align 16
 /* This must match the first entry in level2_kernel_pgt */
 SYM_DATA(phys_base, .quad 0x0)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b18c1385e181..62e3bf37f0b8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1112,7 +1112,12 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
 	idle->thread.sp = (unsigned long)task_pt_regs(idle);
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
 	initial_code = (unsigned long)start_secondary;
-	initial_stack  = idle->thread.sp;
+
+	if (IS_ENABLED(CONFIG_X86_32)) {
+		initial_stack  = idle->thread.sp;
+	} else {
+		smpboot_control = cpu;
+	}
 
 	/* Enable the espfix hack for this CPU */
 	init_espfix_ap(cpu);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index ffaa62167f6e..6bd391476656 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -49,7 +49,7 @@ SYM_CODE_START(startup_xen)
 	ANNOTATE_NOENDBR
 	cld
 
-	mov initial_stack(%rip), %rsp
+	leaq	(__end_init_task - PTREGS_SIZE)(%rip), %rsp
 
 	/* Set up %gs.
 	 *
  
Thomas Gleixner Feb. 28, 2023, 8:17 p.m. UTC | #4
On Tue, Feb 28 2023 at 17:09, David Woodhouse wrote:
> On Tue, 2023-02-28 at 17:13 +0100, Thomas Gleixner wrote:
>> As this patch is now part of the parallel boot series and actually
>> introduces smpboot_control, the above is neither accurate nor useful.
>
> Better commit message, add a comment where we abuse current->thread.sp
> in the sleep path. Didn't remove the {} which would be added back in
> the very next patch. Pushed to my tree for Usama's next round.

Ok.

> However, we start by introducing one more: smpboot_control. For now this

s/we// :)

> merely holds the CPU# of the CPU which is coming up. That CPU can then
> find its own per-cpu data, and everything else it needs can be found from
> there, allowing the other global variables to be removed.
>
> First to be removed is initial_stack. Each CPU can load %rsp from its
> current_task->thread.sp instead. That is already set up with the correct
> idle thread for APs. Set up the .sp field in INIT_THREAD on x86 so that
> the BSP also finds a suitable stack pointer in the static per-cpu data
> when coming up on first boot.
>
> On resume from S3, the CPU needs a temporary stack because its idle task
> is already active. Instead of setting initial_stack, the sleep code can
> simply set its own current->thread.sp to point to the temporary stack.
> The true stack pointer will get restored with the rest of the CPU
> context in do_suspend_lowlevel().

Thanks for writing this up!

> +	/*
> +	 * As each CPU starts up, it will find its own stack pointer
> +	 * from its current_task->thread.sp. Typically that will be
> +	 * the idle thread for a newly-started AP, or even the boot
> +	 * CPU which will find it set to &init_task in the static
> +	 * per-cpu data.
> +	 *
> +	 * Make the resuming CPU use the temporary stack at startup
> +	 * by setting current->thread.sp to point to that. The true
> +	 * %rsp will be restored with the rest of the CPU context,
> +	 * by do_suspend_lowlevel().

Right, but what restores current->thread.sp? thread.sp is used by
unwinders...

Thanks,

        tglx
  
Thomas Gleixner Feb. 28, 2023, 8:32 p.m. UTC | #5
On Tue, Feb 28 2023 at 16:25, David Woodhouse wrote:
> On Tue, 2023-02-28 at 17:13 +0100, Thomas Gleixner wrote:
>> Folks, really.
>
> Also, while those two lines *happen* to be my addition to Brian's
> commit message, I don't know if you knew that.

I knew because I read Brians patches _and_ I know your quick changelog
style by heart.

> Speak to me how you like; you know I'll still love you. But be nicer
> to Brian and Usama.

Hmm. I was not aware that 'Folks, really.' qualifies as not nice
nowadays.

>> > +#endif /* CONFIG_SMP */
>> 
>> Sigh, we should finally make CONFIG_SMP def_bool y ...
>
> Not today :)

Right, but it's overdue nevertheless to adjust with reality :)

>> > +       if (IS_ENABLED(CONFIG_X86_32)) {
>> > +               initial_stack  = idle->thread.sp;
>> > +       } else {
>> > +               smpboot_control = cpu;
>> > +       }
>> 
>> Please remove the pointless brackets.
>
> I pondered that, but they only get added back again in the next patch.
> It just seemed like adding pointless churn.

Fair enough.

Thanks,

        tglx
  
David Woodhouse Feb. 28, 2023, 8:43 p.m. UTC | #6
On 28 February 2023 20:17:19 GMT, Thomas Gleixner <tglx@linutronix.de> wrote:
>On Tue, Feb 28 2023 at 17:09, David Woodhouse wrote:
>> On Tue, 2023-02-28 at 17:13 +0100, Thomas Gleixner wrote:
>>> As this patch is now part of the parallel boot series and actually
>>> introduces smpboot_control, the above is neither accurate nor useful.
>>
>> Better commit message, add a comment where we abuse current->thread.sp
>> in the sleep path. Didn't remove the {} which would be added back in
>> the very next patch. Pushed to my tree for Usama's next round.
>
>Ok.
>
>> However, we start by introducing one more: smpboot_control. For now this
>
>s/we// :)

Yeah, actually spotted that one as I hit send and it's different in the git tree already.


>> merely holds the CPU# of the CPU which is coming up. That CPU can then
>> find its own per-cpu data, and everything else it needs can be found from
>> there, allowing the other global variables to be removed.
>>
>> First to be removed is initial_stack. Each CPU can load %rsp from its
>> current_task->thread.sp instead. That is already set up with the correct
>> idle thread for APs. Set up the .sp field in INIT_THREAD on x86 so that
>> the BSP also finds a suitable stack pointer in the static per-cpu data
>> when coming up on first boot.
>>
>> On resume from S3, the CPU needs a temporary stack because its idle task
>> is already active. Instead of setting initial_stack, the sleep code can
>> simply set its own current->thread.sp to point to the temporary stack.
>> The true stack pointer will get restored with the rest of the CPU
>> context in do_suspend_lowlevel().
>
>Thanks for writing this up!
>
>> +	/*
>> +	 * As each CPU starts up, it will find its own stack pointer
>> +	 * from its current_task->thread.sp. Typically that will be
>> +	 * the idle thread for a newly-started AP, or even the boot
>> +	 * CPU which will find it set to &init_task in the static
>> +	 * per-cpu data.
>> +	 *
>> +	 * Make the resuming CPU use the temporary stack at startup
>> +	 * by setting current->thread.sp to point to that. The true
>> +	 * %rsp will be restored with the rest of the CPU context,
>> +	 * by do_suspend_lowlevel().
>
>Right, but what restores current->thread.sp? thread.sp is used by
>unwinders...

Unwinding a thread that is actually *on* the CPU? By the time it's taken off, won't ->thread.sp have been written out again? I figured it was just a dead variable while the actual %rsp was in use?
  
Brian Gerst Feb. 28, 2023, 10:22 p.m. UTC | #7
On Tue, Feb 28, 2023 at 11:13 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Sun, Feb 26 2023 at 11:07, Usama Arif wrote:
> > From: Brian Gerst <brgerst@gmail.com>
> >
> > Eliminating global variables from the CPU startup path in order to simplify
> > it and facilitate parallel startup.
>
> As this patch is now part of the parallel boot series and actually
> introduces smpboot_control, the above is neither accurate nor useful.
>
> Folks, really.
>
> > Remove initial_stack, and load RSP from current_task->thread.sp instead.
>
>
> >  #ifdef CONFIG_SMP
> > -     initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
> > +     current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
>
> This lacks a comment about the temporary (ab)use of current->thread.sp

task->thread.sp represents the saved stack pointer of a sleeping or
newly forked task (see __switch_to_asm()), and the boot cpu's idle
task is effectively going to sleep.  Using a temporary stack for
resume is kind of a hack to workaround the fact that the idle task
stack is in use already, but improving that is out of scope for this
series.

--
Brian Gerst
  
Thomas Gleixner March 1, 2023, 10:18 a.m. UTC | #8
On Tue, Feb 28 2023 at 20:43, David Woodhouse wrote:
> On 28 February 2023 20:17:19 GMT, Thomas Gleixner <tglx@linutronix.de> wrote:
>>> +	 * Make the resuming CPU use the temporary stack at startup
>>> +	 * by setting current->thread.sp to point to that. The true
>>> +	 * %rsp will be restored with the rest of the CPU context,
>>> +	 * by do_suspend_lowlevel().
>>
>>Right, but what restores current->thread.sp? thread.sp is used by
>>unwinders...
>
> Unwinding a thread that is actually *on* the CPU?

No.

> By the time it's taken off, won't ->thread.sp have been written out
> again? I figured it was just a dead variable while the actual %rsp was
> in use?

Yes. It's not used when the thread is on the CPU. And you are right,
it's saved and restored in switch_to(). Can you please add a comment to
that effect?

Thanks,

        tglx
  
David Woodhouse March 1, 2023, 10:42 a.m. UTC | #9
On Wed, 2023-03-01 at 11:18 +0100, Thomas Gleixner wrote:
> 
> > By the time it's taken off, won't ->thread.sp have been written out
> > again? I figured it was just a dead variable while the actual %rsp was
> > in use?
> 
> Yes. It's not used when the thread is on the CPU. And you are right,
> it's saved and restored in switch_to(). Can you please add a comment to
> that effect?

Pushed to my v12bis branch:

From ac86ad32434d4661db0bef6791b7953d369585e2 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 24 Feb 2023 10:42:31 -0500
Subject: [PATCH 06/11] x86/smpboot: Remove initial_stack on 64-bit

In order to facilitate parallel startup, start to eliminate some of the
global variables passing information to CPUs in the startup path.

However, start by introducing one more: smpboot_control. For now this
merely holds the CPU# of the CPU which is coming up. Each CPU can then
find its own per-cpu data, and everything else it needs can be found
from there, allowing the other global variables to be removed.

First to be removed is initial_stack. Each CPU can load %rsp from its
current_task->thread.sp instead. That is already set up with the correct
idle thread for APs. Set up the .sp field in INIT_THREAD on x86 so that
the BSP also finds a suitable stack pointer in the static per-cpu data
when coming up on first boot.

On resume from S3, the CPU needs a temporary stack because its idle task
is already active. Instead of setting initial_stack, the sleep code can
simply set its own current->thread.sp to point to the temporary stack.
Nobody else cares about ->thread.sp for a thread which is currently on
a CPU, because the true value is actually in the %rsp register. Which
is restored with the rest of the CPU context in do_suspend_lowlevel().

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Tested-by: Usama Arif <usama.arif@bytedance.com>
Signed-off-by: Usama Arif <usama.arif@bytedance.com>
Tested-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
---
 arch/x86/include/asm/processor.h |  6 ++++-
 arch/x86/include/asm/smp.h       |  5 +++-
 arch/x86/kernel/acpi/sleep.c     | 20 +++++++++++++--
 arch/x86/kernel/asm-offsets.c    |  1 +
 arch/x86/kernel/head_64.S        | 43 +++++++++++++++++++++-----------
 arch/x86/kernel/smpboot.c        |  7 +++++-
 arch/x86/xen/xen-head.S          |  2 +-
 7 files changed, 63 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4e35c66edeb7..bdde7316e75b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -648,7 +648,11 @@ static inline void spin_lock_prefetch(const void *x)
 #define KSTK_ESP(task)		(task_pt_regs(task)->sp)
 
 #else
-#define INIT_THREAD { }
+extern unsigned long __end_init_task[];
+
+#define INIT_THREAD {							    \
+	.sp	= (unsigned long)&__end_init_task - sizeof(struct pt_regs), \
+}
 
 extern unsigned long KSTK_ESP(struct task_struct *task);
 
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index b4dbb20dab1a..bf2c51df9e0b 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -199,5 +199,8 @@ extern void nmi_selftest(void);
 #define nmi_selftest() do { } while (0)
 #endif
 
-#endif /* __ASSEMBLY__ */
+extern unsigned int smpboot_control;
+
+#endif /* !__ASSEMBLY__ */
+
 #endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 3b7f4cdbf2e0..1b4c43d0819a 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -111,13 +111,29 @@ int x86_acpi_suspend_lowlevel(void)
 	saved_magic = 0x12345678;
 #else /* CONFIG_64BIT */
 #ifdef CONFIG_SMP
-	initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
+	/*
+	 * As each CPU starts up, it will find its own stack pointer
+	 * from its current_task->thread.sp. Typically that will be
+	 * the idle thread for a newly-started AP, or even the boot
+	 * CPU which will find it set to &init_task in the static
+	 * per-cpu data.
+	 *
+	 * Make the resuming CPU use the temporary stack at startup
+	 * by setting current->thread.sp to point to that. The true
+	 * %rsp will be restored with the rest of the CPU context,
+	 * by do_suspend_lowlevel(). And unwinders don't care about
+	 * the abuse of ->thread.sp because it's a dead variable
+	 * while the thread is running on the CPU anyway; the true
+	 * value is in the actual %rsp register.
+	 */
+	current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
 	early_gdt_descr.address =
 			(unsigned long)get_cpu_gdt_rw(smp_processor_id());
 	initial_gs = per_cpu_offset(smp_processor_id());
+	smpboot_control = smp_processor_id();
 #endif
 	initial_code = (unsigned long)wakeup_long64;
-       saved_magic = 0x123456789abcdef0L;
+	saved_magic = 0x123456789abcdef0L;
 #endif /* CONFIG_64BIT */
 
 	/*
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 82c783da16a8..797ae1a15c91 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -108,6 +108,7 @@ static void __used common(void)
 	OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
 	OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
 	OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
+	OFFSET(X86_current_task, pcpu_hot, current_task);
 #ifdef CONFIG_CALL_DEPTH_TRACKING
 	OFFSET(X86_call_depth, pcpu_hot, call_depth);
 #endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 222efd4a09bc..0b34f6a9221b 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -61,8 +61,8 @@ SYM_CODE_START_NOALIGN(startup_64)
 	 * tables and then reload them.
 	 */
 
-	/* Set up the stack for verify_cpu(), similar to initial_stack below */
-	leaq	(__end_init_task - FRAME_SIZE)(%rip), %rsp
+	/* Set up the stack for verify_cpu() */
+	leaq	(__end_init_task - PTREGS_SIZE)(%rip), %rsp
 
 	leaq	_text(%rip), %rdi
 
@@ -241,6 +241,24 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 	UNWIND_HINT_EMPTY
 	ANNOTATE_NOENDBR // above
 
+#ifdef CONFIG_SMP
+	movl	smpboot_control(%rip), %ecx
+
+	/* Get the per cpu offset for the given CPU# which is in ECX */
+	movq	__per_cpu_offset(,%rcx,8), %rdx
+#else
+	xorq	%edx, %edx /* zero-extended to clear all of RDX */
+#endif /* CONFIG_SMP */
+
+	/*
+	 * Setup a boot time stack - Any secondary CPU will have lost its stack
+	 * by now because the cr3-switch above unmaps the real-mode stack.
+	 *
+	 * RDX contains the per-cpu offset
+	 */
+	movq	pcpu_hot + X86_current_task(%rdx), %rax
+	movq	TASK_threadsp(%rax), %rsp
+
 	/*
 	 * We must switch to a new descriptor in kernel space for the GDT
 	 * because soon the kernel won't have access anymore to the userspace
@@ -275,12 +293,6 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 	movl	initial_gs+4(%rip),%edx
 	wrmsr
 
-	/*
-	 * Setup a boot time stack - Any secondary CPU will have lost its stack
-	 * by now because the cr3-switch above unmaps the real-mode stack
-	 */
-	movq initial_stack(%rip), %rsp
-
 	/* Setup and Load IDT */
 	pushq	%rsi
 	call	early_setup_idt
@@ -372,7 +384,11 @@ SYM_CODE_END(secondary_startup_64)
 SYM_CODE_START(start_cpu0)
 	ANNOTATE_NOENDBR
 	UNWIND_HINT_EMPTY
-	movq	initial_stack(%rip), %rsp
+
+	/* Find the idle task stack */
+	movq	PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx
+	movq	TASK_threadsp(%rcx), %rsp
+
 	jmp	.Ljump_to_C_code
 SYM_CODE_END(start_cpu0)
 #endif
@@ -420,12 +436,6 @@ SYM_DATA(initial_gs,	.quad INIT_PER_CPU_VAR(fixed_percpu_data))
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 SYM_DATA(initial_vc_handler,	.quad handle_vc_boot_ghcb)
 #endif
-
-/*
- * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder
- * reliably detect the end of the stack.
- */
-SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
 	__FINITDATA
 
 	__INIT
@@ -660,6 +670,9 @@ SYM_DATA_END(level1_fixmap_pgt)
 SYM_DATA(early_gdt_descr,		.word GDT_ENTRIES*8-1)
 SYM_DATA_LOCAL(early_gdt_descr_base,	.quad INIT_PER_CPU_VAR(gdt_page))
 
+	.align 16
+SYM_DATA(smpboot_control,		.long 0)
+
 	.align 16
 /* This must match the first entry in level2_kernel_pgt */
 SYM_DATA(phys_base, .quad 0x0)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b18c1385e181..62e3bf37f0b8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1112,7 +1112,12 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
 	idle->thread.sp = (unsigned long)task_pt_regs(idle);
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
 	initial_code = (unsigned long)start_secondary;
-	initial_stack  = idle->thread.sp;
+
+	if (IS_ENABLED(CONFIG_X86_32)) {
+		initial_stack  = idle->thread.sp;
+	} else {
+		smpboot_control = cpu;
+	}
 
 	/* Enable the espfix hack for this CPU */
 	init_espfix_ap(cpu);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index ffaa62167f6e..6bd391476656 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -49,7 +49,7 @@ SYM_CODE_START(startup_xen)
 	ANNOTATE_NOENDBR
 	cld
 
-	mov initial_stack(%rip), %rsp
+	leaq	(__end_init_task - PTREGS_SIZE)(%rip), %rsp
 
 	/* Set up %gs.
 	 *
  

Patch

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4e35c66edeb7..bdde7316e75b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -648,7 +648,11 @@  static inline void spin_lock_prefetch(const void *x)
 #define KSTK_ESP(task)		(task_pt_regs(task)->sp)
 
 #else
-#define INIT_THREAD { }
+extern unsigned long __end_init_task[];
+
+#define INIT_THREAD {							    \
+	.sp	= (unsigned long)&__end_init_task - sizeof(struct pt_regs), \
+}
 
 extern unsigned long KSTK_ESP(struct task_struct *task);
 
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index b4dbb20dab1a..bf2c51df9e0b 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -199,5 +199,8 @@  extern void nmi_selftest(void);
 #define nmi_selftest() do { } while (0)
 #endif
 
-#endif /* __ASSEMBLY__ */
+extern unsigned int smpboot_control;
+
+#endif /* !__ASSEMBLY__ */
+
 #endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 3b7f4cdbf2e0..ab6e29b32c04 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -111,13 +111,14 @@  int x86_acpi_suspend_lowlevel(void)
 	saved_magic = 0x12345678;
 #else /* CONFIG_64BIT */
 #ifdef CONFIG_SMP
-	initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
+	current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
 	early_gdt_descr.address =
 			(unsigned long)get_cpu_gdt_rw(smp_processor_id());
 	initial_gs = per_cpu_offset(smp_processor_id());
+	smpboot_control = smp_processor_id();
 #endif
 	initial_code = (unsigned long)wakeup_long64;
-       saved_magic = 0x123456789abcdef0L;
+	saved_magic = 0x123456789abcdef0L;
 #endif /* CONFIG_64BIT */
 
 	/*
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 82c783da16a8..797ae1a15c91 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -108,6 +108,7 @@  static void __used common(void)
 	OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
 	OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
 	OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
+	OFFSET(X86_current_task, pcpu_hot, current_task);
 #ifdef CONFIG_CALL_DEPTH_TRACKING
 	OFFSET(X86_call_depth, pcpu_hot, call_depth);
 #endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 222efd4a09bc..5a2417d788d1 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -61,8 +61,8 @@  SYM_CODE_START_NOALIGN(startup_64)
 	 * tables and then reload them.
 	 */
 
-	/* Set up the stack for verify_cpu(), similar to initial_stack below */
-	leaq	(__end_init_task - FRAME_SIZE)(%rip), %rsp
+	/* Set up the stack for verify_cpu() */
+	leaq	(__end_init_task - PTREGS_SIZE)(%rip), %rsp
 
 	leaq	_text(%rip), %rdi
 
@@ -241,6 +241,24 @@  SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 	UNWIND_HINT_EMPTY
 	ANNOTATE_NOENDBR // above
 
+#ifdef CONFIG_SMP
+	movl	smpboot_control(%rip), %ecx
+
+	/* Get the per cpu offset for the given CPU# which is in ECX */
+	movq	__per_cpu_offset(,%rcx,8), %rdx
+#else
+	xorl	%edx, %edx
+#endif /* CONFIG_SMP */
+
+	/*
+	 * Setup a boot time stack - Any secondary CPU will have lost its stack
+	 * by now because the cr3-switch above unmaps the real-mode stack.
+	 *
+	 * RDX contains the per-cpu offset
+	 */
+	movq	pcpu_hot + X86_current_task(%rdx), %rax
+	movq	TASK_threadsp(%rax), %rsp
+
 	/*
 	 * We must switch to a new descriptor in kernel space for the GDT
 	 * because soon the kernel won't have access anymore to the userspace
@@ -275,12 +293,6 @@  SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 	movl	initial_gs+4(%rip),%edx
 	wrmsr
 
-	/*
-	 * Setup a boot time stack - Any secondary CPU will have lost its stack
-	 * by now because the cr3-switch above unmaps the real-mode stack
-	 */
-	movq initial_stack(%rip), %rsp
-
 	/* Setup and Load IDT */
 	pushq	%rsi
 	call	early_setup_idt
@@ -372,7 +384,11 @@  SYM_CODE_END(secondary_startup_64)
 SYM_CODE_START(start_cpu0)
 	ANNOTATE_NOENDBR
 	UNWIND_HINT_EMPTY
-	movq	initial_stack(%rip), %rsp
+
+	/* Find the idle task stack */
+	movq	PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx
+	movq	TASK_threadsp(%rcx), %rsp
+
 	jmp	.Ljump_to_C_code
 SYM_CODE_END(start_cpu0)
 #endif
@@ -420,12 +436,6 @@  SYM_DATA(initial_gs,	.quad INIT_PER_CPU_VAR(fixed_percpu_data))
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 SYM_DATA(initial_vc_handler,	.quad handle_vc_boot_ghcb)
 #endif
-
-/*
- * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder
- * reliably detect the end of the stack.
- */
-SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
 	__FINITDATA
 
 	__INIT
@@ -660,6 +670,9 @@  SYM_DATA_END(level1_fixmap_pgt)
 SYM_DATA(early_gdt_descr,		.word GDT_ENTRIES*8-1)
 SYM_DATA_LOCAL(early_gdt_descr_base,	.quad INIT_PER_CPU_VAR(gdt_page))
 
+	.align 16
+SYM_DATA(smpboot_control,		.long 0)
+
 	.align 16
 /* This must match the first entry in level2_kernel_pgt */
 SYM_DATA(phys_base, .quad 0x0)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b18c1385e181..62e3bf37f0b8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1112,7 +1112,12 @@  static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
 	idle->thread.sp = (unsigned long)task_pt_regs(idle);
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
 	initial_code = (unsigned long)start_secondary;
-	initial_stack  = idle->thread.sp;
+
+	if (IS_ENABLED(CONFIG_X86_32)) {
+		initial_stack  = idle->thread.sp;
+	} else {
+		smpboot_control = cpu;
+	}
 
 	/* Enable the espfix hack for this CPU */
 	init_espfix_ap(cpu);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index ffaa62167f6e..6bd391476656 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -49,7 +49,7 @@  SYM_CODE_START(startup_xen)
 	ANNOTATE_NOENDBR
 	cld
 
-	mov initial_stack(%rip), %rsp
+	leaq	(__end_init_task - PTREGS_SIZE)(%rip), %rsp
 
 	/* Set up %gs.
 	 *