[v12,07/11] x86/smpboot: Remove early_gdt_descr on 64-bit

Message ID 20230226110802.103134-8-usama.arif@bytedance.com
State New
Headers
Series Parallel CPU bringup for x86_64 |

Commit Message

Usama Arif Feb. 26, 2023, 11:07 a.m. UTC
  From: Brian Gerst <brgerst@gmail.com>

Build the GDT descriptor on the stack instead.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Tested-by: Usama Arif <usama.arif@bytedance.com>
Signed-off-by: Usama Arif <usama.arif@bytedance.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kernel/acpi/sleep.c |  2 --
 arch/x86/kernel/head_64.S    | 11 ++++++-----
 arch/x86/kernel/smpboot.c    |  2 +-
 3 files changed, 7 insertions(+), 8 deletions(-)
  

Comments

Thomas Gleixner Feb. 28, 2023, 9:01 p.m. UTC | #1
On Sun, Feb 26 2023 at 11:07, Usama Arif wrote:
> @@ -265,7 +265,12 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
>  	 * addresses where we're currently running on. We have to do that here
>  	 * because in 32bit we couldn't load a 64bit linear address.
>  	 */
> -	lgdt	early_gdt_descr(%rip)
> +	subq	$16, %rsp
> +	movw	$(GDT_SIZE-1), (%rsp)
> +	leaq	gdt_page(%rdx), %rax

Even on !SMP gdt_page is in the 0...__per_cpu_end range. Which means
that on !SMP this results in:

      leaq    0xb000(%rdx),%rax

and RDX is 0. That's not really a valid GDT pointer, right?

> +	movq	%rax, 2(%rsp)
> +	lgdt	(%rsp)

and obviously that's equally broken for the task stack part:

>	movq	pcpu_hot + X86_current_task(%rdx), %rax

This needs:

--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -239,7 +239,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_
 	/* Get the per cpu offset for the given CPU# which is in ECX */
 	movq	__per_cpu_offset(,%rcx,8), %rdx
 #else
-	xorl	%edx, %edx
+	leaq	INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
 #endif /* CONFIG_SMP */
 
 	/*

in the initial_stack patch, which then allows to remove this hunk in the
initial_gs patch:

@@ -286,9 +286,6 @@ SYM_INNER_LABEL(secondary_startup_64_no_
 	 * the per cpu areas are set up.
 	 */
 	movl	$MSR_GS_BASE,%ecx
-#ifndef CONFIG_SMP
-	leaq	INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
-#endif
 	movl	%edx, %eax
 	shrq	$32, %rdx
 	wrmsr

Maybe we should enforce CONFIG_SMP=y first :)

Thanks,

        tglx
  
Arjan van de Ven Feb. 28, 2023, 9:02 p.m. UTC | #2
> 
> Maybe we should enforce CONFIG_SMP=y first :)
> 
> Thanks,

for 64 bit I can see the point of removing the !SMP case entirely from arch/x86 .
maybe even for 32 bit if it just makes the code simpler I suppose
  
David Woodhouse Feb. 28, 2023, 9:57 p.m. UTC | #3
On Tue, 2023-02-28 at 22:01 +0100, Thomas Gleixner wrote:
> 
> This needs:
> 
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -239,7 +239,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_
>         /* Get the per cpu offset for the given CPU# which is in ECX */
>         movq    __per_cpu_offset(,%rcx,8), %rdx
>  #else
> -       xorl    %edx, %edx
> +       leaq    INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
>  #endif /* CONFIG_SMP */
>  
>         /*
> 

So yeah, I'd frowned at the zero %edx there for a while (but evidently
not long enough to realise that even if it's right it should be %rdx).

But then figured it was an *offset*, that __per_cpu_offset[0] would be
zero too at least on first boot and probably forever, did a quick grep
for pcpu_hot and figured my brain hurt and Brian probably knew what he
was doing.

Empirically, yours doesn't boot and Brian's does. If I just fix it to
zero all of %rdx and then run (just up to the initial_gs patch) with
qemu -d in_asm ...

 grep gdt_page System.map 
ffffffff05d7a000 A init_per_cpu__gdt_page
ffffffff825cfeb0 r __ksymtab_gdt_page
ffffffff82811000 D gdt_page


0x060000a9:  48 c7 c0 b2 00 00 a2     movq     $-0x5dffff4e, %rax
0x060000b0:  ff e0                    jmpq     *%rax

----------------
IN: 
0xffffffffa20000b2:  48 31 d2                 xorq     %rdx, %rdx
0xffffffffa20000b5:  48 8b 82 c0 74 d5 a3     movq     -0x5c2a8b40(%rdx), %rax
0xffffffffa20000bc:  48 8b a0 58 14 00 00     movq     0x1458(%rax), %rsp
0xffffffffa20000c3:  48 83 ec 10              subq     $0x10, %rsp
0xffffffffa20000c7:  66 c7 04 24 7f 00        movw     $0x7f, (%rsp)
0xffffffffa20000cd:  48 8d 82 00 10 81 a3     leaq     -0x5c7ef000(%rdx), %rax
0xffffffffa20000d4:  48 89 44 24 02           movq     %rax, 2(%rsp)
0xffffffffa20000d9:  0f 01 14 24              lgdtq    (%rsp)
0xffffffffa20000dd:  48 83 c4 10              addq     $0x10, %rsp
0xffffffffa20000e1:  31 c0                    xorl     %eax, %eax
0xffffffffa20000e3:  8e d8                    movl     %eax, %ds

I cannot work out where the value -0x5c7ef000 comes from, but it
doesn't seem to be the 0xb000 you claimed, and my brain is hurting
again...
  
Brian Gerst Feb. 28, 2023, 10:09 p.m. UTC | #4
On Tue, Feb 28, 2023 at 4:01 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Sun, Feb 26 2023 at 11:07, Usama Arif wrote:
> > @@ -265,7 +265,12 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
> >        * addresses where we're currently running on. We have to do that here
> >        * because in 32bit we couldn't load a 64bit linear address.
> >        */
> > -     lgdt    early_gdt_descr(%rip)
> > +     subq    $16, %rsp
> > +     movw    $(GDT_SIZE-1), (%rsp)
> > +     leaq    gdt_page(%rdx), %rax
>
> Even on !SMP gdt_page is in the 0...__per_cpu_end range. Which means
> that on !SMP this results in:
>
>       leaq    0xb000(%rdx),%rax
>
> and RDX is 0. That's not really a valid GDT pointer, right?

No.  On !SMP per-cpu variables are normal variables in the .data
section.  They are not gathered together in the per-cpu section and
are not accessed with the GS prefix.

ffffffff810000c9:       48 8d 82 00 10 81 82    lea    0x82811000(%rdx),%rax
                        ffffffff810000cc: R_X86_64_32S  gdt_page

ffffffff82811000 D gdt_page

So RDX=0 is correct.

> > +     movq    %rax, 2(%rsp)
> > +     lgdt    (%rsp)
>
> and obviously that's equally broken for the task stack part:
>
> >       movq    pcpu_hot + X86_current_task(%rdx), %rax

Same as gdt_page:

ffffffff810000b1:       48 8b 82 00 88 a8 82    mov    0x82a88800(%rdx),%rax
                        ffffffff810000b4: R_X86_64_32S  pcpu_hot

ffffffff82a88800 D pcpu_hot

> This needs:
>
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -239,7 +239,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_
>         /* Get the per cpu offset for the given CPU# which is in ECX */
>         movq    __per_cpu_offset(,%rcx,8), %rdx
>  #else
> -       xorl    %edx, %edx
> +       leaq    INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
>  #endif /* CONFIG_SMP */
>
>         /*
>
> in the initial_stack patch, which then allows to remove this hunk in the
> initial_gs patch:
>
> @@ -286,9 +286,6 @@ SYM_INNER_LABEL(secondary_startup_64_no_
>          * the per cpu areas are set up.
>          */
>         movl    $MSR_GS_BASE,%ecx
> -#ifndef CONFIG_SMP
> -       leaq    INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
> -#endif

On !SMP the only thing GSBASE is used for is the stack protector
canary, which is in fixed_percpu_data.  There is no per-cpu section.

FWIW, I posted a patch set a while back that switched x86-64 to use
the options added to newer compilers controlling where the canary is
located, allowing it to become a standard per-cpu variable and
removing the need to force the per-cpu section to be zero-based.
However it was not accepted at that time, due to removing support for
stack protector on older compilers (GCC < 8.1).

>         movl    %edx, %eax
>         shrq    $32, %rdx
>         wrmsr
>
> Maybe we should enforce CONFIG_SMP=y first :)

Makes sense, only the earliest generations of x86-64 processors have a
single core/thread, and an SMP kernel can still run on them.

--
Brian Gerst
  
David Woodhouse Feb. 28, 2023, 10:41 p.m. UTC | #5
On Tue, 2023-02-28 at 21:57 +0000, David Woodhouse wrote:
> 
> ----------------
> IN: 
> 0xffffffffa20000b2:  48 31 d2                 xorq     %rdx, %rdx
> 0xffffffffa20000b5:  48 8b 82 c0 74 d5 a3     movq     -0x5c2a8b40(%rdx), %rax
> 0xffffffffa20000bc:  48 8b a0 58 14 00 00     movq     0x1458(%rax), %rsp
> 0xffffffffa20000c3:  48 83 ec 10              subq     $0x10, %rsp
> 0xffffffffa20000c7:  66 c7 04 24 7f 00        movw     $0x7f, (%rsp)
> 0xffffffffa20000cd:  48 8d 82 00 10 81 a3     leaq     -0x5c7ef000(%rdx), %rax
> 0xffffffffa20000d4:  48 89 44 24 02           movq     %rax, 2(%rsp)
> 0xffffffffa20000d9:  0f 01 14 24              lgdtq    (%rsp)
> 0xffffffffa20000dd:  48 83 c4 10              addq     $0x10, %rsp
> 0xffffffffa20000e1:  31 c0                    xorl     %eax, %eax
> 0xffffffffa20000e3:  8e d8                    movl     %eax, %ds
> 
> I cannot work out where the value -0x5c7ef000 comes from, but it
> doesn't seem to be the 0xb000 you claimed, and my brain is hurting
> again...

Turning off CONFIG_RANDOMIZE_BASE (or just looking at the vmlinux
disassembly instead as Brian did) helps to resolve that FWIW.

I've changed it to zero all of %rdx and pushed it back to the v12bis
branch.
  
Brian Gerst Feb. 28, 2023, 10:48 p.m. UTC | #6
On Tue, Feb 28, 2023 at 5:41 PM David Woodhouse <dwmw2@infradead.org> wrote:
>
> On Tue, 2023-02-28 at 21:57 +0000, David Woodhouse wrote:
> >
> > ----------------
> > IN:
> > 0xffffffffa20000b2:  48 31 d2                 xorq     %rdx, %rdx
> > 0xffffffffa20000b5:  48 8b 82 c0 74 d5 a3     movq     -0x5c2a8b40(%rdx), %rax
> > 0xffffffffa20000bc:  48 8b a0 58 14 00 00     movq     0x1458(%rax), %rsp
> > 0xffffffffa20000c3:  48 83 ec 10              subq     $0x10, %rsp
> > 0xffffffffa20000c7:  66 c7 04 24 7f 00        movw     $0x7f, (%rsp)
> > 0xffffffffa20000cd:  48 8d 82 00 10 81 a3     leaq     -0x5c7ef000(%rdx), %rax
> > 0xffffffffa20000d4:  48 89 44 24 02           movq     %rax, 2(%rsp)
> > 0xffffffffa20000d9:  0f 01 14 24              lgdtq    (%rsp)
> > 0xffffffffa20000dd:  48 83 c4 10              addq     $0x10, %rsp
> > 0xffffffffa20000e1:  31 c0                    xorl     %eax, %eax
> > 0xffffffffa20000e3:  8e d8                    movl     %eax, %ds
> >
> > I cannot work out where the value -0x5c7ef000 comes from, but it
> > doesn't seem to be the 0xb000 you claimed, and my brain is hurting
> > again...
>
> Turning off CONFIG_RANDOMIZE_BASE (or just looking at the vmlinux
> disassembly instead as Brian did) helps to resolve that FWIW.
>
> I've changed it to zero all of %rdx and pushed it back to the v12bis
> branch.

xorl %edx, %edx is preferred, as a 32-bit operation zero-extends to
the full 64-bit register.  Using xorq to clear any of the lower 8
registers adds an unnecessary REX prefix.  Just one of many quirks of
the x86 instruction set...

--
Brian Gerst
  
David Woodhouse Feb. 28, 2023, 11:42 p.m. UTC | #7
On 28 February 2023 22:48:42 GMT, Brian Gerst <brgerst@gmail.com> wrote:
>On Tue, Feb 28, 2023 at 5:41 PM David Woodhouse <dwmw2@infradead.org> wrote:
>>
>> On Tue, 2023-02-28 at 21:57 +0000, David Woodhouse wrote:
>> >
>> > ----------------
>> > IN:
>> > 0xffffffffa20000b2:  48 31 d2                 xorq     %rdx, %rdx
>> > 0xffffffffa20000b5:  48 8b 82 c0 74 d5 a3     movq     -0x5c2a8b40(%rdx), %rax
>> > 0xffffffffa20000bc:  48 8b a0 58 14 00 00     movq     0x1458(%rax), %rsp
>> > 0xffffffffa20000c3:  48 83 ec 10              subq     $0x10, %rsp
>> > 0xffffffffa20000c7:  66 c7 04 24 7f 00        movw     $0x7f, (%rsp)
>> > 0xffffffffa20000cd:  48 8d 82 00 10 81 a3     leaq     -0x5c7ef000(%rdx), %rax
>> > 0xffffffffa20000d4:  48 89 44 24 02           movq     %rax, 2(%rsp)
>> > 0xffffffffa20000d9:  0f 01 14 24              lgdtq    (%rsp)
>> > 0xffffffffa20000dd:  48 83 c4 10              addq     $0x10, %rsp
>> > 0xffffffffa20000e1:  31 c0                    xorl     %eax, %eax
>> > 0xffffffffa20000e3:  8e d8                    movl     %eax, %ds
>> >
>> > I cannot work out where the value -0x5c7ef000 comes from, but it
>> > doesn't seem to be the 0xb000 you claimed, and my brain is hurting
>> > again...
>>
>> Turning off CONFIG_RANDOMIZE_BASE (or just looking at the vmlinux
>> disassembly instead as Brian did) helps to resolve that FWIW.
>>
>> I've changed it to zero all of %rdx and pushed it back to the v12bis
>> branch.
>
>xorl %edx, %edx is preferred, as a 32-bit operation zero-extends to
>the full 64-bit register.  Using xorq to clear any of the lower 8
>registers adds an unnecessary REX prefix.  Just one of many quirks of
>the x86 instruction set...

Ewww. Couldn't the assembler choose to omit the REX prefix then? It does more tricksy things than that already.

I almost prefer having the prefix but (in the morning) if you prefer I can put it back as it was with a comment about the zero-extension.
  
H. Peter Anvin March 1, 2023, 12:02 a.m. UTC | #8
On February 28, 2023 3:42:55 PM PST, David Woodhouse <dwmw2@infradead.org> wrote:
>
>
>On 28 February 2023 22:48:42 GMT, Brian Gerst <brgerst@gmail.com> wrote:
>>On Tue, Feb 28, 2023 at 5:41 PM David Woodhouse <dwmw2@infradead.org> wrote:
>>>
>>> On Tue, 2023-02-28 at 21:57 +0000, David Woodhouse wrote:
>>> >
>>> > ----------------
>>> > IN:
>>> > 0xffffffffa20000b2:  48 31 d2                 xorq     %rdx, %rdx
>>> > 0xffffffffa20000b5:  48 8b 82 c0 74 d5 a3     movq     -0x5c2a8b40(%rdx), %rax
>>> > 0xffffffffa20000bc:  48 8b a0 58 14 00 00     movq     0x1458(%rax), %rsp
>>> > 0xffffffffa20000c3:  48 83 ec 10              subq     $0x10, %rsp
>>> > 0xffffffffa20000c7:  66 c7 04 24 7f 00        movw     $0x7f, (%rsp)
>>> > 0xffffffffa20000cd:  48 8d 82 00 10 81 a3     leaq     -0x5c7ef000(%rdx), %rax
>>> > 0xffffffffa20000d4:  48 89 44 24 02           movq     %rax, 2(%rsp)
>>> > 0xffffffffa20000d9:  0f 01 14 24              lgdtq    (%rsp)
>>> > 0xffffffffa20000dd:  48 83 c4 10              addq     $0x10, %rsp
>>> > 0xffffffffa20000e1:  31 c0                    xorl     %eax, %eax
>>> > 0xffffffffa20000e3:  8e d8                    movl     %eax, %ds
>>> >
>>> > I cannot work out where the value -0x5c7ef000 comes from, but it
>>> > doesn't seem to be the 0xb000 you claimed, and my brain is hurting
>>> > again...
>>>
>>> Turning off CONFIG_RANDOMIZE_BASE (or just looking at the vmlinux
>>> disassembly instead as Brian did) helps to resolve that FWIW.
>>>
>>> I've changed it to zero all of %rdx and pushed it back to the v12bis
>>> branch.
>>
>>xorl %edx, %edx is preferred, as a 32-bit operation zero-extends to
>>the full 64-bit register.  Using xorq to clear any of the lower 8
>>registers adds an unnecessary REX prefix.  Just one of many quirks of
>>the x86 instruction set...
>
>Ewww. Couldn't the assembler choose to omit the REX prefix then? It does more tricksy things than that already.
>
>I almost prefer having the prefix but (in the morning) if you prefer I can put it back as it was with a comment about the zero-extension.
>

Like it or not, that's how the assembler currently works.
  
Brian Gerst March 1, 2023, 12:03 a.m. UTC | #9
On Tue, Feb 28, 2023 at 6:43 PM David Woodhouse <dwmw2@infradead.org> wrote:
>
>
>
> On 28 February 2023 22:48:42 GMT, Brian Gerst <brgerst@gmail.com> wrote:
> >On Tue, Feb 28, 2023 at 5:41 PM David Woodhouse <dwmw2@infradead.org> wrote:
> >>
> >> On Tue, 2023-02-28 at 21:57 +0000, David Woodhouse wrote:
> >> >
> >> > ----------------
> >> > IN:
> >> > 0xffffffffa20000b2:  48 31 d2                 xorq     %rdx, %rdx
> >> > 0xffffffffa20000b5:  48 8b 82 c0 74 d5 a3     movq     -0x5c2a8b40(%rdx), %rax
> >> > 0xffffffffa20000bc:  48 8b a0 58 14 00 00     movq     0x1458(%rax), %rsp
> >> > 0xffffffffa20000c3:  48 83 ec 10              subq     $0x10, %rsp
> >> > 0xffffffffa20000c7:  66 c7 04 24 7f 00        movw     $0x7f, (%rsp)
> >> > 0xffffffffa20000cd:  48 8d 82 00 10 81 a3     leaq     -0x5c7ef000(%rdx), %rax
> >> > 0xffffffffa20000d4:  48 89 44 24 02           movq     %rax, 2(%rsp)
> >> > 0xffffffffa20000d9:  0f 01 14 24              lgdtq    (%rsp)
> >> > 0xffffffffa20000dd:  48 83 c4 10              addq     $0x10, %rsp
> >> > 0xffffffffa20000e1:  31 c0                    xorl     %eax, %eax
> >> > 0xffffffffa20000e3:  8e d8                    movl     %eax, %ds
> >> >
> >> > I cannot work out where the value -0x5c7ef000 comes from, but it
> >> > doesn't seem to be the 0xb000 you claimed, and my brain is hurting
> >> > again...
> >>
> >> Turning off CONFIG_RANDOMIZE_BASE (or just looking at the vmlinux
> >> disassembly instead as Brian did) helps to resolve that FWIW.
> >>
> >> I've changed it to zero all of %rdx and pushed it back to the v12bis
> >> branch.
> >
> >xorl %edx, %edx is preferred, as a 32-bit operation zero-extends to
> >the full 64-bit register.  Using xorq to clear any of the lower 8
> >registers adds an unnecessary REX prefix.  Just one of many quirks of
> >the x86 instruction set...
>
> Ewww. Couldn't the assembler choose to omit the REX prefix then? It does more tricksy things than that already.
>
> I almost prefer having the prefix but (in the morning) if you prefer I can put it back as it was with a comment about the zero-extension.

commit a7bea8308933aaeea76dad7d42a6e51000417626
Author: Jan Beulich <JBeulich@suse.com>
Date:   Mon Jul 2 04:31:54 2018 -0600

    x86/asm/64: Use 32-bit XOR to zero registers

    Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms. Zeroing
    idioms don't require execution bandwidth, as they're being taken care
    of in the frontend (through register renaming). Use 32-bit XORs instead.

Not that speed is important here, but it's good to be consistent
across the whole kernel.  Someone will eventually come by and fix it
up anyways, as there have been a few of these patches already in the
git history.

--
Brian Gerst
  
David Woodhouse March 1, 2023, 7:39 a.m. UTC | #10
On Tue, 2023-02-28 at 16:02 -0800, H. Peter Anvin wrote:
> 
> > Ewww. Couldn't the assembler choose to omit the REX prefix then? It
> > does more tricksy things than that already.
> > 
> > I almost prefer having the prefix but (in the morning) if you
> > prefer I can put it back as it was with a comment about the zero-
> > extension.
> > 
> 
> Like it or not, that's how the assembler currently works.

Last time someone told me "you can't tell the assembler what you
actually mean; you have to tell it something different" I threw my toys
out of the pram a little bit and implemented all of the 16-bit '-m16'
support in LLVM/clang.

On Tue, 2023-02-28 at 19:03 -0500, Brian Gerst wrote:
> 
> commit a7bea8308933aaeea76dad7d42a6e51000417626
> Author: Jan Beulich <JBeulich@suse.com>
> Date:   Mon Jul 2 04:31:54 2018 -0600
> 
>     x86/asm/64: Use 32-bit XOR to zero registers
> 
>     Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms. Zeroing
>     idioms don't require execution bandwidth, as they're being taken care
>     of in the frontend (through register renaming). Use 32-bit XORs instead.
> 
> Not that speed is important here, but it's good to be consistent
> across the whole kernel.  Someone will eventually come by and fix it
> up anyways, as there have been a few of these patches already in the
> git history.

So there literally is a better encoding of this mnemonic which isn't
just a byte shorter, but is also faster to execute. And one *hopes*
that the compiler knows about it... yep... so why in $DEITY's name
can't I type what I mean into an asm file? Isn't asm intricate enough
already?

Meh, life's too short. Not shaving that particular yak today. I've put
the git tree back to using %edx, with a comment that it zero-extends.
  
Thomas Gleixner March 1, 2023, 10:27 a.m. UTC | #11
On Tue, Feb 28 2023 at 21:57, David Woodhouse wrote:
> On Tue, 2023-02-28 at 22:01 +0100, Thomas Gleixner wrote:
> Empirically, yours doesn't boot and Brian's does.

Correct. It was just my sleep deprived brain which got lost in assembly
and #ifdef maze.

On UP per_cpu variables just become regular variables and gdt_page(%rdx)
with %rdx = 0 will just resolve that address.

Sorry for the noise.

Thanks,

        tglx
  
Josh Triplett March 1, 2023, 8:32 p.m. UTC | #12
On Tue, Feb 28, 2023 at 01:02:33PM -0800, Arjan van de Ven wrote:
  Thomas Gleixner wrote:
> > 
> > Maybe we should enforce CONFIG_SMP=y first :)
> > 
> > Thanks,
> 
> for 64 bit I can see the point of removing the !SMP case entirely from arch/x86 .
> maybe even for 32 bit if it just makes the code simpler I suppose

As one of the folks keeping an eye on tinyconfig and kernel size, I
actually think we *should* make this change and rip out !CONFIG_SMP,
albeit carefully.

In particular, I would propose that we rip out !CONFIG_SMP, *but* we
allow building with CONFIG_NR_CPUS=1. (And we could make sure in that
case that the compiler can recognize that at compile time and optimize
accordingly, so that it might provide some of the UP optimizations for
us.)

Then, any *optimizations* for the "will only have one CPU, ever" case
can move to CONFIG_NR_CPUS=1 rather than !CONFIG_SMP. I think many of
those optimizations may be worth keeping for small embedded systems, or
for cases like Linux-as-bootloader or similar.

The difference here would be that code written for !CONFIG_SMP today
needs to account for the UP case for *correctness*, whereas code written
for CONFIG_SMP can *optionally* consider CONFIG_NR_CPUS=1 for
*performance*.
  
Paul E. McKenney March 1, 2023, 10:16 p.m. UTC | #13
On Wed, Mar 01, 2023 at 12:32:26PM -0800, Josh Triplett wrote:
> On Tue, Feb 28, 2023 at 01:02:33PM -0800, Arjan van de Ven wrote:
>   Thomas Gleixner wrote:
> > > 
> > > Maybe we should enforce CONFIG_SMP=y first :)
> > > 
> > > Thanks,
> > 
> > for 64 bit I can see the point of removing the !SMP case entirely from arch/x86 .
> > maybe even for 32 bit if it just makes the code simpler I suppose
> 
> As one of the folks keeping an eye on tinyconfig and kernel size, I
> actually think we *should* make this change and rip out !CONFIG_SMP,
> albeit carefully.
> 
> In particular, I would propose that we rip out !CONFIG_SMP, *but* we
> allow building with CONFIG_NR_CPUS=1. (And we could make sure in that
> case that the compiler can recognize that at compile time and optimize
> accordingly, so that it might provide some of the UP optimizations for
> us.)
> 
> Then, any *optimizations* for the "will only have one CPU, ever" case
> can move to CONFIG_NR_CPUS=1 rather than !CONFIG_SMP. I think many of
> those optimizations may be worth keeping for small embedded systems, or
> for cases like Linux-as-bootloader or similar.
> 
> The difference here would be that code written for !CONFIG_SMP today
> needs to account for the UP case for *correctness*, whereas code written
> for CONFIG_SMP can *optionally* consider CONFIG_NR_CPUS=1 for
> *performance*.

It certainly would not make much sense to keep Tiny RCU and Tiny SRCU
around if there was no CONFIG_SMP=n.

It should be interesting seeing what comes up out of the IoT space.  ;-)

							Thanx, Paul
  
Josh Triplett March 1, 2023, 10:25 p.m. UTC | #14
On Wed, Mar 01, 2023 at 02:16:32PM -0800, Paul E. McKenney wrote:
> On Wed, Mar 01, 2023 at 12:32:26PM -0800, Josh Triplett wrote:
> > On Tue, Feb 28, 2023 at 01:02:33PM -0800, Arjan van de Ven wrote:
> >   Thomas Gleixner wrote:
> > > > 
> > > > Maybe we should enforce CONFIG_SMP=y first :)
> > > > 
> > > > Thanks,
> > > 
> > > for 64 bit I can see the point of removing the !SMP case entirely from arch/x86 .
> > > maybe even for 32 bit if it just makes the code simpler I suppose
> > 
> > As one of the folks keeping an eye on tinyconfig and kernel size, I
> > actually think we *should* make this change and rip out !CONFIG_SMP,
> > albeit carefully.
> > 
> > In particular, I would propose that we rip out !CONFIG_SMP, *but* we
> > allow building with CONFIG_NR_CPUS=1. (And we could make sure in that
> > case that the compiler can recognize that at compile time and optimize
> > accordingly, so that it might provide some of the UP optimizations for
> > us.)
> > 
> > Then, any *optimizations* for the "will only have one CPU, ever" case
> > can move to CONFIG_NR_CPUS=1 rather than !CONFIG_SMP. I think many of
> > those optimizations may be worth keeping for small embedded systems, or
> > for cases like Linux-as-bootloader or similar.
> > 
> > The difference here would be that code written for !CONFIG_SMP today
> > needs to account for the UP case for *correctness*, whereas code written
> > for CONFIG_SMP can *optionally* consider CONFIG_NR_CPUS=1 for
> > *performance*.
> 
> It certainly would not make much sense to keep Tiny RCU and Tiny SRCU
> around if there was no CONFIG_SMP=n.

On the contrary, I think it's entirely appropriate to keep them for
CONFIG_NR_CPUS=1; that's exactly the kind of simple optimization that
seems well worth having. (Ideal optimization: "very very simple for UP,
complex for SMP"; non-ideal optimization: "complex for SMP, differently
complex for UP".)

- Josh
  
Paul E. McKenney March 2, 2023, 12:28 a.m. UTC | #15
On Wed, Mar 01, 2023 at 02:25:36PM -0800, Josh Triplett wrote:
> On Wed, Mar 01, 2023 at 02:16:32PM -0800, Paul E. McKenney wrote:
> > On Wed, Mar 01, 2023 at 12:32:26PM -0800, Josh Triplett wrote:
> > > On Tue, Feb 28, 2023 at 01:02:33PM -0800, Arjan van de Ven wrote:
> > >   Thomas Gleixner wrote:
> > > > > 
> > > > > Maybe we should enforce CONFIG_SMP=y first :)
> > > > > 
> > > > > Thanks,
> > > > 
> > > > for 64 bit I can see the point of removing the !SMP case entirely from arch/x86 .
> > > > maybe even for 32 bit if it just makes the code simpler I suppose
> > > 
> > > As one of the folks keeping an eye on tinyconfig and kernel size, I
> > > actually think we *should* make this change and rip out !CONFIG_SMP,
> > > albeit carefully.
> > > 
> > > In particular, I would propose that we rip out !CONFIG_SMP, *but* we
> > > allow building with CONFIG_NR_CPUS=1. (And we could make sure in that
> > > case that the compiler can recognize that at compile time and optimize
> > > accordingly, so that it might provide some of the UP optimizations for
> > > us.)
> > > 
> > > Then, any *optimizations* for the "will only have one CPU, ever" case
> > > can move to CONFIG_NR_CPUS=1 rather than !CONFIG_SMP. I think many of
> > > those optimizations may be worth keeping for small embedded systems, or
> > > for cases like Linux-as-bootloader or similar.
> > > 
> > > The difference here would be that code written for !CONFIG_SMP today
> > > needs to account for the UP case for *correctness*, whereas code written
> > > for CONFIG_SMP can *optionally* consider CONFIG_NR_CPUS=1 for
> > > *performance*.
> > 
> > It certainly would not make much sense to keep Tiny RCU and Tiny SRCU
> > around if there was no CONFIG_SMP=n.
> 
> On the contrary, I think it's entirely appropriate to keep them for
> CONFIG_NR_CPUS=1; that's exactly the kind of simple optimization that
> seems well worth having. (Ideal optimization: "very very simple for UP,
> complex for SMP"; non-ideal optimization: "complex for SMP, differently
> complex for UP".)

Fair enough, but how does removing CONFIG_SMP help with that?  Given that
it is not all that hard to work around the lack of CONFIG_SMP for Tiny
RCU and Tiny SRCU, then it cannot be all that hard to work around that
lack for the use cases that you are trying to get rid of, right?

							Thanx, Paul

------------------------------------------------------------------------

diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 9071182b1284b..7487bee3d4341 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -7,7 +7,7 @@ menu "RCU Subsystem"
 
 config TREE_RCU
 	bool
-	default y if SMP
+	default y if CONFIG_NR_CPUS = 1
 	# Dynticks-idle tracking
 	select CONTEXT_TRACKING_IDLE
 	help
@@ -31,7 +31,7 @@ config PREEMPT_RCU
 
 config TINY_RCU
 	bool
-	default y if !PREEMPTION && !SMP
+	default y if !PREEMPTION && CONFIG_NR_CPUS != 1
 	help
 	  This option selects the RCU implementation that is
 	  designed for UP systems from which real-time response
  
Randy Dunlap March 2, 2023, 1:05 a.m. UTC | #16
On 3/1/23 16:28, Paul E. McKenney wrote:
> On Wed, Mar 01, 2023 at 02:25:36PM -0800, Josh Triplett wrote:
>> On Wed, Mar 01, 2023 at 02:16:32PM -0800, Paul E. McKenney wrote:
>>> On Wed, Mar 01, 2023 at 12:32:26PM -0800, Josh Triplett wrote:
>>>> On Tue, Feb 28, 2023 at 01:02:33PM -0800, Arjan van de Ven wrote:
>>>>   Thomas Gleixner wrote:
>>>>>>
>>>>>> Maybe we should enforce CONFIG_SMP=y first :)
>>>>>>
>>>>>> Thanks,
>>>>>
>>>>> for 64 bit I can see the point of removing the !SMP case entirely from arch/x86 .
>>>>> maybe even for 32 bit if it just makes the code simpler I suppose
>>>>
>>>> As one of the folks keeping an eye on tinyconfig and kernel size, I
>>>> actually think we *should* make this change and rip out !CONFIG_SMP,
>>>> albeit carefully.
>>>>
>>>> In particular, I would propose that we rip out !CONFIG_SMP, *but* we
>>>> allow building with CONFIG_NR_CPUS=1. (And we could make sure in that
>>>> case that the compiler can recognize that at compile time and optimize
>>>> accordingly, so that it might provide some of the UP optimizations for
>>>> us.)
>>>>
>>>> Then, any *optimizations* for the "will only have one CPU, ever" case
>>>> can move to CONFIG_NR_CPUS=1 rather than !CONFIG_SMP. I think many of
>>>> those optimizations may be worth keeping for small embedded systems, or
>>>> for cases like Linux-as-bootloader or similar.
>>>>
>>>> The difference here would be that code written for !CONFIG_SMP today
>>>> needs to account for the UP case for *correctness*, whereas code written
>>>> for CONFIG_SMP can *optionally* consider CONFIG_NR_CPUS=1 for
>>>> *performance*.
>>>
>>> It certainly would not make much sense to keep Tiny RCU and Tiny SRCU
>>> around if there was no CONFIG_SMP=n.
>>
>> On the contrary, I think it's entirely appropriate to keep them for
>> CONFIG_NR_CPUS=1; that's exactly the kind of simple optimization that
>> seems well worth having. (Ideal optimization: "very very simple for UP,
>> complex for SMP"; non-ideal optimization: "complex for SMP, differently
>> complex for UP".)
> 
> Fair enough, but how does removing CONFIG_SMP help with that?  Given that
> it is not all that hard to work around the lack of CONFIG_SMP for Tiny
> RCU and Tiny SRCU, then it cannot be all that hard to work around that
> lack for the use cases that you are trying to get rid of, right?
> 
> 							Thanx, Paul
> 
> ------------------------------------------------------------------------
> 
> diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
> index 9071182b1284b..7487bee3d4341 100644
> --- a/kernel/rcu/Kconfig
> +++ b/kernel/rcu/Kconfig
> @@ -7,7 +7,7 @@ menu "RCU Subsystem"
>  
>  config TREE_RCU
>  	bool
> -	default y if SMP
> +	default y if CONFIG_NR_CPUS = 1
>  	# Dynticks-idle tracking
>  	select CONTEXT_TRACKING_IDLE
>  	help
> @@ -31,7 +31,7 @@ config PREEMPT_RCU
>  
>  config TINY_RCU
>  	bool
> -	default y if !PREEMPTION && !SMP
> +	default y if !PREEMPTION && CONFIG_NR_CPUS != 1
>  	help
>  	  This option selects the RCU implementation that is
>  	  designed for UP systems from which real-time response

but drop the CONFIG_ prefixes...
  
Paul E. McKenney March 2, 2023, 1:37 a.m. UTC | #17
On Wed, Mar 01, 2023 at 05:05:39PM -0800, Randy Dunlap wrote:
> 
> 
> On 3/1/23 16:28, Paul E. McKenney wrote:
> > On Wed, Mar 01, 2023 at 02:25:36PM -0800, Josh Triplett wrote:
> >> On Wed, Mar 01, 2023 at 02:16:32PM -0800, Paul E. McKenney wrote:
> >>> On Wed, Mar 01, 2023 at 12:32:26PM -0800, Josh Triplett wrote:
> >>>> On Tue, Feb 28, 2023 at 01:02:33PM -0800, Arjan van de Ven wrote:
> >>>>   Thomas Gleixner wrote:
> >>>>>>
> >>>>>> Maybe we should enforce CONFIG_SMP=y first :)
> >>>>>>
> >>>>>> Thanks,
> >>>>>
> >>>>> for 64 bit I can see the point of removing the !SMP case entirely from arch/x86 .
> >>>>> maybe even for 32 bit if it just makes the code simpler I suppose
> >>>>
> >>>> As one of the folks keeping an eye on tinyconfig and kernel size, I
> >>>> actually think we *should* make this change and rip out !CONFIG_SMP,
> >>>> albeit carefully.
> >>>>
> >>>> In particular, I would propose that we rip out !CONFIG_SMP, *but* we
> >>>> allow building with CONFIG_NR_CPUS=1. (And we could make sure in that
> >>>> case that the compiler can recognize that at compile time and optimize
> >>>> accordingly, so that it might provide some of the UP optimizations for
> >>>> us.)
> >>>>
> >>>> Then, any *optimizations* for the "will only have one CPU, ever" case
> >>>> can move to CONFIG_NR_CPUS=1 rather than !CONFIG_SMP. I think many of
> >>>> those optimizations may be worth keeping for small embedded systems, or
> >>>> for cases like Linux-as-bootloader or similar.
> >>>>
> >>>> The difference here would be that code written for !CONFIG_SMP today
> >>>> needs to account for the UP case for *correctness*, whereas code written
> >>>> for CONFIG_SMP can *optionally* consider CONFIG_NR_CPUS=1 for
> >>>> *performance*.
> >>>
> >>> It certainly would not make much sense to keep Tiny RCU and Tiny SRCU
> >>> around if there was no CONFIG_SMP=n.
> >>
> >> On the contrary, I think it's entirely appropriate to keep them for
> >> CONFIG_NR_CPUS=1; that's exactly the kind of simple optimization that
> >> seems well worth having. (Ideal optimization: "very very simple for UP,
> >> complex for SMP"; non-ideal optimization: "complex for SMP, differently
> >> complex for UP".)
> > 
> > Fair enough, but how does removing CONFIG_SMP help with that?  Given that
> > it is not all that hard to work around the lack of CONFIG_SMP for Tiny
> > RCU and Tiny SRCU, then it cannot be all that hard to work around that
> > lack for the use cases that you are trying to get rid of, right?
> > 
> > 							Thanx, Paul
> > 
> > ------------------------------------------------------------------------
> > 
> > diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
> > index 9071182b1284b..7487bee3d4341 100644
> > --- a/kernel/rcu/Kconfig
> > +++ b/kernel/rcu/Kconfig
> > @@ -7,7 +7,7 @@ menu "RCU Subsystem"
> >  
> >  config TREE_RCU
> >  	bool
> > -	default y if SMP
> > +	default y if CONFIG_NR_CPUS = 1
> >  	# Dynticks-idle tracking
> >  	select CONTEXT_TRACKING_IDLE
> >  	help
> > @@ -31,7 +31,7 @@ config PREEMPT_RCU
> >  
> >  config TINY_RCU
> >  	bool
> > -	default y if !PREEMPTION && !SMP
> > +	default y if !PREEMPTION && CONFIG_NR_CPUS != 1
> >  	help
> >  	  This option selects the RCU implementation that is
> >  	  designed for UP systems from which real-time response
> 
> but drop the CONFIG_ prefixes...

Indeed.  What I don't understand is how the above passed some light
rcutorture testing.  And the comparisons are backwards as well.
Perhaps this time two bugs make working code?

How about the following?

							Thanx, Paul

------------------------------------------------------------------------

diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 9071182b1284b..a2ba97b949498 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -7,7 +7,7 @@ menu "RCU Subsystem"
 
 config TREE_RCU
 	bool
-	default y if SMP
+	default y if NR_CPUS != 1
 	# Dynticks-idle tracking
 	select CONTEXT_TRACKING_IDLE
 	help
@@ -31,7 +31,7 @@ config PREEMPT_RCU
 
 config TINY_RCU
 	bool
-	default y if !PREEMPTION && !SMP
+	default y if !PREEMPTION && NR_CPUS = 1
 	help
 	  This option selects the RCU implementation that is
 	  designed for UP systems from which real-time response
  

Patch

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ab6e29b32c04..236f2423454d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -112,8 +112,6 @@  int x86_acpi_suspend_lowlevel(void)
 #else /* CONFIG_64BIT */
 #ifdef CONFIG_SMP
 	current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
-	early_gdt_descr.address =
-			(unsigned long)get_cpu_gdt_rw(smp_processor_id());
 	initial_gs = per_cpu_offset(smp_processor_id());
 	smpboot_control = smp_processor_id();
 #endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 5a2417d788d1..0ccca297e90e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -265,7 +265,12 @@  SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 	 * addresses where we're currently running on. We have to do that here
 	 * because in 32bit we couldn't load a 64bit linear address.
 	 */
-	lgdt	early_gdt_descr(%rip)
+	subq	$16, %rsp
+	movw	$(GDT_SIZE-1), (%rsp)
+	leaq	gdt_page(%rdx), %rax
+	movq	%rax, 2(%rsp)
+	lgdt	(%rsp)
+	addq	$16, %rsp
 
 	/* set up data segments */
 	xorl %eax,%eax
@@ -667,10 +672,6 @@  SYM_DATA_END(level1_fixmap_pgt)
 	.data
 	.align 16
 
-SYM_DATA(early_gdt_descr,		.word GDT_ENTRIES*8-1)
-SYM_DATA_LOCAL(early_gdt_descr_base,	.quad INIT_PER_CPU_VAR(gdt_page))
-
-	.align 16
 SYM_DATA(smpboot_control,		.long 0)
 
 	.align 16
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 62e3bf37f0b8..a22460a07cf8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1110,10 +1110,10 @@  static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
 		start_ip = real_mode_header->trampoline_start64;
 #endif
 	idle->thread.sp = (unsigned long)task_pt_regs(idle);
-	early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
 	initial_code = (unsigned long)start_secondary;
 
 	if (IS_ENABLED(CONFIG_X86_32)) {
+		early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
 		initial_stack  = idle->thread.sp;
 	} else {
 		smpboot_control = cpu;