[v2,07/11] x86/percpu/64: Use relative percpu offsets
Commit Message
The percpu section is currently linked at virtual address 0, because
older compilers hardcoded the stack protector canary value at a fixed
offset from the start of the GS segment. Now that the canary is a
normal percpu variable, the percpu section can be linked normally.
This means that x86-64 will calculate percpu offsets like most other
architectures, as the delta between the initial percpu address and the
dynamically allocated memory.
Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
arch/x86/kernel/head_64.S | 6 ------
arch/x86/kernel/setup_percpu.c | 12 ++----------
arch/x86/kernel/vmlinux.lds.S | 24 +-----------------------
arch/x86/platform/pvh/head.S | 6 ------
arch/x86/tools/relocs.c | 10 +++-------
arch/x86/xen/xen-head.S | 6 ------
init/Kconfig | 2 +-
7 files changed, 7 insertions(+), 59 deletions(-)
Comments
On Thu, Oct 26, 2023 at 6:01 PM Brian Gerst <brgerst@gmail.com> wrote:
>
> The percpu section is currently linked at virtual address 0, because
> older compilers hardcoded the stack protector canary value at a fixed
> offset from the start of the GS segment. Now that the canary is a
> normal percpu variable, the percpu section can be linked normally.
> This means that x86-64 will calculate percpu offsets like most other
> architectures, as the delta between the initial percpu address and the
> dynamically allocated memory.
The comments above MSR_GS_BASE setup should be reviewed or removed. I
don't think they need to be set up to access stack canary, they are
just clearing MSR now.
>
> Signed-off-by: Brian Gerst <brgerst@gmail.com>
With fixed comments:
Reviewed-by: Uros Bizjak <ubizjak@gmail.com>
> ---
> arch/x86/kernel/head_64.S | 6 ------
> arch/x86/kernel/setup_percpu.c | 12 ++----------
> arch/x86/kernel/vmlinux.lds.S | 24 +-----------------------
> arch/x86/platform/pvh/head.S | 6 ------
> arch/x86/tools/relocs.c | 10 +++-------
> arch/x86/xen/xen-head.S | 6 ------
> init/Kconfig | 2 +-
> 7 files changed, 7 insertions(+), 59 deletions(-)
>
> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> index f2453eb38417..b35f74e58dd7 100644
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -72,14 +72,8 @@ SYM_CODE_START_NOALIGN(startup_64)
>
> /* Setup GSBASE to allow stack canary access for C code */
> movl $MSR_GS_BASE, %ecx
> -#ifdef CONFIG_SMP
> - leaq __per_cpu_load(%rip), %rdx
> - movl %edx, %eax
> - shrq $32, %rdx
> -#else
> xorl %eax, %eax
> xorl %edx, %edx
> -#endif
> wrmsr
>
> call startup_64_setup_env
> diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
> index 2c97bf7b56ae..8707dd07b9ce 100644
> --- a/arch/x86/kernel/setup_percpu.c
> +++ b/arch/x86/kernel/setup_percpu.c
> @@ -23,18 +23,10 @@
> #include <asm/cpumask.h>
> #include <asm/cpu.h>
>
> -#ifdef CONFIG_X86_64
> -#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
> -#else
> -#define BOOT_PERCPU_OFFSET 0
> -#endif
> -
> -DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
> +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off);
> EXPORT_PER_CPU_SYMBOL(this_cpu_off);
>
> -unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = {
> - [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
> -};
> +unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init;
> EXPORT_SYMBOL(__per_cpu_offset);
>
> /*
> diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
> index e6126cd21615..efa4885060b5 100644
> --- a/arch/x86/kernel/vmlinux.lds.S
> +++ b/arch/x86/kernel/vmlinux.lds.S
> @@ -103,12 +103,6 @@ const_pcpu_hot = pcpu_hot;
> PHDRS {
> text PT_LOAD FLAGS(5); /* R_E */
> data PT_LOAD FLAGS(6); /* RW_ */
> -#ifdef CONFIG_X86_64
> -#ifdef CONFIG_SMP
> - percpu PT_LOAD FLAGS(6); /* RW_ */
> -#endif
> - init PT_LOAD FLAGS(7); /* RWE */
> -#endif
> note PT_NOTE FLAGS(0); /* ___ */
> }
>
> @@ -224,21 +218,7 @@ SECTIONS
> __init_begin = .; /* paired with __init_end */
> }
>
> -#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
> - /*
> - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
> - * output PHDR, so the next output section - .init.text - should
> - * start another segment - init.
> - */
> - PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
> - ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START,
> - "per-CPU data too large - increase CONFIG_PHYSICAL_START")
> -#endif
> -
> INIT_TEXT_SECTION(PAGE_SIZE)
> -#ifdef CONFIG_X86_64
> - :init
> -#endif
>
> /*
> * Section for code used exclusively before alternatives are run. All
> @@ -368,9 +348,7 @@ SECTIONS
> EXIT_DATA
> }
>
> -#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
> PERCPU_SECTION(INTERNODE_CACHE_BYTES)
> -#endif
>
> . = ALIGN(PAGE_SIZE);
>
> @@ -508,7 +486,7 @@ SECTIONS
> * Per-cpu symbols which need to be offset from __per_cpu_load
> * for the boot processor.
> */
> -#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load
> +#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x)
> INIT_PER_CPU(gdt_page);
> INIT_PER_CPU(irq_stack_backing_store);
>
> diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
> index d215b16bf89f..4bd925b23436 100644
> --- a/arch/x86/platform/pvh/head.S
> +++ b/arch/x86/platform/pvh/head.S
> @@ -96,14 +96,8 @@ SYM_CODE_START_LOCAL(pvh_start_xen)
> 1:
> /* Set base address in stack canary descriptor. */
> mov $MSR_GS_BASE,%ecx
> -#ifdef CONFIG_SMP
> - lea __per_cpu_load(%rip), %rdx
> - mov %edx, %eax
> - shr $32, %rdx
> -#else
> xor %eax, %eax
> xor %edx, %edx
> -#endif
> wrmsr
>
> call xen_prepare_pvh
> diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
> index 3ccd9d4fcf9c..01efbfdd3eb3 100644
> --- a/arch/x86/tools/relocs.c
> +++ b/arch/x86/tools/relocs.c
> @@ -815,12 +815,7 @@ static void percpu_init(void)
> */
> static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
> {
> - int shndx = sym_index(sym);
> -
> - return (shndx == per_cpu_shndx) &&
> - strcmp(symname, "__init_begin") &&
> - strcmp(symname, "__per_cpu_load") &&
> - strncmp(symname, "init_per_cpu_", 13);
> + return 0;
> }
>
>
> @@ -1043,7 +1038,8 @@ static int cmp_relocs(const void *va, const void *vb)
>
> static void sort_relocs(struct relocs *r)
> {
> - qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs);
> + if (r->count)
> + qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs);
> }
>
> static int write32(uint32_t v, FILE *f)
> diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
> index 9ce0d9d268bb..c1d9c92b417a 100644
> --- a/arch/x86/xen/xen-head.S
> +++ b/arch/x86/xen/xen-head.S
> @@ -57,14 +57,8 @@ SYM_CODE_START(startup_xen)
> * the per cpu areas are set up.
> */
> movl $MSR_GS_BASE,%ecx
> -#ifdef CONFIG_SMP
> - leaq __per_cpu_load(%rip), %rdx
> - movl %edx, %eax
> - shrq $32, %rdx
> -#else
> xorl %eax, %eax
> xorl %edx, %edx
> -#endif
> wrmsr
>
> mov %rsi, %rdi
> diff --git a/init/Kconfig b/init/Kconfig
> index 6d35728b94b2..1af31b23e376 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1718,7 +1718,7 @@ config KALLSYMS_ALL
> config KALLSYMS_ABSOLUTE_PERCPU
> bool
> depends on KALLSYMS
> - default X86_64 && SMP
> + default n
>
> config KALLSYMS_BASE_RELATIVE
> bool
> --
> 2.41.0
>
On Thu, Oct 26, 2023 at 2:47 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Thu, Oct 26, 2023 at 6:01 PM Brian Gerst <brgerst@gmail.com> wrote:
> >
> > The percpu section is currently linked at virtual address 0, because
> > older compilers hardcoded the stack protector canary value at a fixed
> > offset from the start of the GS segment. Now that the canary is a
> > normal percpu variable, the percpu section can be linked normally.
> > This means that x86-64 will calculate percpu offsets like most other
> > architectures, as the delta between the initial percpu address and the
> > dynamically allocated memory.
>
> The comments above MSR_GS_BASE setup should be reviewed or removed. I
> don't think they need to be set up to access stack canary, they are
> just clearing MSR now.
GSBASE is deliberately set to zero offset on SMP for boot because we
want any percpu accesses (including stack protector) to use the
initial percpu area until the full percpu memory is allocated. It's
possible that more stack protector checks could sneak back into the
early boot code, and after the conversion to relative percpu offsets
they would work properly again. I just didn't reenable them because
they are unnecessary that early.
Brian Gerst
On Fri, Oct 27, 2023 at 4:09 AM Brian Gerst <brgerst@gmail.com> wrote:
>
> On Thu, Oct 26, 2023 at 2:47 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Thu, Oct 26, 2023 at 6:01 PM Brian Gerst <brgerst@gmail.com> wrote:
> > >
> > > The percpu section is currently linked at virtual address 0, because
> > > older compilers hardcoded the stack protector canary value at a fixed
> > > offset from the start of the GS segment. Now that the canary is a
> > > normal percpu variable, the percpu section can be linked normally.
> > > This means that x86-64 will calculate percpu offsets like most other
> > > architectures, as the delta between the initial percpu address and the
> > > dynamically allocated memory.
> >
> > The comments above MSR_GS_BASE setup should be reviewed or removed. I
> > don't think they need to be set up to access stack canary, they are
> > just clearing MSR now.
>
> GSBASE is deliberately set to zero offset on SMP for boot because we
> want any percpu accesses (including stack protector) to use the
> initial percpu area until the full percpu memory is allocated. It's
> possible that more stack protector checks could sneak back into the
> early boot code, and after the conversion to relative percpu offsets
> they would work properly again. I just didn't reenable them because
> they are unnecessary that early.
Thanks for the explanation, perhaps this non-obvious fact should be
mentioned in the comment .
Thanks,
Uros.
> Brian Gerst
On Fri, Oct 27, 2023 at 2:09 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Fri, Oct 27, 2023 at 4:09 AM Brian Gerst <brgerst@gmail.com> wrote:
> >
> > On Thu, Oct 26, 2023 at 2:47 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Thu, Oct 26, 2023 at 6:01 PM Brian Gerst <brgerst@gmail.com> wrote:
> > > >
> > > > The percpu section is currently linked at virtual address 0, because
> > > > older compilers hardcoded the stack protector canary value at a fixed
> > > > offset from the start of the GS segment. Now that the canary is a
> > > > normal percpu variable, the percpu section can be linked normally.
> > > > This means that x86-64 will calculate percpu offsets like most other
> > > > architectures, as the delta between the initial percpu address and the
> > > > dynamically allocated memory.
> > >
> > > The comments above MSR_GS_BASE setup should be reviewed or removed. I
> > > don't think they need to be set up to access stack canary, they are
> > > just clearing MSR now.
> >
> > GSBASE is deliberately set to zero offset on SMP for boot because we
> > want any percpu accesses (including stack protector) to use the4
> > initial percpu area until the full percpu memory is allocated. It's
> > possible that more stack protector checks could sneak back into the
> > early boot code, and after the conversion to relative percpu offsets
> > they would work properly again. I just didn't reenable them because
> > they are unnecessary that early.
>
> Thanks for the explanation, perhaps this non-obvious fact should be
> mentioned in the comment .
I will update the comments if I need to send a v3 for some other
reason, otherwise I'll do a followup patch.
Brian Gerst
@@ -72,14 +72,8 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Setup GSBASE to allow stack canary access for C code */
movl $MSR_GS_BASE, %ecx
-#ifdef CONFIG_SMP
- leaq __per_cpu_load(%rip), %rdx
- movl %edx, %eax
- shrq $32, %rdx
-#else
xorl %eax, %eax
xorl %edx, %edx
-#endif
wrmsr
call startup_64_setup_env
@@ -23,18 +23,10 @@
#include <asm/cpumask.h>
#include <asm/cpu.h>
-#ifdef CONFIG_X86_64
-#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
-#else
-#define BOOT_PERCPU_OFFSET 0
-#endif
-
-DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off);
EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = {
- [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
-};
+unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init;
EXPORT_SYMBOL(__per_cpu_offset);
/*
@@ -103,12 +103,6 @@ const_pcpu_hot = pcpu_hot;
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(6); /* RW_ */
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_SMP
- percpu PT_LOAD FLAGS(6); /* RW_ */
-#endif
- init PT_LOAD FLAGS(7); /* RWE */
-#endif
note PT_NOTE FLAGS(0); /* ___ */
}
@@ -224,21 +218,7 @@ SECTIONS
__init_begin = .; /* paired with __init_end */
}
-#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
- /*
- * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
- * output PHDR, so the next output section - .init.text - should
- * start another segment - init.
- */
- PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
- ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START,
- "per-CPU data too large - increase CONFIG_PHYSICAL_START")
-#endif
-
INIT_TEXT_SECTION(PAGE_SIZE)
-#ifdef CONFIG_X86_64
- :init
-#endif
/*
* Section for code used exclusively before alternatives are run. All
@@ -368,9 +348,7 @@ SECTIONS
EXIT_DATA
}
-#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
PERCPU_SECTION(INTERNODE_CACHE_BYTES)
-#endif
. = ALIGN(PAGE_SIZE);
@@ -508,7 +486,7 @@ SECTIONS
* Per-cpu symbols which need to be offset from __per_cpu_load
* for the boot processor.
*/
-#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load
+#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x)
INIT_PER_CPU(gdt_page);
INIT_PER_CPU(irq_stack_backing_store);
@@ -96,14 +96,8 @@ SYM_CODE_START_LOCAL(pvh_start_xen)
1:
/* Set base address in stack canary descriptor. */
mov $MSR_GS_BASE,%ecx
-#ifdef CONFIG_SMP
- lea __per_cpu_load(%rip), %rdx
- mov %edx, %eax
- shr $32, %rdx
-#else
xor %eax, %eax
xor %edx, %edx
-#endif
wrmsr
call xen_prepare_pvh
@@ -815,12 +815,7 @@ static void percpu_init(void)
*/
static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
{
- int shndx = sym_index(sym);
-
- return (shndx == per_cpu_shndx) &&
- strcmp(symname, "__init_begin") &&
- strcmp(symname, "__per_cpu_load") &&
- strncmp(symname, "init_per_cpu_", 13);
+ return 0;
}
@@ -1043,7 +1038,8 @@ static int cmp_relocs(const void *va, const void *vb)
static void sort_relocs(struct relocs *r)
{
- qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs);
+ if (r->count)
+ qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs);
}
static int write32(uint32_t v, FILE *f)
@@ -57,14 +57,8 @@ SYM_CODE_START(startup_xen)
* the per cpu areas are set up.
*/
movl $MSR_GS_BASE,%ecx
-#ifdef CONFIG_SMP
- leaq __per_cpu_load(%rip), %rdx
- movl %edx, %eax
- shrq $32, %rdx
-#else
xorl %eax, %eax
xorl %edx, %edx
-#endif
wrmsr
mov %rsi, %rdi
@@ -1718,7 +1718,7 @@ config KALLSYMS_ALL
config KALLSYMS_ABSOLUTE_PERCPU
bool
depends on KALLSYMS
- default X86_64 && SMP
+ default n
config KALLSYMS_BASE_RELATIVE
bool