[v2,05/11] x86/stackprotector/64: Convert stack protector to normal percpu variable

Message ID 20231026160100.195099-6-brgerst@gmail.com
State New
Headers
Series x86-64: Stack protector and percpu improvements |

Commit Message

Brian Gerst Oct. 26, 2023, 4 p.m. UTC
  Older versions of GCC fixed the location of the stack protector canary
at %gs:40.  This constraint forced the percpu section to be linked at
virtual address 0 so that the canary could be the first data object in
the percpu section.  Supporting the zero-based percpu section requires
additional code to handle relocations for RIP-relative references to
percpu data, extra complexity to kallsyms, and workarounds for linker
bugs due to the use of absolute symbols.

Since version 8.1, GCC has options to configure the location of the
canary value.  This allows the canary to be turned into a normal
percpu variable and removes the constraint that the percpu section
be zero-based.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
 arch/x86/Kconfig                      |  5 ++--
 arch/x86/Makefile                     | 19 +++++++++-----
 arch/x86/entry/entry_64.S             |  2 +-
 arch/x86/include/asm/processor.h      | 15 +----------
 arch/x86/include/asm/stackprotector.h | 37 +++++----------------------
 arch/x86/kernel/asm-offsets_64.c      |  6 -----
 arch/x86/kernel/cpu/common.c          |  4 +--
 arch/x86/kernel/head_64.S             |  3 +--
 arch/x86/xen/xen-head.S               |  3 +--
 9 files changed, 26 insertions(+), 68 deletions(-)
  

Comments

Uros Bizjak Oct. 26, 2023, 6:16 p.m. UTC | #1
On Thu, Oct 26, 2023 at 6:01 PM Brian Gerst <brgerst@gmail.com> wrote:
>
> Older versions of GCC fixed the location of the stack protector canary
> at %gs:40.  This constraint forced the percpu section to be linked at
> virtual address 0 so that the canary could be the first data object in
> the percpu section.  Supporting the zero-based percpu section requires
> additional code to handle relocations for RIP-relative references to
> percpu data, extra complexity to kallsyms, and workarounds for linker
> bugs due to the use of absolute symbols.
>
> Since version 8.1, GCC has options to configure the location of the
> canary value.  This allows the canary to be turned into a normal
> percpu variable and removes the constraint that the percpu section
> be zero-based.
>
> Signed-off-by: Brian Gerst <brgerst@gmail.com>

Reviewed-by: Uros Bizjak <ubizjak@gmail.com>

> ---
>  arch/x86/Kconfig                      |  5 ++--
>  arch/x86/Makefile                     | 19 +++++++++-----
>  arch/x86/entry/entry_64.S             |  2 +-
>  arch/x86/include/asm/processor.h      | 15 +----------
>  arch/x86/include/asm/stackprotector.h | 37 +++++----------------------
>  arch/x86/kernel/asm-offsets_64.c      |  6 -----
>  arch/x86/kernel/cpu/common.c          |  4 +--
>  arch/x86/kernel/head_64.S             |  3 +--
>  arch/x86/xen/xen-head.S               |  3 +--
>  9 files changed, 26 insertions(+), 68 deletions(-)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 92144c6f26d2..c95e0ce557da 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -410,12 +410,11 @@ config PGTABLE_LEVELS
>
>  config CC_HAS_SANE_STACKPROTECTOR
>         bool
> -       default y if 64BIT
> +       default $(cc-option,-mstack-protector-guard-reg=gs -mstack-protector-guard-symbol=__stack_chk_guard) if 64BIT
>         default $(cc-option,-mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard)
>         help
>           We have to make sure stack protector is unconditionally disabled if
> -         the compiler produces broken code or if it does not let us control
> -         the segment on 32-bit kernels.
> +         the compiler does not allow control of the segment and symbol.
>
>  menu "Processor type and features"
>
> diff --git a/arch/x86/Makefile b/arch/x86/Makefile
> index 22e41d9dbc23..6ab8b4419f41 100644
> --- a/arch/x86/Makefile
> +++ b/arch/x86/Makefile
> @@ -111,13 +111,7 @@ ifeq ($(CONFIG_X86_32),y)
>          # temporary until string.h is fixed
>          KBUILD_CFLAGS += -ffreestanding
>
> -       ifeq ($(CONFIG_STACKPROTECTOR),y)
> -               ifeq ($(CONFIG_SMP),y)
> -                       KBUILD_CFLAGS += -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard
> -               else
> -                       KBUILD_CFLAGS += -mstack-protector-guard=global
> -               endif
> -       endif
> +       percpu_seg := fs
>  else
>          BITS := 64
>          UTS_MACHINE := x86_64
> @@ -167,6 +161,17 @@ else
>          KBUILD_CFLAGS += -mcmodel=kernel
>          KBUILD_RUSTFLAGS += -Cno-redzone=y
>          KBUILD_RUSTFLAGS += -Ccode-model=kernel
> +
> +       percpu_seg := gs
> +endif
> +
> +ifeq ($(CONFIG_STACKPROTECTOR),y)
> +       ifeq ($(CONFIG_SMP),y)
> +               KBUILD_CFLAGS += -mstack-protector-guard-reg=$(percpu_seg)
> +               KBUILD_CFLAGS += -mstack-protector-guard-symbol=__stack_chk_guard
> +       else
> +               KBUILD_CFLAGS += -mstack-protector-guard=global
> +       endif
>  endif
>
>  #
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 1a88ad8a7b48..cddcc236aaae 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -190,7 +190,7 @@ SYM_FUNC_START(__switch_to_asm)
>
>  #ifdef CONFIG_STACKPROTECTOR
>         movq    TASK_stack_canary(%rsi), %rbx
> -       movq    %rbx, PER_CPU_VAR(fixed_percpu_data + FIXED_stack_canary)
> +       movq    %rbx, PER_CPU_VAR(__stack_chk_guard)
>  #endif
>
>         /*
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index 4b130d894cb6..2b6531d90273 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -394,16 +394,7 @@ struct irq_stack {
>
>  #ifdef CONFIG_X86_64
>  struct fixed_percpu_data {
> -       /*
> -        * GCC hardcodes the stack canary as %gs:40.  Since the
> -        * irq_stack is the object at %gs:0, we reserve the bottom
> -        * 48 bytes of the irq stack for the canary.
> -        *
> -        * Once we are willing to require -mstack-protector-guard-symbol=
> -        * support for x86_64 stackprotector, we can get rid of this.
> -        */
>         char            gs_base[40];
> -       unsigned long   stack_canary;
>  };
>
>  DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible;
> @@ -418,11 +409,7 @@ extern asmlinkage void entry_SYSCALL32_ignore(void);
>
>  /* Save actual FS/GS selectors and bases to current->thread */
>  void current_save_fsgs(void);
> -#else  /* X86_64 */
> -#ifdef CONFIG_STACKPROTECTOR
> -DECLARE_PER_CPU(unsigned long, __stack_chk_guard);
> -#endif
> -#endif /* !X86_64 */
> +#endif /* X86_64 */
>
>  struct perf_event;
>
> diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
> index 00473a650f51..33abbd29ea26 100644
> --- a/arch/x86/include/asm/stackprotector.h
> +++ b/arch/x86/include/asm/stackprotector.h
> @@ -2,26 +2,13 @@
>  /*
>   * GCC stack protector support.
>   *
> - * Stack protector works by putting predefined pattern at the start of
> + * Stack protector works by putting a predefined pattern at the start of
>   * the stack frame and verifying that it hasn't been overwritten when
> - * returning from the function.  The pattern is called stack canary
> - * and unfortunately gcc historically required it to be at a fixed offset
> - * from the percpu segment base.  On x86_64, the offset is 40 bytes.
> + * returning from the function.  The pattern is called the stack canary
> + * and is a unique value for each task.
>   *
> - * The same segment is shared by percpu area and stack canary.  On
> - * x86_64, percpu symbols are zero based and %gs (64-bit) points to the
> - * base of percpu area.  The first occupant of the percpu area is always
> - * fixed_percpu_data which contains stack_canary at the appropriate
> - * offset.  On x86_32, the stack canary is just a regular percpu
> - * variable.
> - *
> - * Putting percpu data in %fs on 32-bit is a minor optimization compared to
> - * using %gs.  Since 32-bit userspace normally has %fs == 0, we are likely
> - * to load 0 into %fs on exit to usermode, whereas with percpu data in
> - * %gs, we are likely to load a non-null %gs on return to user mode.
> - *
> - * Once we are willing to require GCC 8.1 or better for 64-bit stackprotector
> - * support, we can remove some of this complexity.
> + * GCC is configured to read the stack canary value from the __stack_chk_guard
> + * per-cpu variable, which is changed on task switch.
>   */
>
>  #ifndef _ASM_STACKPROTECTOR_H
> @@ -36,6 +23,8 @@
>
>  #include <linux/sched.h>
>
> +DECLARE_PER_CPU(unsigned long, __stack_chk_guard);
> +
>  /*
>   * Initialize the stackprotector canary value.
>   *
> @@ -51,25 +40,13 @@ static __always_inline void boot_init_stack_canary(void)
>  {
>         unsigned long canary = get_random_canary();
>
> -#ifdef CONFIG_X86_64
> -       BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40);
> -#endif
> -
>         current->stack_canary = canary;
> -#ifdef CONFIG_X86_64
> -       this_cpu_write(fixed_percpu_data.stack_canary, canary);
> -#else
>         this_cpu_write(__stack_chk_guard, canary);
> -#endif
>  }
>
>  static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle)
>  {
> -#ifdef CONFIG_X86_64
> -       per_cpu(fixed_percpu_data.stack_canary, cpu) = idle->stack_canary;
> -#else
>         per_cpu(__stack_chk_guard, cpu) = idle->stack_canary;
> -#endif
>  }
>
>  #else  /* STACKPROTECTOR */
> diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
> index bb65371ea9df..590b6cd0eac0 100644
> --- a/arch/x86/kernel/asm-offsets_64.c
> +++ b/arch/x86/kernel/asm-offsets_64.c
> @@ -54,11 +54,5 @@ int main(void)
>         BLANK();
>  #undef ENTRY
>
> -       BLANK();
> -
> -#ifdef CONFIG_STACKPROTECTOR
> -       OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary);
> -       BLANK();
> -#endif
>         return 0;
>  }
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 9058da9ae011..fb8f0371ffc3 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -2104,15 +2104,13 @@ void syscall_init(void)
>                X86_EFLAGS_AC|X86_EFLAGS_ID);
>  }
>
> -#else  /* CONFIG_X86_64 */
> +#endif /* CONFIG_X86_64 */
>
>  #ifdef CONFIG_STACKPROTECTOR
>  DEFINE_PER_CPU(unsigned long, __stack_chk_guard);
>  EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
>  #endif
>
> -#endif /* CONFIG_X86_64 */
> -
>  /*
>   * Clear all 6 debug registers:
>   */
> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> index 3dcabbc49149..0d94d2a091fe 100644
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -345,8 +345,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
>
>         /* Set up %gs.
>          *
> -        * The base of %gs always points to fixed_percpu_data. If the
> -        * stack protector canary is enabled, it is located at %gs:40.
> +        * The base of %gs always points to fixed_percpu_data.
>          * Note that, on SMP, the boot cpu uses init data section until
>          * the per cpu areas are set up.
>          */
> diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
> index a0ea285878db..30f27e757354 100644
> --- a/arch/x86/xen/xen-head.S
> +++ b/arch/x86/xen/xen-head.S
> @@ -53,8 +53,7 @@ SYM_CODE_START(startup_xen)
>
>         /* Set up %gs.
>          *
> -        * The base of %gs always points to fixed_percpu_data.  If the
> -        * stack protector canary is enabled, it is located at %gs:40.
> +        * The base of %gs always points to fixed_percpu_data.
>          * Note that, on SMP, the boot cpu uses init data section until
>          * the per cpu areas are set up.
>          */
> --
> 2.41.0
>
  
kernel test robot Oct. 29, 2023, 1:26 a.m. UTC | #2
Hi Brian,

kernel test robot noticed the following build errors:

[auto build test ERROR on tip/master]
[also build test ERROR on next-20231027]
[cannot apply to tip/x86/core dennis-percpu/for-next linus/master tip/auto-latest v6.6-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Brian-Gerst/x86-stackprotector-32-Remove-stack-protector-test-script/20231027-000533
base:   tip/master
patch link:    https://lore.kernel.org/r/20231026160100.195099-6-brgerst%40gmail.com
patch subject: [PATCH v2 05/11] x86/stackprotector/64: Convert stack protector to normal percpu variable
config: x86_64-rhel-8.3-rust (https://download.01.org/0day-ci/archive/20231029/202310290927.2MuJJdu9-lkp@intel.com/config)
compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231029/202310290927.2MuJJdu9-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202310290927.2MuJJdu9-lkp@intel.com/

All errors (new ones prefixed by >>):

>> Unsupported relocation type: unknown type rel type name (42)
  
Brian Gerst Oct. 29, 2023, 6:56 a.m. UTC | #3
On Sat, Oct 28, 2023 at 9:26 PM kernel test robot <lkp@intel.com> wrote:
>
> Hi Brian,
>
> kernel test robot noticed the following build errors:
>
> [auto build test ERROR on tip/master]
> [also build test ERROR on next-20231027]
> [cannot apply to tip/x86/core dennis-percpu/for-next linus/master tip/auto-latest v6.6-rc7]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url:    https://github.com/intel-lab-lkp/linux/commits/Brian-Gerst/x86-stackprotector-32-Remove-stack-protector-test-script/20231027-000533
> base:   tip/master
> patch link:    https://lore.kernel.org/r/20231026160100.195099-6-brgerst%40gmail.com
> patch subject: [PATCH v2 05/11] x86/stackprotector/64: Convert stack protector to normal percpu variable
> config: x86_64-rhel-8.3-rust (https://download.01.org/0day-ci/archive/20231029/202310290927.2MuJJdu9-lkp@intel.com/config)
> compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07)
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231029/202310290927.2MuJJdu9-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202310290927.2MuJJdu9-lkp@intel.com/
>
> All errors (new ones prefixed by >>):
>
> >> Unsupported relocation type: unknown type rel type name (42)

Clang is generating a new relocation type (R_X86_64_REX_GOTPCRELX)
that the relocs tool doesn't know about.  This is supposed to allow
        movq    __stack_chk_guard@GOTPCREL(%rip), %rax
        movq    %gs:(%rax), %rax
to be relaxed to
        leaq    __stack_chk_guard(%rip), %rax
        movq    %gs:(%rax), %rax

But why is clang doing this instead of what GCC does?
        movq    %gs:__stack_chk_guard(%rip), %rax

Brian Gerst
  
Brian Gerst Oct. 29, 2023, 5 p.m. UTC | #4
On Sun, Oct 29, 2023 at 2:56 AM Brian Gerst <brgerst@gmail.com> wrote:
>
> On Sat, Oct 28, 2023 at 9:26 PM kernel test robot <lkp@intel.com> wrote:
> >
> > Hi Brian,
> >
> > kernel test robot noticed the following build errors:
> >
> > [auto build test ERROR on tip/master]
> > [also build test ERROR on next-20231027]
> > [cannot apply to tip/x86/core dennis-percpu/for-next linus/master tip/auto-latest v6.6-rc7]
> > [If your patch is applied to the wrong git tree, kindly drop us a note.
> > And when submitting patch, we suggest to use '--base' as documented in
> > https://git-scm.com/docs/git-format-patch#_base_tree_information]
> >
> > url:    https://github.com/intel-lab-lkp/linux/commits/Brian-Gerst/x86-stackprotector-32-Remove-stack-protector-test-script/20231027-000533
> > base:   tip/master
> > patch link:    https://lore.kernel.org/r/20231026160100.195099-6-brgerst%40gmail.com
> > patch subject: [PATCH v2 05/11] x86/stackprotector/64: Convert stack protector to normal percpu variable
> > config: x86_64-rhel-8.3-rust (https://download.01.org/0day-ci/archive/20231029/202310290927.2MuJJdu9-lkp@intel.com/config)
> > compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07)
> > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231029/202310290927.2MuJJdu9-lkp@intel.com/reproduce)
> >
> > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > the same patch/commit), kindly add following tags
> > | Reported-by: kernel test robot <lkp@intel.com>
> > | Closes: https://lore.kernel.org/oe-kbuild-all/202310290927.2MuJJdu9-lkp@intel.com/
> >
> > All errors (new ones prefixed by >>):
> >
> > >> Unsupported relocation type: unknown type rel type name (42)
>
> Clang is generating a new relocation type (R_X86_64_REX_GOTPCRELX)
> that the relocs tool doesn't know about.  This is supposed to allow
>         movq    __stack_chk_guard@GOTPCREL(%rip), %rax
>         movq    %gs:(%rax), %rax
> to be relaxed to
>         leaq    __stack_chk_guard(%rip), %rax
>         movq    %gs:(%rax), %rax
>
> But why is clang doing this instead of what GCC does?
>         movq    %gs:__stack_chk_guard(%rip), %rax

Digging a bit deeper, there also appears to be differences in how the
linkers behave with this new relocation:

make CC=clang LD=ld:
ffffffff81002838:       48 c7 c0 c0 5c 42 83    mov    $0xffffffff83425cc0,%rax
                        ffffffff8100283b: R_X86_64_32S  __stack_chk_guard
ffffffff8100283f:       65 48 8b 00             mov    %gs:(%rax),%rax

make CC=clang LD=ld.lld:
ffffffff81002838:       48 8d 05 81 34 42 02    lea
0x2423481(%rip),%rax        # ffffffff83425cc0 <__stack_chk_guard>
                        ffffffff8100283b: R_X86_64_REX_GOTPCRELX
 __stack_chk_guard-0x4
ffffffff8100283f:       65 48 8b 00             mov    %gs:(%rax),%rax

The LLVM linker keeps the R_X86_64_REX_GOTPCRELX even after performing
the relaxation.  It should be R_X86_64_32S based on it changing to an
LEA instruction.  The GNU linker changes it to R_X86_64_32S and a MOV
immediate.

So I think there are two issues here.  1) clang is producing code
referencing the GOT for stack protector accesses, despite -fno-PIE on
the command line and no other GOT references, and 2) ld.lld is using
the wrong relocation type after the relaxation step is performed.

I think the quick fix here is to teach the relocs tool about this new
relocation.  It should be able to be safely ignored since it's
PC-relative.  The code clang produces is functionally correct,
although not optimal.

Brian Gerst
  
Nick Desaulniers Oct. 30, 2023, 3:24 p.m. UTC | #5
On Sun, Oct 29, 2023 at 10:01 AM Brian Gerst <brgerst@gmail.com> wrote:
>
> On Sun, Oct 29, 2023 at 2:56 AM Brian Gerst <brgerst@gmail.com> wrote:
> >
> > On Sat, Oct 28, 2023 at 9:26 PM kernel test robot <lkp@intel.com> wrote:
> > >
> > > Hi Brian,
> > >
> > > kernel test robot noticed the following build errors:
> > >
> > > [auto build test ERROR on tip/master]
> > > [also build test ERROR on next-20231027]
> > > [cannot apply to tip/x86/core dennis-percpu/for-next linus/master tip/auto-latest v6.6-rc7]
> > > [If your patch is applied to the wrong git tree, kindly drop us a note.
> > > And when submitting patch, we suggest to use '--base' as documented in
> > > https://git-scm.com/docs/git-format-patch#_base_tree_information]
> > >
> > > url:    https://github.com/intel-lab-lkp/linux/commits/Brian-Gerst/x86-stackprotector-32-Remove-stack-protector-test-script/20231027-000533
> > > base:   tip/master
> > > patch link:    https://lore.kernel.org/r/20231026160100.195099-6-brgerst%40gmail.com
> > > patch subject: [PATCH v2 05/11] x86/stackprotector/64: Convert stack protector to normal percpu variable
> > > config: x86_64-rhel-8.3-rust (https://download.01.org/0day-ci/archive/20231029/202310290927.2MuJJdu9-lkp@intel.com/config)
> > > compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07)
> > > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231029/202310290927.2MuJJdu9-lkp@intel.com/reproduce)
> > >
> > > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > > the same patch/commit), kindly add following tags
> > > | Reported-by: kernel test robot <lkp@intel.com>
> > > | Closes: https://lore.kernel.org/oe-kbuild-all/202310290927.2MuJJdu9-lkp@intel.com/
> > >
> > > All errors (new ones prefixed by >>):
> > >
> > > >> Unsupported relocation type: unknown type rel type name (42)
> >
> > Clang is generating a new relocation type (R_X86_64_REX_GOTPCRELX)
> > that the relocs tool doesn't know about.  This is supposed to allow
> >         movq    __stack_chk_guard@GOTPCREL(%rip), %rax
> >         movq    %gs:(%rax), %rax
> > to be relaxed to
> >         leaq    __stack_chk_guard(%rip), %rax
> >         movq    %gs:(%rax), %rax
> >
> > But why is clang doing this instead of what GCC does?
> >         movq    %gs:__stack_chk_guard(%rip), %rax
>
> Digging a bit deeper, there also appears to be differences in how the
> linkers behave with this new relocation:
>
> make CC=clang LD=ld:
> ffffffff81002838:       48 c7 c0 c0 5c 42 83    mov    $0xffffffff83425cc0,%rax
>                         ffffffff8100283b: R_X86_64_32S  __stack_chk_guard
> ffffffff8100283f:       65 48 8b 00             mov    %gs:(%rax),%rax
>
> make CC=clang LD=ld.lld:
> ffffffff81002838:       48 8d 05 81 34 42 02    lea
> 0x2423481(%rip),%rax        # ffffffff83425cc0 <__stack_chk_guard>
>                         ffffffff8100283b: R_X86_64_REX_GOTPCRELX
>  __stack_chk_guard-0x4
> ffffffff8100283f:       65 48 8b 00             mov    %gs:(%rax),%rax
>
> The LLVM linker keeps the R_X86_64_REX_GOTPCRELX even after performing
> the relaxation.  It should be R_X86_64_32S based on it changing to an
> LEA instruction.  The GNU linker changes it to R_X86_64_32S and a MOV
> immediate.
>
> So I think there are two issues here.  1) clang is producing code
> referencing the GOT for stack protector accesses, despite -fno-PIE on
> the command line and no other GOT references, and 2) ld.lld is using
> the wrong relocation type after the relaxation step is performed.
>
> I think the quick fix here is to teach the relocs tool about this new
> relocation.  It should be able to be safely ignored since it's
> PC-relative.  The code clang produces is functionally correct,
> although not optimal.

Thanks for the report.  + Fangrui for thoughts on relocations against
__stack_chk_guard; clang has similar issues for 32b x86 as well.

>
> Brian Gerst
>
  
Brian Gerst Oct. 30, 2023, 5:19 p.m. UTC | #6
On Mon, Oct 30, 2023 at 11:24 AM Nick Desaulniers
<ndesaulniers@google.com> wrote:
>
> On Sun, Oct 29, 2023 at 10:01 AM Brian Gerst <brgerst@gmail.com> wrote:
> >
> > On Sun, Oct 29, 2023 at 2:56 AM Brian Gerst <brgerst@gmail.com> wrote:
> > >
> > > On Sat, Oct 28, 2023 at 9:26 PM kernel test robot <lkp@intel.com> wrote:
> > > >
> > > > Hi Brian,
> > > >
> > > > kernel test robot noticed the following build errors:
> > > >
> > > > [auto build test ERROR on tip/master]
> > > > [also build test ERROR on next-20231027]
> > > > [cannot apply to tip/x86/core dennis-percpu/for-next linus/master tip/auto-latest v6.6-rc7]
> > > > [If your patch is applied to the wrong git tree, kindly drop us a note.
> > > > And when submitting patch, we suggest to use '--base' as documented in
> > > > https://git-scm.com/docs/git-format-patch#_base_tree_information]
> > > >
> > > > url:    https://github.com/intel-lab-lkp/linux/commits/Brian-Gerst/x86-stackprotector-32-Remove-stack-protector-test-script/20231027-000533
> > > > base:   tip/master
> > > > patch link:    https://lore.kernel.org/r/20231026160100.195099-6-brgerst%40gmail.com
> > > > patch subject: [PATCH v2 05/11] x86/stackprotector/64: Convert stack protector to normal percpu variable
> > > > config: x86_64-rhel-8.3-rust (https://download.01.org/0day-ci/archive/20231029/202310290927.2MuJJdu9-lkp@intel.com/config)
> > > > compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07)
> > > > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231029/202310290927.2MuJJdu9-lkp@intel.com/reproduce)
> > > >
> > > > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > > > the same patch/commit), kindly add following tags
> > > > | Reported-by: kernel test robot <lkp@intel.com>
> > > > | Closes: https://lore.kernel.org/oe-kbuild-all/202310290927.2MuJJdu9-lkp@intel.com/
> > > >
> > > > All errors (new ones prefixed by >>):
> > > >
> > > > >> Unsupported relocation type: unknown type rel type name (42)
> > >
> > > Clang is generating a new relocation type (R_X86_64_REX_GOTPCRELX)
> > > that the relocs tool doesn't know about.  This is supposed to allow
> > >         movq    __stack_chk_guard@GOTPCREL(%rip), %rax
> > >         movq    %gs:(%rax), %rax
> > > to be relaxed to
> > >         leaq    __stack_chk_guard(%rip), %rax
> > >         movq    %gs:(%rax), %rax
> > >
> > > But why is clang doing this instead of what GCC does?
> > >         movq    %gs:__stack_chk_guard(%rip), %rax
> >
> > Digging a bit deeper, there also appears to be differences in how the
> > linkers behave with this new relocation:
> >
> > make CC=clang LD=ld:
> > ffffffff81002838:       48 c7 c0 c0 5c 42 83    mov    $0xffffffff83425cc0,%rax
> >                         ffffffff8100283b: R_X86_64_32S  __stack_chk_guard
> > ffffffff8100283f:       65 48 8b 00             mov    %gs:(%rax),%rax
> >
> > make CC=clang LD=ld.lld:
> > ffffffff81002838:       48 8d 05 81 34 42 02    lea
> > 0x2423481(%rip),%rax        # ffffffff83425cc0 <__stack_chk_guard>
> >                         ffffffff8100283b: R_X86_64_REX_GOTPCRELX
> >  __stack_chk_guard-0x4
> > ffffffff8100283f:       65 48 8b 00             mov    %gs:(%rax),%rax
> >
> > The LLVM linker keeps the R_X86_64_REX_GOTPCRELX even after performing
> > the relaxation.  It should be R_X86_64_32S based on it changing to an
> > LEA instruction.  The GNU linker changes it to R_X86_64_32S and a MOV
> > immediate.

Correction:  It should be R_X86_64_PC32 for the LEA instruction.

Brian Gerst
  
Fangrui Song Nov. 1, 2023, 9:21 p.m. UTC | #7
On Mon, Oct 30, 2023 at 10:19 AM Brian Gerst <brgerst@gmail.com> wrote:
>
> On Mon, Oct 30, 2023 at 11:24 AM Nick Desaulniers
> <ndesaulniers@google.com> wrote:
> >
> > On Sun, Oct 29, 2023 at 10:01 AM Brian Gerst <brgerst@gmail.com> wrote:
> > >
> > > On Sun, Oct 29, 2023 at 2:56 AM Brian Gerst <brgerst@gmail.com> wrote:
> > > >
> > > > On Sat, Oct 28, 2023 at 9:26 PM kernel test robot <lkp@intel.com> wrote:
> > > > >
> > > > > Hi Brian,
> > > > >
> > > > > kernel test robot noticed the following build errors:
> > > > >
> > > > > [auto build test ERROR on tip/master]
> > > > > [also build test ERROR on next-20231027]
> > > > > [cannot apply to tip/x86/core dennis-percpu/for-next linus/master tip/auto-latest v6.6-rc7]
> > > > > [If your patch is applied to the wrong git tree, kindly drop us a note.
> > > > > And when submitting patch, we suggest to use '--base' as documented in
> > > > > https://git-scm.com/docs/git-format-patch#_base_tree_information]
> > > > >
> > > > > url:    https://github.com/intel-lab-lkp/linux/commits/Brian-Gerst/x86-stackprotector-32-Remove-stack-protector-test-script/20231027-000533
> > > > > base:   tip/master
> > > > > patch link:    https://lore.kernel.org/r/20231026160100.195099-6-brgerst%40gmail.com
> > > > > patch subject: [PATCH v2 05/11] x86/stackprotector/64: Convert stack protector to normal percpu variable
> > > > > config: x86_64-rhel-8.3-rust (https://download.01.org/0day-ci/archive/20231029/202310290927.2MuJJdu9-lkp@intel.com/config)
> > > > > compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07)
> > > > > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231029/202310290927.2MuJJdu9-lkp@intel.com/reproduce)
> > > > >
> > > > > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > > > > the same patch/commit), kindly add following tags
> > > > > | Reported-by: kernel test robot <lkp@intel.com>
> > > > > | Closes: https://lore.kernel.org/oe-kbuild-all/202310290927.2MuJJdu9-lkp@intel.com/
> > > > >
> > > > > All errors (new ones prefixed by >>):
> > > > >
> > > > > >> Unsupported relocation type: unknown type rel type name (42)
> > > >
> > > > Clang is generating a new relocation type (R_X86_64_REX_GOTPCRELX)
> > > > that the relocs tool doesn't know about.  This is supposed to allow
> > > >         movq    __stack_chk_guard@GOTPCREL(%rip), %rax
> > > >         movq    %gs:(%rax), %rax
> > > > to be relaxed to
> > > >         leaq    __stack_chk_guard(%rip), %rax
> > > >         movq    %gs:(%rax), %rax
> > > >
> > > > But why is clang doing this instead of what GCC does?
> > > >         movq    %gs:__stack_chk_guard(%rip), %rax

https://github.com/llvm/llvm-project/issues/60116 has some discussions
on this topic.

clang-16 -fno-pic -fstack-protector -mstack-protector-guard-reg=gs
-mstack-protector-guard-symbol=__stack_chk_guard
uses a GOT-generating relocation for __stack_chk_guard. This avoids a
copy relocation for userspace but the kernel seems to really want an
absolute relocation,
so https://reviews.llvm.org/D150841 (milestone: clang 17) has implemented it.

> If an `R_X86_64_32` relocation is used and `__stack_chk_guard` is defined by a shared object, copy relocation. We will need an ELF hack called [copy relocation](https://maskray.me/blog/2021-01-09-copy-relocations-canonical-plt-entries-and-protected).

> > > Digging a bit deeper, there also appears to be differences in how the
> > > linkers behave with this new relocation:
> > >
> > > make CC=clang LD=ld:
> > > ffffffff81002838:       48 c7 c0 c0 5c 42 83    mov    $0xffffffff83425cc0,%rax
> > >                         ffffffff8100283b: R_X86_64_32S  __stack_chk_guard
> > > ffffffff8100283f:       65 48 8b 00             mov    %gs:(%rax),%rax
> > >
> > > make CC=clang LD=ld.lld:
> > > ffffffff81002838:       48 8d 05 81 34 42 02    lea
> > > 0x2423481(%rip),%rax        # ffffffff83425cc0 <__stack_chk_guard>
> > >                         ffffffff8100283b: R_X86_64_REX_GOTPCRELX
> > >  __stack_chk_guard-0x4
> > > ffffffff8100283f:       65 48 8b 00             mov    %gs:(%rax),%rax
> > >
> > > The LLVM linker keeps the R_X86_64_REX_GOTPCRELX even after performing
> > > the relaxation.  It should be R_X86_64_32S based on it changing to an
> > > LEA instruction.  The GNU linker changes it to R_X86_64_32S and a MOV
> > > immediate.
>
> Correction:  It should be R_X86_64_PC32 for the LEA instruction.
>
> Brian Gerst

Whether --emit-relocs converts the original relocation type is debatable.
I have some comments on a similar topic on RISC-V:
https://sourceware.org/bugzilla/show_bug.cgi?id=30844#c6

> So it seems that ppc performed conversion can all be described by existing relocation types, which is nice.
>
> However, I do not know whether the property will hold for all current and future RISC-V relaxation schemes.
>
> When investigating relocation overflow pressure for x86-64 small code model, I have found that preserving the original relocation type gives me more information: I can tell how
many R_X86_64_PC32/R_X86_64_GOTPCRELX/R_X86_64_REX_GOTPCRELX are
problematic. If they are converted to R_X86_64_PC32/R_X86_64_32, I'd
lose some information.
>
> Perhaps whether the --emit-relocs uses the original relocation type or the transformed relocation type , does not matter for the majority of use cases. E.g. Linux kernel's objtool, seems to perform a sanity check on relocations. It just needs to know the categories of relocations, e.g. absolute/PC-relative, not the exact type.
  

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 92144c6f26d2..c95e0ce557da 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -410,12 +410,11 @@  config PGTABLE_LEVELS
 
 config CC_HAS_SANE_STACKPROTECTOR
 	bool
-	default y if 64BIT
+	default $(cc-option,-mstack-protector-guard-reg=gs -mstack-protector-guard-symbol=__stack_chk_guard) if 64BIT
 	default $(cc-option,-mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard)
 	help
 	  We have to make sure stack protector is unconditionally disabled if
-	  the compiler produces broken code or if it does not let us control
-	  the segment on 32-bit kernels.
+	  the compiler does not allow control of the segment and symbol.
 
 menu "Processor type and features"
 
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 22e41d9dbc23..6ab8b4419f41 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -111,13 +111,7 @@  ifeq ($(CONFIG_X86_32),y)
         # temporary until string.h is fixed
         KBUILD_CFLAGS += -ffreestanding
 
-	ifeq ($(CONFIG_STACKPROTECTOR),y)
-		ifeq ($(CONFIG_SMP),y)
-			KBUILD_CFLAGS += -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard
-		else
-			KBUILD_CFLAGS += -mstack-protector-guard=global
-		endif
-	endif
+	percpu_seg := fs
 else
         BITS := 64
         UTS_MACHINE := x86_64
@@ -167,6 +161,17 @@  else
         KBUILD_CFLAGS += -mcmodel=kernel
         KBUILD_RUSTFLAGS += -Cno-redzone=y
         KBUILD_RUSTFLAGS += -Ccode-model=kernel
+
+	percpu_seg := gs
+endif
+
+ifeq ($(CONFIG_STACKPROTECTOR),y)
+	ifeq ($(CONFIG_SMP),y)
+		KBUILD_CFLAGS += -mstack-protector-guard-reg=$(percpu_seg)
+		KBUILD_CFLAGS += -mstack-protector-guard-symbol=__stack_chk_guard
+	else
+		KBUILD_CFLAGS += -mstack-protector-guard=global
+	endif
 endif
 
 #
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1a88ad8a7b48..cddcc236aaae 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -190,7 +190,7 @@  SYM_FUNC_START(__switch_to_asm)
 
 #ifdef CONFIG_STACKPROTECTOR
 	movq	TASK_stack_canary(%rsi), %rbx
-	movq	%rbx, PER_CPU_VAR(fixed_percpu_data + FIXED_stack_canary)
+	movq	%rbx, PER_CPU_VAR(__stack_chk_guard)
 #endif
 
 	/*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4b130d894cb6..2b6531d90273 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -394,16 +394,7 @@  struct irq_stack {
 
 #ifdef CONFIG_X86_64
 struct fixed_percpu_data {
-	/*
-	 * GCC hardcodes the stack canary as %gs:40.  Since the
-	 * irq_stack is the object at %gs:0, we reserve the bottom
-	 * 48 bytes of the irq stack for the canary.
-	 *
-	 * Once we are willing to require -mstack-protector-guard-symbol=
-	 * support for x86_64 stackprotector, we can get rid of this.
-	 */
 	char		gs_base[40];
-	unsigned long	stack_canary;
 };
 
 DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible;
@@ -418,11 +409,7 @@  extern asmlinkage void entry_SYSCALL32_ignore(void);
 
 /* Save actual FS/GS selectors and bases to current->thread */
 void current_save_fsgs(void);
-#else	/* X86_64 */
-#ifdef CONFIG_STACKPROTECTOR
-DECLARE_PER_CPU(unsigned long, __stack_chk_guard);
-#endif
-#endif	/* !X86_64 */
+#endif /* X86_64 */
 
 struct perf_event;
 
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 00473a650f51..33abbd29ea26 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -2,26 +2,13 @@ 
 /*
  * GCC stack protector support.
  *
- * Stack protector works by putting predefined pattern at the start of
+ * Stack protector works by putting a predefined pattern at the start of
  * the stack frame and verifying that it hasn't been overwritten when
- * returning from the function.  The pattern is called stack canary
- * and unfortunately gcc historically required it to be at a fixed offset
- * from the percpu segment base.  On x86_64, the offset is 40 bytes.
+ * returning from the function.  The pattern is called the stack canary
+ * and is a unique value for each task.
  *
- * The same segment is shared by percpu area and stack canary.  On
- * x86_64, percpu symbols are zero based and %gs (64-bit) points to the
- * base of percpu area.  The first occupant of the percpu area is always
- * fixed_percpu_data which contains stack_canary at the appropriate
- * offset.  On x86_32, the stack canary is just a regular percpu
- * variable.
- *
- * Putting percpu data in %fs on 32-bit is a minor optimization compared to
- * using %gs.  Since 32-bit userspace normally has %fs == 0, we are likely
- * to load 0 into %fs on exit to usermode, whereas with percpu data in
- * %gs, we are likely to load a non-null %gs on return to user mode.
- *
- * Once we are willing to require GCC 8.1 or better for 64-bit stackprotector
- * support, we can remove some of this complexity.
+ * GCC is configured to read the stack canary value from the __stack_chk_guard
+ * per-cpu variable, which is changed on task switch.
  */
 
 #ifndef _ASM_STACKPROTECTOR_H
@@ -36,6 +23,8 @@ 
 
 #include <linux/sched.h>
 
+DECLARE_PER_CPU(unsigned long, __stack_chk_guard);
+
 /*
  * Initialize the stackprotector canary value.
  *
@@ -51,25 +40,13 @@  static __always_inline void boot_init_stack_canary(void)
 {
 	unsigned long canary = get_random_canary();
 
-#ifdef CONFIG_X86_64
-	BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40);
-#endif
-
 	current->stack_canary = canary;
-#ifdef CONFIG_X86_64
-	this_cpu_write(fixed_percpu_data.stack_canary, canary);
-#else
 	this_cpu_write(__stack_chk_guard, canary);
-#endif
 }
 
 static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle)
 {
-#ifdef CONFIG_X86_64
-	per_cpu(fixed_percpu_data.stack_canary, cpu) = idle->stack_canary;
-#else
 	per_cpu(__stack_chk_guard, cpu) = idle->stack_canary;
-#endif
 }
 
 #else	/* STACKPROTECTOR */
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bb65371ea9df..590b6cd0eac0 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -54,11 +54,5 @@  int main(void)
 	BLANK();
 #undef ENTRY
 
-	BLANK();
-
-#ifdef CONFIG_STACKPROTECTOR
-	OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary);
-	BLANK();
-#endif
 	return 0;
 }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9058da9ae011..fb8f0371ffc3 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2104,15 +2104,13 @@  void syscall_init(void)
 	       X86_EFLAGS_AC|X86_EFLAGS_ID);
 }
 
-#else	/* CONFIG_X86_64 */
+#endif	/* CONFIG_X86_64 */
 
 #ifdef CONFIG_STACKPROTECTOR
 DEFINE_PER_CPU(unsigned long, __stack_chk_guard);
 EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
 #endif
 
-#endif	/* CONFIG_X86_64 */
-
 /*
  * Clear all 6 debug registers:
  */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3dcabbc49149..0d94d2a091fe 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -345,8 +345,7 @@  SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 
 	/* Set up %gs.
 	 *
-	 * The base of %gs always points to fixed_percpu_data. If the
-	 * stack protector canary is enabled, it is located at %gs:40.
+	 * The base of %gs always points to fixed_percpu_data.
 	 * Note that, on SMP, the boot cpu uses init data section until
 	 * the per cpu areas are set up.
 	 */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index a0ea285878db..30f27e757354 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -53,8 +53,7 @@  SYM_CODE_START(startup_xen)
 
 	/* Set up %gs.
 	 *
-	 * The base of %gs always points to fixed_percpu_data.  If the
-	 * stack protector canary is enabled, it is located at %gs:40.
+	 * The base of %gs always points to fixed_percpu_data.
 	 * Note that, on SMP, the boot cpu uses init data section until
 	 * the per cpu areas are set up.
 	 */