[RFC,31/43] x86/modules: Adapt module loading for PIE support

Message ID c79285bfa4450fd5ad160ddd4919ac9ad826de04.1682673543.git.houwenlong.hwl@antgroup.com
State New
Headers
Series x86/pie: Make kernel image's virtual address flexible |

Commit Message

Hou Wenlong April 28, 2023, 9:51 a.m. UTC
  Adapt module loading to support PIE relocations. No GOT is generared for
module, all the GOT entry of got references in module should exist in
kernel GOT.  Currently, there is only one usable got reference for
__fentry__().

Signed-off-by: Hou Wenlong <houwenlong.hwl@antgroup.com>
Cc: Thomas Garnier <thgarnie@chromium.org>
Cc: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Cc: Kees Cook <keescook@chromium.org>
---
 arch/x86/include/asm/sections.h |  5 +++++
 arch/x86/kernel/module.c        | 27 +++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)
  

Comments

Ard Biesheuvel April 28, 2023, 7:29 p.m. UTC | #1
On Fri, 28 Apr 2023 at 10:53, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
>
> Adapt module loading to support PIE relocations. No GOT is generared for
> module, all the GOT entry of got references in module should exist in
> kernel GOT.  Currently, there is only one usable got reference for
> __fentry__().
>

I don't think this is the right approach. We should permit GOTPCREL
relocations properly, which means making them point to a location in
memory that carries the absolute address of the symbol. There are
several ways to go about that, but perhaps the simplest way is to make
the symbol address in ksymtab a 64-bit absolute value (but retain the
PC32 references for the symbol name and the symbol namespace name).
That way, you can always resolve such GOTPCREL relocations by pointing
it to the ksymtab entry. Another option would be to take inspiration
from the PLT code we have on ARM and arm64 (and other architectures,
surely) and to count the GOT based relocations, allocate some extra
r/o module space for each, and allocate slots and populate them with
the right value as you fix up the relocations.

Then, many such relocations can be relaxed at module load time if the
symbol is in range. IIUC, the module and kernel will still be inside
the same 2G window even after widening the KASLR range to 512G, so
most GOT loads can be converted into RIP relative LEA instructions.

Note that this will also permit you to do things like

#define PV_VCPU_PREEMPTED_ASM \
 "leaq __per_cpu_offset(%rip), %rax \n\t" \
 "movq (%rax,%rdi,8), %rax \n\t" \
 "addq steal_time@GOTPCREL(%rip), %rax \n\t" \
 "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "(%rax) \n\t" \
 "setne %al\n\t"

or

+#ifdef CONFIG_X86_PIE
+ " pushq arch_rethook_trampoline@GOTPCREL(%rip)\n"
+#else
" pushq $arch_rethook_trampoline\n"
+#endif

instead of having these kludgy push/pop sequences to free up temp registers.

(FYI I have looked into this PIE linking just a few weeks ago [0] so
this is all rather fresh in my memory)




[0] https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/log/?h=x86-pie


> Signed-off-by: Hou Wenlong <houwenlong.hwl@antgroup.com>
> Cc: Thomas Garnier <thgarnie@chromium.org>
> Cc: Lai Jiangshan <jiangshan.ljs@antgroup.com>
> Cc: Kees Cook <keescook@chromium.org>
> ---
>  arch/x86/include/asm/sections.h |  5 +++++
>  arch/x86/kernel/module.c        | 27 +++++++++++++++++++++++++++
>  2 files changed, 32 insertions(+)
>
> diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
> index a6e8373a5170..dc1c2b08ec48 100644
> --- a/arch/x86/include/asm/sections.h
> +++ b/arch/x86/include/asm/sections.h
> @@ -12,6 +12,11 @@ extern char __end_rodata_aligned[];
>
>  #if defined(CONFIG_X86_64)
>  extern char __end_rodata_hpage_align[];
> +
> +#ifdef CONFIG_X86_PIE
> +extern char __start_got[], __end_got[];
> +#endif
> +
>  #endif
>
>  extern char __end_of_kernel_reserve[];
> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
> index 84ad0e61ba6e..051f88e6884e 100644
> --- a/arch/x86/kernel/module.c
> +++ b/arch/x86/kernel/module.c
> @@ -129,6 +129,18 @@ int apply_relocate(Elf32_Shdr *sechdrs,
>         return 0;
>  }
>  #else /*X86_64*/
> +#ifdef CONFIG_X86_PIE
> +static u64 find_got_kernel_entry(Elf64_Sym *sym, const Elf64_Rela *rela)
> +{
> +       u64 *pos;
> +
> +       for (pos = (u64 *)__start_got; pos < (u64 *)__end_got; pos++)
> +               if (*pos == sym->st_value)
> +                       return (u64)pos + rela->r_addend;
> +       return 0;
> +}
> +#endif
> +
>  static int __write_relocate_add(Elf64_Shdr *sechdrs,
>                    const char *strtab,
>                    unsigned int symindex,
> @@ -171,6 +183,7 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
>                 case R_X86_64_64:
>                         size = 8;
>                         break;
> +#ifndef CONFIG_X86_PIE
>                 case R_X86_64_32:
>                         if (val != *(u32 *)&val)
>                                 goto overflow;
> @@ -181,6 +194,13 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
>                                 goto overflow;
>                         size = 4;
>                         break;
> +#else
> +               case R_X86_64_GOTPCREL:
> +                       val = find_got_kernel_entry(sym, rel);
> +                       if (!val)
> +                               goto unexpected_got_reference;
> +                       fallthrough;
> +#endif
>                 case R_X86_64_PC32:
>                 case R_X86_64_PLT32:
>                         val -= (u64)loc;
> @@ -214,11 +234,18 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
>         }
>         return 0;
>
> +#ifdef CONFIG_X86_PIE
> +unexpected_got_reference:
> +       pr_err("Target got entry doesn't exist in kernel got, loc %p\n", loc);
> +       return -ENOEXEC;
> +#else
>  overflow:
>         pr_err("overflow in relocation type %d val %Lx\n",
>                (int)ELF64_R_TYPE(rel[i].r_info), val);
>         pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
>                me->name);
> +#endif
> +
>         return -ENOEXEC;
>  }
>
> --
> 2.31.1
>
  
Hou Wenlong May 8, 2023, 8:32 a.m. UTC | #2
On Sat, Apr 29, 2023 at 03:29:32AM +0800, Ard Biesheuvel wrote:
> On Fri, 28 Apr 2023 at 10:53, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> >
> > Adapt module loading to support PIE relocations. No GOT is generared for
> > module, all the GOT entry of got references in module should exist in
> > kernel GOT.  Currently, there is only one usable got reference for
> > __fentry__().
> >
> 
> I don't think this is the right approach. We should permit GOTPCREL
> relocations properly, which means making them point to a location in
> memory that carries the absolute address of the symbol. There are
> several ways to go about that, but perhaps the simplest way is to make
> the symbol address in ksymtab a 64-bit absolute value (but retain the
> PC32 references for the symbol name and the symbol namespace name).
> That way, you can always resolve such GOTPCREL relocations by pointing
> it to the ksymtab entry. Another option would be to take inspiration
> from the PLT code we have on ARM and arm64 (and other architectures,
> surely) and to count the GOT based relocations, allocate some extra
> r/o module space for each, and allocate slots and populate them with
> the right value as you fix up the relocations.
> 
> Then, many such relocations can be relaxed at module load time if the
> symbol is in range. IIUC, the module and kernel will still be inside
> the same 2G window even after widening the KASLR range to 512G, so
> most GOT loads can be converted into RIP relative LEA instructions.
> 
> Note that this will also permit you to do things like
> 
> #define PV_VCPU_PREEMPTED_ASM \
>  "leaq __per_cpu_offset(%rip), %rax \n\t" \
>  "movq (%rax,%rdi,8), %rax \n\t" \
>  "addq steal_time@GOTPCREL(%rip), %rax \n\t" \
>  "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "(%rax) \n\t" \
>  "setne %al\n\t"
> 
> or
> 
> +#ifdef CONFIG_X86_PIE
> + " pushq arch_rethook_trampoline@GOTPCREL(%rip)\n"
> +#else
> " pushq $arch_rethook_trampoline\n"
> +#endif
> 
> instead of having these kludgy push/pop sequences to free up temp registers.
> 
> (FYI I have looked into this PIE linking just a few weeks ago [0] so
> this is all rather fresh in my memory)
> 
> 
> 
> 
> [0] https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/log/?h=x86-pie
> 
>
Hi Ard,
Thanks for providing the link, it has been very helpful for me as I am
new to the topic of compilers. One key difference I noticed is that you
linked the kernel with "-pie" instead of "--emit-reloc". I also noticed
that Thomas' initial patchset[0] used "-pie", but in RFC v3 [1], it
switched to "--emit-reloc" in order to reduce dynamic relocation space
on mapped memory.

The another issue is that it requires the addition of the
"-mrelax-relocations=no" option to support older compilers and linkers.
R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX relocations are supported
in binutils 2.26 and later, but the mini version required for the kernel
is 2.25. This option disables relocation relaxation, which makes GOT not
empty. I also noticed this option in arch/x86/boot/compressed/Makefile
with the reason given in [2]. Without relocation relaxation, GOT
references would increase the size of GOT. Therefore, I do not want to
use GOT reference in assembly directly.  However, I realized that the
compiler could still generate GOT references in some cases such as
"fentry" calls and stack canary references.

Regarding module loading, I agree that we should support GOT reference
for the module itself. I will refactor it according to your suggestion.

Thanks.

[0] https://yhbt.net/lore/all/20170718223333.110371-20-thgarnie@google.com
[1] https://yhbt.net/lore/all/20171004212003.28296-1-thgarnie@google.com
[2] https://lore.kernel.org/all/20200903203053.3411268-2-samitolvanen@google.com/

> > Signed-off-by: Hou Wenlong <houwenlong.hwl@antgroup.com>
> > Cc: Thomas Garnier <thgarnie@chromium.org>
> > Cc: Lai Jiangshan <jiangshan.ljs@antgroup.com>
> > Cc: Kees Cook <keescook@chromium.org>
> > ---
> >  arch/x86/include/asm/sections.h |  5 +++++
> >  arch/x86/kernel/module.c        | 27 +++++++++++++++++++++++++++
> >  2 files changed, 32 insertions(+)
> >
> > diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
> > index a6e8373a5170..dc1c2b08ec48 100644
> > --- a/arch/x86/include/asm/sections.h
> > +++ b/arch/x86/include/asm/sections.h
> > @@ -12,6 +12,11 @@ extern char __end_rodata_aligned[];
> >
> >  #if defined(CONFIG_X86_64)
> >  extern char __end_rodata_hpage_align[];
> > +
> > +#ifdef CONFIG_X86_PIE
> > +extern char __start_got[], __end_got[];
> > +#endif
> > +
> >  #endif
> >
> >  extern char __end_of_kernel_reserve[];
> > diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
> > index 84ad0e61ba6e..051f88e6884e 100644
> > --- a/arch/x86/kernel/module.c
> > +++ b/arch/x86/kernel/module.c
> > @@ -129,6 +129,18 @@ int apply_relocate(Elf32_Shdr *sechdrs,
> >         return 0;
> >  }
> >  #else /*X86_64*/
> > +#ifdef CONFIG_X86_PIE
> > +static u64 find_got_kernel_entry(Elf64_Sym *sym, const Elf64_Rela *rela)
> > +{
> > +       u64 *pos;
> > +
> > +       for (pos = (u64 *)__start_got; pos < (u64 *)__end_got; pos++)
> > +               if (*pos == sym->st_value)
> > +                       return (u64)pos + rela->r_addend;
> > +       return 0;
> > +}
> > +#endif
> > +
> >  static int __write_relocate_add(Elf64_Shdr *sechdrs,
> >                    const char *strtab,
> >                    unsigned int symindex,
> > @@ -171,6 +183,7 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
> >                 case R_X86_64_64:
> >                         size = 8;
> >                         break;
> > +#ifndef CONFIG_X86_PIE
> >                 case R_X86_64_32:
> >                         if (val != *(u32 *)&val)
> >                                 goto overflow;
> > @@ -181,6 +194,13 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
> >                                 goto overflow;
> >                         size = 4;
> >                         break;
> > +#else
> > +               case R_X86_64_GOTPCREL:
> > +                       val = find_got_kernel_entry(sym, rel);
> > +                       if (!val)
> > +                               goto unexpected_got_reference;
> > +                       fallthrough;
> > +#endif
> >                 case R_X86_64_PC32:
> >                 case R_X86_64_PLT32:
> >                         val -= (u64)loc;
> > @@ -214,11 +234,18 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
> >         }
> >         return 0;
> >
> > +#ifdef CONFIG_X86_PIE
> > +unexpected_got_reference:
> > +       pr_err("Target got entry doesn't exist in kernel got, loc %p\n", loc);
> > +       return -ENOEXEC;
> > +#else
> >  overflow:
> >         pr_err("overflow in relocation type %d val %Lx\n",
> >                (int)ELF64_R_TYPE(rel[i].r_info), val);
> >         pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
> >                me->name);
> > +#endif
> > +
> >         return -ENOEXEC;
> >  }
> >
> > --
> > 2.31.1
> >
  
Ard Biesheuvel May 8, 2023, 9:16 a.m. UTC | #3
On Mon, 8 May 2023 at 10:38, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
>
> On Sat, Apr 29, 2023 at 03:29:32AM +0800, Ard Biesheuvel wrote:
> > On Fri, 28 Apr 2023 at 10:53, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> > >
> > > Adapt module loading to support PIE relocations. No GOT is generared for
> > > module, all the GOT entry of got references in module should exist in
> > > kernel GOT.  Currently, there is only one usable got reference for
> > > __fentry__().
> > >
> >
> > I don't think this is the right approach. We should permit GOTPCREL
> > relocations properly, which means making them point to a location in
> > memory that carries the absolute address of the symbol. There are
> > several ways to go about that, but perhaps the simplest way is to make
> > the symbol address in ksymtab a 64-bit absolute value (but retain the
> > PC32 references for the symbol name and the symbol namespace name).
> > That way, you can always resolve such GOTPCREL relocations by pointing
> > it to the ksymtab entry. Another option would be to take inspiration
> > from the PLT code we have on ARM and arm64 (and other architectures,
> > surely) and to count the GOT based relocations, allocate some extra
> > r/o module space for each, and allocate slots and populate them with
> > the right value as you fix up the relocations.
> >
> > Then, many such relocations can be relaxed at module load time if the
> > symbol is in range. IIUC, the module and kernel will still be inside
> > the same 2G window even after widening the KASLR range to 512G, so
> > most GOT loads can be converted into RIP relative LEA instructions.
> >
> > Note that this will also permit you to do things like
> >
> > #define PV_VCPU_PREEMPTED_ASM \
> >  "leaq __per_cpu_offset(%rip), %rax \n\t" \
> >  "movq (%rax,%rdi,8), %rax \n\t" \
> >  "addq steal_time@GOTPCREL(%rip), %rax \n\t" \
> >  "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "(%rax) \n\t" \
> >  "setne %al\n\t"
> >
> > or
> >
> > +#ifdef CONFIG_X86_PIE
> > + " pushq arch_rethook_trampoline@GOTPCREL(%rip)\n"
> > +#else
> > " pushq $arch_rethook_trampoline\n"
> > +#endif
> >
> > instead of having these kludgy push/pop sequences to free up temp registers.
> >
> > (FYI I have looked into this PIE linking just a few weeks ago [0] so
> > this is all rather fresh in my memory)
> >
> >
> >
> >
> > [0] https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/log/?h=x86-pie
> >
> >
> Hi Ard,
> Thanks for providing the link, it has been very helpful for me as I am
> new to the topic of compilers.

Happy to hear that.

> One key difference I noticed is that you
> linked the kernel with "-pie" instead of "--emit-reloc". I also noticed
> that Thomas' initial patchset[0] used "-pie", but in RFC v3 [1], it
> switched to "--emit-reloc" in order to reduce dynamic relocation space
> on mapped memory.
>

The problem with --emit-relocs is that the relocations emitted into
the binary may get out of sync with the actual code after the linker
has applied relocations.

$ cat /tmp/a.s
foo:movq foo@GOTPCREL(%rip), %rax

$ x86_64-linux-gnu-gcc -c -o /tmp/a.o /tmp/a.s
ard@gambale:~/linux$ x86_64-linux-gnu-objdump -dr /tmp/a.o

/tmp/a.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <foo>:
   0: 48 8b 05 00 00 00 00 mov    0x0(%rip),%rax        # 7 <foo+0x7>
3: R_X86_64_REX_GOTPCRELX foo-0x4

$ x86_64-linux-gnu-gcc -c -o /tmp/a.o /tmp/a.s
$ x86_64-linux-gnu-objdump -dr /tmp/a.o
0000000000000000 <foo>:
   0: 48 8b 05 00 00 00 00 mov    0x0(%rip),%rax        # 7 <foo+0x7>
3: R_X86_64_REX_GOTPCRELX foo-0x4

$ x86_64-linux-gnu-gcc -o /tmp/a.elf -nostartfiles
-Wl,-no-pie,-q,--defsym,_start=0x0 /tmp/a.s
$ x86_64-linux-gnu-objdump -dr /tmp/a.elf
0000000000401000 <foo>:
  401000: 48 c7 c0 00 10 40 00 mov    $0x401000,%rax
401003: R_X86_64_32S foo

$ x86_64-linux-gnu-gcc -o /tmp/a.elf -nostartfiles
-Wl,-q,--defsym,_start=0x0 /tmp/a.s
$ x86_64-linux-gnu-objdump -dr /tmp/a.elf
0000000000001000 <foo>:
    1000: 48 8d 05 f9 ff ff ff lea    -0x7(%rip),%rax        # 1000 <foo>
1003: R_X86_64_PC32 foo-0x4

This all looks as expected. However, when using Clang, we end up with

$ clang -target x86_64-linux-gnu -o /tmp/a.elf -nostartfiles
-fuse-ld=lld -Wl,--relax,-q,--defsym,_start=0x0 /tmp/a.s
$ x86_64-linux-gnu-objdump -dr /tmp/a.elf
00000000000012c0 <foo>:
    12c0: 48 8d 05 f9 ff ff ff lea    -0x7(%rip),%rax        # 12c0 <foo>
12c3: R_X86_64_REX_GOTPCRELX foo-0x4

So in this case, what --emit-relocs gives us is not what is actually
in the binary. We cannot just ignore these either, given that they are
treated differently depending on whether the symbol is a per-CPU
symbol or not - in the former case, we need to perform a fixup if the
relaxed reference is RIP relative, and in the latter case, if the
relaxed reference is absolute.

On top of that, --emit-relocs does not cover the GOT, so we'd still
need to process that from the code explicitly.

In general, relying on --emit-relocs is kind of dodgy, and I think
combining PIE linking with --emit-relocs is a bad idea.

> The another issue is that it requires the addition of the
> "-mrelax-relocations=no" option to support older compilers and linkers.

Why? The decompressor is now linked in PIE mode so we should be able
to drop that. Or do you need to add is somewhere else?

> R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX relocations are supported
> in binutils 2.26 and later, but the mini version required for the kernel
> is 2.25. This option disables relocation relaxation, which makes GOT not
> empty. I also noticed this option in arch/x86/boot/compressed/Makefile
> with the reason given in [2]. Without relocation relaxation, GOT
> references would increase the size of GOT. Therefore, I do not want to
> use GOT reference in assembly directly.  However, I realized that the
> compiler could still generate GOT references in some cases such as
> "fentry" calls and stack canary references.
>

The stack canary references are under discussion here [3]. I have also
sent a patch for kallsyms symbol references [4]. Beyond that, there
should be very few cases where GOT entries are emitted, so I don't
think this is fundamentally a problem.

I haven't run into the __fentry__ issue myself: do you think we should
fix this in the compiler?

> Regarding module loading, I agree that we should support GOT reference
> for the module itself. I will refactor it according to your suggestion.
>

Excellent, good luck with that.

However, you will still need to make a convincing case for why this is
all worth the trouble. Especially given that you disable the depth
tracking code, which I don't think should be mutually exclusive.

I am aware that this a rather tricky, and involves rewriting
RIP-relative per-CPU variable accesses, but it would be good to get a
discussion started on that topic, and figure out whether there is a
way forward there. Ignoring it is not going to help.


>
> [0] https://yhbt.net/lore/all/20170718223333.110371-20-thgarnie@google.com
> [1] https://yhbt.net/lore/all/20171004212003.28296-1-thgarnie@google.com
> [2] https://lore.kernel.org/all/20200903203053.3411268-2-samitolvanen@google.com/
>

[3] https://github.com/llvm/llvm-project/issues/60116
[4] 20230504174320.3930345-1-ardb@kernel.org

> > > Signed-off-by: Hou Wenlong <houwenlong.hwl@antgroup.com>
> > > Cc: Thomas Garnier <thgarnie@chromium.org>
> > > Cc: Lai Jiangshan <jiangshan.ljs@antgroup.com>
> > > Cc: Kees Cook <keescook@chromium.org>
> > > ---
> > >  arch/x86/include/asm/sections.h |  5 +++++
> > >  arch/x86/kernel/module.c        | 27 +++++++++++++++++++++++++++
> > >  2 files changed, 32 insertions(+)
> > >
> > > diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
> > > index a6e8373a5170..dc1c2b08ec48 100644
> > > --- a/arch/x86/include/asm/sections.h
> > > +++ b/arch/x86/include/asm/sections.h
> > > @@ -12,6 +12,11 @@ extern char __end_rodata_aligned[];
> > >
> > >  #if defined(CONFIG_X86_64)
> > >  extern char __end_rodata_hpage_align[];
> > > +
> > > +#ifdef CONFIG_X86_PIE
> > > +extern char __start_got[], __end_got[];
> > > +#endif
> > > +
> > >  #endif
> > >
> > >  extern char __end_of_kernel_reserve[];
> > > diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
> > > index 84ad0e61ba6e..051f88e6884e 100644
> > > --- a/arch/x86/kernel/module.c
> > > +++ b/arch/x86/kernel/module.c
> > > @@ -129,6 +129,18 @@ int apply_relocate(Elf32_Shdr *sechdrs,
> > >         return 0;
> > >  }
> > >  #else /*X86_64*/
> > > +#ifdef CONFIG_X86_PIE
> > > +static u64 find_got_kernel_entry(Elf64_Sym *sym, const Elf64_Rela *rela)
> > > +{
> > > +       u64 *pos;
> > > +
> > > +       for (pos = (u64 *)__start_got; pos < (u64 *)__end_got; pos++)
> > > +               if (*pos == sym->st_value)
> > > +                       return (u64)pos + rela->r_addend;
> > > +       return 0;
> > > +}
> > > +#endif
> > > +
> > >  static int __write_relocate_add(Elf64_Shdr *sechdrs,
> > >                    const char *strtab,
> > >                    unsigned int symindex,
> > > @@ -171,6 +183,7 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
> > >                 case R_X86_64_64:
> > >                         size = 8;
> > >                         break;
> > > +#ifndef CONFIG_X86_PIE
> > >                 case R_X86_64_32:
> > >                         if (val != *(u32 *)&val)
> > >                                 goto overflow;
> > > @@ -181,6 +194,13 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
> > >                                 goto overflow;
> > >                         size = 4;
> > >                         break;
> > > +#else
> > > +               case R_X86_64_GOTPCREL:
> > > +                       val = find_got_kernel_entry(sym, rel);
> > > +                       if (!val)
> > > +                               goto unexpected_got_reference;
> > > +                       fallthrough;
> > > +#endif
> > >                 case R_X86_64_PC32:
> > >                 case R_X86_64_PLT32:
> > >                         val -= (u64)loc;
> > > @@ -214,11 +234,18 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
> > >         }
> > >         return 0;
> > >
> > > +#ifdef CONFIG_X86_PIE
> > > +unexpected_got_reference:
> > > +       pr_err("Target got entry doesn't exist in kernel got, loc %p\n", loc);
> > > +       return -ENOEXEC;
> > > +#else
> > >  overflow:
> > >         pr_err("overflow in relocation type %d val %Lx\n",
> > >                (int)ELF64_R_TYPE(rel[i].r_info), val);
> > >         pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
> > >                me->name);
> > > +#endif
> > > +
> > >         return -ENOEXEC;
> > >  }
> > >
> > > --
> > > 2.31.1
> > >
  
Hou Wenlong May 8, 2023, 11:40 a.m. UTC | #4
On Mon, May 08, 2023 at 05:16:34PM +0800, Ard Biesheuvel wrote:
> On Mon, 8 May 2023 at 10:38, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> >
> > On Sat, Apr 29, 2023 at 03:29:32AM +0800, Ard Biesheuvel wrote:
> > > On Fri, 28 Apr 2023 at 10:53, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> > > >
> > > > Adapt module loading to support PIE relocations. No GOT is generared for
> > > > module, all the GOT entry of got references in module should exist in
> > > > kernel GOT.  Currently, there is only one usable got reference for
> > > > __fentry__().
> > > >
> > >
> > > I don't think this is the right approach. We should permit GOTPCREL
> > > relocations properly, which means making them point to a location in
> > > memory that carries the absolute address of the symbol. There are
> > > several ways to go about that, but perhaps the simplest way is to make
> > > the symbol address in ksymtab a 64-bit absolute value (but retain the
> > > PC32 references for the symbol name and the symbol namespace name).
> > > That way, you can always resolve such GOTPCREL relocations by pointing
> > > it to the ksymtab entry. Another option would be to take inspiration
> > > from the PLT code we have on ARM and arm64 (and other architectures,
> > > surely) and to count the GOT based relocations, allocate some extra
> > > r/o module space for each, and allocate slots and populate them with
> > > the right value as you fix up the relocations.
> > >
> > > Then, many such relocations can be relaxed at module load time if the
> > > symbol is in range. IIUC, the module and kernel will still be inside
> > > the same 2G window even after widening the KASLR range to 512G, so
> > > most GOT loads can be converted into RIP relative LEA instructions.
> > >
> > > Note that this will also permit you to do things like
> > >
> > > #define PV_VCPU_PREEMPTED_ASM \
> > >  "leaq __per_cpu_offset(%rip), %rax \n\t" \
> > >  "movq (%rax,%rdi,8), %rax \n\t" \
> > >  "addq steal_time@GOTPCREL(%rip), %rax \n\t" \
> > >  "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "(%rax) \n\t" \
> > >  "setne %al\n\t"
> > >
> > > or
> > >
> > > +#ifdef CONFIG_X86_PIE
> > > + " pushq arch_rethook_trampoline@GOTPCREL(%rip)\n"
> > > +#else
> > > " pushq $arch_rethook_trampoline\n"
> > > +#endif
> > >
> > > instead of having these kludgy push/pop sequences to free up temp registers.
> > >
> > > (FYI I have looked into this PIE linking just a few weeks ago [0] so
> > > this is all rather fresh in my memory)
> > >
> > >
> > >
> > >
> > > [0] https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/log/?h=x86-pie
> > >
> > >
> > Hi Ard,
> > Thanks for providing the link, it has been very helpful for me as I am
> > new to the topic of compilers.
> 
> Happy to hear that.
>
 
Thanks for your prompt reply.

> > One key difference I noticed is that you
> > linked the kernel with "-pie" instead of "--emit-reloc". I also noticed
> > that Thomas' initial patchset[0] used "-pie", but in RFC v3 [1], it
> > switched to "--emit-reloc" in order to reduce dynamic relocation space
> > on mapped memory.
> >
> 
> The problem with --emit-relocs is that the relocations emitted into
> the binary may get out of sync with the actual code after the linker
> has applied relocations.
> 
> $ cat /tmp/a.s
> foo:movq foo@GOTPCREL(%rip), %rax
> 
> $ x86_64-linux-gnu-gcc -c -o /tmp/a.o /tmp/a.s
> ard@gambale:~/linux$ x86_64-linux-gnu-objdump -dr /tmp/a.o
> 
> /tmp/a.o:     file format elf64-x86-64
> 
> 
> Disassembly of section .text:
> 
> 0000000000000000 <foo>:
>    0: 48 8b 05 00 00 00 00 mov    0x0(%rip),%rax        # 7 <foo+0x7>
> 3: R_X86_64_REX_GOTPCRELX foo-0x4
> 
> $ x86_64-linux-gnu-gcc -c -o /tmp/a.o /tmp/a.s
> $ x86_64-linux-gnu-objdump -dr /tmp/a.o
> 0000000000000000 <foo>:
>    0: 48 8b 05 00 00 00 00 mov    0x0(%rip),%rax        # 7 <foo+0x7>
> 3: R_X86_64_REX_GOTPCRELX foo-0x4
> 
> $ x86_64-linux-gnu-gcc -o /tmp/a.elf -nostartfiles
> -Wl,-no-pie,-q,--defsym,_start=0x0 /tmp/a.s
> $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> 0000000000401000 <foo>:
>   401000: 48 c7 c0 00 10 40 00 mov    $0x401000,%rax
> 401003: R_X86_64_32S foo
> 
> $ x86_64-linux-gnu-gcc -o /tmp/a.elf -nostartfiles
> -Wl,-q,--defsym,_start=0x0 /tmp/a.s
> $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> 0000000000001000 <foo>:
>     1000: 48 8d 05 f9 ff ff ff lea    -0x7(%rip),%rax        # 1000 <foo>
> 1003: R_X86_64_PC32 foo-0x4
> 
> This all looks as expected. However, when using Clang, we end up with
> 
> $ clang -target x86_64-linux-gnu -o /tmp/a.elf -nostartfiles
> -fuse-ld=lld -Wl,--relax,-q,--defsym,_start=0x0 /tmp/a.s
> $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> 00000000000012c0 <foo>:
>     12c0: 48 8d 05 f9 ff ff ff lea    -0x7(%rip),%rax        # 12c0 <foo>
> 12c3: R_X86_64_REX_GOTPCRELX foo-0x4
> 
> So in this case, what --emit-relocs gives us is not what is actually
> in the binary. We cannot just ignore these either, given that they are
> treated differently depending on whether the symbol is a per-CPU
> symbol or not - in the former case, we need to perform a fixup if the
> relaxed reference is RIP relative, and in the latter case, if the
> relaxed reference is absolute.
>
With symbols hidden and the compile-time address of the kernel image
kept in the top 2G, is it possible for the relaxed reference to be
absolute, even if I keep the percpu section zero-mapping for SMP?  I
didn't see absoulte relaxed reference after dropping
"-mrelax-relocations=no" option.

> On top of that, --emit-relocs does not cover the GOT, so we'd still
> need to process that from the code explicitly.
>
Yes, so the relocs tool would process GOT, and generate
R_X86_64_GLOB_DAT relocation for GOT entries in patch 27:
https://lore.kernel.org/lkml/d25c7644249355785365914398bdba1ed2c52468.1682673543.git.houwenlong.hwl@antgroup.com

> In general, relying on --emit-relocs is kind of dodgy, and I think
> combining PIE linking with --emit-relocs is a bad idea.
> 
> > The another issue is that it requires the addition of the
> > "-mrelax-relocations=no" option to support older compilers and linkers.
> 
> Why? The decompressor is now linked in PIE mode so we should be able
> to drop that. Or do you need to add is somewhere else?
>
I tried to use binutils 2.25 (mini version), it couldn't recognize
R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX.

> > R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX relocations are supported
> > in binutils 2.26 and later, but the mini version required for the kernel
> > is 2.25. This option disables relocation relaxation, which makes GOT not
> > empty. I also noticed this option in arch/x86/boot/compressed/Makefile
> > with the reason given in [2]. Without relocation relaxation, GOT
> > references would increase the size of GOT. Therefore, I do not want to
> > use GOT reference in assembly directly.  However, I realized that the
> > compiler could still generate GOT references in some cases such as
> > "fentry" calls and stack canary references.
> >
> 
> The stack canary references are under discussion here [3]. I have also
> sent a patch for kallsyms symbol references [4]. Beyond that, there
> should be very few cases where GOT entries are emitted, so I don't
> think this is fundamentally a problem.
> 
> I haven't run into the __fentry__ issue myself: do you think we should
> fix this in the compiler?
>
The issue about __fentry__ is that the compiler would generate 6-bytes
indirect call through GOT with "-fPIE" option. However, the original
ftrace nop patching assumes it is a 5-bytes direct call. And
"-mnop-mcount" option is not compatiable with "-fPIE" option, so the
complier woudn't patch it as nop.

So we should patch it with one 5-bytes nop followed by one 1-byte nop,
This way, ftrace can handle the previous 5-bytes as before. Also I have
built PIE kernel with relocation relaxation on GCC, and the linker would
relax it as following:
ffffffff810018f0 <do_one_initcall>:
ffffffff810018f0:       f3 0f 1e fa             endbr64
ffffffff810018f4:       67 e8 a6 d6 05 00       addr32 call ffffffff8105efa0 <__fentry__>
			ffffffff810018f6: R_X86_64_PC32 __fentry__-0x4

It still requires a different nop patching for ftrace. I notice
"Optimize GOTPCRELX Relocations" chapter in x86-64 psABI, which suggests
that the GOT indirect call can be relaxed as "call fentry nop" or "nop
call fentry", it appears that the latter is chosen. If the linker could
generate the former, then no fixup would be necessary for ftrace with
PIE.

> > Regarding module loading, I agree that we should support GOT reference
> > for the module itself. I will refactor it according to your suggestion.
> >
> 
> Excellent, good luck with that.
> 
> However, you will still need to make a convincing case for why this is
> all worth the trouble. Especially given that you disable the depth
> tracking code, which I don't think should be mutually exclusive.
>
Actually, I could do relocation for it when apply patching for the
depth tracking code. I'm not sure such case is common or not.

> I am aware that this a rather tricky, and involves rewriting
> RIP-relative per-CPU variable accesses, but it would be good to get a
> discussion started on that topic, and figure out whether there is a
> way forward there. Ignoring it is not going to help.
> 
>
I see that your PIE linking chose to put the per-cpu section in high
kernel image address, I still keep it as zero-mapping. However, both are
in the RIP-relative addressing range.

> >
> > [0] https://yhbt.net/lore/all/20170718223333.110371-20-thgarnie@google.com
> > [1] https://yhbt.net/lore/all/20171004212003.28296-1-thgarnie@google.com
> > [2] https://lore.kernel.org/all/20200903203053.3411268-2-samitolvanen@google.com/
> >
> 
> [3] https://github.com/llvm/llvm-project/issues/60116
> [4] 20230504174320.3930345-1-ardb@kernel.org
> 
> > > > Signed-off-by: Hou Wenlong <houwenlong.hwl@antgroup.com>
> > > > Cc: Thomas Garnier <thgarnie@chromium.org>
> > > > Cc: Lai Jiangshan <jiangshan.ljs@antgroup.com>
> > > > Cc: Kees Cook <keescook@chromium.org>
> > > > ---
> > > >  arch/x86/include/asm/sections.h |  5 +++++
> > > >  arch/x86/kernel/module.c        | 27 +++++++++++++++++++++++++++
> > > >  2 files changed, 32 insertions(+)
> > > >
> > > > diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
> > > > index a6e8373a5170..dc1c2b08ec48 100644
> > > > --- a/arch/x86/include/asm/sections.h
> > > > +++ b/arch/x86/include/asm/sections.h
> > > > @@ -12,6 +12,11 @@ extern char __end_rodata_aligned[];
> > > >
> > > >  #if defined(CONFIG_X86_64)
> > > >  extern char __end_rodata_hpage_align[];
> > > > +
> > > > +#ifdef CONFIG_X86_PIE
> > > > +extern char __start_got[], __end_got[];
> > > > +#endif
> > > > +
> > > >  #endif
> > > >
> > > >  extern char __end_of_kernel_reserve[];
> > > > diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
> > > > index 84ad0e61ba6e..051f88e6884e 100644
> > > > --- a/arch/x86/kernel/module.c
> > > > +++ b/arch/x86/kernel/module.c
> > > > @@ -129,6 +129,18 @@ int apply_relocate(Elf32_Shdr *sechdrs,
> > > >         return 0;
> > > >  }
> > > >  #else /*X86_64*/
> > > > +#ifdef CONFIG_X86_PIE
> > > > +static u64 find_got_kernel_entry(Elf64_Sym *sym, const Elf64_Rela *rela)
> > > > +{
> > > > +       u64 *pos;
> > > > +
> > > > +       for (pos = (u64 *)__start_got; pos < (u64 *)__end_got; pos++)
> > > > +               if (*pos == sym->st_value)
> > > > +                       return (u64)pos + rela->r_addend;
> > > > +       return 0;
> > > > +}
> > > > +#endif
> > > > +
> > > >  static int __write_relocate_add(Elf64_Shdr *sechdrs,
> > > >                    const char *strtab,
> > > >                    unsigned int symindex,
> > > > @@ -171,6 +183,7 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
> > > >                 case R_X86_64_64:
> > > >                         size = 8;
> > > >                         break;
> > > > +#ifndef CONFIG_X86_PIE
> > > >                 case R_X86_64_32:
> > > >                         if (val != *(u32 *)&val)
> > > >                                 goto overflow;
> > > > @@ -181,6 +194,13 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
> > > >                                 goto overflow;
> > > >                         size = 4;
> > > >                         break;
> > > > +#else
> > > > +               case R_X86_64_GOTPCREL:
> > > > +                       val = find_got_kernel_entry(sym, rel);
> > > > +                       if (!val)
> > > > +                               goto unexpected_got_reference;
> > > > +                       fallthrough;
> > > > +#endif
> > > >                 case R_X86_64_PC32:
> > > >                 case R_X86_64_PLT32:
> > > >                         val -= (u64)loc;
> > > > @@ -214,11 +234,18 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
> > > >         }
> > > >         return 0;
> > > >
> > > > +#ifdef CONFIG_X86_PIE
> > > > +unexpected_got_reference:
> > > > +       pr_err("Target got entry doesn't exist in kernel got, loc %p\n", loc);
> > > > +       return -ENOEXEC;
> > > > +#else
> > > >  overflow:
> > > >         pr_err("overflow in relocation type %d val %Lx\n",
> > > >                (int)ELF64_R_TYPE(rel[i].r_info), val);
> > > >         pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
> > > >                me->name);
> > > > +#endif
> > > > +
> > > >         return -ENOEXEC;
> > > >  }
> > > >
> > > > --
> > > > 2.31.1
> > > >
  
Ard Biesheuvel May 8, 2023, 5:47 p.m. UTC | #5
On Mon, 8 May 2023 at 13:45, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
>
> On Mon, May 08, 2023 at 05:16:34PM +0800, Ard Biesheuvel wrote:
> > On Mon, 8 May 2023 at 10:38, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> > >
> > > On Sat, Apr 29, 2023 at 03:29:32AM +0800, Ard Biesheuvel wrote:
> > > > On Fri, 28 Apr 2023 at 10:53, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> > > > >
> > > > > Adapt module loading to support PIE relocations. No GOT is generared for
> > > > > module, all the GOT entry of got references in module should exist in
> > > > > kernel GOT.  Currently, there is only one usable got reference for
> > > > > __fentry__().
> > > > >
> > > >
> > > > I don't think this is the right approach. We should permit GOTPCREL
> > > > relocations properly, which means making them point to a location in
> > > > memory that carries the absolute address of the symbol. There are
> > > > several ways to go about that, but perhaps the simplest way is to make
> > > > the symbol address in ksymtab a 64-bit absolute value (but retain the
> > > > PC32 references for the symbol name and the symbol namespace name).
> > > > That way, you can always resolve such GOTPCREL relocations by pointing
> > > > it to the ksymtab entry. Another option would be to take inspiration
> > > > from the PLT code we have on ARM and arm64 (and other architectures,
> > > > surely) and to count the GOT based relocations, allocate some extra
> > > > r/o module space for each, and allocate slots and populate them with
> > > > the right value as you fix up the relocations.
> > > >
> > > > Then, many such relocations can be relaxed at module load time if the
> > > > symbol is in range. IIUC, the module and kernel will still be inside
> > > > the same 2G window even after widening the KASLR range to 512G, so
> > > > most GOT loads can be converted into RIP relative LEA instructions.
> > > >
> > > > Note that this will also permit you to do things like
> > > >
> > > > #define PV_VCPU_PREEMPTED_ASM \
> > > >  "leaq __per_cpu_offset(%rip), %rax \n\t" \
> > > >  "movq (%rax,%rdi,8), %rax \n\t" \
> > > >  "addq steal_time@GOTPCREL(%rip), %rax \n\t" \
> > > >  "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "(%rax) \n\t" \
> > > >  "setne %al\n\t"
> > > >
> > > > or
> > > >
> > > > +#ifdef CONFIG_X86_PIE
> > > > + " pushq arch_rethook_trampoline@GOTPCREL(%rip)\n"
> > > > +#else
> > > > " pushq $arch_rethook_trampoline\n"
> > > > +#endif
> > > >
> > > > instead of having these kludgy push/pop sequences to free up temp registers.
> > > >
> > > > (FYI I have looked into this PIE linking just a few weeks ago [0] so
> > > > this is all rather fresh in my memory)
> > > >
> > > >
> > > >
> > > >
> > > > [0] https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/log/?h=x86-pie
> > > >
> > > >
> > > Hi Ard,
> > > Thanks for providing the link, it has been very helpful for me as I am
> > > new to the topic of compilers.
> >
> > Happy to hear that.
> >
>
> Thanks for your prompt reply.
>
> > > One key difference I noticed is that you
> > > linked the kernel with "-pie" instead of "--emit-reloc". I also noticed
> > > that Thomas' initial patchset[0] used "-pie", but in RFC v3 [1], it
> > > switched to "--emit-reloc" in order to reduce dynamic relocation space
> > > on mapped memory.
> > >
> >
> > The problem with --emit-relocs is that the relocations emitted into
> > the binary may get out of sync with the actual code after the linker
> > has applied relocations.
> >
> > $ cat /tmp/a.s
> > foo:movq foo@GOTPCREL(%rip), %rax
> >
> > $ x86_64-linux-gnu-gcc -c -o /tmp/a.o /tmp/a.s
> > ard@gambale:~/linux$ x86_64-linux-gnu-objdump -dr /tmp/a.o
> >
> > /tmp/a.o:     file format elf64-x86-64
> >
> >
> > Disassembly of section .text:
> >
> > 0000000000000000 <foo>:
> >    0: 48 8b 05 00 00 00 00 mov    0x0(%rip),%rax        # 7 <foo+0x7>
> > 3: R_X86_64_REX_GOTPCRELX foo-0x4
> >
> > $ x86_64-linux-gnu-gcc -c -o /tmp/a.o /tmp/a.s
> > $ x86_64-linux-gnu-objdump -dr /tmp/a.o
> > 0000000000000000 <foo>:
> >    0: 48 8b 05 00 00 00 00 mov    0x0(%rip),%rax        # 7 <foo+0x7>
> > 3: R_X86_64_REX_GOTPCRELX foo-0x4
> >
> > $ x86_64-linux-gnu-gcc -o /tmp/a.elf -nostartfiles
> > -Wl,-no-pie,-q,--defsym,_start=0x0 /tmp/a.s
> > $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> > 0000000000401000 <foo>:
> >   401000: 48 c7 c0 00 10 40 00 mov    $0x401000,%rax
> > 401003: R_X86_64_32S foo
> >
> > $ x86_64-linux-gnu-gcc -o /tmp/a.elf -nostartfiles
> > -Wl,-q,--defsym,_start=0x0 /tmp/a.s
> > $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> > 0000000000001000 <foo>:
> >     1000: 48 8d 05 f9 ff ff ff lea    -0x7(%rip),%rax        # 1000 <foo>
> > 1003: R_X86_64_PC32 foo-0x4
> >
> > This all looks as expected. However, when using Clang, we end up with
> >
> > $ clang -target x86_64-linux-gnu -o /tmp/a.elf -nostartfiles
> > -fuse-ld=lld -Wl,--relax,-q,--defsym,_start=0x0 /tmp/a.s
> > $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> > 00000000000012c0 <foo>:
> >     12c0: 48 8d 05 f9 ff ff ff lea    -0x7(%rip),%rax        # 12c0 <foo>
> > 12c3: R_X86_64_REX_GOTPCRELX foo-0x4
> >
> > So in this case, what --emit-relocs gives us is not what is actually
> > in the binary. We cannot just ignore these either, given that they are
> > treated differently depending on whether the symbol is a per-CPU
> > symbol or not - in the former case, we need to perform a fixup if the
> > relaxed reference is RIP relative, and in the latter case, if the
> > relaxed reference is absolute.
> >
> With symbols hidden and the compile-time address of the kernel image
> kept in the top 2G, is it possible for the relaxed reference to be
> absolute, even if I keep the percpu section zero-mapping for SMP?  I
> didn't see absoulte relaxed reference after dropping
> "-mrelax-relocations=no" option.
>

If you link in PIE mode, you should never see absolute references
after relaxation.

> > On top of that, --emit-relocs does not cover the GOT, so we'd still
> > need to process that from the code explicitly.
> >
> Yes, so the relocs tool would process GOT, and generate
> R_X86_64_GLOB_DAT relocation for GOT entries in patch 27:
> https://lore.kernel.org/lkml/d25c7644249355785365914398bdba1ed2c52468.1682673543.git.houwenlong.hwl@antgroup.com
>

Yes, something like that is needed. I'd lean towards generating the
reloc data directly instead of creating an artifiical RELA section
with GLOB_DAT relocations, but that is a minor detail.

> > In general, relying on --emit-relocs is kind of dodgy, and I think
> > combining PIE linking with --emit-relocs is a bad idea.
> >
> > > The another issue is that it requires the addition of the
> > > "-mrelax-relocations=no" option to support older compilers and linkers.
> >
> > Why? The decompressor is now linked in PIE mode so we should be able
> > to drop that. Or do you need to add is somewhere else?
> >
> I tried to use binutils 2.25 (mini version), it couldn't recognize
> R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX.
>

I'm not sure that matters. If the assembler accepts @GOTPCREL
notation, it should generate the relocations that the linker can
understand. If the toolchain is not internally consistent in this
regard, I don't think it is our problem.

This might mean that we end up with more residual GOT entries than
with a more recent toolchain, but I don't think that is a big deal.

> > > R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX relocations are supported
> > > in binutils 2.26 and later, but the mini version required for the kernel
> > > is 2.25. This option disables relocation relaxation, which makes GOT not
> > > empty. I also noticed this option in arch/x86/boot/compressed/Makefile
> > > with the reason given in [2]. Without relocation relaxation, GOT
> > > references would increase the size of GOT. Therefore, I do not want to
> > > use GOT reference in assembly directly.  However, I realized that the
> > > compiler could still generate GOT references in some cases such as
> > > "fentry" calls and stack canary references.
> > >
> >
> > The stack canary references are under discussion here [3]. I have also
> > sent a patch for kallsyms symbol references [4]. Beyond that, there
> > should be very few cases where GOT entries are emitted, so I don't
> > think this is fundamentally a problem.
> >
> > I haven't run into the __fentry__ issue myself: do you think we should
> > fix this in the compiler?
> >
> The issue about __fentry__ is that the compiler would generate 6-bytes
> indirect call through GOT with "-fPIE" option. However, the original
> ftrace nop patching assumes it is a 5-bytes direct call. And
> "-mnop-mcount" option is not compatiable with "-fPIE" option, so the
> complier woudn't patch it as nop.
>
> So we should patch it with one 5-bytes nop followed by one 1-byte nop,
> This way, ftrace can handle the previous 5-bytes as before. Also I have
> built PIE kernel with relocation relaxation on GCC, and the linker would
> relax it as following:
> ffffffff810018f0 <do_one_initcall>:
> ffffffff810018f0:       f3 0f 1e fa             endbr64
> ffffffff810018f4:       67 e8 a6 d6 05 00       addr32 call ffffffff8105efa0 <__fentry__>
>                         ffffffff810018f6: R_X86_64_PC32 __fentry__-0x4
>

But if fentry is a function symbol, I would not expect the codegen to
be different at all. Are you using -fno-plt?

> It still requires a different nop patching for ftrace. I notice
> "Optimize GOTPCRELX Relocations" chapter in x86-64 psABI, which suggests
> that the GOT indirect call can be relaxed as "call fentry nop" or "nop
> call fentry", it appears that the latter is chosen. If the linker could
> generate the former, then no fixup would be necessary for ftrace with
> PIE.
>

Right. I think this may be a result of __fentry__ not being subject to
the same rules wrt visibility etc, similar to __stack_chk_guard. These
are arguably compiler issues that could qualify as bugs, given that
these symbol references don't behave like ordinary symbol references.

> > > Regarding module loading, I agree that we should support GOT reference
> > > for the module itself. I will refactor it according to your suggestion.
> > >
> >
> > Excellent, good luck with that.
> >
> > However, you will still need to make a convincing case for why this is
> > all worth the trouble. Especially given that you disable the depth
> > tracking code, which I don't think should be mutually exclusive.
> >
> Actually, I could do relocation for it when apply patching for the
> depth tracking code. I'm not sure such case is common or not.
>

I think that alternatives patching in general would need to support
RIP relative references in the alternatives. The depth tracking
template is a bit different in this regard, and could be fixed more
easily, I think.

> > I am aware that this a rather tricky, and involves rewriting
> > RIP-relative per-CPU variable accesses, but it would be good to get a
> > discussion started on that topic, and figure out whether there is a
> > way forward there. Ignoring it is not going to help.
> >
> >
> I see that your PIE linking chose to put the per-cpu section in high
> kernel image address, I still keep it as zero-mapping. However, both are
> in the RIP-relative addressing range.
>

Pure PIE linking cannot support the zero mapping - it can only work
with --emit-relocs, which I was trying to avoid.
  
Hou Wenlong May 9, 2023, 9:42 a.m. UTC | #6
On Tue, May 09, 2023 at 01:47:53AM +0800, Ard Biesheuvel wrote:
> On Mon, 8 May 2023 at 13:45, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> >
> > On Mon, May 08, 2023 at 05:16:34PM +0800, Ard Biesheuvel wrote:
> > > On Mon, 8 May 2023 at 10:38, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> > > >
> > > > On Sat, Apr 29, 2023 at 03:29:32AM +0800, Ard Biesheuvel wrote:
> > > > > On Fri, 28 Apr 2023 at 10:53, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> > > > > >
> > > > > > Adapt module loading to support PIE relocations. No GOT is generared for
> > > > > > module, all the GOT entry of got references in module should exist in
> > > > > > kernel GOT.  Currently, there is only one usable got reference for
> > > > > > __fentry__().
> > > > > >
> > > > >
> > > > > I don't think this is the right approach. We should permit GOTPCREL
> > > > > relocations properly, which means making them point to a location in
> > > > > memory that carries the absolute address of the symbol. There are
> > > > > several ways to go about that, but perhaps the simplest way is to make
> > > > > the symbol address in ksymtab a 64-bit absolute value (but retain the
> > > > > PC32 references for the symbol name and the symbol namespace name).
> > > > > That way, you can always resolve such GOTPCREL relocations by pointing
> > > > > it to the ksymtab entry. Another option would be to take inspiration
> > > > > from the PLT code we have on ARM and arm64 (and other architectures,
> > > > > surely) and to count the GOT based relocations, allocate some extra
> > > > > r/o module space for each, and allocate slots and populate them with
> > > > > the right value as you fix up the relocations.
> > > > >
> > > > > Then, many such relocations can be relaxed at module load time if the
> > > > > symbol is in range. IIUC, the module and kernel will still be inside
> > > > > the same 2G window even after widening the KASLR range to 512G, so
> > > > > most GOT loads can be converted into RIP relative LEA instructions.
> > > > >
> > > > > Note that this will also permit you to do things like
> > > > >
> > > > > #define PV_VCPU_PREEMPTED_ASM \
> > > > >  "leaq __per_cpu_offset(%rip), %rax \n\t" \
> > > > >  "movq (%rax,%rdi,8), %rax \n\t" \
> > > > >  "addq steal_time@GOTPCREL(%rip), %rax \n\t" \
> > > > >  "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "(%rax) \n\t" \
> > > > >  "setne %al\n\t"
> > > > >
> > > > > or
> > > > >
> > > > > +#ifdef CONFIG_X86_PIE
> > > > > + " pushq arch_rethook_trampoline@GOTPCREL(%rip)\n"
> > > > > +#else
> > > > > " pushq $arch_rethook_trampoline\n"
> > > > > +#endif
> > > > >
> > > > > instead of having these kludgy push/pop sequences to free up temp registers.
> > > > >
> > > > > (FYI I have looked into this PIE linking just a few weeks ago [0] so
> > > > > this is all rather fresh in my memory)
> > > > >
> > > > >
> > > > >
> > > > >
> > > > > [0] https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/log/?h=x86-pie
> > > > >
> > > > >
> > > > Hi Ard,
> > > > Thanks for providing the link, it has been very helpful for me as I am
> > > > new to the topic of compilers.
> > >
> > > Happy to hear that.
> > >
> >
> > Thanks for your prompt reply.
> >
> > > > One key difference I noticed is that you
> > > > linked the kernel with "-pie" instead of "--emit-reloc". I also noticed
> > > > that Thomas' initial patchset[0] used "-pie", but in RFC v3 [1], it
> > > > switched to "--emit-reloc" in order to reduce dynamic relocation space
> > > > on mapped memory.
> > > >
> > >
> > > The problem with --emit-relocs is that the relocations emitted into
> > > the binary may get out of sync with the actual code after the linker
> > > has applied relocations.
> > >
> > > $ cat /tmp/a.s
> > > foo:movq foo@GOTPCREL(%rip), %rax
> > >
> > > $ x86_64-linux-gnu-gcc -c -o /tmp/a.o /tmp/a.s
> > > ard@gambale:~/linux$ x86_64-linux-gnu-objdump -dr /tmp/a.o
> > >
> > > /tmp/a.o:     file format elf64-x86-64
> > >
> > >
> > > Disassembly of section .text:
> > >
> > > 0000000000000000 <foo>:
> > >    0: 48 8b 05 00 00 00 00 mov    0x0(%rip),%rax        # 7 <foo+0x7>
> > > 3: R_X86_64_REX_GOTPCRELX foo-0x4
> > >
> > > $ x86_64-linux-gnu-gcc -c -o /tmp/a.o /tmp/a.s
> > > $ x86_64-linux-gnu-objdump -dr /tmp/a.o
> > > 0000000000000000 <foo>:
> > >    0: 48 8b 05 00 00 00 00 mov    0x0(%rip),%rax        # 7 <foo+0x7>
> > > 3: R_X86_64_REX_GOTPCRELX foo-0x4
> > >
> > > $ x86_64-linux-gnu-gcc -o /tmp/a.elf -nostartfiles
> > > -Wl,-no-pie,-q,--defsym,_start=0x0 /tmp/a.s
> > > $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> > > 0000000000401000 <foo>:
> > >   401000: 48 c7 c0 00 10 40 00 mov    $0x401000,%rax
> > > 401003: R_X86_64_32S foo
> > >
> > > $ x86_64-linux-gnu-gcc -o /tmp/a.elf -nostartfiles
> > > -Wl,-q,--defsym,_start=0x0 /tmp/a.s
> > > $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> > > 0000000000001000 <foo>:
> > >     1000: 48 8d 05 f9 ff ff ff lea    -0x7(%rip),%rax        # 1000 <foo>
> > > 1003: R_X86_64_PC32 foo-0x4
> > >
> > > This all looks as expected. However, when using Clang, we end up with
> > >
> > > $ clang -target x86_64-linux-gnu -o /tmp/a.elf -nostartfiles
> > > -fuse-ld=lld -Wl,--relax,-q,--defsym,_start=0x0 /tmp/a.s
> > > $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> > > 00000000000012c0 <foo>:
> > >     12c0: 48 8d 05 f9 ff ff ff lea    -0x7(%rip),%rax        # 12c0 <foo>
> > > 12c3: R_X86_64_REX_GOTPCRELX foo-0x4
> > >
> > > So in this case, what --emit-relocs gives us is not what is actually
> > > in the binary. We cannot just ignore these either, given that they are
> > > treated differently depending on whether the symbol is a per-CPU
> > > symbol or not - in the former case, we need to perform a fixup if the
> > > relaxed reference is RIP relative, and in the latter case, if the
> > > relaxed reference is absolute.
> > >
> > With symbols hidden and the compile-time address of the kernel image
> > kept in the top 2G, is it possible for the relaxed reference to be
> > absolute, even if I keep the percpu section zero-mapping for SMP?  I
> > didn't see absoulte relaxed reference after dropping
> > "-mrelax-relocations=no" option.
> >
> 
> If you link in PIE mode, you should never see absolute references
> after relaxation.
> 
> > > On top of that, --emit-relocs does not cover the GOT, so we'd still
> > > need to process that from the code explicitly.
> > >
> > Yes, so the relocs tool would process GOT, and generate
> > R_X86_64_GLOB_DAT relocation for GOT entries in patch 27:
> > https://lore.kernel.org/lkml/d25c7644249355785365914398bdba1ed2c52468.1682673543.git.houwenlong.hwl@antgroup.com
> >
> 
> Yes, something like that is needed. I'd lean towards generating the
> reloc data directly instead of creating an artifiical RELA section
> with GLOB_DAT relocations, but that is a minor detail.
> 
> > > In general, relying on --emit-relocs is kind of dodgy, and I think
> > > combining PIE linking with --emit-relocs is a bad idea.
> > >
> > > > The another issue is that it requires the addition of the
> > > > "-mrelax-relocations=no" option to support older compilers and linkers.
> > >
> > > Why? The decompressor is now linked in PIE mode so we should be able
> > > to drop that. Or do you need to add is somewhere else?
> > >
> > I tried to use binutils 2.25 (mini version), it couldn't recognize
> > R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX.
> >
> 
> I'm not sure that matters. If the assembler accepts @GOTPCREL
> notation, it should generate the relocations that the linker can
> understand. If the toolchain is not internally consistent in this
> regard, I don't think it is our problem.
>
I get it. Thanks.
 
> This might mean that we end up with more residual GOT entries than
> with a more recent toolchain, but I don't think that is a big deal.
> 
> > > > R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX relocations are supported
> > > > in binutils 2.26 and later, but the mini version required for the kernel
> > > > is 2.25. This option disables relocation relaxation, which makes GOT not
> > > > empty. I also noticed this option in arch/x86/boot/compressed/Makefile
> > > > with the reason given in [2]. Without relocation relaxation, GOT
> > > > references would increase the size of GOT. Therefore, I do not want to
> > > > use GOT reference in assembly directly.  However, I realized that the
> > > > compiler could still generate GOT references in some cases such as
> > > > "fentry" calls and stack canary references.
> > > >
> > >
> > > The stack canary references are under discussion here [3]. I have also
> > > sent a patch for kallsyms symbol references [4]. Beyond that, there
> > > should be very few cases where GOT entries are emitted, so I don't
> > > think this is fundamentally a problem.
> > >
> > > I haven't run into the __fentry__ issue myself: do you think we should
> > > fix this in the compiler?
> > >
> > The issue about __fentry__ is that the compiler would generate 6-bytes
> > indirect call through GOT with "-fPIE" option. However, the original
> > ftrace nop patching assumes it is a 5-bytes direct call. And
> > "-mnop-mcount" option is not compatiable with "-fPIE" option, so the
> > complier woudn't patch it as nop.
> >
> > So we should patch it with one 5-bytes nop followed by one 1-byte nop,
> > This way, ftrace can handle the previous 5-bytes as before. Also I have
> > built PIE kernel with relocation relaxation on GCC, and the linker would
> > relax it as following:
> > ffffffff810018f0 <do_one_initcall>:
> > ffffffff810018f0:       f3 0f 1e fa             endbr64
> > ffffffff810018f4:       67 e8 a6 d6 05 00       addr32 call ffffffff8105efa0 <__fentry__>
> >                         ffffffff810018f6: R_X86_64_PC32 __fentry__-0x4
> >
> 
> But if fentry is a function symbol, I would not expect the codegen to
> be different at all. Are you using -fno-plt?
>
No, even with -fno-plt added, the compiler still generates a GOT
reference for fentry. Therefore, the problem may be visibility, as you
said.

> > It still requires a different nop patching for ftrace. I notice
> > "Optimize GOTPCRELX Relocations" chapter in x86-64 psABI, which suggests
> > that the GOT indirect call can be relaxed as "call fentry nop" or "nop
> > call fentry", it appears that the latter is chosen. If the linker could
> > generate the former, then no fixup would be necessary for ftrace with
> > PIE.
> >
> 
> Right. I think this may be a result of __fentry__ not being subject to
> the same rules wrt visibility etc, similar to __stack_chk_guard. These
> are arguably compiler issues that could qualify as bugs, given that
> these symbol references don't behave like ordinary symbol references.
> 
> > > > Regarding module loading, I agree that we should support GOT reference
> > > > for the module itself. I will refactor it according to your suggestion.
> > > >
> > >
> > > Excellent, good luck with that.
> > >
> > > However, you will still need to make a convincing case for why this is
> > > all worth the trouble. Especially given that you disable the depth
> > > tracking code, which I don't think should be mutually exclusive.
> > >
> > Actually, I could do relocation for it when apply patching for the
> > depth tracking code. I'm not sure such case is common or not.
> >
> 
> I think that alternatives patching in general would need to support
> RIP relative references in the alternatives. The depth tracking
> template is a bit different in this regard, and could be fixed more
> easily, I think.
> 
> > > I am aware that this a rather tricky, and involves rewriting
> > > RIP-relative per-CPU variable accesses, but it would be good to get a
> > > discussion started on that topic, and figure out whether there is a
> > > way forward there. Ignoring it is not going to help.
> > >
> > >
> > I see that your PIE linking chose to put the per-cpu section in high
> > kernel image address, I still keep it as zero-mapping. However, both are
> > in the RIP-relative addressing range.
> >
> 
> Pure PIE linking cannot support the zero mapping - it can only work
> with --emit-relocs, which I was trying to avoid.
Sorry, why doesn't PIE linking support zero mapping? I noticed in the
commit message for your PIE linking that it stated, "if we randomize the
kernel's VA by increasing it by X bytes, every RIP-relative per-CPU
reference needs to be decreased by the same amount in order for the
produced offset to remain correct." As a result, I decided to decrease
the GS base and not relocate the RIP-relative per-CPU reference in the
relocs. Consequently, all RIP-relative references, regardless of whether
they are per-CPU variables or not, do not require relocation.

Furthermore, all symbols are hidden, which implies that all per-CPU
references will not generate a GOT reference and will be relaxed as
absolute reference due to zero mapping. However, the __stack_chk_guard
on CLANG always generates a GOT reference, but I didn't see it being
relaxed as absolute reference on LLVM.

Thanks!
  
Ard Biesheuvel May 9, 2023, 9:52 a.m. UTC | #7
On Tue, 9 May 2023 at 11:42, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
>
> On Tue, May 09, 2023 at 01:47:53AM +0800, Ard Biesheuvel wrote:
> > On Mon, 8 May 2023 at 13:45, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> > >
> > > On Mon, May 08, 2023 at 05:16:34PM +0800, Ard Biesheuvel wrote:
> > > > On Mon, 8 May 2023 at 10:38, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> > > > >
> > > > > On Sat, Apr 29, 2023 at 03:29:32AM +0800, Ard Biesheuvel wrote:
> > > > > > On Fri, 28 Apr 2023 at 10:53, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
...
> > > > > R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX relocations are supported
> > > > > in binutils 2.26 and later, but the mini version required for the kernel
> > > > > is 2.25. This option disables relocation relaxation, which makes GOT not
> > > > > empty. I also noticed this option in arch/x86/boot/compressed/Makefile
> > > > > with the reason given in [2]. Without relocation relaxation, GOT
> > > > > references would increase the size of GOT. Therefore, I do not want to
> > > > > use GOT reference in assembly directly.  However, I realized that the
> > > > > compiler could still generate GOT references in some cases such as
> > > > > "fentry" calls and stack canary references.
> > > > >
> > > >
> > > > The stack canary references are under discussion here [3]. I have also
> > > > sent a patch for kallsyms symbol references [4]. Beyond that, there
> > > > should be very few cases where GOT entries are emitted, so I don't
> > > > think this is fundamentally a problem.
> > > >
> > > > I haven't run into the __fentry__ issue myself: do you think we should
> > > > fix this in the compiler?
> > > >
> > > The issue about __fentry__ is that the compiler would generate 6-bytes
> > > indirect call through GOT with "-fPIE" option. However, the original
> > > ftrace nop patching assumes it is a 5-bytes direct call. And
> > > "-mnop-mcount" option is not compatiable with "-fPIE" option, so the
> > > complier woudn't patch it as nop.
> > >
> > > So we should patch it with one 5-bytes nop followed by one 1-byte nop,
> > > This way, ftrace can handle the previous 5-bytes as before. Also I have
> > > built PIE kernel with relocation relaxation on GCC, and the linker would
> > > relax it as following:
> > > ffffffff810018f0 <do_one_initcall>:
> > > ffffffff810018f0:       f3 0f 1e fa             endbr64
> > > ffffffff810018f4:       67 e8 a6 d6 05 00       addr32 call ffffffff8105efa0 <__fentry__>
> > >                         ffffffff810018f6: R_X86_64_PC32 __fentry__-0x4
> > >
> >
> > But if fentry is a function symbol, I would not expect the codegen to
> > be different at all. Are you using -fno-plt?
> >
> No, even with -fno-plt added, the compiler still generates a GOT
> reference for fentry. Therefore, the problem may be visibility, as you
> said.
>

Yeah, I spotted this issue in GCC - I just sent them a patch this morning.

> > > It still requires a different nop patching for ftrace. I notice
> > > "Optimize GOTPCRELX Relocations" chapter in x86-64 psABI, which suggests
> > > that the GOT indirect call can be relaxed as "call fentry nop" or "nop
> > > call fentry", it appears that the latter is chosen. If the linker could
> > > generate the former, then no fixup would be necessary for ftrace with
> > > PIE.
> > >
> >
> > Right. I think this may be a result of __fentry__ not being subject to
> > the same rules wrt visibility etc, similar to __stack_chk_guard. These
> > are arguably compiler issues that could qualify as bugs, given that
> > these symbol references don't behave like ordinary symbol references.
> >
> > > > > Regarding module loading, I agree that we should support GOT reference
> > > > > for the module itself. I will refactor it according to your suggestion.
> > > > >
> > > >
> > > > Excellent, good luck with that.
> > > >
> > > > However, you will still need to make a convincing case for why this is
> > > > all worth the trouble. Especially given that you disable the depth
> > > > tracking code, which I don't think should be mutually exclusive.
> > > >
> > > Actually, I could do relocation for it when apply patching for the
> > > depth tracking code. I'm not sure such case is common or not.
> > >
> >
> > I think that alternatives patching in general would need to support
> > RIP relative references in the alternatives. The depth tracking
> > template is a bit different in this regard, and could be fixed more
> > easily, I think.
> >
> > > > I am aware that this a rather tricky, and involves rewriting
> > > > RIP-relative per-CPU variable accesses, but it would be good to get a
> > > > discussion started on that topic, and figure out whether there is a
> > > > way forward there. Ignoring it is not going to help.
> > > >
> > > >
> > > I see that your PIE linking chose to put the per-cpu section in high
> > > kernel image address, I still keep it as zero-mapping. However, both are
> > > in the RIP-relative addressing range.
> > >
> >
> > Pure PIE linking cannot support the zero mapping - it can only work
> > with --emit-relocs, which I was trying to avoid.
> Sorry, why doesn't PIE linking support zero mapping? I noticed in the
> commit message for your PIE linking that it stated, "if we randomize the
> kernel's VA by increasing it by X bytes, every RIP-relative per-CPU
> reference needs to be decreased by the same amount in order for the
> produced offset to remain correct." As a result, I decided to decrease
> the GS base and not relocate the RIP-relative per-CPU reference in the
> relocs. Consequently, all RIP-relative references, regardless of whether
> they are per-CPU variables or not, do not require relocation.
>

Interesting. Does that work as expected with dynamically allocated
per-CPU variables?

> Furthermore, all symbols are hidden, which implies that all per-CPU
> references will not generate a GOT reference and will be relaxed as
> absolute reference due to zero mapping. However, the __stack_chk_guard
> on CLANG always generates a GOT reference, but I didn't see it being
> relaxed as absolute reference on LLVM.
>

Yeah, we should fix that.
  
Hou Wenlong May 9, 2023, 12:35 p.m. UTC | #8
On Tue, May 09, 2023 at 05:52:31PM +0800, Ard Biesheuvel wrote:
> On Tue, 9 May 2023 at 11:42, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> >
> > On Tue, May 09, 2023 at 01:47:53AM +0800, Ard Biesheuvel wrote:
> > > On Mon, 8 May 2023 at 13:45, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> > > >
> > > > On Mon, May 08, 2023 at 05:16:34PM +0800, Ard Biesheuvel wrote:
> > > > > On Mon, 8 May 2023 at 10:38, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> > > > > >
> > > > > > On Sat, Apr 29, 2023 at 03:29:32AM +0800, Ard Biesheuvel wrote:
> > > > > > > On Fri, 28 Apr 2023 at 10:53, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> ...
> > > > > > R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX relocations are supported
> > > > > > in binutils 2.26 and later, but the mini version required for the kernel
> > > > > > is 2.25. This option disables relocation relaxation, which makes GOT not
> > > > > > empty. I also noticed this option in arch/x86/boot/compressed/Makefile
> > > > > > with the reason given in [2]. Without relocation relaxation, GOT
> > > > > > references would increase the size of GOT. Therefore, I do not want to
> > > > > > use GOT reference in assembly directly.  However, I realized that the
> > > > > > compiler could still generate GOT references in some cases such as
> > > > > > "fentry" calls and stack canary references.
> > > > > >
> > > > >
> > > > > The stack canary references are under discussion here [3]. I have also
> > > > > sent a patch for kallsyms symbol references [4]. Beyond that, there
> > > > > should be very few cases where GOT entries are emitted, so I don't
> > > > > think this is fundamentally a problem.
> > > > >
> > > > > I haven't run into the __fentry__ issue myself: do you think we should
> > > > > fix this in the compiler?
> > > > >
> > > > The issue about __fentry__ is that the compiler would generate 6-bytes
> > > > indirect call through GOT with "-fPIE" option. However, the original
> > > > ftrace nop patching assumes it is a 5-bytes direct call. And
> > > > "-mnop-mcount" option is not compatiable with "-fPIE" option, so the
> > > > complier woudn't patch it as nop.
> > > >
> > > > So we should patch it with one 5-bytes nop followed by one 1-byte nop,
> > > > This way, ftrace can handle the previous 5-bytes as before. Also I have
> > > > built PIE kernel with relocation relaxation on GCC, and the linker would
> > > > relax it as following:
> > > > ffffffff810018f0 <do_one_initcall>:
> > > > ffffffff810018f0:       f3 0f 1e fa             endbr64
> > > > ffffffff810018f4:       67 e8 a6 d6 05 00       addr32 call ffffffff8105efa0 <__fentry__>
> > > >                         ffffffff810018f6: R_X86_64_PC32 __fentry__-0x4
> > > >
> > >
> > > But if fentry is a function symbol, I would not expect the codegen to
> > > be different at all. Are you using -fno-plt?
> > >
> > No, even with -fno-plt added, the compiler still generates a GOT
> > reference for fentry. Therefore, the problem may be visibility, as you
> > said.
> >
> 
> Yeah, I spotted this issue in GCC - I just sent them a patch this morning.
> 
> > > > It still requires a different nop patching for ftrace. I notice
> > > > "Optimize GOTPCRELX Relocations" chapter in x86-64 psABI, which suggests
> > > > that the GOT indirect call can be relaxed as "call fentry nop" or "nop
> > > > call fentry", it appears that the latter is chosen. If the linker could
> > > > generate the former, then no fixup would be necessary for ftrace with
> > > > PIE.
> > > >
> > >
> > > Right. I think this may be a result of __fentry__ not being subject to
> > > the same rules wrt visibility etc, similar to __stack_chk_guard. These
> > > are arguably compiler issues that could qualify as bugs, given that
> > > these symbol references don't behave like ordinary symbol references.
> > >
> > > > > > Regarding module loading, I agree that we should support GOT reference
> > > > > > for the module itself. I will refactor it according to your suggestion.
> > > > > >
> > > > >
> > > > > Excellent, good luck with that.
> > > > >
> > > > > However, you will still need to make a convincing case for why this is
> > > > > all worth the trouble. Especially given that you disable the depth
> > > > > tracking code, which I don't think should be mutually exclusive.
> > > > >
> > > > Actually, I could do relocation for it when apply patching for the
> > > > depth tracking code. I'm not sure such case is common or not.
> > > >
> > >
> > > I think that alternatives patching in general would need to support
> > > RIP relative references in the alternatives. The depth tracking
> > > template is a bit different in this regard, and could be fixed more
> > > easily, I think.
> > >
> > > > > I am aware that this a rather tricky, and involves rewriting
> > > > > RIP-relative per-CPU variable accesses, but it would be good to get a
> > > > > discussion started on that topic, and figure out whether there is a
> > > > > way forward there. Ignoring it is not going to help.
> > > > >
> > > > >
> > > > I see that your PIE linking chose to put the per-cpu section in high
> > > > kernel image address, I still keep it as zero-mapping. However, both are
> > > > in the RIP-relative addressing range.
> > > >
> > >
> > > Pure PIE linking cannot support the zero mapping - it can only work
> > > with --emit-relocs, which I was trying to avoid.
> > Sorry, why doesn't PIE linking support zero mapping? I noticed in the
> > commit message for your PIE linking that it stated, "if we randomize the
> > kernel's VA by increasing it by X bytes, every RIP-relative per-CPU
> > reference needs to be decreased by the same amount in order for the
> > produced offset to remain correct." As a result, I decided to decrease
> > the GS base and not relocate the RIP-relative per-CPU reference in the
> > relocs. Consequently, all RIP-relative references, regardless of whether
> > they are per-CPU variables or not, do not require relocation.
> >
> 
> Interesting. Does that work as expected with dynamically allocated
> per-CPU variables?
>
I didn't encounter any issues with the dynamically allocated per-CPU
variables. Since the per_cpu_ptr macro uses the __per_cpu_offset array
directly, it should work. In any case, I have tested loading the kvm
module, which uses dynamically allocated per-CPU variables, and
successfully booted a guest.

The related patch is:
https://lore.kernel.org/lkml/62d7e9e73467b711351a84ebce99372d3dccaa73.1682673543.git.houwenlong.hwl@antgroup.com
 
Thanks!
> > Furthermore, all symbols are hidden, which implies that all per-CPU
> > references will not generate a GOT reference and will be relaxed as
> > absolute reference due to zero mapping. However, the __stack_chk_guard
> > on CLANG always generates a GOT reference, but I didn't see it being
> > relaxed as absolute reference on LLVM.
> >
> 
> Yeah, we should fix that.
  
Hou Wenlong May 10, 2023, 7:09 a.m. UTC | #9
On Mon, May 08, 2023 at 05:16:34PM +0800, Ard Biesheuvel wrote:
> On Mon, 8 May 2023 at 10:38, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> >
> > On Sat, Apr 29, 2023 at 03:29:32AM +0800, Ard Biesheuvel wrote:
> > > On Fri, 28 Apr 2023 at 10:53, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> > > >
> > > > Adapt module loading to support PIE relocations. No GOT is generared for
> > > > module, all the GOT entry of got references in module should exist in
> > > > kernel GOT.  Currently, there is only one usable got reference for
> > > > __fentry__().
> > > >
> > >
> > > I don't think this is the right approach. We should permit GOTPCREL
> > > relocations properly, which means making them point to a location in
> > > memory that carries the absolute address of the symbol. There are
> > > several ways to go about that, but perhaps the simplest way is to make
> > > the symbol address in ksymtab a 64-bit absolute value (but retain the
> > > PC32 references for the symbol name and the symbol namespace name).
> > > That way, you can always resolve such GOTPCREL relocations by pointing
> > > it to the ksymtab entry. Another option would be to take inspiration
> > > from the PLT code we have on ARM and arm64 (and other architectures,
> > > surely) and to count the GOT based relocations, allocate some extra
> > > r/o module space for each, and allocate slots and populate them with
> > > the right value as you fix up the relocations.
> > >
> > > Then, many such relocations can be relaxed at module load time if the
> > > symbol is in range. IIUC, the module and kernel will still be inside
> > > the same 2G window even after widening the KASLR range to 512G, so
> > > most GOT loads can be converted into RIP relative LEA instructions.
> > >
> > > Note that this will also permit you to do things like
> > >
> > > #define PV_VCPU_PREEMPTED_ASM \
> > >  "leaq __per_cpu_offset(%rip), %rax \n\t" \
> > >  "movq (%rax,%rdi,8), %rax \n\t" \
> > >  "addq steal_time@GOTPCREL(%rip), %rax \n\t" \
> > >  "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "(%rax) \n\t" \
> > >  "setne %al\n\t"
> > >
> > > or
> > >
> > > +#ifdef CONFIG_X86_PIE
> > > + " pushq arch_rethook_trampoline@GOTPCREL(%rip)\n"
> > > +#else
> > > " pushq $arch_rethook_trampoline\n"
> > > +#endif
> > >
> > > instead of having these kludgy push/pop sequences to free up temp registers.
> > >
> > > (FYI I have looked into this PIE linking just a few weeks ago [0] so
> > > this is all rather fresh in my memory)
> > >
> > >
> > >
> > >
> > > [0] https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/log/?h=x86-pie
> > >
> > >
> > Hi Ard,
> > Thanks for providing the link, it has been very helpful for me as I am
> > new to the topic of compilers.
> 
> Happy to hear that.
> 
> > One key difference I noticed is that you
> > linked the kernel with "-pie" instead of "--emit-reloc". I also noticed
> > that Thomas' initial patchset[0] used "-pie", but in RFC v3 [1], it
> > switched to "--emit-reloc" in order to reduce dynamic relocation space
> > on mapped memory.
> >
> 
> The problem with --emit-relocs is that the relocations emitted into
> the binary may get out of sync with the actual code after the linker
> has applied relocations.
> 
> $ cat /tmp/a.s
> foo:movq foo@GOTPCREL(%rip), %rax
> 
> $ x86_64-linux-gnu-gcc -c -o /tmp/a.o /tmp/a.s
> ard@gambale:~/linux$ x86_64-linux-gnu-objdump -dr /tmp/a.o
> 
> /tmp/a.o:     file format elf64-x86-64
> 
> 
> Disassembly of section .text:
> 
> 0000000000000000 <foo>:
>    0: 48 8b 05 00 00 00 00 mov    0x0(%rip),%rax        # 7 <foo+0x7>
> 3: R_X86_64_REX_GOTPCRELX foo-0x4
> 
> $ x86_64-linux-gnu-gcc -c -o /tmp/a.o /tmp/a.s
> $ x86_64-linux-gnu-objdump -dr /tmp/a.o
> 0000000000000000 <foo>:
>    0: 48 8b 05 00 00 00 00 mov    0x0(%rip),%rax        # 7 <foo+0x7>
> 3: R_X86_64_REX_GOTPCRELX foo-0x4
> 
> $ x86_64-linux-gnu-gcc -o /tmp/a.elf -nostartfiles
> -Wl,-no-pie,-q,--defsym,_start=0x0 /tmp/a.s
> $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> 0000000000401000 <foo>:
>   401000: 48 c7 c0 00 10 40 00 mov    $0x401000,%rax
> 401003: R_X86_64_32S foo
> 
> $ x86_64-linux-gnu-gcc -o /tmp/a.elf -nostartfiles
> -Wl,-q,--defsym,_start=0x0 /tmp/a.s
> $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> 0000000000001000 <foo>:
>     1000: 48 8d 05 f9 ff ff ff lea    -0x7(%rip),%rax        # 1000 <foo>
> 1003: R_X86_64_PC32 foo-0x4
> 
> This all looks as expected. However, when using Clang, we end up with
> 
> $ clang -target x86_64-linux-gnu -o /tmp/a.elf -nostartfiles
> -fuse-ld=lld -Wl,--relax,-q,--defsym,_start=0x0 /tmp/a.s
> $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> 00000000000012c0 <foo>:
>     12c0: 48 8d 05 f9 ff ff ff lea    -0x7(%rip),%rax        # 12c0 <foo>
> 12c3: R_X86_64_REX_GOTPCRELX foo-0x4
> 
> So in this case, what --emit-relocs gives us is not what is actually
> in the binary. We cannot just ignore these either, given that they are
> treated differently depending on whether the symbol is a per-CPU
> symbol or not - in the former case, we need to perform a fixup if the
> relaxed reference is RIP relative, and in the latter case, if the
> relaxed reference is absolute.
> 
> On top of that, --emit-relocs does not cover the GOT, so we'd still
> need to process that from the code explicitly.
> 
> In general, relying on --emit-relocs is kind of dodgy, and I think
> combining PIE linking with --emit-relocs is a bad idea.
> 
> > The another issue is that it requires the addition of the
> > "-mrelax-relocations=no" option to support older compilers and linkers.
> 
> Why? The decompressor is now linked in PIE mode so we should be able
> to drop that. Or do you need to add is somewhere else?
>
Hi Ard,

After removing the "-mrelax-relocations=no" option, I noticed that the
linker was relaxing GOT references as absolute references for mov
instructions, even if the symbol was in a high address, as long as I
kept the compile-time base address of the kernel image in the top 2G. I
consulted the "Optimize GOTPCRELX Relocations" chapter in x86-64 psABI,
which stated that "When position-independent code is disabled and foo is
defined locally in the lower 32-bit address space, memory operand in mov
can be converted into immediate operand". However, it seemed that if the
symbol was in the higher 32-bit address space, the memory operand in mov
would also be converted into an immediate operand. If I decreased the
compile-time base address of the kernel image, it would be relaxed as
lea. Therefore, I believe that using "-mrelax-relocations=no" without
"-pie" option is necessary. Is there a way to force the linker to relax
it as lea without using the "-pie" option when linking?

Since all GOT references cannot be omitted, perhaps I should try linking
the kernel with the "-pie" option.

Thanks!

> > R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX relocations are supported
> > in binutils 2.26 and later, but the mini version required for the kernel
> > is 2.25. This option disables relocation relaxation, which makes GOT not
> > empty. I also noticed this option in arch/x86/boot/compressed/Makefile
> > with the reason given in [2]. Without relocation relaxation, GOT
> > references would increase the size of GOT. Therefore, I do not want to
> > use GOT reference in assembly directly.  However, I realized that the
> > compiler could still generate GOT references in some cases such as
> > "fentry" calls and stack canary references.
> >
> 
> The stack canary references are under discussion here [3]. I have also
> sent a patch for kallsyms symbol references [4]. Beyond that, there
> should be very few cases where GOT entries are emitted, so I don't
> think this is fundamentally a problem.
> 
> I haven't run into the __fentry__ issue myself: do you think we should
> fix this in the compiler?
> 
> > Regarding module loading, I agree that we should support GOT reference
> > for the module itself. I will refactor it according to your suggestion.
> >
> 
> Excellent, good luck with that.
> 
> However, you will still need to make a convincing case for why this is
> all worth the trouble. Especially given that you disable the depth
> tracking code, which I don't think should be mutually exclusive.
> 
> I am aware that this a rather tricky, and involves rewriting
> RIP-relative per-CPU variable accesses, but it would be good to get a
> discussion started on that topic, and figure out whether there is a
> way forward there. Ignoring it is not going to help.
> 
> 
> >
> > [0] https://yhbt.net/lore/all/20170718223333.110371-20-thgarnie@google.com
> > [1] https://yhbt.net/lore/all/20171004212003.28296-1-thgarnie@google.com
> > [2] https://lore.kernel.org/all/20200903203053.3411268-2-samitolvanen@google.com/
> >
> 
> [3] https://github.com/llvm/llvm-project/issues/60116
> [4] 20230504174320.3930345-1-ardb@kernel.org
> 
> > > > Signed-off-by: Hou Wenlong <houwenlong.hwl@antgroup.com>
> > > > Cc: Thomas Garnier <thgarnie@chromium.org>
> > > > Cc: Lai Jiangshan <jiangshan.ljs@antgroup.com>
> > > > Cc: Kees Cook <keescook@chromium.org>
> > > > ---
> > > >  arch/x86/include/asm/sections.h |  5 +++++
> > > >  arch/x86/kernel/module.c        | 27 +++++++++++++++++++++++++++
> > > >  2 files changed, 32 insertions(+)
> > > >
> > > > diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
> > > > index a6e8373a5170..dc1c2b08ec48 100644
> > > > --- a/arch/x86/include/asm/sections.h
> > > > +++ b/arch/x86/include/asm/sections.h
> > > > @@ -12,6 +12,11 @@ extern char __end_rodata_aligned[];
> > > >
> > > >  #if defined(CONFIG_X86_64)
> > > >  extern char __end_rodata_hpage_align[];
> > > > +
> > > > +#ifdef CONFIG_X86_PIE
> > > > +extern char __start_got[], __end_got[];
> > > > +#endif
> > > > +
> > > >  #endif
> > > >
> > > >  extern char __end_of_kernel_reserve[];
> > > > diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
> > > > index 84ad0e61ba6e..051f88e6884e 100644
> > > > --- a/arch/x86/kernel/module.c
> > > > +++ b/arch/x86/kernel/module.c
> > > > @@ -129,6 +129,18 @@ int apply_relocate(Elf32_Shdr *sechdrs,
> > > >         return 0;
> > > >  }
> > > >  #else /*X86_64*/
> > > > +#ifdef CONFIG_X86_PIE
> > > > +static u64 find_got_kernel_entry(Elf64_Sym *sym, const Elf64_Rela *rela)
> > > > +{
> > > > +       u64 *pos;
> > > > +
> > > > +       for (pos = (u64 *)__start_got; pos < (u64 *)__end_got; pos++)
> > > > +               if (*pos == sym->st_value)
> > > > +                       return (u64)pos + rela->r_addend;
> > > > +       return 0;
> > > > +}
> > > > +#endif
> > > > +
> > > >  static int __write_relocate_add(Elf64_Shdr *sechdrs,
> > > >                    const char *strtab,
> > > >                    unsigned int symindex,
> > > > @@ -171,6 +183,7 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
> > > >                 case R_X86_64_64:
> > > >                         size = 8;
> > > >                         break;
> > > > +#ifndef CONFIG_X86_PIE
> > > >                 case R_X86_64_32:
> > > >                         if (val != *(u32 *)&val)
> > > >                                 goto overflow;
> > > > @@ -181,6 +194,13 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
> > > >                                 goto overflow;
> > > >                         size = 4;
> > > >                         break;
> > > > +#else
> > > > +               case R_X86_64_GOTPCREL:
> > > > +                       val = find_got_kernel_entry(sym, rel);
> > > > +                       if (!val)
> > > > +                               goto unexpected_got_reference;
> > > > +                       fallthrough;
> > > > +#endif
> > > >                 case R_X86_64_PC32:
> > > >                 case R_X86_64_PLT32:
> > > >                         val -= (u64)loc;
> > > > @@ -214,11 +234,18 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
> > > >         }
> > > >         return 0;
> > > >
> > > > +#ifdef CONFIG_X86_PIE
> > > > +unexpected_got_reference:
> > > > +       pr_err("Target got entry doesn't exist in kernel got, loc %p\n", loc);
> > > > +       return -ENOEXEC;
> > > > +#else
> > > >  overflow:
> > > >         pr_err("overflow in relocation type %d val %Lx\n",
> > > >                (int)ELF64_R_TYPE(rel[i].r_info), val);
> > > >         pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
> > > >                me->name);
> > > > +#endif
> > > > +
> > > >         return -ENOEXEC;
> > > >  }
> > > >
> > > > --
> > > > 2.31.1
> > > >
  
Ard Biesheuvel May 10, 2023, 8:15 a.m. UTC | #10
On Wed, 10 May 2023 at 09:15, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
>
> On Mon, May 08, 2023 at 05:16:34PM +0800, Ard Biesheuvel wrote:
> > On Mon, 8 May 2023 at 10:38, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> > >
> > > On Sat, Apr 29, 2023 at 03:29:32AM +0800, Ard Biesheuvel wrote:
> > > > On Fri, 28 Apr 2023 at 10:53, Hou Wenlong <houwenlong.hwl@antgroup.com> wrote:
> > > > >
> > > > > Adapt module loading to support PIE relocations. No GOT is generared for
> > > > > module, all the GOT entry of got references in module should exist in
> > > > > kernel GOT.  Currently, there is only one usable got reference for
> > > > > __fentry__().
> > > > >
> > > >
> > > > I don't think this is the right approach. We should permit GOTPCREL
> > > > relocations properly, which means making them point to a location in
> > > > memory that carries the absolute address of the symbol. There are
> > > > several ways to go about that, but perhaps the simplest way is to make
> > > > the symbol address in ksymtab a 64-bit absolute value (but retain the
> > > > PC32 references for the symbol name and the symbol namespace name).
> > > > That way, you can always resolve such GOTPCREL relocations by pointing
> > > > it to the ksymtab entry. Another option would be to take inspiration
> > > > from the PLT code we have on ARM and arm64 (and other architectures,
> > > > surely) and to count the GOT based relocations, allocate some extra
> > > > r/o module space for each, and allocate slots and populate them with
> > > > the right value as you fix up the relocations.
> > > >
> > > > Then, many such relocations can be relaxed at module load time if the
> > > > symbol is in range. IIUC, the module and kernel will still be inside
> > > > the same 2G window even after widening the KASLR range to 512G, so
> > > > most GOT loads can be converted into RIP relative LEA instructions.
> > > >
> > > > Note that this will also permit you to do things like
> > > >
> > > > #define PV_VCPU_PREEMPTED_ASM \
> > > >  "leaq __per_cpu_offset(%rip), %rax \n\t" \
> > > >  "movq (%rax,%rdi,8), %rax \n\t" \
> > > >  "addq steal_time@GOTPCREL(%rip), %rax \n\t" \
> > > >  "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "(%rax) \n\t" \
> > > >  "setne %al\n\t"
> > > >
> > > > or
> > > >
> > > > +#ifdef CONFIG_X86_PIE
> > > > + " pushq arch_rethook_trampoline@GOTPCREL(%rip)\n"
> > > > +#else
> > > > " pushq $arch_rethook_trampoline\n"
> > > > +#endif
> > > >
> > > > instead of having these kludgy push/pop sequences to free up temp registers.
> > > >
> > > > (FYI I have looked into this PIE linking just a few weeks ago [0] so
> > > > this is all rather fresh in my memory)
> > > >
> > > >
> > > >
> > > >
> > > > [0] https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/log/?h=x86-pie
> > > >
> > > >
> > > Hi Ard,
> > > Thanks for providing the link, it has been very helpful for me as I am
> > > new to the topic of compilers.
> >
> > Happy to hear that.
> >
> > > One key difference I noticed is that you
> > > linked the kernel with "-pie" instead of "--emit-reloc". I also noticed
> > > that Thomas' initial patchset[0] used "-pie", but in RFC v3 [1], it
> > > switched to "--emit-reloc" in order to reduce dynamic relocation space
> > > on mapped memory.
> > >
> >
> > The problem with --emit-relocs is that the relocations emitted into
> > the binary may get out of sync with the actual code after the linker
> > has applied relocations.
> >
> > $ cat /tmp/a.s
> > foo:movq foo@GOTPCREL(%rip), %rax
> >
> > $ x86_64-linux-gnu-gcc -c -o /tmp/a.o /tmp/a.s
> > ard@gambale:~/linux$ x86_64-linux-gnu-objdump -dr /tmp/a.o
> >
> > /tmp/a.o:     file format elf64-x86-64
> >
> >
> > Disassembly of section .text:
> >
> > 0000000000000000 <foo>:
> >    0: 48 8b 05 00 00 00 00 mov    0x0(%rip),%rax        # 7 <foo+0x7>
> > 3: R_X86_64_REX_GOTPCRELX foo-0x4
> >
> > $ x86_64-linux-gnu-gcc -c -o /tmp/a.o /tmp/a.s
> > $ x86_64-linux-gnu-objdump -dr /tmp/a.o
> > 0000000000000000 <foo>:
> >    0: 48 8b 05 00 00 00 00 mov    0x0(%rip),%rax        # 7 <foo+0x7>
> > 3: R_X86_64_REX_GOTPCRELX foo-0x4
> >
> > $ x86_64-linux-gnu-gcc -o /tmp/a.elf -nostartfiles
> > -Wl,-no-pie,-q,--defsym,_start=0x0 /tmp/a.s
> > $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> > 0000000000401000 <foo>:
> >   401000: 48 c7 c0 00 10 40 00 mov    $0x401000,%rax
> > 401003: R_X86_64_32S foo
> >
> > $ x86_64-linux-gnu-gcc -o /tmp/a.elf -nostartfiles
> > -Wl,-q,--defsym,_start=0x0 /tmp/a.s
> > $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> > 0000000000001000 <foo>:
> >     1000: 48 8d 05 f9 ff ff ff lea    -0x7(%rip),%rax        # 1000 <foo>
> > 1003: R_X86_64_PC32 foo-0x4
> >
> > This all looks as expected. However, when using Clang, we end up with
> >
> > $ clang -target x86_64-linux-gnu -o /tmp/a.elf -nostartfiles
> > -fuse-ld=lld -Wl,--relax,-q,--defsym,_start=0x0 /tmp/a.s
> > $ x86_64-linux-gnu-objdump -dr /tmp/a.elf
> > 00000000000012c0 <foo>:
> >     12c0: 48 8d 05 f9 ff ff ff lea    -0x7(%rip),%rax        # 12c0 <foo>
> > 12c3: R_X86_64_REX_GOTPCRELX foo-0x4
> >
> > So in this case, what --emit-relocs gives us is not what is actually
> > in the binary. We cannot just ignore these either, given that they are
> > treated differently depending on whether the symbol is a per-CPU
> > symbol or not - in the former case, we need to perform a fixup if the
> > relaxed reference is RIP relative, and in the latter case, if the
> > relaxed reference is absolute.
> >
> > On top of that, --emit-relocs does not cover the GOT, so we'd still
> > need to process that from the code explicitly.
> >
> > In general, relying on --emit-relocs is kind of dodgy, and I think
> > combining PIE linking with --emit-relocs is a bad idea.
> >
> > > The another issue is that it requires the addition of the
> > > "-mrelax-relocations=no" option to support older compilers and linkers.
> >
> > Why? The decompressor is now linked in PIE mode so we should be able
> > to drop that. Or do you need to add is somewhere else?
> >
> Hi Ard,
>
> After removing the "-mrelax-relocations=no" option, I noticed that the
> linker was relaxing GOT references as absolute references for mov
> instructions, even if the symbol was in a high address, as long as I
> kept the compile-time base address of the kernel image in the top 2G. I
> consulted the "Optimize GOTPCRELX Relocations" chapter in x86-64 psABI,
> which stated that "When position-independent code is disabled and foo is
> defined locally in the lower 32-bit address space, memory operand in mov
> can be converted into immediate operand". However, it seemed that if the
> symbol was in the higher 32-bit address space, the memory operand in mov
> would also be converted into an immediate operand. If I decreased the
> compile-time base address of the kernel image, it would be relaxed as
> lea. Therefore, I believe that using "-mrelax-relocations=no" without
> "-pie" option is necessary.

Indeed. As you noted, the linker assumes that non-PIE linked binaries
will always appear at their link time address, and relaxations will
try to take advantage of that.

Currently, we use -pie linking only for the decompressor, and we
should be able to drop -mrelax-relocations=no from its LDFLAGS. But
position dependent linking should not use relaxations at all.

> Is there a way to force the linker to relax
> it as lea without using the "-pie" option when linking?
>

Not that I am aware of.

> Since all GOT references cannot be omitted, perhaps I should try linking
> the kernel with the "-pie" option.
>

That way, we will end up with two sets of relocations, the static ones
from --emit-relocs and the dynamic ones from -pie. This should be
manageable, given that the difference between those sets should
exactly cover the GOT.

However, relying on --emit-relocs and -pie at the same time seems
clumsy to me. I'd prefer to only depend on -pie at /some/ point.
  

Patch

diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index a6e8373a5170..dc1c2b08ec48 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -12,6 +12,11 @@  extern char __end_rodata_aligned[];
 
 #if defined(CONFIG_X86_64)
 extern char __end_rodata_hpage_align[];
+
+#ifdef CONFIG_X86_PIE
+extern char __start_got[], __end_got[];
+#endif
+
 #endif
 
 extern char __end_of_kernel_reserve[];
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 84ad0e61ba6e..051f88e6884e 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -129,6 +129,18 @@  int apply_relocate(Elf32_Shdr *sechdrs,
 	return 0;
 }
 #else /*X86_64*/
+#ifdef CONFIG_X86_PIE
+static u64 find_got_kernel_entry(Elf64_Sym *sym, const Elf64_Rela *rela)
+{
+	u64 *pos;
+
+	for (pos = (u64 *)__start_got; pos < (u64 *)__end_got; pos++)
+		if (*pos == sym->st_value)
+			return (u64)pos + rela->r_addend;
+	return 0;
+}
+#endif
+
 static int __write_relocate_add(Elf64_Shdr *sechdrs,
 		   const char *strtab,
 		   unsigned int symindex,
@@ -171,6 +183,7 @@  static int __write_relocate_add(Elf64_Shdr *sechdrs,
 		case R_X86_64_64:
 			size = 8;
 			break;
+#ifndef CONFIG_X86_PIE
 		case R_X86_64_32:
 			if (val != *(u32 *)&val)
 				goto overflow;
@@ -181,6 +194,13 @@  static int __write_relocate_add(Elf64_Shdr *sechdrs,
 				goto overflow;
 			size = 4;
 			break;
+#else
+		case R_X86_64_GOTPCREL:
+			val = find_got_kernel_entry(sym, rel);
+			if (!val)
+				goto unexpected_got_reference;
+			fallthrough;
+#endif
 		case R_X86_64_PC32:
 		case R_X86_64_PLT32:
 			val -= (u64)loc;
@@ -214,11 +234,18 @@  static int __write_relocate_add(Elf64_Shdr *sechdrs,
 	}
 	return 0;
 
+#ifdef CONFIG_X86_PIE
+unexpected_got_reference:
+	pr_err("Target got entry doesn't exist in kernel got, loc %p\n", loc);
+	return -ENOEXEC;
+#else
 overflow:
 	pr_err("overflow in relocation type %d val %Lx\n",
 	       (int)ELF64_R_TYPE(rel[i].r_info), val);
 	pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
 	       me->name);
+#endif
+
 	return -ENOEXEC;
 }