[v3,08/19] x86/head64: Replace pointer fixups with PIE codegen
Commit Message
From: Ard Biesheuvel <ardb@kernel.org>
Some of the C code in head64.c may be called from a different virtual
address than it was linked at. Currently, we deal with this by using
ordinary, position dependent codegen, and fixing up all symbol
references on the fly. This is fragile and tricky to maintain. It is
also unnecessary: we can use position independent codegen (with hidden
visibility) to ensure that all compiler generated symbol references are
RIP-relative, removing the need for fixups entirely.
It does mean we need explicit references to kernel virtual addresses to
be generated by hand, so generate those using a movabs instruction in
inline asm in the handful places where we actually need this.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/x86/Makefile | 8 ++
arch/x86/boot/compressed/Makefile | 2 +-
arch/x86/include/asm/desc.h | 3 +-
arch/x86/include/asm/setup.h | 4 +-
arch/x86/kernel/Makefile | 5 ++
arch/x86/kernel/head64.c | 88 +++++++-------------
arch/x86/kernel/head_64.S | 5 +-
7 files changed, 51 insertions(+), 64 deletions(-)
Comments
On Mon, Jan 29, 2024 at 07:05:11PM +0100, Ard Biesheuvel wrote:
> From: Ard Biesheuvel <ardb@kernel.org>
>
> Some of the C code in head64.c may be called from a different virtual
> address than it was linked at. Currently, we deal with this by using
Yeah, make passive pls: "Currently, this is done by using... "
> ordinary, position dependent codegen, and fixing up all symbol
> references on the fly. This is fragile and tricky to maintain. It is
> also unnecessary: we can use position independent codegen (with hidden
^^^
Ditto: "use ..."
In the comments below too, pls, where it says "we".
> visibility) to ensure that all compiler generated symbol references are
> RIP-relative, removing the need for fixups entirely.
>
> It does mean we need explicit references to kernel virtual addresses to
> be generated by hand, so generate those using a movabs instruction in
> inline asm in the handful places where we actually need this.
>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
> arch/x86/Makefile | 8 ++
> arch/x86/boot/compressed/Makefile | 2 +-
> arch/x86/include/asm/desc.h | 3 +-
> arch/x86/include/asm/setup.h | 4 +-
> arch/x86/kernel/Makefile | 5 ++
> arch/x86/kernel/head64.c | 88 +++++++-------------
> arch/x86/kernel/head_64.S | 5 +-
> 7 files changed, 51 insertions(+), 64 deletions(-)
>
> diff --git a/arch/x86/Makefile b/arch/x86/Makefile
> index 1a068de12a56..2b5954e75318 100644
> --- a/arch/x86/Makefile
> +++ b/arch/x86/Makefile
> @@ -168,6 +168,14 @@ else
> KBUILD_CFLAGS += -mcmodel=kernel
> KBUILD_RUSTFLAGS += -Cno-redzone=y
> KBUILD_RUSTFLAGS += -Ccode-model=kernel
> +
> + PIE_CFLAGS-$(CONFIG_STACKPROTECTOR) += -fno-stack-protector
Main Makefile has
KBUILD_CFLAGS += -fno-PIE
and this ends up being:
gcc -Wp,-MMD,arch/x86/kernel/.head64.s.d -nostdinc ... -fno-PIE ... -fpie ... -fverbose-asm -S -o arch/x86/kernel/head64.s arch/x86/kernel/head64.c
Can you pls remove -fno-PIE from those TUs which use PIE_CFLAGS so that
there's no confusion when staring at V=1 output?
> + PIE_CFLAGS-$(CONFIG_LTO) += -fno-lto
> +
> + PIE_CFLAGS := -fpie -mcmodel=small $(PIE_CFLAGS-y) \
> + -include $(srctree)/include/linux/hidden.h
> +
> + export PIE_CFLAGS
> endif
>
> #
Other than that, that code becomes much more readable, cool!
Thx.
On Mon, 12 Feb 2024 at 11:29, Borislav Petkov <bp@alien8.de> wrote:
>
> On Mon, Jan 29, 2024 at 07:05:11PM +0100, Ard Biesheuvel wrote:
> > From: Ard Biesheuvel <ardb@kernel.org>
> >
> > Some of the C code in head64.c may be called from a different virtual
> > address than it was linked at. Currently, we deal with this by using
>
> Yeah, make passive pls: "Currently, this is done by using... "
>
> > ordinary, position dependent codegen, and fixing up all symbol
> > references on the fly. This is fragile and tricky to maintain. It is
> > also unnecessary: we can use position independent codegen (with hidden
> ^^^
> Ditto: "use ..."
>
> In the comments below too, pls, where it says "we".
>
Ack.
> > visibility) to ensure that all compiler generated symbol references are
> > RIP-relative, removing the need for fixups entirely.
> >
> > It does mean we need explicit references to kernel virtual addresses to
> > be generated by hand, so generate those using a movabs instruction in
> > inline asm in the handful places where we actually need this.
> >
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > ---
> > arch/x86/Makefile | 8 ++
> > arch/x86/boot/compressed/Makefile | 2 +-
> > arch/x86/include/asm/desc.h | 3 +-
> > arch/x86/include/asm/setup.h | 4 +-
> > arch/x86/kernel/Makefile | 5 ++
> > arch/x86/kernel/head64.c | 88 +++++++-------------
> > arch/x86/kernel/head_64.S | 5 +-
> > 7 files changed, 51 insertions(+), 64 deletions(-)
> >
> > diff --git a/arch/x86/Makefile b/arch/x86/Makefile
> > index 1a068de12a56..2b5954e75318 100644
> > --- a/arch/x86/Makefile
> > +++ b/arch/x86/Makefile
> > @@ -168,6 +168,14 @@ else
> > KBUILD_CFLAGS += -mcmodel=kernel
> > KBUILD_RUSTFLAGS += -Cno-redzone=y
> > KBUILD_RUSTFLAGS += -Ccode-model=kernel
> > +
> > + PIE_CFLAGS-$(CONFIG_STACKPROTECTOR) += -fno-stack-protector
>
> Main Makefile has
>
> KBUILD_CFLAGS += -fno-PIE
>
> and this ends up being:
>
> gcc -Wp,-MMD,arch/x86/kernel/.head64.s.d -nostdinc ... -fno-PIE ... -fpie ... -fverbose-asm -S -o arch/x86/kernel/head64.s arch/x86/kernel/head64.c
>
> Can you pls remove -fno-PIE from those TUs which use PIE_CFLAGS so that
> there's no confusion when staring at V=1 output?
>
Yeah. That would means adding PIE_CFLAGS_REMOVE alongside PIE_CFLAGS
and applying both in every place it is used, but we are only dealing
with a handful of object files here.
> > + PIE_CFLAGS-$(CONFIG_LTO) += -fno-lto
> > +
> > + PIE_CFLAGS := -fpie -mcmodel=small $(PIE_CFLAGS-y) \
> > + -include $(srctree)/include/linux/hidden.h
> > +
> > + export PIE_CFLAGS
> > endif
> >
> > #
>
> Other than that, that code becomes much more readable, cool!
>
Thanks. But now that we have RIP_REL_REF(), I might split the cleanup
from the actual switch to -fpie, which I am still a bit on the fence
about, given different compiler versions, LTO, etc.
RIP_REL_REF(foo) just turns into 'foo' when compiling with -fpie and
we could drop those piecemeal once we are confident that -fpie does
not cause any regressions.
Note that I have some reservations now about .pi.text as well: it is a
bit intrusive, and on x86, we might just as well move everything that
executes from the 1:1 mapping into .head.text, and teach objtool that
those sections should not contain any ELF relocations involving
absolute addresses. But this is another thing that I want to spend a
bit more time on before I respin it, so I will just do the cleanup in
the next revision, and add the rigid correctness checks the next
cycle.
On Mon, Feb 12, 2024 at 12:52:01PM +0100, Ard Biesheuvel wrote:
> Yeah. That would means adding PIE_CFLAGS_REMOVE alongside PIE_CFLAGS
> and applying both in every place it is used, but we are only dealing
> with a handful of object files here.
Right.
And we already have such a thing with PURGATORY_CFLAGS_REMOVE.
> Thanks. But now that we have RIP_REL_REF(), I might split the cleanup
> from the actual switch to -fpie, which I am still a bit on the fence
> about, given different compiler versions, LTO, etc.
Tell me about it. Considering how much jumping through hoops we had to
do in recent years to accomodate building the source with the different
compilers, I'm all for being very conservative here.
> RIP_REL_REF(foo) just turns into 'foo' when compiling with -fpie and
> we could drop those piecemeal once we are confident that -fpie does
> not cause any regressions.
Ack.
> Note that I have some reservations now about .pi.text as well: it is a
> bit intrusive, and on x86, we might just as well move everything that
> executes from the 1:1 mapping into .head.text, and teach objtool that
> those sections should not contain any ELF relocations involving
> absolute addresses. But this is another thing that I want to spend a
> bit more time on before I respin it, so I will just do the cleanup in
> the next revision, and add the rigid correctness checks the next
> cycle.
I am fully onboard with being conservative and doing things in small
steps considering how many bugs tend to fall out when the stuff hits
upstream. So going slowly and making sure our sanity is intact is a very
good idea!
Thx.
@@ -168,6 +168,14 @@ else
KBUILD_CFLAGS += -mcmodel=kernel
KBUILD_RUSTFLAGS += -Cno-redzone=y
KBUILD_RUSTFLAGS += -Ccode-model=kernel
+
+ PIE_CFLAGS-$(CONFIG_STACKPROTECTOR) += -fno-stack-protector
+ PIE_CFLAGS-$(CONFIG_LTO) += -fno-lto
+
+ PIE_CFLAGS := -fpie -mcmodel=small $(PIE_CFLAGS-y) \
+ -include $(srctree)/include/linux/hidden.h
+
+ export PIE_CFLAGS
endif
#
@@ -84,7 +84,7 @@ LDFLAGS_vmlinux += -T
hostprogs := mkpiggy
HOST_EXTRACFLAGS += -I$(srctree)/tools/include
-sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
quiet_cmd_voffset = VOFFSET $@
cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
@@ -134,7 +134,8 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
-static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)
+static __always_inline void
+native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)
{
memcpy(&idt[entry], gate, sizeof(*gate));
}
@@ -47,8 +47,8 @@ extern unsigned long saved_video_mode;
extern void reserve_standard_io_resources(void);
extern void i386_reserve_resources(void);
-extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp);
-extern void startup_64_setup_env(unsigned long physbase);
+extern unsigned long __startup_64(struct boot_params *bp);
+extern void startup_64_setup_env(void);
extern void early_setup_idt(void);
extern void __init do_early_exception(struct pt_regs *regs, int trapnr);
@@ -21,6 +21,11 @@ CFLAGS_REMOVE_sev.o = -pg
CFLAGS_REMOVE_rethook.o = -pg
endif
+# head64.c contains C code that may execute from a different virtual address
+# than it was linked at, so we always build it using PIE codegen
+CFLAGS_head64.o += $(PIE_CFLAGS)
+UBSAN_SANITIZE_head64.o := n
+
KASAN_SANITIZE_head$(BITS).o := n
KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
@@ -74,15 +74,10 @@ static struct desc_ptr startup_gdt_descr __initdata = {
.address = 0,
};
-static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
-{
- return ptr - (void *)_text + (void *)physaddr;
-}
-
-static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr)
-{
- return fixup_pointer(ptr, physaddr);
-}
+#define __va_symbol(sym) ({ \
+ unsigned long __v; \
+ asm("movq $" __stringify(sym) ", %0":"=r"(__v)); \
+ __v; })
static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd)
{
@@ -99,8 +94,8 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
* attribute.
*/
if (sme_get_me_mask()) {
- vaddr = (unsigned long)__start_bss_decrypted;
- vaddr_end = (unsigned long)__end_bss_decrypted;
+ vaddr = __va_symbol(__start_bss_decrypted);
+ vaddr_end = __va_symbol(__end_bss_decrypted);
for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
/*
@@ -127,25 +122,17 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
return sme_get_me_mask();
}
-/* Code in __startup_64() can be relocated during execution, but the compiler
- * doesn't have to generate PC-relative relocations when accessing globals from
- * that function. Clang actually does not generate them, which leads to
- * boot-time crashes. To work around this problem, every global pointer must
- * be adjusted using fixup_pointer().
- */
-unsigned long __head __startup_64(unsigned long physaddr,
- struct boot_params *bp)
+unsigned long __head __startup_64(struct boot_params *bp)
{
+ unsigned long physaddr = (unsigned long)_text;
unsigned long load_delta, *p;
unsigned long pgtable_flags;
pgdval_t *pgd;
p4dval_t *p4d;
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
- pteval_t *mask_ptr;
bool la57;
int i;
- unsigned int *next_pgt_ptr;
la57 = pgtable_l5_enabled();
@@ -157,7 +144,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
* Compute the delta between the address I am compiled to run at
* and the address I am actually running at.
*/
- load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
+ load_delta = physaddr - (__va_symbol(_text) - __START_KERNEL_map);
/* Is the address not 2M aligned? */
if (load_delta & ~PMD_MASK)
@@ -168,26 +155,24 @@ unsigned long __head __startup_64(unsigned long physaddr,
/* Fixup the physical addresses in the page table */
- pgd = fixup_pointer(early_top_pgt, physaddr);
+ pgd = (pgdval_t *)early_top_pgt;
p = pgd + pgd_index(__START_KERNEL_map);
if (la57)
*p = (unsigned long)level4_kernel_pgt;
else
*p = (unsigned long)level3_kernel_pgt;
- *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
+ *p += _PAGE_TABLE_NOENC + sme_get_me_mask();
if (la57) {
- p4d = fixup_pointer(level4_kernel_pgt, physaddr);
+ p4d = (p4dval_t *)level4_kernel_pgt;
p4d[511] += load_delta;
}
- pud = fixup_pointer(level3_kernel_pgt, physaddr);
- pud[510] += load_delta;
- pud[511] += load_delta;
+ level3_kernel_pgt[510].pud += load_delta;
+ level3_kernel_pgt[511].pud += load_delta;
- pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
- pmd[i] += load_delta;
+ level2_fixmap_pgt[i].pmd += load_delta;
/*
* Set up the identity mapping for the switchover. These
@@ -196,15 +181,13 @@ unsigned long __head __startup_64(unsigned long physaddr,
* it avoids problems around wraparound.
*/
- next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
- pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
- pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+ pud = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
+ pmd = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
if (la57) {
- p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++],
- physaddr);
+ p4d = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
@@ -225,8 +208,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
/* Filter out unsupported __PAGE_KERNEL_* bits: */
- mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr);
- pmd_entry &= *mask_ptr;
+ pmd_entry &= __supported_pte_mask;
pmd_entry += sme_get_me_mask();
pmd_entry += physaddr;
@@ -252,14 +234,14 @@ unsigned long __head __startup_64(unsigned long physaddr,
* error, causing the BIOS to halt the system.
*/
- pmd = fixup_pointer(level2_kernel_pgt, physaddr);
+ pmd = (pmdval_t *)level2_kernel_pgt;
/* invalidate pages before the kernel image */
- for (i = 0; i < pmd_index((unsigned long)_text); i++)
+ for (i = 0; i < pmd_index(__va_symbol(_text)); i++)
pmd[i] &= ~_PAGE_PRESENT;
/* fixup pages that are part of the kernel image */
- for (; i <= pmd_index((unsigned long)_end); i++)
+ for (; i <= pmd_index(__va_symbol(_end)); i++)
if (pmd[i] & _PAGE_PRESENT)
pmd[i] += load_delta;
@@ -271,7 +253,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
* Fixup phys_base - remove the memory encryption mask to obtain
* the true physical address.
*/
- *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
+ phys_base += load_delta - sme_get_me_mask();
return sme_postprocess_startup(bp, pmd);
}
@@ -553,22 +535,16 @@ static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler)
}
/* This runs while still in the direct mapping */
-static void __head startup_64_load_idt(unsigned long physbase)
+static void __head startup_64_load_idt(void)
{
- struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase);
- gate_desc *idt = fixup_pointer(bringup_idt_table, physbase);
-
-
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
- void *handler;
+ gate_desc *idt = bringup_idt_table;
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
/* VMM Communication Exception */
- handler = fixup_pointer(vc_no_ghcb, physbase);
- set_bringup_idt_handler(idt, X86_TRAP_VC, handler);
- }
+ set_bringup_idt_handler(idt, X86_TRAP_VC, vc_no_ghcb);
- desc->address = (unsigned long)idt;
- native_load_idt(desc);
+ bringup_idt_descr.address = (unsigned long)idt;
+ native_load_idt(&bringup_idt_descr);
}
/* This is used when running on kernel addresses */
@@ -587,10 +563,10 @@ void early_setup_idt(void)
/*
* Setup boot CPU state needed before kernel switches to virtual addresses.
*/
-void __head startup_64_setup_env(unsigned long physbase)
+void __head startup_64_setup_env(void)
{
/* Load GDT */
- startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase);
+ startup_gdt_descr.address = (unsigned long)startup_gdt;
native_load_gdt(&startup_gdt_descr);
/* New GDT is live - reload data segment registers */
@@ -598,5 +574,5 @@ void __head startup_64_setup_env(unsigned long physbase)
"movl %%eax, %%ss\n"
"movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory");
- startup_64_load_idt(physbase);
+ startup_64_load_idt();
}
@@ -67,8 +67,6 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Set up the stack for verify_cpu() */
leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp
- leaq _text(%rip), %rdi
-
/* Setup GSBASE to allow stack canary access for C code */
movl $MSR_GS_BASE, %ecx
leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
@@ -107,8 +105,7 @@ SYM_CODE_START_NOALIGN(startup_64)
* is active) to be added to the initial pgdir entry that will be
* programmed into CR3.
*/
- leaq _text(%rip), %rdi
- movq %r15, %rsi
+ movq %r15, %rdi
call __startup_64
/* Form the CR3 value being sure to include the CR3 modifier */