[4/6] x86: efistub: Perform 4/5 level paging switch from the stub

Message ID 20230424165726.2245548-5-ardb@kernel.org
State New
Headers
Series efi/x86: Avoid legacy decompressor during EFI boot |

Commit Message

Ard Biesheuvel April 24, 2023, 4:57 p.m. UTC
  In preparation for updating the EFI stub boot flow to avoid the bare
metal decompressor code altogether, implement the support code for
switching between 4 and 5 levels of paging before jumping to the kernel
proper.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/libstub/efi-stub-helper.c |   4 +
 drivers/firmware/efi/libstub/x86-stub.c        | 145 ++++++++++++++++++++
 2 files changed, 149 insertions(+)
  

Comments

Kirill A. Shutemov April 26, 2023, 10:42 a.m. UTC | #1
On Mon, Apr 24, 2023 at 06:57:24PM +0200, Ard Biesheuvel wrote:
> In preparation for updating the EFI stub boot flow to avoid the bare
> metal decompressor code altogether, implement the support code for
> switching between 4 and 5 levels of paging before jumping to the kernel
> proper.

I must admit it is neat. I like it a lot.

Any chance we can share the code with the traditional decompressor?
There's not much that EFI specific here. It should be possible to isolate
it from the rest, no?


> @@ -792,6 +925,14 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
>  				(get_efi_config_table(ACPI_20_TABLE_GUID) ?:
>  				 get_efi_config_table(ACPI_TABLE_GUID));
>  
> +#ifdef CONFIG_X86_64
> +	status = efi_setup_5level_paging();
> +	if (status != EFI_SUCCESS) {
> +		efi_err("efi_setup_5level_paging() failed!\n");
> +		goto fail;
> +	}
> +#endif
> +
>  	/*
>  	 * If the kernel isn't already loaded at a suitable address,
>  	 * relocate it.
> @@ -910,6 +1051,10 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
>  		goto fail;
>  	}
>  
> +#ifdef CONFIG_X86_64
> +	efi_5level_switch();
> +#endif
> +
>  	return bzimage_addr;
>  fail:
>  	efi_err("efi_main() failed!\n");

Maybe use IS_ENABLED() + dummy efi_setup_5level_paging()/efi_5level_switch()
instead of #ifdefs?
  
Ard Biesheuvel April 26, 2023, 9:29 p.m. UTC | #2
On Wed, 26 Apr 2023 at 11:42, Kirill A . Shutemov
<kirill.shutemov@linux.intel.com> wrote:
>
> On Mon, Apr 24, 2023 at 06:57:24PM +0200, Ard Biesheuvel wrote:
> > In preparation for updating the EFI stub boot flow to avoid the bare
> > metal decompressor code altogether, implement the support code for
> > switching between 4 and 5 levels of paging before jumping to the kernel
> > proper.
>
> I must admit it is neat. I like it a lot.
>

Thanks!

> Any chance we can share the code with the traditional decompressor?
> There's not much that EFI specific here. It should be possible to isolate
> it from the rest, no?
>

I agree. The EFI boot code should still avoid the bare metal
trampoline allocation/deallocation, but the actual payload could be
the same - it's just an indirect call with the GDT and page table
pointers as arguments.

>
> > @@ -792,6 +925,14 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
> >                               (get_efi_config_table(ACPI_20_TABLE_GUID) ?:
> >                                get_efi_config_table(ACPI_TABLE_GUID));
> >
> > +#ifdef CONFIG_X86_64
> > +     status = efi_setup_5level_paging();
> > +     if (status != EFI_SUCCESS) {
> > +             efi_err("efi_setup_5level_paging() failed!\n");
> > +             goto fail;
> > +     }
> > +#endif
> > +
> >       /*
> >        * If the kernel isn't already loaded at a suitable address,
> >        * relocate it.
> > @@ -910,6 +1051,10 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
> >               goto fail;
> >       }
> >
> > +#ifdef CONFIG_X86_64
> > +     efi_5level_switch();
> > +#endif
> > +
> >       return bzimage_addr;
> >  fail:
> >       efi_err("efi_main() failed!\n");
>
> Maybe use IS_ENABLED() + dummy efi_setup_5level_paging()/efi_5level_switch()
> instead of #ifdefs?
>

These are functions returning void so I can just move the #ifdef into
the function implementation. Wo do need #ifdefs at some level, as i386
does not provide a definition for __KERNEL32_CS
  

Patch

diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 1e0203d74691ffcc..fc5f3b4c45e91401 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -16,6 +16,8 @@ 
 
 #include "efistub.h"
 
+extern bool efi_no5lvl;
+
 bool efi_nochunk;
 bool efi_nokaslr = !IS_ENABLED(CONFIG_RANDOMIZE_BASE);
 bool efi_novamap;
@@ -73,6 +75,8 @@  efi_status_t efi_parse_options(char const *cmdline)
 			efi_loglevel = CONSOLE_LOGLEVEL_QUIET;
 		} else if (!strcmp(param, "noinitrd")) {
 			efi_noinitrd = true;
+		} else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) {
+			efi_no5lvl = true;
 		} else if (!strcmp(param, "efi") && val) {
 			efi_nochunk = parse_option_str(val, "nochunk");
 			efi_novamap |= parse_option_str(val, "novamap");
diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
index e136c94037dda8d3..7b8717cbb96a1246 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -760,6 +760,139 @@  static efi_status_t exit_boot(struct boot_params *boot_params, void *handle)
 	return EFI_SUCCESS;
 }
 
+#ifdef CONFIG_X86_64
+bool efi_no5lvl;
+
+static const struct desc_struct gdt[] = {
+	[GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+	[GDT_ENTRY_KERNEL_CS]   = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+	[GDT_ENTRY_KERNEL_DS]   = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+};
+
+static void (*la57_toggle)(void *cr3, void *gdt);
+
+static void __naked tmpl_toggle(void *cr3, void *gdt)
+{
+	/*
+	 * This is template code that will be copied into a 32-bit addressable
+	 * buffer, allowing us to drop to 32-bit mode with paging disabled,
+	 * which is required to be able to toggle the CR4.LA57 bit.
+	 *
+	 * The first MOVB instruction is only there to capture the size of the
+	 * sequence, and implicitly, the offset to the LJMP's immediate, which
+	 * will be populated with the correct absolute address after copying.
+	 */
+	asm("0:	movb	$(4f - .), %%al		\n\t"
+	    "	lgdt	(%%rsi)			\n\t"
+	    "	movw	%[ds], %%ax		\n\t"
+	    "	movw	%%ax, %%ds		\n\t"
+	    "	movw	%%ax, %%ss		\n\t"
+	    "	leaq	2f(%%rip), %%rax	\n\t"
+	    "	pushq	%[cs32]			\n\t"
+	    "	pushq	%%rax			\n\t"
+	    "	lretq				\n\t"
+	    "1:	retq				\n\t"
+	    "	.code32				\n\t"
+	    "2: movl	%%cr0, %%eax		\n\t"
+	    "	btrl	%[pg], %%eax		\n\t"
+	    "	movl	%%eax, %%cr0		\n\t"
+	    "	jmp	3f			\n\t"
+	    "3: movl	%%cr4, %%ecx		\n\t"
+	    "	btcl	%[la57], %%ecx		\n\t"
+	    "	movl	%%ecx, %%cr4		\n\t"
+	    "	movl	%%edi, %%cr3		\n\t"
+	    "	btsl	%[pg], %%eax		\n\t"
+	    "	movl	%%eax, %%cr0		\n\t"
+	    "	ljmpl	%[cs], $(1b - 0b)	\n\t"
+	    "4:	.code64"
+	    :
+	    : [cs32]	"i"(__KERNEL32_CS),
+	      [cs]	"i"(__KERNEL_CS),
+	      [ds]	"i"(__KERNEL_DS),
+	      [pg]	"i"(X86_CR0_PG_BIT),
+	      [la57]	"i"(X86_CR4_LA57_BIT));
+}
+
+/*
+ * Enabling (or disabling) 5 level paging is tricky, because it can only be
+ * done from 32-bit mode with paging disabled. This means not only that the
+ * code itself must be running from 32-bit addressable physical memory, but
+ * also that the root page table must be 32-bit addressable, as we cannot
+ * program a 64-bit value into CR3 when running in 32-bit mode.
+ */
+static efi_status_t efi_setup_5level_paging(void)
+{
+	const u8 tmpl_size = ((u8 *)tmpl_toggle)[1];
+	efi_status_t status;
+	u8 *la57_code;
+
+	if (!efi_is_64bit())
+		return EFI_SUCCESS;
+
+	/* check for 5 level paging support */
+	if (native_cpuid_eax(0) < 7 ||
+	    !(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
+		return EFI_SUCCESS;
+
+	/* allocate some 32-bit addressable memory for code and a page table */
+	status = efi_allocate_pages(2 * PAGE_SIZE, (unsigned long *)&la57_code,
+				    U32_MAX);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	la57_toggle = memcpy(la57_code, tmpl_toggle, tmpl_size);
+	memset(la57_code + tmpl_size, 0x90, PAGE_SIZE - tmpl_size);
+
+	/*
+	 * To avoid having to allocate a 32-bit addressable stack, we use a
+	 * ljmp to switch back to long mode. However, this takes an absolute
+	 * address, so we have to poke it in at runtime. The dummy MOVB
+	 * instruction at the beginning can be used to locate the immediate.
+	 */
+	*(u32 *)&la57_code[tmpl_size - 6] += (unsigned long)la57_code;
+
+	adjust_memory_range_protection((unsigned long)la57_code, PAGE_SIZE);
+
+	return EFI_SUCCESS;
+}
+
+static void efi_5level_switch(void)
+{
+	bool want_la57 = IS_ENABLED(CONFIG_X86_5LEVEL) && !efi_no5lvl;
+	bool have_la57 = native_read_cr4() & X86_CR4_LA57;
+	bool need_toggle = want_la57 ^ have_la57;
+	u64 *pgt = (void *)la57_toggle + PAGE_SIZE;
+	u64 *cr3 = (u64 *)__native_read_cr3();
+	struct desc_ptr desc;
+	u64 *new_cr3;
+
+	if (!la57_toggle || !need_toggle)
+		return;
+
+	if (!have_la57) {
+		/*
+		 * We are going to enable 5 level paging, so we need to
+		 * allocate a root level page from the 32-bit addressable
+		 * physical region, and plug the existing hierarchy into it.
+		 */
+		new_cr3 = memset(pgt, 0, PAGE_SIZE);
+		new_cr3[0] = (u64)cr3 | _PAGE_TABLE_NOENC;
+	} else {
+		// take the new root table pointer from the current entry #0
+		new_cr3 = (u64 *)(cr3[0] & PAGE_MASK);
+
+		// copy the new root level table if it is not 32-bit addressable
+		if ((u64)new_cr3 > U32_MAX)
+			new_cr3 = memcpy(pgt, new_cr3, PAGE_SIZE);
+	}
+
+	desc.size       = sizeof(gdt) - 1;
+	desc.address    = (u64)gdt;
+
+	la57_toggle(new_cr3, &desc);
+}
+#endif
+
 /*
  * On success, we return the address of startup_32, which has potentially been
  * relocated by efi_relocate_kernel.
@@ -792,6 +925,14 @@  asmlinkage unsigned long efi_main(efi_handle_t handle,
 				(get_efi_config_table(ACPI_20_TABLE_GUID) ?:
 				 get_efi_config_table(ACPI_TABLE_GUID));
 
+#ifdef CONFIG_X86_64
+	status = efi_setup_5level_paging();
+	if (status != EFI_SUCCESS) {
+		efi_err("efi_setup_5level_paging() failed!\n");
+		goto fail;
+	}
+#endif
+
 	/*
 	 * If the kernel isn't already loaded at a suitable address,
 	 * relocate it.
@@ -910,6 +1051,10 @@  asmlinkage unsigned long efi_main(efi_handle_t handle,
 		goto fail;
 	}
 
+#ifdef CONFIG_X86_64
+	efi_5level_switch();
+#endif
+
 	return bzimage_addr;
 fail:
 	efi_err("efi_main() failed!\n");