[4/6] x86: efistub: Perform 4/5 level paging switch from the stub
Commit Message
In preparation for updating the EFI stub boot flow to avoid the bare
metal decompressor code altogether, implement the support code for
switching between 4 and 5 levels of paging before jumping to the kernel
proper.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
drivers/firmware/efi/libstub/efi-stub-helper.c | 4 +
drivers/firmware/efi/libstub/x86-stub.c | 145 ++++++++++++++++++++
2 files changed, 149 insertions(+)
Comments
On Mon, Apr 24, 2023 at 06:57:24PM +0200, Ard Biesheuvel wrote:
> In preparation for updating the EFI stub boot flow to avoid the bare
> metal decompressor code altogether, implement the support code for
> switching between 4 and 5 levels of paging before jumping to the kernel
> proper.
I must admit it is neat. I like it a lot.
Any chance we can share the code with the traditional decompressor?
There's not much that EFI specific here. It should be possible to isolate
it from the rest, no?
> @@ -792,6 +925,14 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
> (get_efi_config_table(ACPI_20_TABLE_GUID) ?:
> get_efi_config_table(ACPI_TABLE_GUID));
>
> +#ifdef CONFIG_X86_64
> + status = efi_setup_5level_paging();
> + if (status != EFI_SUCCESS) {
> + efi_err("efi_setup_5level_paging() failed!\n");
> + goto fail;
> + }
> +#endif
> +
> /*
> * If the kernel isn't already loaded at a suitable address,
> * relocate it.
> @@ -910,6 +1051,10 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
> goto fail;
> }
>
> +#ifdef CONFIG_X86_64
> + efi_5level_switch();
> +#endif
> +
> return bzimage_addr;
> fail:
> efi_err("efi_main() failed!\n");
Maybe use IS_ENABLED() + dummy efi_setup_5level_paging()/efi_5level_switch()
instead of #ifdefs?
On Wed, 26 Apr 2023 at 11:42, Kirill A . Shutemov
<kirill.shutemov@linux.intel.com> wrote:
>
> On Mon, Apr 24, 2023 at 06:57:24PM +0200, Ard Biesheuvel wrote:
> > In preparation for updating the EFI stub boot flow to avoid the bare
> > metal decompressor code altogether, implement the support code for
> > switching between 4 and 5 levels of paging before jumping to the kernel
> > proper.
>
> I must admit it is neat. I like it a lot.
>
Thanks!
> Any chance we can share the code with the traditional decompressor?
> There's not much that EFI specific here. It should be possible to isolate
> it from the rest, no?
>
I agree. The EFI boot code should still avoid the bare metal
trampoline allocation/deallocation, but the actual payload could be
the same - it's just an indirect call with the GDT and page table
pointers as arguments.
>
> > @@ -792,6 +925,14 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
> > (get_efi_config_table(ACPI_20_TABLE_GUID) ?:
> > get_efi_config_table(ACPI_TABLE_GUID));
> >
> > +#ifdef CONFIG_X86_64
> > + status = efi_setup_5level_paging();
> > + if (status != EFI_SUCCESS) {
> > + efi_err("efi_setup_5level_paging() failed!\n");
> > + goto fail;
> > + }
> > +#endif
> > +
> > /*
> > * If the kernel isn't already loaded at a suitable address,
> > * relocate it.
> > @@ -910,6 +1051,10 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
> > goto fail;
> > }
> >
> > +#ifdef CONFIG_X86_64
> > + efi_5level_switch();
> > +#endif
> > +
> > return bzimage_addr;
> > fail:
> > efi_err("efi_main() failed!\n");
>
> Maybe use IS_ENABLED() + dummy efi_setup_5level_paging()/efi_5level_switch()
> instead of #ifdefs?
>
These are functions returning void so I can just move the #ifdef into
the function implementation. Wo do need #ifdefs at some level, as i386
does not provide a definition for __KERNEL32_CS
@@ -16,6 +16,8 @@
#include "efistub.h"
+extern bool efi_no5lvl;
+
bool efi_nochunk;
bool efi_nokaslr = !IS_ENABLED(CONFIG_RANDOMIZE_BASE);
bool efi_novamap;
@@ -73,6 +75,8 @@ efi_status_t efi_parse_options(char const *cmdline)
efi_loglevel = CONSOLE_LOGLEVEL_QUIET;
} else if (!strcmp(param, "noinitrd")) {
efi_noinitrd = true;
+ } else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) {
+ efi_no5lvl = true;
} else if (!strcmp(param, "efi") && val) {
efi_nochunk = parse_option_str(val, "nochunk");
efi_novamap |= parse_option_str(val, "novamap");
@@ -760,6 +760,139 @@ static efi_status_t exit_boot(struct boot_params *boot_params, void *handle)
return EFI_SUCCESS;
}
+#ifdef CONFIG_X86_64
+bool efi_no5lvl;
+
+static const struct desc_struct gdt[] = {
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+};
+
+static void (*la57_toggle)(void *cr3, void *gdt);
+
+static void __naked tmpl_toggle(void *cr3, void *gdt)
+{
+ /*
+ * This is template code that will be copied into a 32-bit addressable
+ * buffer, allowing us to drop to 32-bit mode with paging disabled,
+ * which is required to be able to toggle the CR4.LA57 bit.
+ *
+ * The first MOVB instruction is only there to capture the size of the
+ * sequence, and implicitly, the offset to the LJMP's immediate, which
+ * will be populated with the correct absolute address after copying.
+ */
+ asm("0: movb $(4f - .), %%al \n\t"
+ " lgdt (%%rsi) \n\t"
+ " movw %[ds], %%ax \n\t"
+ " movw %%ax, %%ds \n\t"
+ " movw %%ax, %%ss \n\t"
+ " leaq 2f(%%rip), %%rax \n\t"
+ " pushq %[cs32] \n\t"
+ " pushq %%rax \n\t"
+ " lretq \n\t"
+ "1: retq \n\t"
+ " .code32 \n\t"
+ "2: movl %%cr0, %%eax \n\t"
+ " btrl %[pg], %%eax \n\t"
+ " movl %%eax, %%cr0 \n\t"
+ " jmp 3f \n\t"
+ "3: movl %%cr4, %%ecx \n\t"
+ " btcl %[la57], %%ecx \n\t"
+ " movl %%ecx, %%cr4 \n\t"
+ " movl %%edi, %%cr3 \n\t"
+ " btsl %[pg], %%eax \n\t"
+ " movl %%eax, %%cr0 \n\t"
+ " ljmpl %[cs], $(1b - 0b) \n\t"
+ "4: .code64"
+ :
+ : [cs32] "i"(__KERNEL32_CS),
+ [cs] "i"(__KERNEL_CS),
+ [ds] "i"(__KERNEL_DS),
+ [pg] "i"(X86_CR0_PG_BIT),
+ [la57] "i"(X86_CR4_LA57_BIT));
+}
+
+/*
+ * Enabling (or disabling) 5 level paging is tricky, because it can only be
+ * done from 32-bit mode with paging disabled. This means not only that the
+ * code itself must be running from 32-bit addressable physical memory, but
+ * also that the root page table must be 32-bit addressable, as we cannot
+ * program a 64-bit value into CR3 when running in 32-bit mode.
+ */
+static efi_status_t efi_setup_5level_paging(void)
+{
+ const u8 tmpl_size = ((u8 *)tmpl_toggle)[1];
+ efi_status_t status;
+ u8 *la57_code;
+
+ if (!efi_is_64bit())
+ return EFI_SUCCESS;
+
+ /* check for 5 level paging support */
+ if (native_cpuid_eax(0) < 7 ||
+ !(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
+ return EFI_SUCCESS;
+
+ /* allocate some 32-bit addressable memory for code and a page table */
+ status = efi_allocate_pages(2 * PAGE_SIZE, (unsigned long *)&la57_code,
+ U32_MAX);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ la57_toggle = memcpy(la57_code, tmpl_toggle, tmpl_size);
+ memset(la57_code + tmpl_size, 0x90, PAGE_SIZE - tmpl_size);
+
+ /*
+ * To avoid having to allocate a 32-bit addressable stack, we use a
+ * ljmp to switch back to long mode. However, this takes an absolute
+ * address, so we have to poke it in at runtime. The dummy MOVB
+ * instruction at the beginning can be used to locate the immediate.
+ */
+ *(u32 *)&la57_code[tmpl_size - 6] += (unsigned long)la57_code;
+
+ adjust_memory_range_protection((unsigned long)la57_code, PAGE_SIZE);
+
+ return EFI_SUCCESS;
+}
+
+static void efi_5level_switch(void)
+{
+ bool want_la57 = IS_ENABLED(CONFIG_X86_5LEVEL) && !efi_no5lvl;
+ bool have_la57 = native_read_cr4() & X86_CR4_LA57;
+ bool need_toggle = want_la57 ^ have_la57;
+ u64 *pgt = (void *)la57_toggle + PAGE_SIZE;
+ u64 *cr3 = (u64 *)__native_read_cr3();
+ struct desc_ptr desc;
+ u64 *new_cr3;
+
+ if (!la57_toggle || !need_toggle)
+ return;
+
+ if (!have_la57) {
+ /*
+ * We are going to enable 5 level paging, so we need to
+ * allocate a root level page from the 32-bit addressable
+ * physical region, and plug the existing hierarchy into it.
+ */
+ new_cr3 = memset(pgt, 0, PAGE_SIZE);
+ new_cr3[0] = (u64)cr3 | _PAGE_TABLE_NOENC;
+ } else {
+ // take the new root table pointer from the current entry #0
+ new_cr3 = (u64 *)(cr3[0] & PAGE_MASK);
+
+ // copy the new root level table if it is not 32-bit addressable
+ if ((u64)new_cr3 > U32_MAX)
+ new_cr3 = memcpy(pgt, new_cr3, PAGE_SIZE);
+ }
+
+ desc.size = sizeof(gdt) - 1;
+ desc.address = (u64)gdt;
+
+ la57_toggle(new_cr3, &desc);
+}
+#endif
+
/*
* On success, we return the address of startup_32, which has potentially been
* relocated by efi_relocate_kernel.
@@ -792,6 +925,14 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
(get_efi_config_table(ACPI_20_TABLE_GUID) ?:
get_efi_config_table(ACPI_TABLE_GUID));
+#ifdef CONFIG_X86_64
+ status = efi_setup_5level_paging();
+ if (status != EFI_SUCCESS) {
+ efi_err("efi_setup_5level_paging() failed!\n");
+ goto fail;
+ }
+#endif
+
/*
* If the kernel isn't already loaded at a suitable address,
* relocate it.
@@ -910,6 +1051,10 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
goto fail;
}
+#ifdef CONFIG_X86_64
+ efi_5level_switch();
+#endif
+
return bzimage_addr;
fail:
efi_err("efi_main() failed!\n");