[v5,13/20] x86/efistub: Perform 4/5 level paging switch from the stub

Message ID 20230607072342.4054036-14-ardb@kernel.org
State New
Headers
Series efi/x86: Avoid bare metal decompressor during EFI boot |

Commit Message

Ard Biesheuvel June 7, 2023, 7:23 a.m. UTC
  In preparation for updating the EFI stub boot flow to avoid the bare
metal decompressor code altogether, implement the support code for
switching between 4 and 5 levels of paging before jumping to the kernel
proper.

This reuses the newly refactored trampoline that the bare metal
decompressor uses, but relies on EFI APIs to allocate 32-bit addressable
memory and remap it with the appropriate permissions. Given that the
bare metal decompressor will no longer call into the trampoline if the
number of paging levels is already set correctly, it is no longer needed
to remove NX restrictions from the memory range where this trampoline
may end up.

Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/libstub/Makefile          |  1 +
 drivers/firmware/efi/libstub/efi-stub-helper.c |  2 +
 drivers/firmware/efi/libstub/efistub.h         |  1 +
 drivers/firmware/efi/libstub/x86-5lvl.c        | 95 ++++++++++++++++++++
 drivers/firmware/efi/libstub/x86-stub.c        | 40 +++------
 drivers/firmware/efi/libstub/x86-stub.h        | 17 ++++
 6 files changed, 130 insertions(+), 26 deletions(-)
  

Comments

Yunhong Jiang June 7, 2023, 8:19 p.m. UTC | #1
On Wed, Jun 07, 2023 at 09:23:35AM +0200, Ard Biesheuvel wrote:
> In preparation for updating the EFI stub boot flow to avoid the bare
> metal decompressor code altogether, implement the support code for
> switching between 4 and 5 levels of paging before jumping to the kernel
> proper.
> 
> This reuses the newly refactored trampoline that the bare metal
> decompressor uses, but relies on EFI APIs to allocate 32-bit addressable
> memory and remap it with the appropriate permissions. Given that the
> bare metal decompressor will no longer call into the trampoline if the
> number of paging levels is already set correctly, it is no longer needed
> to remove NX restrictions from the memory range where this trampoline
> may end up.
> 
> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
>  drivers/firmware/efi/libstub/Makefile          |  1 +
>  drivers/firmware/efi/libstub/efi-stub-helper.c |  2 +
>  drivers/firmware/efi/libstub/efistub.h         |  1 +
>  drivers/firmware/efi/libstub/x86-5lvl.c        | 95 ++++++++++++++++++++
>  drivers/firmware/efi/libstub/x86-stub.c        | 40 +++------
>  drivers/firmware/efi/libstub/x86-stub.h        | 17 ++++
>  6 files changed, 130 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
> index 16d64a34d1e19465..ae8874401a9f1490 100644
> --- a/drivers/firmware/efi/libstub/Makefile
> +++ b/drivers/firmware/efi/libstub/Makefile
> @@ -88,6 +88,7 @@ lib-$(CONFIG_EFI_GENERIC_STUB)	+= efi-stub.o string.o intrinsics.o systable.o \
>  lib-$(CONFIG_ARM)		+= arm32-stub.o
>  lib-$(CONFIG_ARM64)		+= arm64.o arm64-stub.o smbios.o
>  lib-$(CONFIG_X86)		+= x86-stub.o
> +lib-$(CONFIG_X86_64)		+= x86-5lvl.o
>  lib-$(CONFIG_RISCV)		+= riscv.o riscv-stub.o
>  lib-$(CONFIG_LOONGARCH)		+= loongarch.o loongarch-stub.o
>  
> diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
> index 1e0203d74691ffcc..51779279fbff21b5 100644
> --- a/drivers/firmware/efi/libstub/efi-stub-helper.c
> +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
> @@ -73,6 +73,8 @@ efi_status_t efi_parse_options(char const *cmdline)
>  			efi_loglevel = CONSOLE_LOGLEVEL_QUIET;
>  		} else if (!strcmp(param, "noinitrd")) {
>  			efi_noinitrd = true;
> +		} else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) {
> +			efi_no5lvl = true;
>  		} else if (!strcmp(param, "efi") && val) {
>  			efi_nochunk = parse_option_str(val, "nochunk");
>  			efi_novamap |= parse_option_str(val, "novamap");
> diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
> index 6aa38a1bf1265d83..06b7abc92ced9e18 100644
> --- a/drivers/firmware/efi/libstub/efistub.h
> +++ b/drivers/firmware/efi/libstub/efistub.h
> @@ -33,6 +33,7 @@
>  #define EFI_ALLOC_LIMIT		ULONG_MAX
>  #endif
>  
> +extern bool efi_no5lvl;
>  extern bool efi_nochunk;
>  extern bool efi_nokaslr;
>  extern int efi_loglevel;
> diff --git a/drivers/firmware/efi/libstub/x86-5lvl.c b/drivers/firmware/efi/libstub/x86-5lvl.c
> new file mode 100644
> index 0000000000000000..2428578a3ae08be7
> --- /dev/null
> +++ b/drivers/firmware/efi/libstub/x86-5lvl.c
> @@ -0,0 +1,95 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/efi.h>
> +
> +#include <asm/boot.h>
> +#include <asm/desc.h>
> +#include <asm/efi.h>
> +
> +#include "efistub.h"
> +#include "x86-stub.h"
> +
> +bool efi_no5lvl;
> +
> +static void (*la57_toggle)(void *trampoline, bool enable_5lvl);

As an ack to my comments to another patch, would it makes more sense to rename
the trampoline parameter to newcr3 and pass the address of the new page table,
instead of the trampoline start address?

> +
> +static const struct desc_struct gdt[] = {
> +	[GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
> +	[GDT_ENTRY_KERNEL_CS]   = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
> +};
> +
> +/*
> + * Enabling (or disabling) 5 level paging is tricky, because it can only be
> + * done from 32-bit mode with paging disabled. This means not only that the
> + * code itself must be running from 32-bit addressable physical memory, but
> + * also that the root page table must be 32-bit addressable, as programming
> + * a 64-bit value into CR3 when running in 32-bit mode is not supported.
> + */
> +efi_status_t efi_setup_5level_paging(void)
> +{
> +	u8 tmpl_size = (u8 *)&trampoline_ljmp_imm_offset - (u8 *)&trampoline_32bit_src;
> +	efi_status_t status;
> +	u8 *la57_code;
> +
> +	if (!efi_is_64bit())
> +		return EFI_SUCCESS;
> +
> +	/* check for 5 level paging support */
> +	if (native_cpuid_eax(0) < 7 ||
> +	    !(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
> +		return EFI_SUCCESS;
> +
Do we need to check the need_toggle here instead of at efi_5level_switch and
skip the whole setup if no need to switch the paging level? Sorry if I missed
any point.

> +	/* allocate some 32-bit addressable memory for code and a page table */
> +	status = efi_allocate_pages(2 * PAGE_SIZE, (unsigned long *)&la57_code,
> +				    U32_MAX);
> +	if (status != EFI_SUCCESS)
> +		return status;
> +
> +	la57_toggle = memcpy(la57_code, trampoline_32bit_src, tmpl_size);
> +	memset(la57_code + tmpl_size, 0x90, PAGE_SIZE - tmpl_size);
> +
> +	/*
> +	 * To avoid the need to allocate a 32-bit addressable stack, the
> +	 * trampoline uses a LJMP instruction to switch back to long mode.
> +	 * LJMP takes an absolute destination address, which needs to be
> +	 * fixed up at runtime.
> +	 */
> +	*(u32 *)&la57_code[trampoline_ljmp_imm_offset] += (unsigned long)la57_code;
> +
> +	efi_adjust_memory_range_protection((unsigned long)la57_toggle, PAGE_SIZE);
> +
> +	return EFI_SUCCESS;
> +}
> +
> +void efi_5level_switch(void)
> +{
> +	bool want_la57 = IS_ENABLED(CONFIG_X86_5LEVEL) && !efi_no5lvl;
> +	bool have_la57 = native_read_cr4() & X86_CR4_LA57;
> +	bool need_toggle = want_la57 ^ have_la57;
> +	u64 *pgt = (void *)la57_toggle + PAGE_SIZE;

Not sure if we can decouple this address assumption of the pgt and la57_toggle,
and keep the pgt as a variable, like la57_toggle, setup by
efi_setup_5level_paging() too.
Asking because with the Intel X86-S
(https://cdrdv2-public.intel.com/776648/x86s-EAS-v1-4-17-23-1.pdf), no
tramopline code is needed since the 4/5 level paging switch does not require
paging disabling. Of course, it's ok to keep this as is, and we can change
late when we begin working on X86-S support.
  
Ard Biesheuvel June 7, 2023, 8:31 p.m. UTC | #2
On Wed, 7 Jun 2023 at 22:19, Yunhong Jiang
<yunhong.jiang@linux.intel.com> wrote:
>
> On Wed, Jun 07, 2023 at 09:23:35AM +0200, Ard Biesheuvel wrote:
> > In preparation for updating the EFI stub boot flow to avoid the bare
> > metal decompressor code altogether, implement the support code for
> > switching between 4 and 5 levels of paging before jumping to the kernel
> > proper.
> >
> > This reuses the newly refactored trampoline that the bare metal
> > decompressor uses, but relies on EFI APIs to allocate 32-bit addressable
> > memory and remap it with the appropriate permissions. Given that the
> > bare metal decompressor will no longer call into the trampoline if the
> > number of paging levels is already set correctly, it is no longer needed
> > to remove NX restrictions from the memory range where this trampoline
> > may end up.
> >
> > Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > ---
> >  drivers/firmware/efi/libstub/Makefile          |  1 +
> >  drivers/firmware/efi/libstub/efi-stub-helper.c |  2 +
> >  drivers/firmware/efi/libstub/efistub.h         |  1 +
> >  drivers/firmware/efi/libstub/x86-5lvl.c        | 95 ++++++++++++++++++++
> >  drivers/firmware/efi/libstub/x86-stub.c        | 40 +++------
> >  drivers/firmware/efi/libstub/x86-stub.h        | 17 ++++
> >  6 files changed, 130 insertions(+), 26 deletions(-)
> >
> > diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
> > index 16d64a34d1e19465..ae8874401a9f1490 100644
> > --- a/drivers/firmware/efi/libstub/Makefile
> > +++ b/drivers/firmware/efi/libstub/Makefile
> > @@ -88,6 +88,7 @@ lib-$(CONFIG_EFI_GENERIC_STUB)      += efi-stub.o string.o intrinsics.o systable.o \
> >  lib-$(CONFIG_ARM)            += arm32-stub.o
> >  lib-$(CONFIG_ARM64)          += arm64.o arm64-stub.o smbios.o
> >  lib-$(CONFIG_X86)            += x86-stub.o
> > +lib-$(CONFIG_X86_64)         += x86-5lvl.o
> >  lib-$(CONFIG_RISCV)          += riscv.o riscv-stub.o
> >  lib-$(CONFIG_LOONGARCH)              += loongarch.o loongarch-stub.o
> >
> > diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
> > index 1e0203d74691ffcc..51779279fbff21b5 100644
> > --- a/drivers/firmware/efi/libstub/efi-stub-helper.c
> > +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
> > @@ -73,6 +73,8 @@ efi_status_t efi_parse_options(char const *cmdline)
> >                       efi_loglevel = CONSOLE_LOGLEVEL_QUIET;
> >               } else if (!strcmp(param, "noinitrd")) {
> >                       efi_noinitrd = true;
> > +             } else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) {
> > +                     efi_no5lvl = true;
> >               } else if (!strcmp(param, "efi") && val) {
> >                       efi_nochunk = parse_option_str(val, "nochunk");
> >                       efi_novamap |= parse_option_str(val, "novamap");
> > diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
> > index 6aa38a1bf1265d83..06b7abc92ced9e18 100644
> > --- a/drivers/firmware/efi/libstub/efistub.h
> > +++ b/drivers/firmware/efi/libstub/efistub.h
> > @@ -33,6 +33,7 @@
> >  #define EFI_ALLOC_LIMIT              ULONG_MAX
> >  #endif
> >
> > +extern bool efi_no5lvl;
> >  extern bool efi_nochunk;
> >  extern bool efi_nokaslr;
> >  extern int efi_loglevel;
> > diff --git a/drivers/firmware/efi/libstub/x86-5lvl.c b/drivers/firmware/efi/libstub/x86-5lvl.c
> > new file mode 100644
> > index 0000000000000000..2428578a3ae08be7
> > --- /dev/null
> > +++ b/drivers/firmware/efi/libstub/x86-5lvl.c
> > @@ -0,0 +1,95 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +#include <linux/efi.h>
> > +
> > +#include <asm/boot.h>
> > +#include <asm/desc.h>
> > +#include <asm/efi.h>
> > +
> > +#include "efistub.h"
> > +#include "x86-stub.h"
> > +
> > +bool efi_no5lvl;
> > +
> > +static void (*la57_toggle)(void *trampoline, bool enable_5lvl);
>
> As an ack to my comments to another patch, would it makes more sense to rename
> the trampoline parameter to newcr3 and pass the address of the new page table,
> instead of the trampoline start address?
>

Perhaps, but please realise that my goal here was not to invent an API
from scratch. There was existing code that I made minimal changes to
in order to be able to reuse it.

If this needs further changes, you can always send follow-up patches.

> > +
> > +static const struct desc_struct gdt[] = {
> > +     [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
> > +     [GDT_ENTRY_KERNEL_CS]   = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
> > +};
> > +
> > +/*
> > + * Enabling (or disabling) 5 level paging is tricky, because it can only be
> > + * done from 32-bit mode with paging disabled. This means not only that the
> > + * code itself must be running from 32-bit addressable physical memory, but
> > + * also that the root page table must be 32-bit addressable, as programming
> > + * a 64-bit value into CR3 when running in 32-bit mode is not supported.
> > + */
> > +efi_status_t efi_setup_5level_paging(void)
> > +{
> > +     u8 tmpl_size = (u8 *)&trampoline_ljmp_imm_offset - (u8 *)&trampoline_32bit_src;
> > +     efi_status_t status;
> > +     u8 *la57_code;
> > +
> > +     if (!efi_is_64bit())
> > +             return EFI_SUCCESS;
> > +
> > +     /* check for 5 level paging support */
> > +     if (native_cpuid_eax(0) < 7 ||
> > +         !(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
> > +             return EFI_SUCCESS;
> > +
> Do we need to check the need_toggle here instead of at efi_5level_switch and
> skip the whole setup if no need to switch the paging level? Sorry if I missed
> any point.
>

No. There are reasons why firmware might run with 5 levels, and switch
to 4 levels at ExitBootServices() time.

> > +     /* allocate some 32-bit addressable memory for code and a page table */
> > +     status = efi_allocate_pages(2 * PAGE_SIZE, (unsigned long *)&la57_code,
> > +                                 U32_MAX);
> > +     if (status != EFI_SUCCESS)
> > +             return status;
> > +
> > +     la57_toggle = memcpy(la57_code, trampoline_32bit_src, tmpl_size);
> > +     memset(la57_code + tmpl_size, 0x90, PAGE_SIZE - tmpl_size);
> > +
> > +     /*
> > +      * To avoid the need to allocate a 32-bit addressable stack, the
> > +      * trampoline uses a LJMP instruction to switch back to long mode.
> > +      * LJMP takes an absolute destination address, which needs to be
> > +      * fixed up at runtime.
> > +      */
> > +     *(u32 *)&la57_code[trampoline_ljmp_imm_offset] += (unsigned long)la57_code;
> > +
> > +     efi_adjust_memory_range_protection((unsigned long)la57_toggle, PAGE_SIZE);
> > +
> > +     return EFI_SUCCESS;
> > +}
> > +
> > +void efi_5level_switch(void)
> > +{
> > +     bool want_la57 = IS_ENABLED(CONFIG_X86_5LEVEL) && !efi_no5lvl;
> > +     bool have_la57 = native_read_cr4() & X86_CR4_LA57;
> > +     bool need_toggle = want_la57 ^ have_la57;
> > +     u64 *pgt = (void *)la57_toggle + PAGE_SIZE;
>
> Not sure if we can decouple this address assumption of the pgt and la57_toggle,
> and keep the pgt as a variable, like la57_toggle, setup by
> efi_setup_5level_paging() too.
> Asking because with the Intel X86-S
> (https://cdrdv2-public.intel.com/776648/x86s-EAS-v1-4-17-23-1.pdf), no
> tramopline code is needed since the 4/5 level paging switch does not require
> paging disabling. Of course, it's ok to keep this as is, and we can change
> late when we begin working on X86-S support.

We can make further changes as needed. The current interface is based
on the existing code.
  
Yunhong Jiang June 8, 2023, 12:43 a.m. UTC | #3
On Wed, Jun 07, 2023 at 10:31:43PM +0200, Ard Biesheuvel wrote:
> On Wed, 7 Jun 2023 at 22:19, Yunhong Jiang
> <yunhong.jiang@linux.intel.com> wrote:
> >
> > On Wed, Jun 07, 2023 at 09:23:35AM +0200, Ard Biesheuvel wrote:
> > > In preparation for updating the EFI stub boot flow to avoid the bare
> > > metal decompressor code altogether, implement the support code for
> > > switching between 4 and 5 levels of paging before jumping to the kernel
> > > proper.
> > >
> > > This reuses the newly refactored trampoline that the bare metal
> > > decompressor uses, but relies on EFI APIs to allocate 32-bit addressable
> > > memory and remap it with the appropriate permissions. Given that the
> > > bare metal decompressor will no longer call into the trampoline if the
> > > number of paging levels is already set correctly, it is no longer needed
> > > to remove NX restrictions from the memory range where this trampoline
> > > may end up.
> > >
> > > Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > > ---
> > >  drivers/firmware/efi/libstub/Makefile          |  1 +
> > >  drivers/firmware/efi/libstub/efi-stub-helper.c |  2 +
> > >  drivers/firmware/efi/libstub/efistub.h         |  1 +
> > >  drivers/firmware/efi/libstub/x86-5lvl.c        | 95 ++++++++++++++++++++
> > >  drivers/firmware/efi/libstub/x86-stub.c        | 40 +++------
> > >  drivers/firmware/efi/libstub/x86-stub.h        | 17 ++++
> > >  6 files changed, 130 insertions(+), 26 deletions(-)
> > >
> > > diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
> > > index 16d64a34d1e19465..ae8874401a9f1490 100644
> > > --- a/drivers/firmware/efi/libstub/Makefile
> > > +++ b/drivers/firmware/efi/libstub/Makefile
> > > @@ -88,6 +88,7 @@ lib-$(CONFIG_EFI_GENERIC_STUB)      += efi-stub.o string.o intrinsics.o systable.o \
> > >  lib-$(CONFIG_ARM)            += arm32-stub.o
> > >  lib-$(CONFIG_ARM64)          += arm64.o arm64-stub.o smbios.o
> > >  lib-$(CONFIG_X86)            += x86-stub.o
> > > +lib-$(CONFIG_X86_64)         += x86-5lvl.o
> > >  lib-$(CONFIG_RISCV)          += riscv.o riscv-stub.o
> > >  lib-$(CONFIG_LOONGARCH)              += loongarch.o loongarch-stub.o
> > >
> > > diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
> > > index 1e0203d74691ffcc..51779279fbff21b5 100644
> > > --- a/drivers/firmware/efi/libstub/efi-stub-helper.c
> > > +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
> > > @@ -73,6 +73,8 @@ efi_status_t efi_parse_options(char const *cmdline)
> > >                       efi_loglevel = CONSOLE_LOGLEVEL_QUIET;
> > >               } else if (!strcmp(param, "noinitrd")) {
> > >                       efi_noinitrd = true;
> > > +             } else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) {
> > > +                     efi_no5lvl = true;
> > >               } else if (!strcmp(param, "efi") && val) {
> > >                       efi_nochunk = parse_option_str(val, "nochunk");
> > >                       efi_novamap |= parse_option_str(val, "novamap");
> > > diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
> > > index 6aa38a1bf1265d83..06b7abc92ced9e18 100644
> > > --- a/drivers/firmware/efi/libstub/efistub.h
> > > +++ b/drivers/firmware/efi/libstub/efistub.h
> > > @@ -33,6 +33,7 @@
> > >  #define EFI_ALLOC_LIMIT              ULONG_MAX
> > >  #endif
> > >
> > > +extern bool efi_no5lvl;
> > >  extern bool efi_nochunk;
> > >  extern bool efi_nokaslr;
> > >  extern int efi_loglevel;
> > > diff --git a/drivers/firmware/efi/libstub/x86-5lvl.c b/drivers/firmware/efi/libstub/x86-5lvl.c
> > > new file mode 100644
> > > index 0000000000000000..2428578a3ae08be7
> > > --- /dev/null
> > > +++ b/drivers/firmware/efi/libstub/x86-5lvl.c
> > > @@ -0,0 +1,95 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +#include <linux/efi.h>
> > > +
> > > +#include <asm/boot.h>
> > > +#include <asm/desc.h>
> > > +#include <asm/efi.h>
> > > +
> > > +#include "efistub.h"
> > > +#include "x86-stub.h"
> > > +
> > > +bool efi_no5lvl;
> > > +
> > > +static void (*la57_toggle)(void *trampoline, bool enable_5lvl);
> >
> > As an ack to my comments to another patch, would it makes more sense to rename
> > the trampoline parameter to newcr3 and pass the address of the new page table,
> > instead of the trampoline start address?
> >
> 
> Perhaps, but please realise that my goal here was not to invent an API
> from scratch. There was existing code that I made minimal changes to
> in order to be able to reuse it.
> 
> If this needs further changes, you can always send follow-up patches.

Sure, will do that as follow up patches.
> 
> > > +
> > > +static const struct desc_struct gdt[] = {
> > > +     [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
> > > +     [GDT_ENTRY_KERNEL_CS]   = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
> > > +};
> > > +
> > > +/*
> > > + * Enabling (or disabling) 5 level paging is tricky, because it can only be
> > > + * done from 32-bit mode with paging disabled. This means not only that the
> > > + * code itself must be running from 32-bit addressable physical memory, but
> > > + * also that the root page table must be 32-bit addressable, as programming
> > > + * a 64-bit value into CR3 when running in 32-bit mode is not supported.
> > > + */
> > > +efi_status_t efi_setup_5level_paging(void)
> > > +{
> > > +     u8 tmpl_size = (u8 *)&trampoline_ljmp_imm_offset - (u8 *)&trampoline_32bit_src;
> > > +     efi_status_t status;
> > > +     u8 *la57_code;
> > > +
> > > +     if (!efi_is_64bit())
> > > +             return EFI_SUCCESS;
> > > +
> > > +     /* check for 5 level paging support */
> > > +     if (native_cpuid_eax(0) < 7 ||
> > > +         !(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
> > > +             return EFI_SUCCESS;
> > > +
> > Do we need to check the need_toggle here instead of at efi_5level_switch and
> > skip the whole setup if no need to switch the paging level? Sorry if I missed
> > any point.
> >
> 
> No. There are reasons why firmware might run with 5 levels, and switch
> to 4 levels at ExitBootServices() time.

The need_toggle check at efi_5level_switch(), "need_toggle = want_la57 ^
have_la57", should cover this scenario, right? If we check need_toggle on
efi_setup_5level_paging() and it's false, then we don't need the setup in
efi_setup_5level_paging(), right? I don't see the  la57_toggle() called on other
places.

Or I misunderstand your response?

> 
> > > +     /* allocate some 32-bit addressable memory for code and a page table */
> > > +     status = efi_allocate_pages(2 * PAGE_SIZE, (unsigned long *)&la57_code,
> > > +                                 U32_MAX);
> > > +     if (status != EFI_SUCCESS)
> > > +             return status;
> > > +
> > > +     la57_toggle = memcpy(la57_code, trampoline_32bit_src, tmpl_size);
> > > +     memset(la57_code + tmpl_size, 0x90, PAGE_SIZE - tmpl_size);
> > > +
> > > +     /*
> > > +      * To avoid the need to allocate a 32-bit addressable stack, the
> > > +      * trampoline uses a LJMP instruction to switch back to long mode.
> > > +      * LJMP takes an absolute destination address, which needs to be
> > > +      * fixed up at runtime.
> > > +      */
> > > +     *(u32 *)&la57_code[trampoline_ljmp_imm_offset] += (unsigned long)la57_code;
> > > +
> > > +     efi_adjust_memory_range_protection((unsigned long)la57_toggle, PAGE_SIZE);
> > > +
> > > +     return EFI_SUCCESS;
> > > +}
> > > +
> > > +void efi_5level_switch(void)
> > > +{
> > > +     bool want_la57 = IS_ENABLED(CONFIG_X86_5LEVEL) && !efi_no5lvl;
> > > +     bool have_la57 = native_read_cr4() & X86_CR4_LA57;
> > > +     bool need_toggle = want_la57 ^ have_la57;
> > > +     u64 *pgt = (void *)la57_toggle + PAGE_SIZE;
> >
> > Not sure if we can decouple this address assumption of the pgt and la57_toggle,
> > and keep the pgt as a variable, like la57_toggle, setup by
> > efi_setup_5level_paging() too.
> > Asking because with the Intel X86-S
> > (https://cdrdv2-public.intel.com/776648/x86s-EAS-v1-4-17-23-1.pdf), no
> > tramopline code is needed since the 4/5 level paging switch does not require
> > paging disabling. Of course, it's ok to keep this as is, and we can change
> > late when we begin working on X86-S support.
> 
> We can make further changes as needed. The current interface is based
> on the existing code.

Sure. Will do the change in future.
  
Ard Biesheuvel June 8, 2023, 6:34 a.m. UTC | #4
On Thu, 8 Jun 2023 at 02:43, Yunhong Jiang
<yunhong.jiang@linux.intel.com> wrote:
>
> On Wed, Jun 07, 2023 at 10:31:43PM +0200, Ard Biesheuvel wrote:
> > On Wed, 7 Jun 2023 at 22:19, Yunhong Jiang
> > <yunhong.jiang@linux.intel.com> wrote:
> > >
> > > On Wed, Jun 07, 2023 at 09:23:35AM +0200, Ard Biesheuvel wrote:
...
> > > > +efi_status_t efi_setup_5level_paging(void)
> > > > +{
> > > > +     u8 tmpl_size = (u8 *)&trampoline_ljmp_imm_offset - (u8 *)&trampoline_32bit_src;
> > > > +     efi_status_t status;
> > > > +     u8 *la57_code;
> > > > +
> > > > +     if (!efi_is_64bit())
> > > > +             return EFI_SUCCESS;
> > > > +
> > > > +     /* check for 5 level paging support */
> > > > +     if (native_cpuid_eax(0) < 7 ||
> > > > +         !(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
> > > > +             return EFI_SUCCESS;
> > > > +
> > > Do we need to check the need_toggle here instead of at efi_5level_switch and
> > > skip the whole setup if no need to switch the paging level? Sorry if I missed
> > > any point.
> > >
> >
> > No. There are reasons why firmware might run with 5 levels, and switch
> > to 4 levels at ExitBootServices() time.
>
> The need_toggle check at efi_5level_switch(), "need_toggle = want_la57 ^
> have_la57", should cover this scenario, right? If we check need_toggle on
> efi_setup_5level_paging() and it's false, then we don't need the setup in
> efi_setup_5level_paging(), right? I don't see the  la57_toggle() called on other
> places.
>
> Or I misunderstand your response?
>

The actual, current number of paging levels could be 5 during
efi_setup_5level_paging() and 4 during efi_5level_switch(). So whether
we need to toggle can only be decided at switch time, at which point
we can no longer allocate memory. So the allocation logic in
efi_setup_5level_paging() cannot depend on the actual number of
levels, only on whether or not 5 level paging is supported at all (in
which case a switch is never needed)
  
Yunhong Jiang June 8, 2023, 4:10 p.m. UTC | #5
On Thu, Jun 08, 2023 at 08:34:38AM +0200, Ard Biesheuvel wrote:
> On Thu, 8 Jun 2023 at 02:43, Yunhong Jiang
> <yunhong.jiang@linux.intel.com> wrote:
> >
> > On Wed, Jun 07, 2023 at 10:31:43PM +0200, Ard Biesheuvel wrote:
> > > On Wed, 7 Jun 2023 at 22:19, Yunhong Jiang
> > > <yunhong.jiang@linux.intel.com> wrote:
> > > >
> > > > On Wed, Jun 07, 2023 at 09:23:35AM +0200, Ard Biesheuvel wrote:
> ...
> > > > > +efi_status_t efi_setup_5level_paging(void)
> > > > > +{
> > > > > +     u8 tmpl_size = (u8 *)&trampoline_ljmp_imm_offset - (u8 *)&trampoline_32bit_src;
> > > > > +     efi_status_t status;
> > > > > +     u8 *la57_code;
> > > > > +
> > > > > +     if (!efi_is_64bit())
> > > > > +             return EFI_SUCCESS;
> > > > > +
> > > > > +     /* check for 5 level paging support */
> > > > > +     if (native_cpuid_eax(0) < 7 ||
> > > > > +         !(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
> > > > > +             return EFI_SUCCESS;
> > > > > +
> > > > Do we need to check the need_toggle here instead of at efi_5level_switch and
> > > > skip the whole setup if no need to switch the paging level? Sorry if I missed
> > > > any point.
> > > >
> > >
> > > No. There are reasons why firmware might run with 5 levels, and switch
> > > to 4 levels at ExitBootServices() time.
> >
> > The need_toggle check at efi_5level_switch(), "need_toggle = want_la57 ^
> > have_la57", should cover this scenario, right? If we check need_toggle on
> > efi_setup_5level_paging() and it's false, then we don't need the setup in
> > efi_setup_5level_paging(), right? I don't see the  la57_toggle() called on other
> > places.
> >
> > Or I misunderstand your response?
> >
> 
> The actual, current number of paging levels could be 5 during
> efi_setup_5level_paging() and 4 during efi_5level_switch(). So whether
> we need to toggle can only be decided at switch time, at which point
> we can no longer allocate memory. So the allocation logic in
> efi_setup_5level_paging() cannot depend on the actual number of
> levels, only on whether or not 5 level paging is supported at all (in
> which case a switch is never needed)

Oh, I didn't realize that. Thank you for the clarification.
  

Patch

diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 16d64a34d1e19465..ae8874401a9f1490 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -88,6 +88,7 @@  lib-$(CONFIG_EFI_GENERIC_STUB)	+= efi-stub.o string.o intrinsics.o systable.o \
 lib-$(CONFIG_ARM)		+= arm32-stub.o
 lib-$(CONFIG_ARM64)		+= arm64.o arm64-stub.o smbios.o
 lib-$(CONFIG_X86)		+= x86-stub.o
+lib-$(CONFIG_X86_64)		+= x86-5lvl.o
 lib-$(CONFIG_RISCV)		+= riscv.o riscv-stub.o
 lib-$(CONFIG_LOONGARCH)		+= loongarch.o loongarch-stub.o
 
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 1e0203d74691ffcc..51779279fbff21b5 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -73,6 +73,8 @@  efi_status_t efi_parse_options(char const *cmdline)
 			efi_loglevel = CONSOLE_LOGLEVEL_QUIET;
 		} else if (!strcmp(param, "noinitrd")) {
 			efi_noinitrd = true;
+		} else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) {
+			efi_no5lvl = true;
 		} else if (!strcmp(param, "efi") && val) {
 			efi_nochunk = parse_option_str(val, "nochunk");
 			efi_novamap |= parse_option_str(val, "novamap");
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index 6aa38a1bf1265d83..06b7abc92ced9e18 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -33,6 +33,7 @@ 
 #define EFI_ALLOC_LIMIT		ULONG_MAX
 #endif
 
+extern bool efi_no5lvl;
 extern bool efi_nochunk;
 extern bool efi_nokaslr;
 extern int efi_loglevel;
diff --git a/drivers/firmware/efi/libstub/x86-5lvl.c b/drivers/firmware/efi/libstub/x86-5lvl.c
new file mode 100644
index 0000000000000000..2428578a3ae08be7
--- /dev/null
+++ b/drivers/firmware/efi/libstub/x86-5lvl.c
@@ -0,0 +1,95 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/efi.h>
+
+#include <asm/boot.h>
+#include <asm/desc.h>
+#include <asm/efi.h>
+
+#include "efistub.h"
+#include "x86-stub.h"
+
+bool efi_no5lvl;
+
+static void (*la57_toggle)(void *trampoline, bool enable_5lvl);
+
+static const struct desc_struct gdt[] = {
+	[GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+	[GDT_ENTRY_KERNEL_CS]   = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+};
+
+/*
+ * Enabling (or disabling) 5 level paging is tricky, because it can only be
+ * done from 32-bit mode with paging disabled. This means not only that the
+ * code itself must be running from 32-bit addressable physical memory, but
+ * also that the root page table must be 32-bit addressable, as programming
+ * a 64-bit value into CR3 when running in 32-bit mode is not supported.
+ */
+efi_status_t efi_setup_5level_paging(void)
+{
+	u8 tmpl_size = (u8 *)&trampoline_ljmp_imm_offset - (u8 *)&trampoline_32bit_src;
+	efi_status_t status;
+	u8 *la57_code;
+
+	if (!efi_is_64bit())
+		return EFI_SUCCESS;
+
+	/* check for 5 level paging support */
+	if (native_cpuid_eax(0) < 7 ||
+	    !(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
+		return EFI_SUCCESS;
+
+	/* allocate some 32-bit addressable memory for code and a page table */
+	status = efi_allocate_pages(2 * PAGE_SIZE, (unsigned long *)&la57_code,
+				    U32_MAX);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	la57_toggle = memcpy(la57_code, trampoline_32bit_src, tmpl_size);
+	memset(la57_code + tmpl_size, 0x90, PAGE_SIZE - tmpl_size);
+
+	/*
+	 * To avoid the need to allocate a 32-bit addressable stack, the
+	 * trampoline uses a LJMP instruction to switch back to long mode.
+	 * LJMP takes an absolute destination address, which needs to be
+	 * fixed up at runtime.
+	 */
+	*(u32 *)&la57_code[trampoline_ljmp_imm_offset] += (unsigned long)la57_code;
+
+	efi_adjust_memory_range_protection((unsigned long)la57_toggle, PAGE_SIZE);
+
+	return EFI_SUCCESS;
+}
+
+void efi_5level_switch(void)
+{
+	bool want_la57 = IS_ENABLED(CONFIG_X86_5LEVEL) && !efi_no5lvl;
+	bool have_la57 = native_read_cr4() & X86_CR4_LA57;
+	bool need_toggle = want_la57 ^ have_la57;
+	u64 *pgt = (void *)la57_toggle + PAGE_SIZE;
+	u64 *cr3 = (u64 *)__native_read_cr3();
+	u64 *new_cr3;
+
+	if (!la57_toggle || !need_toggle)
+		return;
+
+	if (!have_la57) {
+		/*
+		 * 5 level paging will be enabled, so a root level page needs
+		 * to be allocated from the 32-bit addressable physical region,
+		 * with its first entry referring to the existing hierarchy.
+		 */
+		new_cr3 = memset(pgt, 0, PAGE_SIZE);
+		new_cr3[0] = (u64)cr3 | _PAGE_TABLE_NOENC;
+	} else {
+		/* take the new root table pointer from the current entry #0 */
+		new_cr3 = (u64 *)(cr3[0] & PAGE_MASK);
+
+		/* copy the new root table if it is not 32-bit addressable */
+		if ((u64)new_cr3 > U32_MAX)
+			new_cr3 = memcpy(pgt, new_cr3, PAGE_SIZE);
+	}
+
+	native_load_gdt(&(struct desc_ptr){ sizeof(gdt) - 1, (u64)gdt });
+
+	la57_toggle(new_cr3, want_la57);
+}
diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
index 517cd68ea86cb7f4..f48a2e795d885af8 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -17,6 +17,7 @@ 
 #include <asm/boot.h>
 
 #include "efistub.h"
+#include "x86-stub.h"
 
 /* Maximum physical address for 64-bit kernel with 4-level paging */
 #define MAXMEM_X86_64_4LEVEL (1ull << 46)
@@ -223,8 +224,8 @@  static void retrieve_apple_device_properties(struct boot_params *boot_params)
 	}
 }
 
-static void
-adjust_memory_range_protection(unsigned long start, unsigned long size)
+void efi_adjust_memory_range_protection(unsigned long start,
+					unsigned long size)
 {
 	efi_status_t status;
 	efi_gcd_memory_space_desc_t desc;
@@ -278,35 +279,14 @@  adjust_memory_range_protection(unsigned long start, unsigned long size)
 	}
 }
 
-/*
- * Trampoline takes 2 pages and can be loaded in first megabyte of memory
- * with its end placed between 128k and 640k where BIOS might start.
- * (see arch/x86/boot/compressed/pgtable_64.c)
- *
- * We cannot find exact trampoline placement since memory map
- * can be modified by UEFI, and it can alter the computed address.
- */
-
-#define TRAMPOLINE_PLACEMENT_BASE ((128 - 8)*1024)
-#define TRAMPOLINE_PLACEMENT_SIZE (640*1024 - (128 - 8)*1024)
-
 void startup_32(struct boot_params *boot_params);
 
 static void
 setup_memory_protection(unsigned long image_base, unsigned long image_size)
 {
-	/*
-	 * Allow execution of possible trampoline used
-	 * for switching between 4- and 5-level page tables
-	 * and relocated kernel image.
-	 */
-
-	adjust_memory_range_protection(TRAMPOLINE_PLACEMENT_BASE,
-				       TRAMPOLINE_PLACEMENT_SIZE);
-
 #ifdef CONFIG_64BIT
 	if (image_base != (unsigned long)startup_32)
-		adjust_memory_range_protection(image_base, image_size);
+		efi_adjust_memory_range_protection(image_base, image_size);
 #else
 	/*
 	 * Clear protection flags on a whole range of possible
@@ -316,8 +296,8 @@  setup_memory_protection(unsigned long image_base, unsigned long image_size)
 	 * need to remove possible protection on relocated image
 	 * itself disregarding further relocations.
 	 */
-	adjust_memory_range_protection(LOAD_PHYSICAL_ADDR,
-				       KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR);
+	efi_adjust_memory_range_protection(LOAD_PHYSICAL_ADDR,
+					   KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR);
 #endif
 }
 
@@ -839,6 +819,12 @@  void __noreturn efi_stub_entry(efi_handle_t handle,
 		efi_dxe_table = NULL;
 	}
 
+	status = efi_setup_5level_paging();
+	if (status != EFI_SUCCESS) {
+		efi_err("efi_setup_5level_paging() failed!\n");
+		goto fail;
+	}
+
 	/*
 	 * If the kernel isn't already loaded at a suitable address,
 	 * relocate it.
@@ -959,6 +945,8 @@  void __noreturn efi_stub_entry(efi_handle_t handle,
 		goto fail;
 	}
 
+	efi_5level_switch();
+
 	if (IS_ENABLED(CONFIG_X86_64))
 		/* add offset of startup_64() */
 		bzimage_addr += 0x200;
diff --git a/drivers/firmware/efi/libstub/x86-stub.h b/drivers/firmware/efi/libstub/x86-stub.h
new file mode 100644
index 0000000000000000..37c5a36b9d8cf9b2
--- /dev/null
+++ b/drivers/firmware/efi/libstub/x86-stub.h
@@ -0,0 +1,17 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/efi.h>
+
+extern void trampoline_32bit_src(void *, bool);
+extern const u16 trampoline_ljmp_imm_offset;
+
+void efi_adjust_memory_range_protection(unsigned long start,
+					unsigned long size);
+
+#ifdef CONFIG_X86_64
+efi_status_t efi_setup_5level_paging(void);
+void efi_5level_switch(void);
+#else
+static inline efi_status_t efi_setup_5level_paging(void) { return EFI_SUCCESS; }
+static inline void efi_5level_switch(void) {}
+#endif