[v8,21/33] x86/fred: FRED initialization code
Commit Message
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
The code to initialize FRED when it's available and _not_ disabled.
cpu_init_fred_exceptions() is the core function to initialize FRED,
which
1. Sets up FRED entrypoints for events happening in ring 0 and 3.
2. Sets up a default stack for event handling.
3. Sets up dedicated event stacks for DB/NMI/MC/DF, equivalent to
the IDT IST stacks.
4. Forces 32-bit system calls to use "int $0x80" only.
5. Enables FRED and invalidtes IDT.
When the FRED is used, cpu_init_exception_handling() initializes FRED
through calling cpu_init_fred_exceptions(), otherwise it sets up TSS
IST and loads IDT.
As FRED uses the ring 3 FRED entrypoint for SYSCALL and SYSENTER,
it skips setting up SYSCALL/SYSENTER related MSRs, e.g., MSR_LSTAR.
Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Co-developed-by: Xin Li <xin3.li@intel.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
Changes since v5:
* Add a comment for FRED stack level settings (Lai Jiangshan).
* Define #DB/NMI/#MC/#DF stack levels using macros.
---
arch/x86/include/asm/fred.h | 27 +++++++++++++
arch/x86/include/asm/traps.h | 2 +
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/cpu/common.c | 74 +++++++++++++++++++++-------------
arch/x86/kernel/fred.c | 78 ++++++++++++++++++++++++++++++++++++
arch/x86/kernel/irqinit.c | 7 +++-
arch/x86/kernel/traps.c | 16 +++++++-
7 files changed, 175 insertions(+), 30 deletions(-)
create mode 100644 arch/x86/kernel/fred.c
Comments
On Mon, Apr 10 2023 at 01:14, Xin Li wrote:
> From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
>
> The code to initialize FRED when it's available and _not_ disabled.
>
> cpu_init_fred_exceptions() is the core function to initialize FRED,
> which
> 1. Sets up FRED entrypoints for events happening in ring 0 and 3.
> 2. Sets up a default stack for event handling.
> 3. Sets up dedicated event stacks for DB/NMI/MC/DF, equivalent to
> the IDT IST stacks.
> 4. Forces 32-bit system calls to use "int $0x80" only.
> 5. Enables FRED and invalidtes IDT.
>
> When the FRED is used, cpu_init_exception_handling() initializes FRED
> through calling cpu_init_fred_exceptions(), otherwise it sets up TSS
> IST and loads IDT.
>
> As FRED uses the ring 3 FRED entrypoint for SYSCALL and SYSENTER,
> it skips setting up SYSCALL/SYSENTER related MSRs, e.g., MSR_LSTAR.
So how is this supposed to work? FRED is enabled in Kconfig, the feature
is detected and FRED is initialized _before_ the rest of the required
changes is in place.
Documentation/process/* is not just there because people have nothing
better to do than writing pointless documents.
Thanks,
tglx
On Mon, Apr 10 2023 at 01:14, Xin Li wrote:
>
> +/*
> + * The actual assembly entry and exit points
> + */
> +extern __visible void fred_entrypoint_user(void);
Why is this defined in this patch and not at the point where the
function is introduced?
> +/*
> + * Initialization
> + */
> +void cpu_init_fred_exceptions(void);
> +void fred_setup_apic(void);
> +
> #endif /* __ASSEMBLY__ */
>
> +#else
> +#define cpu_init_fred_exceptions() BUG()
> +#define fred_setup_apic() BUG()
static inline stubs please.
> @@ -2054,28 +2055,6 @@ static void wrmsrl_cstar(unsigned long val)
> /* May not be marked __init: used by software suspend */
> void syscall_init(void)
> {
> - wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
> - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
> -
> -#ifdef CONFIG_IA32_EMULATION
> - wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
> - /*
> - * This only works on Intel CPUs.
> - * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
> - * This does not cause SYSENTER to jump to the wrong location, because
> - * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
> - */
> - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
> - wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
> - (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
> - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
> -#else
> - wrmsrl_cstar((unsigned long)ignore_sysret);
> - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
> - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
> - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
> -#endif
> -
> /*
> * Flags to clear on syscall; clear as much as possible
> * to minimize user space-kernel interference.
> @@ -2086,6 +2065,41 @@ void syscall_init(void)
> X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
> X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
> X86_EFLAGS_AC|X86_EFLAGS_ID);
> +
> + /*
> + * The default user and kernel segments
> + */
> + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
> +
> + if (cpu_feature_enabled(X86_FEATURE_FRED)) {
> + /* Both sysexit and sysret cause #UD when FRED is enabled */
> + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
> + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
> + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
> + } else {
> + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
> +
> +#ifdef CONFIG_IA32_EMULATION
> + wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
> + /*
> + * This only works on Intel CPUs.
> + * On AMD CPUs these MSRs are 32-bit, CPU truncates
> + * MSR_IA32_SYSENTER_EIP.
> + * This does not cause SYSENTER to jump to the wrong
> + * location, because AMD doesn't allow SYSENTER in
> + * long mode (either 32- or 64-bit).
> + */
> + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
> + wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
> + (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
> + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
> +#else
> + wrmsrl_cstar((unsigned long)ignore_sysret);
> + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
> + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
> + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
> +#endif
> + }
> }
Sigh. Can you please split this into
static void idt_syscall_init(void)
{
All the existing gunk
}
void syscall_init(void)
{
/* The default user and kernel segments */
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
idt_syscall_init();
}
in a first step and then in the next patch add the FRED muck?
> +/*
> + * Initialize FRED on this CPU. This cannot be __init as it is called
> + * during CPU hotplug.
Really no need to repeat this comment vs. __init all over the place.
> + */
> +void cpu_init_fred_exceptions(void)
> +{
> + wrmsrl(MSR_IA32_FRED_CONFIG,
> + FRED_CONFIG_REDZONE | /* Reserve for CALL emulation */
Please don't use tail comments. Nowhere.
> + FRED_CONFIG_INT_STKLVL(0) |
> + FRED_CONFIG_ENTRYPOINT(fred_entrypoint_user));
> +
> +/*
> + * Initialize system vectors from a FRED perspective, so
> + * lapic_assign_system_vectors() can do its job.
> + */
> +void __init fred_setup_apic(void)
> +{
> + int i;
> +
> + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
> + set_bit(i, system_vectors);
> + /*
> + * Don't set the non assigned system vectors in the
> + * system_vectors bitmap. Otherwise they show up in
> + * /proc/interrupts.
> + */
> +#ifdef CONFIG_SMP
> + set_bit(IRQ_MOVE_CLEANUP_VECTOR, system_vectors);
> +#endif
> +
> + for (i = 0; i < NR_SYSTEM_VECTORS; i++) {
> + if (get_system_interrupt_handler(i) != NULL) {
This _cannot be NULL. The system vector table must be fully populated
with either the real handler or the spurious handler. Otherwise you need
a NULL pointer check in the dispatch path.
> + set_bit(i + FIRST_SYSTEM_VECTOR, system_vectors);
> + }
> + }
> +
> + /* The rest are fair game... */
Can you please refrain from adding useless comments. Commenting the
obvious is a distraction and not helpful in any way. Comment the things
which are not obvious in the first place.
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -1537,6 +1537,14 @@ static system_interrupt_handler system_interrupt_handlers[NR_SYSTEM_VECTORS] = {
>
> #undef SYSV
>
> +system_interrupt_handler get_system_interrupt_handler(unsigned int i)
> +{
> + if (i >= NR_SYSTEM_VECTORS)
> + return NULL;
Seriously?
> + return system_interrupt_handlers[i];
Get rid of this completely confusing and useless function and look the
table up at the only call site. I'm all for defensive programming, but
this is hideous.
Thanks,
tglx
@@ -57,6 +57,19 @@
#define FRED_CSX_ALLOW_SINGLE_STEP _BITUL(25)
#define FRED_CSX_INTERRUPT_SHADOW _BITUL(24)
+/* #DB in the kernel would imply the use of a kernel debugger. */
+#define FRED_DB_STACK_LEVEL 1
+#define FRED_NMI_STACK_LEVEL 2
+#define FRED_MC_STACK_LEVEL 2
+/*
+ * #DF is the highest level because a #DF means "something went wrong
+ * *while delivering an exception*." The number of cases for which that
+ * can happen with FRED is drastically reduced and basically amounts to
+ * "the stack you pointed me to is broken." Thus, always change stacks
+ * on #DF, which means it should be at the highest level.
+ */
+#define FRED_DF_STACK_LEVEL 3
+
#ifndef __ASSEMBLY__
#include <linux/kernel.h>
@@ -104,8 +117,22 @@ DECLARE_FRED_HANDLER(fred_exc_debug);
DECLARE_FRED_HANDLER(fred_exc_page_fault);
DECLARE_FRED_HANDLER(fred_exc_machine_check);
+/*
+ * The actual assembly entry and exit points
+ */
+extern __visible void fred_entrypoint_user(void);
+
+/*
+ * Initialization
+ */
+void cpu_init_fred_exceptions(void);
+void fred_setup_apic(void);
+
#endif /* __ASSEMBLY__ */
+#else
+#define cpu_init_fred_exceptions() BUG()
+#define fred_setup_apic() BUG()
#endif /* CONFIG_X86_FRED */
#endif /* ASM_X86_FRED_H */
@@ -56,4 +56,6 @@ void __noreturn handle_stack_overflow(struct pt_regs *regs,
void f (struct pt_regs *regs)
typedef DECLARE_SYSTEM_INTERRUPT_HANDLER((*system_interrupt_handler));
+system_interrupt_handler get_system_interrupt_handler(unsigned int i);
+
#endif /* _ASM_X86_TRAPS_H */
@@ -47,6 +47,7 @@ obj-y += platform-quirks.o
obj-y += process_$(BITS).o signal.o signal_$(BITS).o
obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_X86_FRED) += fred.o
obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
obj-y += setup.o x86_init.o i8259.o irqinit.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
@@ -58,6 +58,7 @@
#include <asm/microcode_intel.h>
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
+#include <asm/fred.h>
#include <asm/uv/uv.h>
#include <asm/sigframe.h>
#include <asm/traps.h>
@@ -2054,28 +2055,6 @@ static void wrmsrl_cstar(unsigned long val)
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
- wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
-
-#ifdef CONFIG_IA32_EMULATION
- wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
- /*
- * This only works on Intel CPUs.
- * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
- * This does not cause SYSENTER to jump to the wrong location, because
- * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
- */
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
- (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
-#else
- wrmsrl_cstar((unsigned long)ignore_sysret);
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
-#endif
-
/*
* Flags to clear on syscall; clear as much as possible
* to minimize user space-kernel interference.
@@ -2086,6 +2065,41 @@ void syscall_init(void)
X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
X86_EFLAGS_AC|X86_EFLAGS_ID);
+
+ /*
+ * The default user and kernel segments
+ */
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
+
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ /* Both sysexit and sysret cause #UD when FRED is enabled */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+ } else {
+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+
+#ifdef CONFIG_IA32_EMULATION
+ wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
+ /*
+ * This only works on Intel CPUs.
+ * On AMD CPUs these MSRs are 32-bit, CPU truncates
+ * MSR_IA32_SYSENTER_EIP.
+ * This does not cause SYSENTER to jump to the wrong
+ * location, because AMD doesn't allow SYSENTER in
+ * long mode (either 32- or 64-bit).
+ */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+#else
+ wrmsrl_cstar((unsigned long)ignore_sysret);
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+#endif
+ }
}
#else /* CONFIG_X86_64 */
@@ -2218,18 +2232,24 @@ void cpu_init_exception_handling(void)
/* paranoid_entry() gets the CPU number from the GDT */
setup_getcpu(cpu);
- /* IST vectors need TSS to be set up. */
- tss_setup_ist(tss);
+ /* Set up the TSS */
tss_setup_io_bitmap(tss);
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
-
load_TR_desc();
/* GHCB needs to be setup to handle #VC. */
setup_ghcb();
- /* Finally load the IDT */
- load_current_idt();
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ /* Set up FRED exception handling */
+ cpu_init_fred_exceptions();
+ } else {
+ /* IST vectors need TSS to be set up. */
+ tss_setup_ist(tss);
+
+ /* Finally load the IDT */
+ load_current_idt();
+ }
}
/*
new file mode 100644
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+#include <asm/desc.h>
+#include <asm/fred.h>
+#include <asm/tlbflush.h> /* For cr4_set_bits() */
+#include <asm/traps.h>
+
+/*
+ * Initialize FRED on this CPU. This cannot be __init as it is called
+ * during CPU hotplug.
+ */
+void cpu_init_fred_exceptions(void)
+{
+ wrmsrl(MSR_IA32_FRED_CONFIG,
+ FRED_CONFIG_REDZONE | /* Reserve for CALL emulation */
+ FRED_CONFIG_INT_STKLVL(0) |
+ FRED_CONFIG_ENTRYPOINT(fred_entrypoint_user));
+
+ /*
+ * The purpose of separate stacks for NMI, #DB and #MC *in the kernel*
+ * (remember that user space faults are always taken on stack level 0)
+ * is to avoid overflowing the kernel stack.
+ */
+ wrmsrl(MSR_IA32_FRED_STKLVLS,
+ FRED_STKLVL(X86_TRAP_DB, FRED_DB_STACK_LEVEL) |
+ FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) |
+ FRED_STKLVL(X86_TRAP_MC, FRED_MC_STACK_LEVEL) |
+ FRED_STKLVL(X86_TRAP_DF, FRED_DF_STACK_LEVEL));
+
+ /* The FRED equivalents to IST stacks... */
+ wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB));
+ wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI));
+ wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF));
+
+ /* Not used with FRED */
+ wrmsrl(MSR_LSTAR, 0ULL);
+ wrmsrl(MSR_CSTAR, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+
+ /* Enable FRED */
+ cr4_set_bits(X86_CR4_FRED);
+ idt_invalidate(); /* Any further IDT use is a bug */
+
+ /* Use int $0x80 for 32-bit system calls in FRED mode */
+ setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+}
+
+/*
+ * Initialize system vectors from a FRED perspective, so
+ * lapic_assign_system_vectors() can do its job.
+ */
+void __init fred_setup_apic(void)
+{
+ int i;
+
+ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
+ set_bit(i, system_vectors);
+
+ /*
+ * Don't set the non assigned system vectors in the
+ * system_vectors bitmap. Otherwise they show up in
+ * /proc/interrupts.
+ */
+#ifdef CONFIG_SMP
+ set_bit(IRQ_MOVE_CLEANUP_VECTOR, system_vectors);
+#endif
+
+ for (i = 0; i < NR_SYSTEM_VECTORS; i++) {
+ if (get_system_interrupt_handler(i) != NULL) {
+ set_bit(i + FIRST_SYSTEM_VECTOR, system_vectors);
+ }
+ }
+
+ /* The rest are fair game... */
+}
@@ -28,6 +28,7 @@
#include <asm/setup.h>
#include <asm/i8259.h>
#include <asm/traps.h>
+#include <asm/fred.h>
#include <asm/prom.h>
/*
@@ -96,7 +97,11 @@ void __init native_init_IRQ(void)
/* Execute any quirks before the call gates are initialised: */
x86_init.irqs.pre_vector_init();
- idt_setup_apic_and_irq_gates();
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ fred_setup_apic();
+ else
+ idt_setup_apic_and_irq_gates();
+
lapic_assign_system_vectors();
if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) {
@@ -1537,6 +1537,14 @@ static system_interrupt_handler system_interrupt_handlers[NR_SYSTEM_VECTORS] = {
#undef SYSV
+system_interrupt_handler get_system_interrupt_handler(unsigned int i)
+{
+ if (i >= NR_SYSTEM_VECTORS)
+ return NULL;
+
+ return system_interrupt_handlers[i];
+}
+
/*
* External interrupt dispatch function.
*
@@ -1572,7 +1580,8 @@ void __init install_system_interrupt_handler(unsigned int n, const void *asm_add
#ifdef CONFIG_X86_64
system_interrupt_handlers[n - FIRST_SYSTEM_VECTOR] = (system_interrupt_handler)addr;
#endif
- alloc_intr_gate(n, asm_addr);
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ alloc_intr_gate(n, asm_addr);
}
void __init trap_init(void)
@@ -1585,7 +1594,10 @@ void __init trap_init(void)
/* Initialize TSS before setting up traps so ISTs work */
cpu_init_exception_handling();
+
/* Setup traps as cpu_init() might #GP */
- idt_setup_traps();
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ idt_setup_traps();
+
cpu_init();
}