[RFC,21/32] x86/fred: FRED entry/exit and dispatch code
Commit Message
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
The code to actually handle kernel and event entry/exit using
FRED. It is split up into two files thus:
- entry_64_fred.S contains the actual entrypoints and exit code, and
saves and restores registers.
- entry_fred.c contains the event multi-level dispatch code for FRED.
The two-level dispatch is on the event type, and the second-level
is on the event vector.
Some event handlers, #DB/#BP/#DF/#PF/#MC/#UD, start instrumentation
in their own ways. Dave Hansen suggested to use an exception bitmap
for the checking whether to start instrumentation in the exception
dispatch framework.
Originally-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Co-developed-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
arch/x86/entry/Makefile | 5 +-
arch/x86/entry/entry_64_fred.S | 55 +++++++
arch/x86/entry/entry_fred.c | 270 ++++++++++++++++++++++++++++++++
arch/x86/include/asm/idtentry.h | 2 +
arch/x86/include/asm/traps.h | 2 +
5 files changed, 333 insertions(+), 1 deletion(-)
create mode 100644 arch/x86/entry/entry_64_fred.S
create mode 100644 arch/x86/entry/entry_fred.c
Comments
On Mon, Dec 19, 2022 at 10:36:47PM -0800, Xin Li wrote:
> +static DEFINE_FRED_HANDLER(fred_exception)
> +{
> + /*
> + * This intentially omits exceptions that cannot happen on FRED h/w:
> + * vectors _NOT_ listed are set to NULL.
> + */
> + static const fred_handler exception_handlers[NUM_EXCEPTION_VECTORS] = {
> + [X86_TRAP_DE] = fred_exc_divide_error,
> + [X86_TRAP_DB] = fred_exc_debug,
> + [X86_TRAP_NMI] = NULL, /* A separate event type, not handled here */
> + [X86_TRAP_BP] = exc_int3,
> + [X86_TRAP_OF] = fred_exc_overflow,
> + [X86_TRAP_BR] = fred_exc_bounds,
> + [X86_TRAP_UD] = exc_invalid_op,
> + [X86_TRAP_NM] = fred_exc_device_not_available,
> + [X86_TRAP_DF] = fred_exc_double_fault,
> + [X86_TRAP_OLD_MF] = NULL, /* 387 only! */
> + [X86_TRAP_TS] = fred_exc_invalid_tss,
> + [X86_TRAP_NP] = fred_exc_segment_not_present,
> + [X86_TRAP_SS] = fred_exc_stack_segment,
> + [X86_TRAP_GP] = fred_exc_general_protection,
> + [X86_TRAP_PF] = fred_exc_page_fault,
> + [X86_TRAP_SPURIOUS] = NULL, /* Interrupts are their own event type */
> + [X86_TRAP_MF] = fred_exc_coprocessor_error,
> + [X86_TRAP_AC] = fred_exc_alignment_check,
> + [X86_TRAP_MC] = fred_exc_machine_check,
> + [X86_TRAP_XF] = fred_exc_simd_coprocessor_error
> + };
> + static const u32 noinstr_mask = BIT(X86_TRAP_DB) | BIT(X86_TRAP_BP) |
> + BIT(X86_TRAP_DF) | BIT(X86_TRAP_PF) |
> + BIT(X86_TRAP_MC) | BIT(X86_TRAP_UD);
> + u8 vector = array_index_nospec((u8)regs->vector, NUM_EXCEPTION_VECTORS);
> + irqentry_state_t state;
> +
> + if (likely(exception_handlers[vector])) {
Can't you get rid of this branch by stuffing the exception_handlers[]
table with fred_bad_event?
> + if (!(BIT(vector) & noinstr_mask)) {
> + state = irqentry_enter(regs);
> + instrumentation_begin();
> + }
> +
> + exception_handlers[vector](regs);
> +
> + if (!(BIT(vector) & noinstr_mask)) {
> + instrumentation_end();
> + irqentry_exit(regs, state);
> + }
This noinstr mask is daft; why not have DEFINE_FRED_HANDLER and
DEFINE_FRED_HANDLER_RAW or something, have the normal one include the
irqentry bits and use the _RAW one for the 'funny' ones that need to do
it themselves?
> + } else {
> + return fred_bad_event(regs);
> + }
Then all this becomes:
exception_handlers[vector](regs);
no branches, no nothing.
> +}
> > +static DEFINE_FRED_HANDLER(fred_exception)
> > +{
> > + /*
> > + * This intentially omits exceptions that cannot happen on FRED h/w:
> > + * vectors _NOT_ listed are set to NULL.
> > + */
> > + static const fred_handler
> exception_handlers[NUM_EXCEPTION_VECTORS] = {
> > + [X86_TRAP_DE] = fred_exc_divide_error,
> > + [X86_TRAP_DB] = fred_exc_debug,
> > + [X86_TRAP_NMI] = NULL, /* A separate event type, not handled
> here */
> > + [X86_TRAP_BP] = exc_int3,
> > + [X86_TRAP_OF] = fred_exc_overflow,
> > + [X86_TRAP_BR] = fred_exc_bounds,
> > + [X86_TRAP_UD] = exc_invalid_op,
> > + [X86_TRAP_NM] = fred_exc_device_not_available,
> > + [X86_TRAP_DF] = fred_exc_double_fault,
> > + [X86_TRAP_OLD_MF] = NULL, /* 387 only! */
> > + [X86_TRAP_TS] = fred_exc_invalid_tss,
> > + [X86_TRAP_NP] = fred_exc_segment_not_present,
> > + [X86_TRAP_SS] = fred_exc_stack_segment,
> > + [X86_TRAP_GP] = fred_exc_general_protection,
> > + [X86_TRAP_PF] = fred_exc_page_fault,
> > + [X86_TRAP_SPURIOUS] = NULL, /* Interrupts are their own event
> type */
> > + [X86_TRAP_MF] = fred_exc_coprocessor_error,
> > + [X86_TRAP_AC] = fred_exc_alignment_check,
> > + [X86_TRAP_MC] = fred_exc_machine_check,
> > + [X86_TRAP_XF] = fred_exc_simd_coprocessor_error
> > + };
> > + static const u32 noinstr_mask = BIT(X86_TRAP_DB) | BIT(X86_TRAP_BP) |
> > + BIT(X86_TRAP_DF) | BIT(X86_TRAP_PF) |
> > + BIT(X86_TRAP_MC) | BIT(X86_TRAP_UD);
> > + u8 vector = array_index_nospec((u8)regs->vector,
> NUM_EXCEPTION_VECTORS);
> > + irqentry_state_t state;
> > +
> > + if (likely(exception_handlers[vector])) {
>
> Can't you get rid of this branch by stuffing the exception_handlers[] table with
> fred_bad_event?
>
> > + if (!(BIT(vector) & noinstr_mask)) {
> > + state = irqentry_enter(regs);
> > + instrumentation_begin();
> > + }
> > +
> > + exception_handlers[vector](regs);
> > +
> > + if (!(BIT(vector) & noinstr_mask)) {
> > + instrumentation_end();
> > + irqentry_exit(regs, state);
> > + }
>
> This noinstr mask is daft; why not have DEFINE_FRED_HANDLER and
> DEFINE_FRED_HANDLER_RAW or something, have the normal one include the
> irqentry bits and use the _RAW one for the 'funny' ones that need to do it
> themselves?
I wanted to keep "state = irqentry_enter(regs); instrumentation_begin();"
in the dispatch framework, instead of pushing down to the handlers.
Of course, we could do it the other way if it is more preferred.
#PF is way more frequent than other events, maybe we should do an early
out for it in fred_entry_from_user/kernel().
>
> > + } else {
> > + return fred_bad_event(regs);
> > + }
>
> Then all this becomes:
>
> exception_handlers[vector](regs);
>
> no branches, no nothing.
This does look simpler and cleaner at the dispatch framework layer.
Xin
On Wed, Dec 21, 2022 at 02:56:08AM +0000, Li, Xin3 wrote:
> > > + if (!(BIT(vector) & noinstr_mask)) {
> > > + state = irqentry_enter(regs);
> > > + instrumentation_begin();
> > > + }
> > > +
> > > + exception_handlers[vector](regs);
> > > +
> > > + if (!(BIT(vector) & noinstr_mask)) {
> > > + instrumentation_end();
> > > + irqentry_exit(regs, state);
> > > + }
> >
> > This noinstr mask is daft; why not have DEFINE_FRED_HANDLER and
> > DEFINE_FRED_HANDLER_RAW or something, have the normal one include the
> > irqentry bits and use the _RAW one for the 'funny' ones that need to do it
> > themselves?
>
> I wanted to keep "state = irqentry_enter(regs); instrumentation_begin();"
> in the dispatch framework, instead of pushing down to the handlers.
>
> Of course, we could do it the other way if it is more preferred.
Yes, please do as I suggested, it is consistent IDTENTRY macros.
@@ -18,6 +18,9 @@ obj-y += vdso/
obj-y += vsyscall/
obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o
+CFLAGS_entry_fred.o += -fno-stack-protector
+CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE)
+obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o
+
obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o
obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o
-
new file mode 100644
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * arch/x86/entry/entry_64_fred.S
+ *
+ * The actual FRED entry points.
+ */
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+#include <asm/fred.h>
+
+#include "calling.h"
+
+ .code64
+ .section ".noinstr.text", "ax"
+
+.macro FRED_ENTER
+ UNWIND_HINT_EMPTY
+ PUSH_AND_CLEAR_REGS
+ movq %rsp, %rdi /* %rdi -> pt_regs */
+.endm
+
+.macro FRED_EXIT
+ UNWIND_HINT_REGS
+ POP_REGS
+ addq $8,%rsp /* Drop error code */
+.endm
+
+/*
+ * The new RIP value that FRED event delivery establishes is
+ * IA32_FRED_CONFIG & ~FFFH for events that occur in ring 3.
+ * Thus the FRED ring 3 entry point must be 4K page aligned.
+ */
+ .align 4096
+
+SYM_CODE_START_NOALIGN(fred_entrypoint_user)
+ FRED_ENTER
+ call fred_entry_from_user
+SYM_INNER_LABEL(fred_exit_user, SYM_L_GLOBAL)
+ FRED_EXIT
+ ERETU
+SYM_CODE_END(fred_entrypoint_user)
+
+/*
+ * The new RIP value that FRED event delivery establishes is
+ * (IA32_FRED_CONFIG & ~FFFH) + 256 for events that occur in
+ * ring 0, i.e., fred_entrypoint_user + 256.
+ */
+ .org fred_entrypoint_user+256
+SYM_CODE_START_NOALIGN(fred_entrypoint_kernel)
+ FRED_ENTER
+ call fred_entry_from_kernel
+ FRED_EXIT
+ ERETS
+SYM_CODE_END(fred_entrypoint_kernel)
new file mode 100644
@@ -0,0 +1,270 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * arch/x86/entry/entry_fred.c
+ *
+ * This contains the dispatch functions called from the entry point
+ * assembly.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kdebug.h> /* oops_begin/end, ... */
+#include <linux/nospec.h>
+#include <asm/event-type.h>
+#include <asm/fred.h>
+#include <asm/idtentry.h>
+#include <asm/syscall.h>
+#include <asm/trapnr.h>
+#include <asm/traps.h>
+#include <asm/kdebug.h>
+
+/*
+ * Badness...
+ */
+static DEFINE_FRED_HANDLER(fred_bad_event)
+{
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+
+ instrumentation_begin();
+
+ /* Panic on events from a high stack level */
+ if (regs->current_stack_level > 0) {
+ pr_emerg("PANIC: invalid or fatal FRED event; event type %u "
+ "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n",
+ regs->type, regs->vector, regs->orig_ax,
+ fred_event_data(regs), regs->cs, regs->ip);
+ die("invalid or fatal FRED event", regs, regs->orig_ax);
+ panic("invalid or fatal FRED event");
+ } else {
+ unsigned long flags = oops_begin();
+ int sig = SIGKILL;
+
+ pr_alert("BUG: invalid or fatal FRED event; event type %u "
+ "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n",
+ regs->type, regs->vector, regs->orig_ax,
+ fred_event_data(regs), regs->cs, regs->ip);
+
+ if (__die("Invalid or fatal FRED event", regs, regs->orig_ax))
+ sig = 0;
+
+ oops_end(flags, regs, sig);
+ }
+
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
+}
+
+#define DEFINE_FRED_EXCEPTION_HANDLER(func) \
+static void fred_##func(struct pt_regs *regs) \
+{ \
+ func (regs); \
+}
+
+DEFINE_FRED_EXCEPTION_HANDLER(exc_divide_error);
+DEFINE_FRED_EXCEPTION_HANDLER(exc_overflow);
+DEFINE_FRED_EXCEPTION_HANDLER(exc_bounds);
+DEFINE_FRED_EXCEPTION_HANDLER(exc_device_not_available);
+DEFINE_FRED_EXCEPTION_HANDLER(exc_coprocessor_error);
+DEFINE_FRED_EXCEPTION_HANDLER(exc_simd_coprocessor_error);
+
+#define DEFINE_FRED_EXCEPTION_HANDLER_ERRORCODE(func) \
+static void fred_##func(struct pt_regs *regs) \
+{ \
+ func (regs, regs->orig_ax); \
+}
+
+DEFINE_FRED_EXCEPTION_HANDLER_ERRORCODE(exc_invalid_tss);
+DEFINE_FRED_EXCEPTION_HANDLER_ERRORCODE(exc_segment_not_present);
+noinstr DEFINE_FRED_EXCEPTION_HANDLER_ERRORCODE(exc_double_fault);
+DEFINE_FRED_EXCEPTION_HANDLER_ERRORCODE(exc_stack_segment);
+DEFINE_FRED_EXCEPTION_HANDLER_ERRORCODE(exc_general_protection);
+DEFINE_FRED_EXCEPTION_HANDLER_ERRORCODE(exc_alignment_check);
+
+/*
+ * Exception entry
+ */
+static DEFINE_FRED_HANDLER(fred_exception)
+{
+ /*
+ * This intentially omits exceptions that cannot happen on FRED h/w:
+ * vectors _NOT_ listed are set to NULL.
+ */
+ static const fred_handler exception_handlers[NUM_EXCEPTION_VECTORS] = {
+ [X86_TRAP_DE] = fred_exc_divide_error,
+ [X86_TRAP_DB] = fred_exc_debug,
+ [X86_TRAP_NMI] = NULL, /* A separate event type, not handled here */
+ [X86_TRAP_BP] = exc_int3,
+ [X86_TRAP_OF] = fred_exc_overflow,
+ [X86_TRAP_BR] = fred_exc_bounds,
+ [X86_TRAP_UD] = exc_invalid_op,
+ [X86_TRAP_NM] = fred_exc_device_not_available,
+ [X86_TRAP_DF] = fred_exc_double_fault,
+ [X86_TRAP_OLD_MF] = NULL, /* 387 only! */
+ [X86_TRAP_TS] = fred_exc_invalid_tss,
+ [X86_TRAP_NP] = fred_exc_segment_not_present,
+ [X86_TRAP_SS] = fred_exc_stack_segment,
+ [X86_TRAP_GP] = fred_exc_general_protection,
+ [X86_TRAP_PF] = fred_exc_page_fault,
+ [X86_TRAP_SPURIOUS] = NULL, /* Interrupts are their own event type */
+ [X86_TRAP_MF] = fred_exc_coprocessor_error,
+ [X86_TRAP_AC] = fred_exc_alignment_check,
+ [X86_TRAP_MC] = fred_exc_machine_check,
+ [X86_TRAP_XF] = fred_exc_simd_coprocessor_error
+ };
+ static const u32 noinstr_mask = BIT(X86_TRAP_DB) | BIT(X86_TRAP_BP) |
+ BIT(X86_TRAP_DF) | BIT(X86_TRAP_PF) |
+ BIT(X86_TRAP_MC) | BIT(X86_TRAP_UD);
+ u8 vector = array_index_nospec((u8)regs->vector, NUM_EXCEPTION_VECTORS);
+ irqentry_state_t state;
+
+ if (likely(exception_handlers[vector])) {
+ if (!(BIT(vector) & noinstr_mask)) {
+ state = irqentry_enter(regs);
+ instrumentation_begin();
+ }
+
+ exception_handlers[vector](regs);
+
+ if (!(BIT(vector) & noinstr_mask)) {
+ instrumentation_end();
+ irqentry_exit(regs, state);
+ }
+ } else {
+ return fred_bad_event(regs);
+ }
+}
+
+static __always_inline void fred_emulate_trap(struct pt_regs *regs)
+{
+ regs->type = EVENT_TYPE_SWFAULT;
+ regs->orig_ax = 0;
+ fred_exception(regs);
+}
+
+static __always_inline void fred_emulate_fault(struct pt_regs *regs)
+{
+ regs->ip -= regs->instr_len;
+ fred_emulate_trap(regs);
+}
+
+/*
+ * Emulate SYSENTER if applicable. This is not the preferred system
+ * call in 32-bit mode under FRED, rather int $0x80 is preferred and
+ * exported in the vdso. SYSCALL proper has a hard-coded early out in
+ * fred_entry_from_user().
+ */
+static DEFINE_FRED_HANDLER(fred_syscall_slow)
+{
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) &&
+ likely(regs->vector == FRED_SYSENTER)) {
+ /* Convert frame to a syscall frame */
+ regs->orig_ax = regs->ax;
+ regs->ax = -ENOSYS;
+ do_fast_syscall_32(regs);
+ } else {
+ regs->vector = X86_TRAP_UD;
+ fred_emulate_fault(regs);
+ }
+}
+
+/*
+ * Some software exceptions can also be triggered as int instructions,
+ * for historical reasons. Implement those here. The performance-critical
+ * int $0x80 (32-bit system call) has a hard-coded early out.
+ */
+static DEFINE_FRED_HANDLER(fred_sw_interrupt_user)
+{
+ if (likely(regs->vector == IA32_SYSCALL_VECTOR)) {
+ /* Convert frame to a syscall frame */
+ regs->orig_ax = regs->ax;
+ regs->ax = -ENOSYS;
+ return do_int80_syscall_32(regs);
+ }
+
+ switch (regs->vector) {
+ case X86_TRAP_BP:
+ case X86_TRAP_OF:
+ fred_emulate_trap(regs);
+ break;
+ default:
+ regs->vector = X86_TRAP_GP;
+ fred_emulate_fault(regs);
+ break;
+ }
+}
+
+static DEFINE_FRED_HANDLER(fred_hw_interrupt)
+{
+ irqentry_state_t state = irqentry_enter(regs);
+
+ instrumentation_begin();
+ external_interrupt(regs, regs->vector);
+ instrumentation_end();
+ irqentry_exit(regs, state);
+}
+
+__visible noinstr void fred_entry_from_user(struct pt_regs *regs)
+{
+ static const fred_handler user_handlers[FRED_EVENT_TYPE_COUNT] =
+ {
+ [EVENT_TYPE_HWINT] = fred_hw_interrupt,
+ [EVENT_TYPE_RESERVED] = fred_bad_event,
+ [EVENT_TYPE_NMI] = fred_exc_nmi,
+ [EVENT_TYPE_SWINT] = fred_sw_interrupt_user,
+ [EVENT_TYPE_HWFAULT] = fred_exception,
+ [EVENT_TYPE_SWFAULT] = fred_exception,
+ [EVENT_TYPE_PRIVSW] = fred_exception,
+ [EVENT_TYPE_OTHER] = fred_syscall_slow
+ };
+
+ /*
+ * FRED employs a two-level event dispatch mechanism, with
+ * the first-level on the type of an event and the second-level
+ * on its vector. Thus a dispatch typically induces 2 calls.
+ * We optimize it by using early outs for the most frequent
+ * events, and syscalls are the first. We may also need early
+ * outs for page faults.
+ */
+ if (likely(regs->type == EVENT_TYPE_OTHER &&
+ regs->vector == FRED_SYSCALL)) {
+ /* Convert frame to a syscall frame */
+ regs->orig_ax = regs->ax;
+ regs->ax = -ENOSYS;
+ do_syscall_64(regs, regs->orig_ax);
+ } else {
+ /* Not a system call */
+ u8 type = array_index_nospec((u8)regs->type, FRED_EVENT_TYPE_COUNT);
+
+ user_handlers[type](regs);
+ }
+}
+
+static DEFINE_FRED_HANDLER(fred_sw_interrupt_kernel)
+{
+ switch (regs->vector) {
+ case X86_TRAP_NMI:
+ fred_exc_nmi(regs);
+ break;
+ default:
+ fred_bad_event(regs);
+ break;
+ }
+}
+
+__visible noinstr void fred_entry_from_kernel(struct pt_regs *regs)
+{
+ static const fred_handler kernel_handlers[FRED_EVENT_TYPE_COUNT] =
+ {
+ [EVENT_TYPE_HWINT] = fred_hw_interrupt,
+ [EVENT_TYPE_RESERVED] = fred_bad_event,
+ [EVENT_TYPE_NMI] = fred_exc_nmi,
+ [EVENT_TYPE_SWINT] = fred_sw_interrupt_kernel,
+ [EVENT_TYPE_HWFAULT] = fred_exception,
+ [EVENT_TYPE_SWFAULT] = fred_exception,
+ [EVENT_TYPE_PRIVSW] = fred_exception,
+ [EVENT_TYPE_OTHER] = fred_bad_event
+ };
+ u8 type = array_index_nospec((u8)regs->type, FRED_EVENT_TYPE_COUNT);
+
+ /* The pt_regs frame on entry here is an exception frame */
+ kernel_handlers[type](regs);
+}
@@ -616,6 +616,8 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check);
#ifdef CONFIG_XEN_PV
DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
#endif
+#else
+#define fred_exc_machine_check (NULL)
#endif
/* NMI */
@@ -58,4 +58,6 @@ typedef DECLARE_SYSTEM_INTERRUPT_HANDLER((*system_interrupt_handler));
int exc_raise_irq(struct pt_regs *regs, u32 vector);
+int external_interrupt(struct pt_regs *regs, unsigned int vector);
+
#endif /* _ASM_X86_TRAPS_H */