@@ -21,3 +21,6 @@ obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o
obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o
obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o
+ifeq ($(CONFIG_X86_64),y)
+ obj-y += entry_64_switcher.o
+endif
@@ -142,6 +142,10 @@ For 32-bit we have the following conventions - kernel is built with
.endif
.endm
+.macro SET_NOFLUSH_BIT reg:req
+ bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
+.endm
+
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
@@ -154,10 +158,6 @@ For 32-bit we have the following conventions - kernel is built with
#define PTI_USER_PCID_MASK (1 << PTI_USER_PCID_BIT)
#define PTI_USER_PGTABLE_AND_PCID_MASK (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK)
-.macro SET_NOFLUSH_BIT reg:req
- bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
-.endm
-
.macro ADJUST_KERNEL_CR3 reg:req
ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
@@ -284,6 +284,45 @@ For 32-bit we have the following conventions - kernel is built with
#endif
+#define TSS_extra(field) PER_CPU_VAR(cpu_tss_rw+TSS_EX_##field)
+
+/*
+ * Switcher would be disabled when KPTI is enabled.
+ *
+ * Ideally, switcher would switch to HOST_CR3 in IST before gsbase is fixed,
+ * in which case it would use the offset from the IST stack top to the TSS
+ * in CEA to get the pointer of the TSS. But SEV guest modifies TSS.IST on
+ * the fly and makes the code non-workable in SEV guest even the switcher
+ * is not used.
+ *
+ * So switcher is marked disabled when KPTI is enabled rather than when
+ * in SEV guest.
+ *
+ * To enable switcher with KPTI, something like Integrated Entry code with
+ * atomic-IST-entry has to be introduced beforehand.
+ *
+ * The current SWITCHER_SAVE_AND_SWITCH_TO_HOST_CR3 is called after gsbase
+ * is fixed.
+ */
+.macro SWITCHER_SAVE_AND_SWITCH_TO_HOST_CR3 scratch_reg:req save_reg:req
+ ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_PTI
+ cmpq $0, TSS_extra(host_rsp)
+ jz .Lend_\@
+ movq %cr3, \save_reg
+ movq TSS_extra(host_cr3), \scratch_reg
+ movq \scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+.macro SWITCHER_RESTORE_CR3 scratch_reg:req save_reg:req
+ ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_PTI
+ cmpq $0, TSS_extra(host_rsp)
+ jz .Lend_\@
+ ALTERNATIVE "", "SET_NOFLUSH_BIT \save_reg", X86_FEATURE_PCID
+ movq \save_reg, %cr3
+.Lend_\@:
+.endm
+
/*
* IBRS kernel mitigation for Spectre_v2.
*
@@ -278,10 +278,11 @@ SYM_CODE_END(xen_error_entry)
/**
* idtentry_body - Macro to emit code calling the C function
+ * @vector: Vector number
* @cfunc: C function to be called
* @has_error_code: Hardware pushed error code on stack
*/
-.macro idtentry_body cfunc has_error_code:req
+.macro idtentry_body vector cfunc has_error_code:req
/*
* Call error_entry() and switch to the task stack if from userspace.
@@ -297,6 +298,10 @@ SYM_CODE_END(xen_error_entry)
ENCODE_FRAME_POINTER
UNWIND_HINT_REGS
+ cmpq $0, TSS_extra(host_rsp)
+ jne .Lpvm_idtentry_body_\@
+.L_host_idtenrty_\@:
+
movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
.if \has_error_code == 1
@@ -310,6 +315,25 @@ SYM_CODE_END(xen_error_entry)
REACHABLE
jmp error_return
+
+.Lpvm_idtentry_body_\@:
+ testb $3, CS(%rsp)
+ /* host exception nested in IST handler while the switcher is active */
+ jz .L_host_idtenrty_\@
+
+ .if \vector < 256
+ movl $\vector, ORIG_RAX+4(%rsp)
+ .else // X86_TRAP_OTHER
+ /*
+ * Here are the macros for common_interrupt(), spurious_interrupt(),
+ * and XENPV entries with the titular vector X86_TRAP_OTHER. XENPV
+ * entries can't reach here while common_interrupt() and
+ * spurious_interrupt() have the real vector at ORIG_RAX.
+ */
+ movl ORIG_RAX(%rsp), %eax
+ movl %eax, ORIG_RAX+4(%rsp)
+ .endif
+ jmp switcher_return_from_guest
.endm
/**
@@ -354,7 +378,7 @@ SYM_CODE_START(\asmsym)
.Lfrom_usermode_no_gap_\@:
.endif
- idtentry_body \cfunc \has_error_code
+ idtentry_body \vector \cfunc \has_error_code
_ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym)
@@ -427,7 +451,7 @@ SYM_CODE_START(\asmsym)
/* Switch to the regular task stack and use the noist entry point */
.Lfrom_usermode_switch_stack_\@:
- idtentry_body noist_\cfunc, has_error_code=0
+ idtentry_body \vector, noist_\cfunc, has_error_code=0
_ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym)
@@ -507,7 +531,7 @@ SYM_CODE_START(\asmsym)
/* Switch to the regular task stack */
.Lfrom_usermode_switch_stack_\@:
- idtentry_body user_\cfunc, has_error_code=1
+ idtentry_body \vector, user_\cfunc, has_error_code=1
_ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym)
@@ -919,6 +943,16 @@ SYM_CODE_START(paranoid_entry)
FENCE_SWAPGS_KERNEL_ENTRY
.Lparanoid_gsbase_done:
+ /*
+ * Switch back to kernel cr3 when switcher is active.
+ * Switcher can't be used when KPTI is enabled by far, so only one of
+ * SAVE_AND_SWITCH_TO_KERNEL_CR3 and SWITCHER_SAVE_AND_SWITCH_TO_KERNEL_CR3
+ * takes effect. SWITCHER_SAVE_AND_SWITCH_TO_KERNEL_CR3 requires
+ * kernel GSBASE.
+ * See the comments above SWITCHER_SAVE_AND_SWITCH_TO_HOST_CR3.
+ */
+ SWITCHER_SAVE_AND_SWITCH_TO_HOST_CR3 scratch_reg=%rax save_reg=%r14
+
/*
* Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like
* CR3 above, keep the old value in a callee saved register.
@@ -970,6 +1004,15 @@ SYM_CODE_START_LOCAL(paranoid_exit)
*/
RESTORE_CR3 scratch_reg=%rax save_reg=%r14
+ /*
+ * Switch back to origin cr3 when switcher is active.
+ * Switcher can't be used when KPTI is enabled by far, so only
+ * one of RESTORE_CR3 and SWITCHER_RESTORE_CR3 takes effect.
+ *
+ * See the comments above SWITCHER_SAVE_AND_SWITCH_TO_HOST_CR3.
+ */
+ SWITCHER_RESTORE_CR3 scratch_reg=%rax save_reg=%r14
+
/* Handle the three GSBASE cases */
ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
@@ -1158,6 +1201,8 @@ SYM_CODE_START(asm_exc_nmi)
FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
+ cmpq $0, TSS_extra(host_rsp)
+ jne .Lnmi_from_pvm_guest
movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
UNWIND_HINT_IRET_REGS base=%rdx offset=8
pushq 5*8(%rdx) /* pt_regs->ss */
@@ -1188,6 +1233,21 @@ SYM_CODE_START(asm_exc_nmi)
*/
jmp swapgs_restore_regs_and_return_to_usermode
+.Lnmi_from_pvm_guest:
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+ UNWIND_HINT_IRET_REGS base=%rdx offset=8
+ pushq 5*8(%rdx) /* pt_regs->ss */
+ pushq 4*8(%rdx) /* pt_regs->rsp */
+ pushq 3*8(%rdx) /* pt_regs->flags */
+ pushq 2*8(%rdx) /* pt_regs->cs */
+ pushq 1*8(%rdx) /* pt_regs->rip */
+ UNWIND_HINT_IRET_REGS
+ pushq $0 /* pt_regs->orig_ax */
+ movl $2, 4(%rsp) /* pt_regs->orig_ax, pvm vector */
+ PUSH_AND_CLEAR_REGS rdx=(%rdx)
+ ENCODE_FRAME_POINTER
+ jmp switcher_return_from_guest
+
.Lnmi_from_kernel:
/*
* Here's what our stack frame will look like:
new file mode 100644
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <linux/export.h>
+#include <asm/segment.h>
+#include <asm/asm-offsets.h>
+#include <asm/msr.h>
+#include <asm/percpu.h>
+#include <asm/asm.h>
+#include <asm/nospec-branch.h>
+#include <asm/switcher.h>
+
+#include "calling.h"
+
+.code64
+.section .entry.text, "ax"
+
+.macro MITIGATION_EXIT
+ /* Same as user entry. */
+ IBRS_EXIT
+.endm
+
+.macro MITIGATION_ENTER
+ /*
+ * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before
+ * the first unbalanced RET after vmexit!
+ *
+ * For retpoline or IBRS, RSB filling is needed to prevent poisoned RSB
+ * entries and (in some cases) RSB underflow.
+ *
+ * eIBRS has its own protection against poisoned RSB, so it doesn't
+ * need the RSB filling sequence. But it does need to be enabled, and a
+ * single call to retire, before the first unbalanced RET.
+ */
+ FILL_RETURN_BUFFER %rcx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT, \
+ X86_FEATURE_RSB_VMEXIT_LITE
+
+ IBRS_ENTER
+.endm
+
+/*
+ * switcher_enter_guest - Do a transition to guest mode
+ *
+ * Called with guest registers on the top of the sp0 stack and the switcher
+ * states on cpu_tss_rw.tss_ex.
+ *
+ * Returns:
+ * pointer to pt_regs (on top of sp0 or IST stack) with guest registers.
+ */
+SYM_FUNC_START(switcher_enter_guest)
+ pushq %rbp
+ pushq %r15
+ pushq %r14
+ pushq %r13
+ pushq %r12
+ pushq %rbx
+
+ /* Save host RSP and mark the switcher active */
+ movq %rsp, TSS_extra(host_rsp)
+
+ /* Switch to host sp0 */
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rdi
+ subq $FRAME_SIZE, %rdi
+ movq %rdi, %rsp
+
+ UNWIND_HINT_REGS
+
+ MITIGATION_EXIT
+
+ /* switch to guest cr3 on sp0 stack */
+ movq TSS_extra(enter_cr3), %rax
+ movq %rax, %cr3
+ /* Load guest registers. */
+ POP_REGS
+ addq $8, %rsp
+
+ /* Switch to guest GSBASE and return to guest */
+ swapgs
+ jmp native_irq_return_iret
+
+SYM_INNER_LABEL(switcher_return_from_guest, SYM_L_GLOBAL)
+ /* switch back to host cr3 when still on sp0/ist stack */
+ movq TSS_extra(host_cr3), %rax
+ movq %rax, %cr3
+
+ MITIGATION_ENTER
+
+ /* Restore to host RSP and mark the switcher inactive */
+ movq %rsp, %rax
+ movq TSS_extra(host_rsp), %rsp
+ movq $0, TSS_extra(host_rsp)
+
+ popq %rbx
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %r15
+ popq %rbp
+ RET
+SYM_FUNC_END(switcher_enter_guest)
+EXPORT_SYMBOL_GPL(switcher_enter_guest)
+
+SYM_CODE_START(entry_SYSCALL_64_switcher)
+ UNWIND_HINT_ENTRY
+ ENDBR
+
+ swapgs
+ /* tss.sp2 is scratch space. */
+ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+SYM_INNER_LABEL(entry_SYSCALL_64_switcher_safe_stack, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+
+ /* Construct struct pt_regs on stack */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
+
+ pushq $0 /* pt_regs->orig_ax */
+ movl $SWITCH_EXIT_REASONS_SYSCALL, 4(%rsp)
+
+ PUSH_AND_CLEAR_REGS
+ jmp switcher_return_from_guest
+SYM_CODE_END(entry_SYSCALL_64_switcher)
+EXPORT_SYMBOL_GPL(entry_SYSCALL_64_switcher)
@@ -29,6 +29,7 @@ struct vm86;
#include <asm/vmxfeatures.h>
#include <asm/vdso/processor.h>
#include <asm/shstk.h>
+#include <asm/switcher.h>
#include <linux/personality.h>
#include <linux/cache.h>
@@ -382,6 +383,10 @@ struct tss_struct {
*/
struct x86_hw_tss x86_tss;
+#ifdef CONFIG_X86_64
+ struct tss_extra tss_ex;
+#endif
+
struct x86_io_bitmap io_bitmap;
} __aligned(PAGE_SIZE);
@@ -5,6 +5,7 @@
#include <asm/segment.h>
#include <asm/page_types.h>
#include <uapi/asm/ptrace.h>
+#include <asm/switcher.h>
#ifndef __ASSEMBLY__
#ifdef __i386__
@@ -194,6 +195,8 @@ static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
ret = ret || (regs->ip >= (unsigned long)entry_SYSRETL_compat_unsafe_stack &&
regs->ip < (unsigned long)entry_SYSRETL_compat_end);
#endif
+ ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_64_switcher &&
+ regs->ip < (unsigned long)entry_SYSCALL_64_switcher_safe_stack);
return ret;
}
new file mode 100644
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SWITCHER_H
+#define _ASM_X86_SWITCHER_H
+
+#ifdef CONFIG_X86_64
+#include <asm/processor-flags.h>
+
+#define SWITCH_EXIT_REASONS_SYSCALL 1024
+#define SWITCH_EXIT_REASONS_FAILED_VMETNRY 1025
+
+/* Bits allowed to be set in the underlying eflags */
+#define SWITCH_ENTER_EFLAGS_ALLOWED (X86_EFLAGS_FIXED | X86_EFLAGS_IF |\
+ X86_EFLAGS_TF | X86_EFLAGS_RF |\
+ X86_EFLAGS_AC | X86_EFLAGS_OF | \
+ X86_EFLAGS_DF | X86_EFLAGS_SF | \
+ X86_EFLAGS_ZF | X86_EFLAGS_AF | \
+ X86_EFLAGS_PF | X86_EFLAGS_CF | \
+ X86_EFLAGS_ID | X86_EFLAGS_NT)
+
+/* Bits must be set in the underlying eflags */
+#define SWITCH_ENTER_EFLAGS_FIXED (X86_EFLAGS_FIXED | X86_EFLAGS_IF)
+
+#ifndef __ASSEMBLY__
+#include <linux/cache.h>
+
+struct pt_regs;
+
+/*
+ * Extra per CPU control structure lives in the struct tss_struct.
+ *
+ * The page-size-aligned struct tss_struct has enough room to accommodate
+ * this extra data without increasing its size.
+ *
+ * The extra data is also in the first page of struct tss_struct whose
+ * read-write mapping (percpu cpu_tss_rw) is in the KPTI's user pagetable,
+ * so that it can even be accessible via cpu_tss_rw in the entry code.
+ */
+struct tss_extra {
+ /* Saved host CR3 to be loaded after VM exit. */
+ unsigned long host_cr3;
+ /*
+ * Saved host stack to be loaded after VM exit. This also serves as a
+ * flag to indicate that it is entering the guest world in the switcher
+ * or has been in the guest world in the host entries.
+ */
+ unsigned long host_rsp;
+ /* Prepared guest CR3 to be loaded before VM enter. */
+ unsigned long enter_cr3;
+} ____cacheline_aligned;
+
+extern struct pt_regs *switcher_enter_guest(void);
+extern const char entry_SYSCALL_64_switcher[];
+extern const char entry_SYSCALL_64_switcher_safe_stack[];
+extern const char entry_SYSRETQ_switcher_unsafe_stack[];
+#endif /* __ASSEMBLY__ */
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_SWITCHER_H */
@@ -60,5 +60,13 @@ int main(void)
OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary);
BLANK();
#endif
+
+#define ENTRY(entry) OFFSET(TSS_EX_ ## entry, tss_struct, tss_ex.entry)
+ ENTRY(host_cr3);
+ ENTRY(host_rsp);
+ ENTRY(enter_cr3);
+ BLANK();
+#undef ENTRY
+
return 0;
}
@@ -773,6 +773,9 @@ DEFINE_IDTENTRY_RAW(exc_int3)
asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1;
+
+ if (this_cpu_read(cpu_tss_rw.tss_ex.host_rsp))
+ return eregs;
if (regs != eregs)
*regs = *eregs;
return regs;