[6/6] KVM: VMX: Move VERW closer to VMentry for MDS mitigation

Message ID 20231020-delay-verw-v1-6-cff54096326d@linux.intel.com
State New
Headers
Series Delay VERW |

Commit Message

Pawan Gupta Oct. 20, 2023, 8:45 p.m. UTC
  During VMentry VERW is executed to mitigate MDS. After VERW, any memory
access like register push onto stack may put host data in MDS affected
CPU buffers. A guest can then use MDS to sample host data.

Although likelihood of secrets surviving in registers at current VERW
callsite is less, but it can't be ruled out. Harden the MDS mitigation
by moving the VERW mitigation late in VMentry path.

Note that VERW for MMIO Stale Data mitigation is unchanged because of
the complexity of per-guest conditional VERW which is not easy to handle
that late in asm with no GPRs available. If the CPU is also affected by
MDS, VERW is unconditionally executed late in asm regardless of guest
having MMIO access.

Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
---
 arch/x86/kvm/vmx/vmenter.S |  9 +++++++++
 arch/x86/kvm/vmx/vmx.c     | 10 +++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)
  

Comments

Sean Christopherson Oct. 20, 2023, 10:55 p.m. UTC | #1
On Fri, Oct 20, 2023, Pawan Gupta wrote:
> During VMentry VERW is executed to mitigate MDS. After VERW, any memory
> access like register push onto stack may put host data in MDS affected
> CPU buffers. A guest can then use MDS to sample host data.
> 
> Although likelihood of secrets surviving in registers at current VERW
> callsite is less, but it can't be ruled out. Harden the MDS mitigation
> by moving the VERW mitigation late in VMentry path.
> 
> Note that VERW for MMIO Stale Data mitigation is unchanged because of
> the complexity of per-guest conditional VERW which is not easy to handle
> that late in asm with no GPRs available. If the CPU is also affected by
> MDS, VERW is unconditionally executed late in asm regardless of guest
> having MMIO access.
> 
> Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> ---
>  arch/x86/kvm/vmx/vmenter.S |  9 +++++++++
>  arch/x86/kvm/vmx/vmx.c     | 10 +++++++---
>  2 files changed, 16 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
> index be275a0410a8..efa716cf4727 100644
> --- a/arch/x86/kvm/vmx/vmenter.S
> +++ b/arch/x86/kvm/vmx/vmenter.S
> @@ -1,6 +1,7 @@
>  /* SPDX-License-Identifier: GPL-2.0 */
>  #include <linux/linkage.h>
>  #include <asm/asm.h>
> +#include <asm/segment.h>
>  #include <asm/bitsperlong.h>
>  #include <asm/kvm_vcpu_regs.h>
>  #include <asm/nospec-branch.h>
> @@ -31,6 +32,8 @@
>  #define VCPU_R15	__VCPU_REGS_R15 * WORD_SIZE
>  #endif
>  
> +#define GUEST_CLEAR_CPU_BUFFERS		USER_CLEAR_CPU_BUFFERS
> +
>  .macro VMX_DO_EVENT_IRQOFF call_insn call_target
>  	/*
>  	 * Unconditionally create a stack frame, getting the correct RSP on the
> @@ -177,10 +180,16 @@ SYM_FUNC_START(__vmx_vcpu_run)
>   * the 'vmx_vmexit' label below.
>   */
>  .Lvmresume:
> +	/* Mitigate CPU data sampling attacks .e.g. MDS */
> +	GUEST_CLEAR_CPU_BUFFERS

I have a very hard time believing that it's worth duplicating the mitigation
for VMRESUME vs. VMLAUNCH just to land it after a Jcc.

 3b1:   48 8b 00                mov    (%rax),%rax
 3b4:   74 18                   je     3ce <__vmx_vcpu_run+0x9e>
 3b6:   eb 0e                   jmp    3c6 <__vmx_vcpu_run+0x96>
 3b8:   0f 00 2d 05 00 00 00    verw   0x5(%rip)        # 3c4 <__vmx_vcpu_run+0x94>
 3bf:   0f 1f 80 00 00 18 00    nopl   0x180000(%rax)
 3c6:   0f 01 c3                vmresume
 3c9:   e9 c9 00 00 00          jmp    497 <vmx_vmexit+0xa7>
 3ce:   eb 0e                   jmp    3de <__vmx_vcpu_run+0xae>
 3d0:   0f 00 2d 05 00 00 00    verw   0x5(%rip)        # 3dc <__vmx_vcpu_run+0xac>
 3d7:   0f 1f 80 00 00 18 00    nopl   0x180000(%rax)
 3de:   0f 01 c2                vmlaunch

Also, would it'd be better to put the NOP first?  Or even better, out of line?
It'd be quite hilarious if the CPU pulled a stupid and speculated on the operand
of the NOP, i.e. if the user/guest controlled RAX allowed for pulling in data
after the VERW.
  
Pawan Gupta Oct. 21, 2023, 12:46 a.m. UTC | #2
On Fri, Oct 20, 2023 at 03:55:07PM -0700, Sean Christopherson wrote:
> On Fri, Oct 20, 2023, Pawan Gupta wrote:
> > During VMentry VERW is executed to mitigate MDS. After VERW, any memory
> > access like register push onto stack may put host data in MDS affected
> > CPU buffers. A guest can then use MDS to sample host data.
> > 
> > Although likelihood of secrets surviving in registers at current VERW
> > callsite is less, but it can't be ruled out. Harden the MDS mitigation
> > by moving the VERW mitigation late in VMentry path.
> > 
> > Note that VERW for MMIO Stale Data mitigation is unchanged because of
> > the complexity of per-guest conditional VERW which is not easy to handle
> > that late in asm with no GPRs available. If the CPU is also affected by
> > MDS, VERW is unconditionally executed late in asm regardless of guest
> > having MMIO access.
> > 
> > Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> > ---
> >  arch/x86/kvm/vmx/vmenter.S |  9 +++++++++
> >  arch/x86/kvm/vmx/vmx.c     | 10 +++++++---
> >  2 files changed, 16 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
> > index be275a0410a8..efa716cf4727 100644
> > --- a/arch/x86/kvm/vmx/vmenter.S
> > +++ b/arch/x86/kvm/vmx/vmenter.S
> > @@ -1,6 +1,7 @@
> >  /* SPDX-License-Identifier: GPL-2.0 */
> >  #include <linux/linkage.h>
> >  #include <asm/asm.h>
> > +#include <asm/segment.h>
> >  #include <asm/bitsperlong.h>
> >  #include <asm/kvm_vcpu_regs.h>
> >  #include <asm/nospec-branch.h>
> > @@ -31,6 +32,8 @@
> >  #define VCPU_R15	__VCPU_REGS_R15 * WORD_SIZE
> >  #endif
> >  
> > +#define GUEST_CLEAR_CPU_BUFFERS		USER_CLEAR_CPU_BUFFERS
> > +
> >  .macro VMX_DO_EVENT_IRQOFF call_insn call_target
> >  	/*
> >  	 * Unconditionally create a stack frame, getting the correct RSP on the
> > @@ -177,10 +180,16 @@ SYM_FUNC_START(__vmx_vcpu_run)
> >   * the 'vmx_vmexit' label below.
> >   */
> >  .Lvmresume:
> > +	/* Mitigate CPU data sampling attacks .e.g. MDS */
> > +	GUEST_CLEAR_CPU_BUFFERS
> 
> I have a very hard time believing that it's worth duplicating the mitigation
> for VMRESUME vs. VMLAUNCH just to land it after a Jcc.

VERW modifies the flags, so it either needs to be after Jcc or we
push/pop flags that adds 2 extra memory operations. Please let me know
if there is a better option.

> Also, would it'd be better to put the NOP first?  Or even better, out of line?
> It'd be quite hilarious if the CPU pulled a stupid and speculated on the operand
> of the NOP, i.e. if the user/guest controlled RAX allowed for pulling in data
> after the VERW.

I did confirm with CPU architects that NOP operand won't be
dereferenced, even speculatively. But yes, even if it did, moving NOP
first does take care of it.
  
Sean Christopherson Oct. 23, 2023, 2:58 p.m. UTC | #3
On Fri, Oct 20, 2023, Pawan Gupta wrote:
> On Fri, Oct 20, 2023 at 03:55:07PM -0700, Sean Christopherson wrote:
> > On Fri, Oct 20, 2023, Pawan Gupta wrote:
> > > During VMentry VERW is executed to mitigate MDS. After VERW, any memory
> > > access like register push onto stack may put host data in MDS affected
> > > CPU buffers. A guest can then use MDS to sample host data.
> > > 
> > > Although likelihood of secrets surviving in registers at current VERW
> > > callsite is less, but it can't be ruled out. Harden the MDS mitigation
> > > by moving the VERW mitigation late in VMentry path.
> > > 
> > > Note that VERW for MMIO Stale Data mitigation is unchanged because of
> > > the complexity of per-guest conditional VERW which is not easy to handle
> > > that late in asm with no GPRs available. If the CPU is also affected by
> > > MDS, VERW is unconditionally executed late in asm regardless of guest
> > > having MMIO access.
> > > 
> > > Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> > > ---
> > >  arch/x86/kvm/vmx/vmenter.S |  9 +++++++++
> > >  arch/x86/kvm/vmx/vmx.c     | 10 +++++++---
> > >  2 files changed, 16 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
> > > index be275a0410a8..efa716cf4727 100644
> > > --- a/arch/x86/kvm/vmx/vmenter.S
> > > +++ b/arch/x86/kvm/vmx/vmenter.S
> > > @@ -1,6 +1,7 @@
> > >  /* SPDX-License-Identifier: GPL-2.0 */
> > >  #include <linux/linkage.h>
> > >  #include <asm/asm.h>
> > > +#include <asm/segment.h>
> > >  #include <asm/bitsperlong.h>
> > >  #include <asm/kvm_vcpu_regs.h>
> > >  #include <asm/nospec-branch.h>
> > > @@ -31,6 +32,8 @@
> > >  #define VCPU_R15	__VCPU_REGS_R15 * WORD_SIZE
> > >  #endif
> > >  
> > > +#define GUEST_CLEAR_CPU_BUFFERS		USER_CLEAR_CPU_BUFFERS
> > > +
> > >  .macro VMX_DO_EVENT_IRQOFF call_insn call_target
> > >  	/*
> > >  	 * Unconditionally create a stack frame, getting the correct RSP on the
> > > @@ -177,10 +180,16 @@ SYM_FUNC_START(__vmx_vcpu_run)
> > >   * the 'vmx_vmexit' label below.
> > >   */
> > >  .Lvmresume:
> > > +	/* Mitigate CPU data sampling attacks .e.g. MDS */
> > > +	GUEST_CLEAR_CPU_BUFFERS
> > 
> > I have a very hard time believing that it's worth duplicating the mitigation
> > for VMRESUME vs. VMLAUNCH just to land it after a Jcc.
> 
> VERW modifies the flags, so it either needs to be after Jcc or we
> push/pop flags that adds 2 extra memory operations. Please let me know
> if there is a better option.

Ugh, I assumed that piggybacking VERW overrode the original behavior entirely, I
didn't realize it sacrifices EFLAGS.ZF on the altar of mitigations.

Luckily, this is easy to solve now that VMRESUME vs. VMLAUNCH uses a flag instead
of a dedicated bool.

From: Sean Christopherson <seanjc@google.com>
Date: Mon, 23 Oct 2023 07:44:35 -0700
Subject: [PATCH] KVM: VMX: Use BT+JNC, i.e. EFLAGS.CF to select VMRESUME vs.
 VMLAUNCH

Use EFLAGS.CF instead of EFLAGS.ZF to track whether to use VMRESUME versus
VMLAUNCH.  Freeing up EFLAGS.ZF will allow doing VERW, which clobbers ZF,
for MDS mitigations as late as possible without needing to duplicate VERW
for both paths.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/vmx/run_flags.h | 7 +++++--
 arch/x86/kvm/vmx/vmenter.S   | 6 +++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
index edc3f16cc189..6a9bfdfbb6e5 100644
--- a/arch/x86/kvm/vmx/run_flags.h
+++ b/arch/x86/kvm/vmx/run_flags.h
@@ -2,7 +2,10 @@
 #ifndef __KVM_X86_VMX_RUN_FLAGS_H
 #define __KVM_X86_VMX_RUN_FLAGS_H
 
-#define VMX_RUN_VMRESUME	(1 << 0)
-#define VMX_RUN_SAVE_SPEC_CTRL	(1 << 1)
+#define VMX_RUN_VMRESUME_SHIFT		0
+#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT	1
+
+#define VMX_RUN_VMRESUME		BIT(VMX_RUN_VMRESUME_SHIFT)
+#define VMX_RUN_SAVE_SPEC_CTRL		BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT)
 
 #endif /* __KVM_X86_VMX_RUN_FLAGS_H */
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index be275a0410a8..b3b13ec04bac 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -139,7 +139,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
 	mov (%_ASM_SP), %_ASM_AX
 
 	/* Check if vmlaunch or vmresume is needed */
-	test $VMX_RUN_VMRESUME, %ebx
+	bt   $VMX_RUN_VMRESUME_SHIFT, %ebx
 
 	/* Load guest registers.  Don't clobber flags. */
 	mov VCPU_RCX(%_ASM_AX), %_ASM_CX
@@ -161,8 +161,8 @@ SYM_FUNC_START(__vmx_vcpu_run)
 	/* Load guest RAX.  This kills the @regs pointer! */
 	mov VCPU_RAX(%_ASM_AX), %_ASM_AX
 
-	/* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */
-	jz .Lvmlaunch
+	/* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */
+	jnc .Lvmlaunch
 
 	/*
 	 * After a successful VMRESUME/VMLAUNCH, control flow "magically"

base-commit: ec2f1daad460c6201338dae606466220ccaa96d5
--
  
Pawan Gupta Oct. 23, 2023, 5:05 p.m. UTC | #4
On Mon, Oct 23, 2023 at 07:58:57AM -0700, Sean Christopherson wrote:
> On Fri, Oct 20, 2023, Pawan Gupta wrote:
> > On Fri, Oct 20, 2023 at 03:55:07PM -0700, Sean Christopherson wrote:
> > > On Fri, Oct 20, 2023, Pawan Gupta wrote:
> > > > During VMentry VERW is executed to mitigate MDS. After VERW, any memory
> > > > access like register push onto stack may put host data in MDS affected
> > > > CPU buffers. A guest can then use MDS to sample host data.
> > > > 
> > > > Although likelihood of secrets surviving in registers at current VERW
> > > > callsite is less, but it can't be ruled out. Harden the MDS mitigation
> > > > by moving the VERW mitigation late in VMentry path.
> > > > 
> > > > Note that VERW for MMIO Stale Data mitigation is unchanged because of
> > > > the complexity of per-guest conditional VERW which is not easy to handle
> > > > that late in asm with no GPRs available. If the CPU is also affected by
> > > > MDS, VERW is unconditionally executed late in asm regardless of guest
> > > > having MMIO access.
> > > > 
> > > > Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> > > > ---
> > > >  arch/x86/kvm/vmx/vmenter.S |  9 +++++++++
> > > >  arch/x86/kvm/vmx/vmx.c     | 10 +++++++---
> > > >  2 files changed, 16 insertions(+), 3 deletions(-)
> > > > 
> > > > diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
> > > > index be275a0410a8..efa716cf4727 100644
> > > > --- a/arch/x86/kvm/vmx/vmenter.S
> > > > +++ b/arch/x86/kvm/vmx/vmenter.S
> > > > @@ -1,6 +1,7 @@
> > > >  /* SPDX-License-Identifier: GPL-2.0 */
> > > >  #include <linux/linkage.h>
> > > >  #include <asm/asm.h>
> > > > +#include <asm/segment.h>
> > > >  #include <asm/bitsperlong.h>
> > > >  #include <asm/kvm_vcpu_regs.h>
> > > >  #include <asm/nospec-branch.h>
> > > > @@ -31,6 +32,8 @@
> > > >  #define VCPU_R15	__VCPU_REGS_R15 * WORD_SIZE
> > > >  #endif
> > > >  
> > > > +#define GUEST_CLEAR_CPU_BUFFERS		USER_CLEAR_CPU_BUFFERS
> > > > +
> > > >  .macro VMX_DO_EVENT_IRQOFF call_insn call_target
> > > >  	/*
> > > >  	 * Unconditionally create a stack frame, getting the correct RSP on the
> > > > @@ -177,10 +180,16 @@ SYM_FUNC_START(__vmx_vcpu_run)
> > > >   * the 'vmx_vmexit' label below.
> > > >   */
> > > >  .Lvmresume:
> > > > +	/* Mitigate CPU data sampling attacks .e.g. MDS */
> > > > +	GUEST_CLEAR_CPU_BUFFERS
> > > 
> > > I have a very hard time believing that it's worth duplicating the mitigation
> > > for VMRESUME vs. VMLAUNCH just to land it after a Jcc.
> > 
> > VERW modifies the flags, so it either needs to be after Jcc or we
> > push/pop flags that adds 2 extra memory operations. Please let me know
> > if there is a better option.
> 
> Ugh, I assumed that piggybacking VERW overrode the original behavior entirely, I
> didn't realize it sacrifices EFLAGS.ZF on the altar of mitigations.
> 
> Luckily, this is easy to solve now that VMRESUME vs. VMLAUNCH uses a flag instead
> of a dedicated bool.

Thats great.

> From: Sean Christopherson <seanjc@google.com>
> Date: Mon, 23 Oct 2023 07:44:35 -0700
> Subject: [PATCH] KVM: VMX: Use BT+JNC, i.e. EFLAGS.CF to select VMRESUME vs.
>  VMLAUNCH
> 
> Use EFLAGS.CF instead of EFLAGS.ZF to track whether to use VMRESUME versus
> VMLAUNCH.  Freeing up EFLAGS.ZF will allow doing VERW, which clobbers ZF,
> for MDS mitigations as late as possible without needing to duplicate VERW
> for both paths.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>

Thanks for the patch, I will include it in the next revision.
  
Josh Poimboeuf Oct. 23, 2023, 6:56 p.m. UTC | #5
On Fri, Oct 20, 2023 at 01:45:29PM -0700, Pawan Gupta wrote:
> @@ -31,6 +32,8 @@
>  #define VCPU_R15	__VCPU_REGS_R15 * WORD_SIZE
>  #endif
>  
> +#define GUEST_CLEAR_CPU_BUFFERS		USER_CLEAR_CPU_BUFFERS

I don't think the extra macro buys anything here.
  
Pawan Gupta Oct. 23, 2023, 9:17 p.m. UTC | #6
On Mon, Oct 23, 2023 at 11:56:43AM -0700, Josh Poimboeuf wrote:
> On Fri, Oct 20, 2023 at 01:45:29PM -0700, Pawan Gupta wrote:
> > @@ -31,6 +32,8 @@
> >  #define VCPU_R15	__VCPU_REGS_R15 * WORD_SIZE
> >  #endif
> >  
> > +#define GUEST_CLEAR_CPU_BUFFERS		USER_CLEAR_CPU_BUFFERS
> 
> I don't think the extra macro buys anything here.

Using USER_CLEAR_CPU_BUFFERS in the VMentry path didn't feel right. But,
after "USER_" is gone as per your comment on 2/6 patch,
GUEST_CLEAR_CPU_BUFFERS can also go away.
  

Patch

diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index be275a0410a8..efa716cf4727 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -1,6 +1,7 @@ 
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/linkage.h>
 #include <asm/asm.h>
+#include <asm/segment.h>
 #include <asm/bitsperlong.h>
 #include <asm/kvm_vcpu_regs.h>
 #include <asm/nospec-branch.h>
@@ -31,6 +32,8 @@ 
 #define VCPU_R15	__VCPU_REGS_R15 * WORD_SIZE
 #endif
 
+#define GUEST_CLEAR_CPU_BUFFERS		USER_CLEAR_CPU_BUFFERS
+
 .macro VMX_DO_EVENT_IRQOFF call_insn call_target
 	/*
 	 * Unconditionally create a stack frame, getting the correct RSP on the
@@ -177,10 +180,16 @@  SYM_FUNC_START(__vmx_vcpu_run)
  * the 'vmx_vmexit' label below.
  */
 .Lvmresume:
+	/* Mitigate CPU data sampling attacks .e.g. MDS */
+	GUEST_CLEAR_CPU_BUFFERS
+
 	vmresume
 	jmp .Lvmfail
 
 .Lvmlaunch:
+	/* Mitigate CPU data sampling attacks .e.g. MDS */
+	GUEST_CLEAR_CPU_BUFFERS
+
 	vmlaunch
 	jmp .Lvmfail
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c16297a49e4d..e3d0eda292c3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7226,13 +7226,17 @@  static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 
 	guest_state_enter_irqoff();
 
-	/* L1D Flush includes CPU buffer clear to mitigate MDS */
+	/*
+	 * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW
+	 * mitigation for MDS is done late in VMentry and is still executed
+	 * inspite of L1D Flush. This is because an extra VERW should not matter
+	 * much after the big hammer L1D Flush.
+	 */
 	if (static_branch_unlikely(&vmx_l1d_should_flush))
 		vmx_l1d_flush(vcpu);
-	else if (cpu_feature_enabled(X86_FEATURE_USER_CLEAR_CPU_BUF))
-		mds_clear_cpu_buffers();
 	else if (static_branch_unlikely(&mmio_stale_data_clear) &&
 		 kvm_arch_has_assigned_device(vcpu->kvm))
+		/* MMIO mitigation is mutually exclusive to MDS mitigation later in asm */
 		mds_clear_cpu_buffers();
 
 	vmx_disable_fb_clear(vmx);