[5/6] KVM: x86: Virtualize FLUSH_L1D and passthrough MSR_IA32_FLUSH_CMD

Message ID 20230322011440.2195485-6-seanjc@google.com
State New
Headers
Series KVM: x86: Unhost the *_CMD MSR mess |

Commit Message

Sean Christopherson March 22, 2023, 1:14 a.m. UTC
  Virtualize FLUSH_L1D so that the guest can use the performant L1D flush
if one of the many mitigations might require a flush in the guest, e.g.
Linux provides an option to flush the L1D when switching mms.

Passthrough MSR_IA32_FLUSH_CMD for write when it's supported in hardware
and exposed to the guest, i.e. always let the guest write it directly if
FLUSH_L1D is fully supported.

Forward writes to hardware in host context on the off chance that KVM
ends up emulating a WRMSR, or in the really unlikely scenario where
userspace wants to force a flush.  Restrict these forwarded WRMSRs to
the known command out of an abundance of caution.  Passing through the
MSR means the guest can throw any and all values at hardware, but doing
so in host context is arguably a bit more dangerous.

Link: https://lkml.kernel.org/r/CALMp9eTt3xzAEoQ038bJQ9LN0ZOXrSWsN7xnNUD%2B0SS%3DWwF7Pg%40mail.gmail.com
Link: https://lore.kernel.org/all/20230201132905.549148-2-eesposit@redhat.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/cpuid.c      |  2 +-
 arch/x86/kvm/svm/svm.c    |  5 +++++
 arch/x86/kvm/vmx/nested.c |  3 +++
 arch/x86/kvm/vmx/vmx.c    |  5 +++++
 arch/x86/kvm/vmx/vmx.h    |  2 +-
 arch/x86/kvm/x86.c        | 12 ++++++++++++
 6 files changed, 27 insertions(+), 2 deletions(-)
  

Comments

Pawan Gupta March 23, 2023, 5:07 a.m. UTC | #1
On Tue, Mar 21, 2023 at 06:14:39PM -0700, Sean Christopherson wrote:
> Virtualize FLUSH_L1D so that the guest can use the performant L1D flush
> if one of the many mitigations might require a flush in the guest, e.g.
> Linux provides an option to flush the L1D when switching mms.
> 
> Passthrough MSR_IA32_FLUSH_CMD for write when it's supported in hardware
> and exposed to the guest, i.e. always let the guest write it directly if
> FLUSH_L1D is fully supported.
> 
> Forward writes to hardware in host context on the off chance that KVM
> ends up emulating a WRMSR, or in the really unlikely scenario where
> userspace wants to force a flush.  Restrict these forwarded WRMSRs to
> the known command out of an abundance of caution.  Passing through the
> MSR means the guest can throw any and all values at hardware, but doing
> so in host context is arguably a bit more dangerous.
> 
> Link: https://lkml.kernel.org/r/CALMp9eTt3xzAEoQ038bJQ9LN0ZOXrSWsN7xnNUD%2B0SS%3DWwF7Pg%40mail.gmail.com
> Link: https://lore.kernel.org/all/20230201132905.549148-2-eesposit@redhat.com
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/cpuid.c      |  2 +-
>  arch/x86/kvm/svm/svm.c    |  5 +++++
>  arch/x86/kvm/vmx/nested.c |  3 +++
>  arch/x86/kvm/vmx/vmx.c    |  5 +++++
>  arch/x86/kvm/vmx/vmx.h    |  2 +-
>  arch/x86/kvm/x86.c        | 12 ++++++++++++
>  6 files changed, 27 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 599aebec2d52..9583a110cf5f 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -653,7 +653,7 @@ void kvm_set_cpu_caps(void)
>  		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
>  		F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
>  		F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
> -		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16)
> +		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
>  	);
>  
>  	/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 85bb535fc321..b32edaf5a74b 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -95,6 +95,7 @@ static const struct svm_direct_access_msrs {
>  #endif
>  	{ .index = MSR_IA32_SPEC_CTRL,			.always = false },
>  	{ .index = MSR_IA32_PRED_CMD,			.always = false },
> +	{ .index = MSR_IA32_FLUSH_CMD,			.always = false },
>  	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
>  	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
>  	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
> @@ -4140,6 +4141,10 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
>  		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
>  				     !!guest_has_pred_cmd_msr(vcpu));
>  
> +	if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))

Just curious, will this ever be true on AMD hardware? AFAIK,
X86_FEATURE_FLUSH_L1D is Intel-defined CPU feature.

> +		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
> +				     !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
> +
>  	/* For sev guests, the memory encryption bit is not reserved in CR3.  */
>  	if (sev_guest(vcpu->kvm)) {
>  		best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
  
Sean Christopherson March 23, 2023, 10:17 p.m. UTC | #2
On Wed, Mar 22, 2023, Pawan Gupta wrote:
> On Tue, Mar 21, 2023 at 06:14:39PM -0700, Sean Christopherson wrote:
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index 85bb535fc321..b32edaf5a74b 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -95,6 +95,7 @@ static const struct svm_direct_access_msrs {
> >  #endif
> >  	{ .index = MSR_IA32_SPEC_CTRL,			.always = false },
> >  	{ .index = MSR_IA32_PRED_CMD,			.always = false },
> > +	{ .index = MSR_IA32_FLUSH_CMD,			.always = false },
> >  	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
> >  	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
> >  	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
> > @@ -4140,6 +4141,10 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
> >  		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
> >  				     !!guest_has_pred_cmd_msr(vcpu));
> >  
> > +	if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
> 
> Just curious, will this ever be true on AMD hardware? AFAIK,
> X86_FEATURE_FLUSH_L1D is Intel-defined CPU feature.

Don't know myself, but I assume/home there was actual motivation behind adding
support for AMD.
  
Zhi Wang March 24, 2023, 8:48 a.m. UTC | #3
On Thu, 23 Mar 2023 15:17:50 -0700
Sean Christopherson <seanjc@google.com> wrote:

> On Wed, Mar 22, 2023, Pawan Gupta wrote:
> > On Tue, Mar 21, 2023 at 06:14:39PM -0700, Sean Christopherson wrote:
> > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > > index 85bb535fc321..b32edaf5a74b 100644
> > > --- a/arch/x86/kvm/svm/svm.c
> > > +++ b/arch/x86/kvm/svm/svm.c
> > > @@ -95,6 +95,7 @@ static const struct svm_direct_access_msrs {
> > >  #endif
> > >  	{ .index = MSR_IA32_SPEC_CTRL,			.always = false },
> > >  	{ .index = MSR_IA32_PRED_CMD,			.always = false },
> > > +	{ .index = MSR_IA32_FLUSH_CMD,			.always = false },
> > >  	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
> > >  	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
> > >  	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
> > > @@ -4140,6 +4141,10 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
> > >  		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
> > >  				     !!guest_has_pred_cmd_msr(vcpu));
> > >  
> > > +	if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
> > 
> > Just curious, will this ever be true on AMD hardware? AFAIK,
> > X86_FEATURE_FLUSH_L1D is Intel-defined CPU feature.
> 
> Don't know myself, but I assume/home there was actual motivation behind adding
> support for AMD.

Hmm. I took a look on the APM[1] published on Jan 2023, 3.2.9 Speculation
control MSRs. It only has SPEC_CTL/PRED_SMD so far. Also, the information
here [2] shows this is a mitigation only for Intel CPUs. Looks like AMD
does not require this so far.

[1] https://www.amd.com/system/files/TechDocs/40332.pdf
[2] https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html
  
Xiaoyao Li March 27, 2023, 3:33 a.m. UTC | #4
On 3/22/2023 9:14 AM, Sean Christopherson wrote:
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index c83ec88da043..3c58dbae7b4c 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3628,6 +3628,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>   
>   		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
>   		break;
> +	case MSR_IA32_FLUSH_CMD:
> +		if (!msr_info->host_initiated &&
> +		    !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
> +			return 1;
> +
> +		if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
> +			return 1;
> +		if (!data)
> +			break;
> +
> +		wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
> +		break;

Then KVM provides the ability to flush the L1 data cache of host to 
userspace. Can it be exploited to degrade the host performance if 
userspace VMM keeps flushing the L1 data cache?
  
Jim Mattson March 27, 2023, 3:37 p.m. UTC | #5
On Sun, Mar 26, 2023 at 8:33 PM Xiaoyao Li <xiaoyao.li@intel.com> wrote:
>
> On 3/22/2023 9:14 AM, Sean Christopherson wrote:
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index c83ec88da043..3c58dbae7b4c 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -3628,6 +3628,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >
> >               wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
> >               break;
> > +     case MSR_IA32_FLUSH_CMD:
> > +             if (!msr_info->host_initiated &&
> > +                 !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
> > +                     return 1;
> > +
> > +             if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
> > +                     return 1;
> > +             if (!data)
> > +                     break;
> > +
> > +             wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
> > +             break;
>
> Then KVM provides the ability to flush the L1 data cache of host to
> userspace. Can it be exploited to degrade the host performance if
> userspace VMM keeps flushing the L1 data cache?

The L1D$ isn't very big. A guest could always flush out any previously
cached data simply by referencing its own data. Is the ability to
flush the L1D$ by WRMSR that egregious?
  
Sean Christopherson March 27, 2023, 4 p.m. UTC | #6
On Mon, Mar 27, 2023, Jim Mattson wrote:
> On Sun, Mar 26, 2023 at 8:33 PM Xiaoyao Li <xiaoyao.li@intel.com> wrote:
> >
> > On 3/22/2023 9:14 AM, Sean Christopherson wrote:
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index c83ec88da043..3c58dbae7b4c 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -3628,6 +3628,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > >
> > >               wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
> > >               break;
> > > +     case MSR_IA32_FLUSH_CMD:
> > > +             if (!msr_info->host_initiated &&
> > > +                 !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
> > > +                     return 1;
> > > +
> > > +             if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
> > > +                     return 1;
> > > +             if (!data)
> > > +                     break;
> > > +
> > > +             wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
> > > +             break;
> >
> > Then KVM provides the ability to flush the L1 data cache of host to
> > userspace. Can it be exploited to degrade the host performance if
> > userspace VMM keeps flushing the L1 data cache?
> 
> The L1D$ isn't very big. A guest could always flush out any previously
> cached data simply by referencing its own data. Is the ability to
> flush the L1D$ by WRMSR that egregious?

Yeah, AFAIK RDT and the like only provide QoS controls for L3, so L1 is fair game.
  

Patch

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 599aebec2d52..9583a110cf5f 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -653,7 +653,7 @@  void kvm_set_cpu_caps(void)
 		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
 		F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
 		F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
-		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16)
+		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
 	);
 
 	/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 85bb535fc321..b32edaf5a74b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -95,6 +95,7 @@  static const struct svm_direct_access_msrs {
 #endif
 	{ .index = MSR_IA32_SPEC_CTRL,			.always = false },
 	{ .index = MSR_IA32_PRED_CMD,			.always = false },
+	{ .index = MSR_IA32_FLUSH_CMD,			.always = false },
 	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
 	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
 	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
@@ -4140,6 +4141,10 @@  static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
 				     !!guest_has_pred_cmd_msr(vcpu));
 
+	if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
+				     !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
 	/* For sev guests, the memory encryption bit is not reserved in CR3.  */
 	if (sev_guest(vcpu->kvm)) {
 		best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 1bc2b80273c9..f63b28f46a71 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -654,6 +654,9 @@  static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
 					 MSR_IA32_PRED_CMD, MSR_TYPE_W);
 
+	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+					 MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
+
 	kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
 
 	vmx->nested.force_msr_bitmap_recalc = false;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 29807be219b9..56e0c7ae961d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -164,6 +164,7 @@  module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
 	MSR_IA32_SPEC_CTRL,
 	MSR_IA32_PRED_CMD,
+	MSR_IA32_FLUSH_CMD,
 	MSR_IA32_TSC,
 #ifdef CONFIG_X86_64
 	MSR_FS_BASE,
@@ -7720,6 +7721,10 @@  static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 		vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
 					  !guest_has_pred_cmd_msr(vcpu));
 
+	if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+		vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+					  !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
 	set_cr4_guest_host_mask(vmx);
 
 	vmx_write_encls_bitmap(vcpu, NULL);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 2acdc54bc34b..cb766f65a3eb 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -369,7 +369,7 @@  struct vcpu_vmx {
 	struct lbr_desc lbr_desc;
 
 	/* Save desired MSR intercept (read: pass-through) state */
-#define MAX_POSSIBLE_PASSTHROUGH_MSRS	15
+#define MAX_POSSIBLE_PASSTHROUGH_MSRS	16
 	struct {
 		DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
 		DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c83ec88da043..3c58dbae7b4c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3628,6 +3628,18 @@  int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
 		break;
+	case MSR_IA32_FLUSH_CMD:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
+			return 1;
+
+		if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
+			return 1;
+		if (!data)
+			break;
+
+		wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+		break;
 	case MSR_EFER:
 		return set_efer(vcpu, msr_info);
 	case MSR_K7_HWCR: