[5/6] KVM: x86: Virtualize FLUSH_L1D and passthrough MSR_IA32_FLUSH_CMD
Commit Message
Virtualize FLUSH_L1D so that the guest can use the performant L1D flush
if one of the many mitigations might require a flush in the guest, e.g.
Linux provides an option to flush the L1D when switching mms.
Passthrough MSR_IA32_FLUSH_CMD for write when it's supported in hardware
and exposed to the guest, i.e. always let the guest write it directly if
FLUSH_L1D is fully supported.
Forward writes to hardware in host context on the off chance that KVM
ends up emulating a WRMSR, or in the really unlikely scenario where
userspace wants to force a flush. Restrict these forwarded WRMSRs to
the known command out of an abundance of caution. Passing through the
MSR means the guest can throw any and all values at hardware, but doing
so in host context is arguably a bit more dangerous.
Link: https://lkml.kernel.org/r/CALMp9eTt3xzAEoQ038bJQ9LN0ZOXrSWsN7xnNUD%2B0SS%3DWwF7Pg%40mail.gmail.com
Link: https://lore.kernel.org/all/20230201132905.549148-2-eesposit@redhat.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kvm/cpuid.c | 2 +-
arch/x86/kvm/svm/svm.c | 5 +++++
arch/x86/kvm/vmx/nested.c | 3 +++
arch/x86/kvm/vmx/vmx.c | 5 +++++
arch/x86/kvm/vmx/vmx.h | 2 +-
arch/x86/kvm/x86.c | 12 ++++++++++++
6 files changed, 27 insertions(+), 2 deletions(-)
Comments
On Tue, Mar 21, 2023 at 06:14:39PM -0700, Sean Christopherson wrote:
> Virtualize FLUSH_L1D so that the guest can use the performant L1D flush
> if one of the many mitigations might require a flush in the guest, e.g.
> Linux provides an option to flush the L1D when switching mms.
>
> Passthrough MSR_IA32_FLUSH_CMD for write when it's supported in hardware
> and exposed to the guest, i.e. always let the guest write it directly if
> FLUSH_L1D is fully supported.
>
> Forward writes to hardware in host context on the off chance that KVM
> ends up emulating a WRMSR, or in the really unlikely scenario where
> userspace wants to force a flush. Restrict these forwarded WRMSRs to
> the known command out of an abundance of caution. Passing through the
> MSR means the guest can throw any and all values at hardware, but doing
> so in host context is arguably a bit more dangerous.
>
> Link: https://lkml.kernel.org/r/CALMp9eTt3xzAEoQ038bJQ9LN0ZOXrSWsN7xnNUD%2B0SS%3DWwF7Pg%40mail.gmail.com
> Link: https://lore.kernel.org/all/20230201132905.549148-2-eesposit@redhat.com
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> arch/x86/kvm/cpuid.c | 2 +-
> arch/x86/kvm/svm/svm.c | 5 +++++
> arch/x86/kvm/vmx/nested.c | 3 +++
> arch/x86/kvm/vmx/vmx.c | 5 +++++
> arch/x86/kvm/vmx/vmx.h | 2 +-
> arch/x86/kvm/x86.c | 12 ++++++++++++
> 6 files changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 599aebec2d52..9583a110cf5f 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -653,7 +653,7 @@ void kvm_set_cpu_caps(void)
> F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
> F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
> F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
> - F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16)
> + F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
> );
>
> /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 85bb535fc321..b32edaf5a74b 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -95,6 +95,7 @@ static const struct svm_direct_access_msrs {
> #endif
> { .index = MSR_IA32_SPEC_CTRL, .always = false },
> { .index = MSR_IA32_PRED_CMD, .always = false },
> + { .index = MSR_IA32_FLUSH_CMD, .always = false },
> { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
> { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
> { .index = MSR_IA32_LASTINTFROMIP, .always = false },
> @@ -4140,6 +4141,10 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
> set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
> !!guest_has_pred_cmd_msr(vcpu));
>
> + if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
Just curious, will this ever be true on AMD hardware? AFAIK,
X86_FEATURE_FLUSH_L1D is Intel-defined CPU feature.
> + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
> + !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
> +
> /* For sev guests, the memory encryption bit is not reserved in CR3. */
> if (sev_guest(vcpu->kvm)) {
> best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
On Wed, Mar 22, 2023, Pawan Gupta wrote:
> On Tue, Mar 21, 2023 at 06:14:39PM -0700, Sean Christopherson wrote:
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index 85bb535fc321..b32edaf5a74b 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -95,6 +95,7 @@ static const struct svm_direct_access_msrs {
> > #endif
> > { .index = MSR_IA32_SPEC_CTRL, .always = false },
> > { .index = MSR_IA32_PRED_CMD, .always = false },
> > + { .index = MSR_IA32_FLUSH_CMD, .always = false },
> > { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
> > { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
> > { .index = MSR_IA32_LASTINTFROMIP, .always = false },
> > @@ -4140,6 +4141,10 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
> > set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
> > !!guest_has_pred_cmd_msr(vcpu));
> >
> > + if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
>
> Just curious, will this ever be true on AMD hardware? AFAIK,
> X86_FEATURE_FLUSH_L1D is Intel-defined CPU feature.
Don't know myself, but I assume/home there was actual motivation behind adding
support for AMD.
On Thu, 23 Mar 2023 15:17:50 -0700
Sean Christopherson <seanjc@google.com> wrote:
> On Wed, Mar 22, 2023, Pawan Gupta wrote:
> > On Tue, Mar 21, 2023 at 06:14:39PM -0700, Sean Christopherson wrote:
> > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > > index 85bb535fc321..b32edaf5a74b 100644
> > > --- a/arch/x86/kvm/svm/svm.c
> > > +++ b/arch/x86/kvm/svm/svm.c
> > > @@ -95,6 +95,7 @@ static const struct svm_direct_access_msrs {
> > > #endif
> > > { .index = MSR_IA32_SPEC_CTRL, .always = false },
> > > { .index = MSR_IA32_PRED_CMD, .always = false },
> > > + { .index = MSR_IA32_FLUSH_CMD, .always = false },
> > > { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
> > > { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
> > > { .index = MSR_IA32_LASTINTFROMIP, .always = false },
> > > @@ -4140,6 +4141,10 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
> > > set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
> > > !!guest_has_pred_cmd_msr(vcpu));
> > >
> > > + if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
> >
> > Just curious, will this ever be true on AMD hardware? AFAIK,
> > X86_FEATURE_FLUSH_L1D is Intel-defined CPU feature.
>
> Don't know myself, but I assume/home there was actual motivation behind adding
> support for AMD.
Hmm. I took a look on the APM[1] published on Jan 2023, 3.2.9 Speculation
control MSRs. It only has SPEC_CTL/PRED_SMD so far. Also, the information
here [2] shows this is a mitigation only for Intel CPUs. Looks like AMD
does not require this so far.
[1] https://www.amd.com/system/files/TechDocs/40332.pdf
[2] https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html
On 3/22/2023 9:14 AM, Sean Christopherson wrote:
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index c83ec88da043..3c58dbae7b4c 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3628,6 +3628,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>
> wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
> break;
> + case MSR_IA32_FLUSH_CMD:
> + if (!msr_info->host_initiated &&
> + !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
> + return 1;
> +
> + if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
> + return 1;
> + if (!data)
> + break;
> +
> + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
> + break;
Then KVM provides the ability to flush the L1 data cache of host to
userspace. Can it be exploited to degrade the host performance if
userspace VMM keeps flushing the L1 data cache?
On Sun, Mar 26, 2023 at 8:33 PM Xiaoyao Li <xiaoyao.li@intel.com> wrote:
>
> On 3/22/2023 9:14 AM, Sean Christopherson wrote:
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index c83ec88da043..3c58dbae7b4c 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -3628,6 +3628,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >
> > wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
> > break;
> > + case MSR_IA32_FLUSH_CMD:
> > + if (!msr_info->host_initiated &&
> > + !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
> > + return 1;
> > +
> > + if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
> > + return 1;
> > + if (!data)
> > + break;
> > +
> > + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
> > + break;
>
> Then KVM provides the ability to flush the L1 data cache of host to
> userspace. Can it be exploited to degrade the host performance if
> userspace VMM keeps flushing the L1 data cache?
The L1D$ isn't very big. A guest could always flush out any previously
cached data simply by referencing its own data. Is the ability to
flush the L1D$ by WRMSR that egregious?
On Mon, Mar 27, 2023, Jim Mattson wrote:
> On Sun, Mar 26, 2023 at 8:33 PM Xiaoyao Li <xiaoyao.li@intel.com> wrote:
> >
> > On 3/22/2023 9:14 AM, Sean Christopherson wrote:
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index c83ec88da043..3c58dbae7b4c 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -3628,6 +3628,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > >
> > > wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
> > > break;
> > > + case MSR_IA32_FLUSH_CMD:
> > > + if (!msr_info->host_initiated &&
> > > + !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
> > > + return 1;
> > > +
> > > + if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
> > > + return 1;
> > > + if (!data)
> > > + break;
> > > +
> > > + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
> > > + break;
> >
> > Then KVM provides the ability to flush the L1 data cache of host to
> > userspace. Can it be exploited to degrade the host performance if
> > userspace VMM keeps flushing the L1 data cache?
>
> The L1D$ isn't very big. A guest could always flush out any previously
> cached data simply by referencing its own data. Is the ability to
> flush the L1D$ by WRMSR that egregious?
Yeah, AFAIK RDT and the like only provide QoS controls for L3, so L1 is fair game.
@@ -653,7 +653,7 @@ void kvm_set_cpu_caps(void)
F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
- F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16)
+ F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
);
/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
@@ -95,6 +95,7 @@ static const struct svm_direct_access_msrs {
#endif
{ .index = MSR_IA32_SPEC_CTRL, .always = false },
{ .index = MSR_IA32_PRED_CMD, .always = false },
+ { .index = MSR_IA32_FLUSH_CMD, .always = false },
{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
@@ -4140,6 +4141,10 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
!!guest_has_pred_cmd_msr(vcpu));
+ if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
+ !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
/* For sev guests, the memory encryption bit is not reserved in CR3. */
if (sev_guest(vcpu->kvm)) {
best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
@@ -654,6 +654,9 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_PRED_CMD, MSR_TYPE_W);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
+
kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
vmx->nested.force_msr_bitmap_recalc = false;
@@ -164,6 +164,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
MSR_IA32_SPEC_CTRL,
MSR_IA32_PRED_CMD,
+ MSR_IA32_FLUSH_CMD,
MSR_IA32_TSC,
#ifdef CONFIG_X86_64
MSR_FS_BASE,
@@ -7720,6 +7721,10 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
!guest_has_pred_cmd_msr(vcpu));
+ if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
set_cr4_guest_host_mask(vmx);
vmx_write_encls_bitmap(vcpu, NULL);
@@ -369,7 +369,7 @@ struct vcpu_vmx {
struct lbr_desc lbr_desc;
/* Save desired MSR intercept (read: pass-through) state */
-#define MAX_POSSIBLE_PASSTHROUGH_MSRS 15
+#define MAX_POSSIBLE_PASSTHROUGH_MSRS 16
struct {
DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
@@ -3628,6 +3628,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
break;
+ case MSR_IA32_FLUSH_CMD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
+ return 1;
+
+ if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
+ return 1;
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ break;
case MSR_EFER:
return set_efer(vcpu, msr_info);
case MSR_K7_HWCR: