[v5,11/19] KVM:VMX: Emulate read and write to CET MSRs

Message ID 20230803042732.88515-12-weijiang.yang@intel.com
State New
Headers
Series Enable CET Virtualization |

Commit Message

Yang, Weijiang Aug. 3, 2023, 4:27 a.m. UTC
  Add emulation interface for CET MSR read and write.
The emulation code is split into common part and vendor specific
part, the former resides in x86.c to benefic different x86 CPU
vendors, the latter for VMX is implemented in this patch.

Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
---
 arch/x86/kvm/vmx/vmx.c |  27 +++++++++++
 arch/x86/kvm/x86.c     | 104 +++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/x86.h     |  18 +++++++
 3 files changed, 141 insertions(+), 8 deletions(-)
  

Comments

Chao Gao Aug. 4, 2023, 5:14 a.m. UTC | #1
On Thu, Aug 03, 2023 at 12:27:24AM -0400, Yang Weijiang wrote:
>Add emulation interface for CET MSR read and write.
>The emulation code is split into common part and vendor specific
>part, the former resides in x86.c to benefic different x86 CPU
>vendors, the latter for VMX is implemented in this patch.
>
>Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
>---
> arch/x86/kvm/vmx/vmx.c |  27 +++++++++++
> arch/x86/kvm/x86.c     | 104 +++++++++++++++++++++++++++++++++++++----
> arch/x86/kvm/x86.h     |  18 +++++++
> 3 files changed, 141 insertions(+), 8 deletions(-)
>
>diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>index 6aa76124e81e..ccf750e79608 100644
>--- a/arch/x86/kvm/vmx/vmx.c
>+++ b/arch/x86/kvm/vmx/vmx.c
>@@ -2095,6 +2095,18 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> 		else
> 			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
> 		break;
>+	case MSR_IA32_S_CET:
>+	case MSR_KVM_GUEST_SSP:
>+	case MSR_IA32_INT_SSP_TAB:
>+		if (kvm_get_msr_common(vcpu, msr_info))
>+			return 1;
>+		if (msr_info->index == MSR_KVM_GUEST_SSP)
>+			msr_info->data = vmcs_readl(GUEST_SSP);
>+		else if (msr_info->index == MSR_IA32_S_CET)
>+			msr_info->data = vmcs_readl(GUEST_S_CET);
>+		else if (msr_info->index == MSR_IA32_INT_SSP_TAB)
>+			msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);

This if-else-if suggests that they are focibly grouped together to just
share the call of kvm_get_msr_common(). For readability, I think it is better
to handle them separately.

e.g.,
	case MSR_IA32_S_CET:
		if (kvm_get_msr_common(vcpu, msr_info))
			return 1;
		msr_info->data = vmcs_readl(GUEST_S_CET);
		break;

	case MSR_KVM_GUEST_SSP:
		if (kvm_get_msr_common(vcpu, msr_info))
			return 1;
		msr_info->data = vmcs_readl(GUEST_SSP);
		break;

	...


>+		break;
> 	case MSR_IA32_DEBUGCTLMSR:
> 		msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
> 		break;
>@@ -2404,6 +2416,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> 		else
> 			vmx->pt_desc.guest.addr_a[index / 2] = data;
> 		break;
>+	case MSR_IA32_S_CET:
>+	case MSR_KVM_GUEST_SSP:
>+	case MSR_IA32_INT_SSP_TAB:
>+		if (kvm_set_msr_common(vcpu, msr_info))
>+			return 1;
>+		if (msr_index == MSR_KVM_GUEST_SSP)
>+			vmcs_writel(GUEST_SSP, data);
>+		else if (msr_index == MSR_IA32_S_CET)
>+			vmcs_writel(GUEST_S_CET, data);
>+		else if (msr_index == MSR_IA32_INT_SSP_TAB)
>+			vmcs_writel(GUEST_INTR_SSP_TABLE, data);

ditto

>+		break;
> 	case MSR_IA32_PERF_CAPABILITIES:
> 		if (data && !vcpu_to_pmu(vcpu)->version)
> 			return 1;
>@@ -4864,6 +4888,9 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> 		vmcs_write64(GUEST_BNDCFGS, 0);
> 
> 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
>+	vmcs_writel(GUEST_SSP, 0);
>+	vmcs_writel(GUEST_S_CET, 0);
>+	vmcs_writel(GUEST_INTR_SSP_TABLE, 0);

where are MSR_IA32_PL3_SSP and MSR_IA32_U_CET reset?

I thought that guest FPU would be reset in kvm_vcpu_reset(). But it turns out
only MPX states are reset in KVM while other FPU states are unchanged. This
is aligned with "Table 10.1 IA-32 and Intel® 64 Processor States Following
Power-up, Reset, or INIT"

Could you double confirm the hardware beahavior that CET states are reset to 0
on INIT? If CET states are reset, we need to handle CET_IA32_PL3_SSP and
MSR_IA32_U_CET like MPX.

> 
> 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
> 
>diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>index 5b63441fd2d2..98f3ff6078e6 100644
>--- a/arch/x86/kvm/x86.c
>+++ b/arch/x86/kvm/x86.c
>@@ -3627,6 +3627,39 @@ static bool kvm_is_msr_to_save(u32 msr_index)
> 	return false;
> }
> 
>+static inline bool is_shadow_stack_msr(u32 msr)
>+{
>+	return msr == MSR_IA32_PL0_SSP ||
>+		msr == MSR_IA32_PL1_SSP ||
>+		msr == MSR_IA32_PL2_SSP ||
>+		msr == MSR_IA32_PL3_SSP ||
>+		msr == MSR_IA32_INT_SSP_TAB ||
>+		msr == MSR_KVM_GUEST_SSP;
>+}
>+
>+static bool kvm_cet_is_msr_accessible(struct kvm_vcpu *vcpu,
>+				      struct msr_data *msr)
>+{
>+	if (is_shadow_stack_msr(msr->index)) {
>+		if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK))
>+			return false;
>+
>+		if (msr->index == MSR_KVM_GUEST_SSP)
>+			return msr->host_initiated;
>+
>+		return msr->host_initiated ||
>+			guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);
>+	}
>+
>+	if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
>+	    !kvm_cpu_cap_has(X86_FEATURE_IBT))
>+		return false;
>+
>+	return msr->host_initiated ||
>+		guest_cpuid_has(vcpu, X86_FEATURE_IBT) ||
>+		guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);
>+}
>+
> int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> {
> 	u32 msr = msr_info->index;
>@@ -3981,6 +4014,45 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> 		vcpu->arch.guest_fpu.xfd_err = data;
> 		break;
> #endif
>+#define CET_EXCLUSIVE_BITS		(CET_SUPPRESS | CET_WAIT_ENDBR)
>+#define CET_CTRL_RESERVED_BITS		GENMASK(9, 6)
>+#define CET_SHSTK_MASK_BITS		GENMASK(1, 0)
>+#define CET_IBT_MASK_BITS		(GENMASK_ULL(5, 2) | \
>+					 GENMASK_ULL(63, 10))
>+#define CET_LEG_BITMAP_BASE(data)	((data) >> 12)
>+	case MSR_IA32_U_CET:
>+	case MSR_IA32_S_CET:
>+		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
>+			return 1;
>+		if (!!(data & CET_CTRL_RESERVED_BITS))
>+			return 1;
>+		if (!guest_can_use(vcpu, X86_FEATURE_SHSTK) &&
>+		    (data & CET_SHSTK_MASK_BITS))
>+			return 1;
>+		if (!guest_can_use(vcpu, X86_FEATURE_IBT) &&
>+		    (data & CET_IBT_MASK_BITS))
>+			return 1;
>+		if (!IS_ALIGNED(CET_LEG_BITMAP_BASE(data), 4) ||
>+		    (data & CET_EXCLUSIVE_BITS) == CET_EXCLUSIVE_BITS)
>+			return 1;
>+		if (msr == MSR_IA32_U_CET)

can you add a comment before this if() statement like?
		/* MSR_IA32_S_CET is handled by vendor code */

>+	case MSR_KVM_GUEST_SSP:
>+	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
>+		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
>+			return 1;
>+		if (is_noncanonical_address(data, vcpu))
>+			return 1;
>+		if (!IS_ALIGNED(data, 4))
>+			return 1;
>+		if (msr == MSR_IA32_PL0_SSP || msr == MSR_IA32_PL1_SSP ||
>+		    msr == MSR_IA32_PL2_SSP) {
>+			vcpu->arch.cet_s_ssp[msr - MSR_IA32_PL0_SSP] = data;
>+		} else if (msr == MSR_IA32_PL3_SSP) {
>+			kvm_set_xsave_msr(msr_info);
>+		}

brackets are not needed.

also add a comment for MSR_KVM_GUEST_SSP.

>+		break;
> 	default:
> 		if (kvm_pmu_is_valid_msr(vcpu, msr))
> 			return kvm_pmu_set_msr(vcpu, msr_info);
>@@ -4051,7 +4123,9 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
> 
> int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> {
>-	switch (msr_info->index) {
>+	u32 msr = msr_info->index;
>+
>+	switch (msr) {
> 	case MSR_IA32_PLATFORM_ID:
> 	case MSR_IA32_EBL_CR_POWERON:
> 	case MSR_IA32_LASTBRANCHFROMIP:
>@@ -4086,7 +4160,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> 	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
> 	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
> 	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
>-		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
>+		if (kvm_pmu_is_valid_msr(vcpu, msr))
> 			return kvm_pmu_get_msr(vcpu, msr_info);
> 		msr_info->data = 0;
> 		break;
>@@ -4137,7 +4211,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> 	case MSR_MTRRcap:
> 	case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
> 	case MSR_MTRRdefType:
>-		return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
>+		return kvm_mtrr_get_msr(vcpu, msr, &msr_info->data);
> 	case 0xcd: /* fsb frequency */
> 		msr_info->data = 3;
> 		break;
>@@ -4159,7 +4233,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> 		msr_info->data = kvm_get_apic_base(vcpu);
> 		break;
> 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
>-		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
>+		return kvm_x2apic_msr_read(vcpu, msr, &msr_info->data);
> 	case MSR_IA32_TSC_DEADLINE:
> 		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
> 		break;
>@@ -4253,7 +4327,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> 	case MSR_IA32_MCG_STATUS:
> 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
> 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
>-		return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
>+		return get_msr_mce(vcpu, msr, &msr_info->data,
> 				   msr_info->host_initiated);
> 	case MSR_IA32_XSS:
> 		if (!msr_info->host_initiated &&
>@@ -4284,7 +4358,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> 	case HV_X64_MSR_TSC_EMULATION_STATUS:
> 	case HV_X64_MSR_TSC_INVARIANT_CONTROL:
> 		return kvm_hv_get_msr_common(vcpu,
>-					     msr_info->index, &msr_info->data,
>+					     msr, &msr_info->data,
> 					     msr_info->host_initiated);
> 	case MSR_IA32_BBL_CR_CTL3:
> 		/* This legacy MSR exists but isn't fully documented in current
>@@ -4337,8 +4411,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> 		msr_info->data = vcpu->arch.guest_fpu.xfd_err;
> 		break;
> #endif
>+	case MSR_IA32_U_CET:
>+	case MSR_IA32_S_CET:
>+	case MSR_KVM_GUEST_SSP:
>+	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
>+		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
>+			return 1;
>+		if (msr == MSR_IA32_PL0_SSP || msr == MSR_IA32_PL1_SSP ||
>+		    msr == MSR_IA32_PL2_SSP) {
>+			msr_info->data =
>+				vcpu->arch.cet_s_ssp[msr - MSR_IA32_PL0_SSP];
>+		} else if (msr == MSR_IA32_U_CET || msr == MSR_IA32_PL3_SSP) {
>+			kvm_get_xsave_msr(msr_info);
>+		}

Again, for readability and clarity, how about:

	case MSR_IA32_U_CET:
	case MSR_IA32_PL3_SSP:
		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
			return 1;
		kvm_get_xsave_msr(msr_info);
		break;
	case MSR_IA32_PL0_SSP ... MSR_IA32_PL2_SSP:
		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
			return 1;
		msr_info->data = vcpu->arch.cet_s_ssp[msr - MSR_IA32_PL0_SSP];
		break;
	case MSR_IA32_S_CET:
	case MSR_KVM_GUEST_SSP:
		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
			return 1;
		/* Further handling in vendor code */
		break;

>+		break;
> 	default:
>-		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
>+		if (kvm_pmu_is_valid_msr(vcpu, msr))
> 			return kvm_pmu_get_msr(vcpu, msr_info);
> 
> 		/*
>@@ -4346,7 +4434,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> 		 * to-be-saved, even if an MSR isn't fully supported.
> 		 */
> 		if (msr_info->host_initiated &&
>-		    kvm_is_msr_to_save(msr_info->index)) {
>+		    kvm_is_msr_to_save(msr)) {
> 			msr_info->data = 0;
> 			break;
> 		}
>diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
>index c69fc027f5ec..3b79d6db2f83 100644
>--- a/arch/x86/kvm/x86.h
>+++ b/arch/x86/kvm/x86.h
>@@ -552,4 +552,22 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
> 			 unsigned int port, void *data,  unsigned int count,
> 			 int in);
> 
>+/*
>+ * Guest xstate MSRs have been loaded in __msr_io(), disable preemption before
>+ * access the MSRs to avoid MSR content corruption.
>+ */

I think it is better to describe what the function does prior to jumping into
details like where guest FPU is loaded.

/*
 * Lock and/or reload guest FPU and access xstate MSRs. For accesses initiated
 * by host, guest FPU is loaded in __msr_io(). For accesses initiated by guest,
 * guest FPU should have been loaded already.
 */
>+static inline void kvm_get_xsave_msr(struct msr_data *msr_info)
>+{
>+	kvm_fpu_get();
>+	rdmsrl(msr_info->index, msr_info->data);
>+	kvm_fpu_put();
>+}
>+
>+static inline void kvm_set_xsave_msr(struct msr_data *msr_info)
>+{
>+	kvm_fpu_get();
>+	wrmsrl(msr_info->index, msr_info->data);
>+	kvm_fpu_put();
>+}

Can you rename functions to kvm_get/set_xstate_msr() to align with the comment
and patch 6? And if there is no user outside x86.c, you can just put these two
functions right after the is_xstate_msr() added in patch 6.

>+
> #endif
>-- 
>2.27.0
>
  
Chao Gao Aug. 4, 2023, 8:28 a.m. UTC | #2
>+	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
>+		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
>+			return 1;
>+		if (is_noncanonical_address(data, vcpu))
>+			return 1;
>+		if (!IS_ALIGNED(data, 4))
>+			return 1;

Why should MSR_IA32_INT_SSP_TAB be 4-byte aligned? I don't see
this requirement in SDM.

IA32_INTERRUPT_SSP_TABLE_ADDR:

Linear address of a table of seven shadow
stack pointers that are selected in IA-32e
mode using the IST index (when not 0) from
the interrupt gate descriptor. (R/W)
This MSR is not present on processors that
do not support Intel 64 architecture. This
field cannot represent a non-canonical
address.
  
Sean Christopherson Aug. 4, 2023, 9:27 p.m. UTC | #3
On Fri, Aug 04, 2023, Chao Gao wrote:
> On Thu, Aug 03, 2023 at 12:27:24AM -0400, Yang Weijiang wrote:
> >Add emulation interface for CET MSR read and write.
> >The emulation code is split into common part and vendor specific
> >part, the former resides in x86.c to benefic different x86 CPU
> >vendors, the latter for VMX is implemented in this patch.
> >
> >Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
> >---
> > arch/x86/kvm/vmx/vmx.c |  27 +++++++++++
> > arch/x86/kvm/x86.c     | 104 +++++++++++++++++++++++++++++++++++++----
> > arch/x86/kvm/x86.h     |  18 +++++++
> > 3 files changed, 141 insertions(+), 8 deletions(-)
> >
> >diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> >index 6aa76124e81e..ccf750e79608 100644
> >--- a/arch/x86/kvm/vmx/vmx.c
> >+++ b/arch/x86/kvm/vmx/vmx.c
> >@@ -2095,6 +2095,18 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > 		else
> > 			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
> > 		break;
> >+	case MSR_IA32_S_CET:
> >+	case MSR_KVM_GUEST_SSP:
> >+	case MSR_IA32_INT_SSP_TAB:
> >+		if (kvm_get_msr_common(vcpu, msr_info))
> >+			return 1;
> >+		if (msr_info->index == MSR_KVM_GUEST_SSP)
> >+			msr_info->data = vmcs_readl(GUEST_SSP);
> >+		else if (msr_info->index == MSR_IA32_S_CET)
> >+			msr_info->data = vmcs_readl(GUEST_S_CET);
> >+		else if (msr_info->index == MSR_IA32_INT_SSP_TAB)
> >+			msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
> 
> This if-else-if suggests that they are focibly grouped together to just
> share the call of kvm_get_msr_common(). For readability, I think it is better
> to handle them separately.
> 
> e.g.,
> 	case MSR_IA32_S_CET:
> 		if (kvm_get_msr_common(vcpu, msr_info))
> 			return 1;
> 		msr_info->data = vmcs_readl(GUEST_S_CET);
> 		break;
> 
> 	case MSR_KVM_GUEST_SSP:
> 		if (kvm_get_msr_common(vcpu, msr_info))
> 			return 1;
> 		msr_info->data = vmcs_readl(GUEST_SSP);
> 		break;

Actually, we can do even better.  We have an existing framework for these types
of prechecks, I just completely forgot about it :-(  (my "look at PAT" was a bad
suggestion).

Handle the checks in __kvm_set_msr() and __kvm_get_msr(), i.e. *before* calling
into vendor code.  Then vendor code doesn't need to make weird callbacks.

> > int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > {
> > 	u32 msr = msr_info->index;
> >@@ -3981,6 +4014,45 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > 		vcpu->arch.guest_fpu.xfd_err = data;
> > 		break;
> > #endif
> >+#define CET_EXCLUSIVE_BITS		(CET_SUPPRESS | CET_WAIT_ENDBR)
> >+#define CET_CTRL_RESERVED_BITS		GENMASK(9, 6)

Please use a single namespace for these #defines, e.g. CET_CTRL_* or maybe
CET_US_* for everything.

> >+#define CET_SHSTK_MASK_BITS		GENMASK(1, 0)
> >+#define CET_IBT_MASK_BITS		(GENMASK_ULL(5, 2) | \
> >+					 GENMASK_ULL(63, 10))
> >+#define CET_LEG_BITMAP_BASE(data)	((data) >> 12)

Bah, stupid SDM.  Please spell out "LEGACY", I though "LEG" was short for "LEGAL"
since this looks a lot like a page shift, i.e. getting a pfn.

> >+static bool kvm_cet_is_msr_accessible(struct kvm_vcpu *vcpu,
> >+				      struct msr_data *msr)
> >+{
> >+	if (is_shadow_stack_msr(msr->index)) {
> >+		if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK))
> >+			return false;
> >+
> >+		if (msr->index == MSR_KVM_GUEST_SSP)
> >+			return msr->host_initiated;
> >+
> >+		return msr->host_initiated ||
> >+			guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);
> >+	}
> >+
> >+	if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
> >+	    !kvm_cpu_cap_has(X86_FEATURE_IBT))
> >+		return false;
> >+
> >+	return msr->host_initiated ||
> >+		guest_cpuid_has(vcpu, X86_FEATURE_IBT) ||
> >+		guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);

Similar to my suggestsion for XSS, I think we drop the waiver for host_initiated
accesses, i.e. require the feature to be enabled and exposed to the guest, even
for the host.

> >diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> >index c69fc027f5ec..3b79d6db2f83 100644
> >--- a/arch/x86/kvm/x86.h
> >+++ b/arch/x86/kvm/x86.h
> >@@ -552,4 +552,22 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
> > 			 unsigned int port, void *data,  unsigned int count,
> > 			 int in);
> > 
> >+/*
> >+ * Guest xstate MSRs have been loaded in __msr_io(), disable preemption before
> >+ * access the MSRs to avoid MSR content corruption.
> >+ */
> 
> I think it is better to describe what the function does prior to jumping into
> details like where guest FPU is loaded.
> 
> /*
>  * Lock and/or reload guest FPU and access xstate MSRs. For accesses initiated
>  * by host, guest FPU is loaded in __msr_io(). For accesses initiated by guest,
>  * guest FPU should have been loaded already.
>  */
> >+static inline void kvm_get_xsave_msr(struct msr_data *msr_info)
> >+{
> >+	kvm_fpu_get();
> >+	rdmsrl(msr_info->index, msr_info->data);
> >+	kvm_fpu_put();
> >+}
> >+
> >+static inline void kvm_set_xsave_msr(struct msr_data *msr_info)
> >+{
> >+	kvm_fpu_get();
> >+	wrmsrl(msr_info->index, msr_info->data);
> >+	kvm_fpu_put();
> >+}
> 
> Can you rename functions to kvm_get/set_xstate_msr() to align with the comment
> and patch 6? And if there is no user outside x86.c, you can just put these two
> functions right after the is_xstate_msr() added in patch 6.

+1.  These should also assert that (a) guest FPU state is loaded and (b) the MSR
is passed through to the guest.  I might be ok dropping (b) if both VMX and SVM
passthrough all MSRs if they're exposed to the guest, i.e. not lazily passed
through.

Sans any changes to kvm_{g,s}et_xsave_msr(), I think this?  (completely untested)


---
 arch/x86/kvm/vmx/vmx.c |  34 +++-------
 arch/x86/kvm/x86.c     | 151 +++++++++++++++--------------------------
 2 files changed, 64 insertions(+), 121 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 491039aeb61b..1211eb469d06 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2100,16 +2100,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
 		break;
 	case MSR_IA32_S_CET:
+		msr_info->data = vmcs_readl(GUEST_S_CET);
+		break;
 	case MSR_KVM_GUEST_SSP:
+		msr_info->data = vmcs_readl(GUEST_SSP);
+		break;
 	case MSR_IA32_INT_SSP_TAB:
-		if (kvm_get_msr_common(vcpu, msr_info))
-			return 1;
-		if (msr_info->index == MSR_KVM_GUEST_SSP)
-			msr_info->data = vmcs_readl(GUEST_SSP);
-		else if (msr_info->index == MSR_IA32_S_CET)
-			msr_info->data = vmcs_readl(GUEST_S_CET);
-		else if (msr_info->index == MSR_IA32_INT_SSP_TAB)
-			msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
+		msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
 		break;
 	case MSR_IA32_DEBUGCTLMSR:
 		msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
@@ -2432,25 +2429,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		else
 			vmx->pt_desc.guest.addr_a[index / 2] = data;
 		break;
-	case MSR_IA32_PL0_SSP ... MSR_IA32_PL2_SSP:
-		if (kvm_set_msr_common(vcpu, msr_info))
-			return 1;
-		if (data) {
-			vmx_disable_write_intercept_sss_msr(vcpu);
-			wrmsrl(msr_index, data);
-		}
-		break;
 	case MSR_IA32_S_CET:
+		vmcs_writel(GUEST_S_CET, data);
+		break;
 	case MSR_KVM_GUEST_SSP:
+		vmcs_writel(GUEST_SSP, data);
+		break;
 	case MSR_IA32_INT_SSP_TAB:
-		if (kvm_set_msr_common(vcpu, msr_info))
-			return 1;
-		if (msr_index == MSR_KVM_GUEST_SSP)
-			vmcs_writel(GUEST_SSP, data);
-		else if (msr_index == MSR_IA32_S_CET)
-			vmcs_writel(GUEST_S_CET, data);
-		else if (msr_index == MSR_IA32_INT_SSP_TAB)
-			vmcs_writel(GUEST_INTR_SSP_TABLE, data);
+		vmcs_writel(GUEST_INTR_SSP_TABLE, data);
 		break;
 	case MSR_IA32_PERF_CAPABILITIES:
 		if (data && !vcpu_to_pmu(vcpu)->version)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7385fc25a987..75e6de7c9268 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1838,6 +1838,11 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
 }
 EXPORT_SYMBOL_GPL(kvm_msr_allowed);
 
+#define CET_US_RESERVED_BITS		GENMASK(9, 6)
+#define CET_US_SHSTK_MASK_BITS		GENMASK(1, 0)
+#define CET_US_IBT_MASK_BITS		(GENMASK_ULL(5, 2) | GENMASK_ULL(63, 10))
+#define CET_US_LEGACY_BITMAP_BASE(data)	((data) >> 12)
+
 /*
  * Write @data into the MSR specified by @index.  Select MSR specific fault
  * checks are bypassed if @host_initiated is %true.
@@ -1897,6 +1902,35 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
 
 		data = (u32)data;
 		break;
+	case MSR_IA32_U_CET:
+	case MSR_IA32_S_CET:
+		if (!guest_can_use(vcpu, X86_FEATURE_SHSTK) &&
+		    !guest_can_use(vcpu, X86_FEATURE_IBT))
+		    	return 1;
+		if (data & CET_US_RESERVED_BITS)
+			return 1;
+		if (!guest_can_use(vcpu, X86_FEATURE_SHSTK) &&
+		    (data & CET_US_SHSTK_MASK_BITS))
+			return 1;
+		if (!guest_can_use(vcpu, X86_FEATURE_IBT) &&
+		    (data & CET_US_IBT_MASK_BITS))
+			return 1;
+		if (!IS_ALIGNED(CET_US_LEGACY_BITMAP_BASE(data), 4))
+			return 1;
+
+		/* IBT can be suppressed iff the TRACKER isn't WAIT_ENDR. */
+		if ((data & CET_SUPPRESS) && (data & CET_WAIT_ENDBR))
+			return 1;
+		break;
+	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
+	case MSR_KVM_GUEST_SSP:
+		if (!guest_can_use(vcpu, X86_FEATURE_SHSTK))
+			return 1;
+		if (is_noncanonical_address(data, vcpu))
+			return 1;
+		if (!IS_ALIGNED(data, 4))
+			return 1;
+		break;
 	}
 
 	msr.data = data;
@@ -1940,6 +1974,17 @@ static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
 			return 1;
 		break;
+	case MSR_IA32_U_CET:
+	case MSR_IA32_S_CET:
+		if (!guest_can_use(vcpu, X86_FEATURE_IBT) &&
+		    !guest_can_use(vcpu, X86_FEATURE_SHSTK))
+			return 1;
+		break;
+	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
+	case MSR_KVM_GUEST_SSP:
+		if (!guest_can_use(vcpu, X86_FEATURE_SHSTK))
+			return 1;
+		break;
 	}
 
 	msr.index = index;
@@ -3640,47 +3685,6 @@ static bool kvm_is_msr_to_save(u32 msr_index)
 	return false;
 }
 
-static inline bool is_shadow_stack_msr(u32 msr)
-{
-	return msr == MSR_IA32_PL0_SSP ||
-		msr == MSR_IA32_PL1_SSP ||
-		msr == MSR_IA32_PL2_SSP ||
-		msr == MSR_IA32_PL3_SSP ||
-		msr == MSR_IA32_INT_SSP_TAB ||
-		msr == MSR_KVM_GUEST_SSP;
-}
-
-static bool kvm_cet_is_msr_accessible(struct kvm_vcpu *vcpu,
-				      struct msr_data *msr)
-{
-	if (is_shadow_stack_msr(msr->index)) {
-		if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK))
-			return false;
-
-		/*
-		 * This MSR is synthesized mainly for userspace access during
-		 * Live Migration, it also can be accessed in SMM mode by VMM.
-		 * Guest is not allowed to access this MSR.
-		 */
-		if (msr->index == MSR_KVM_GUEST_SSP) {
-			if (IS_ENABLED(CONFIG_X86_64) && is_smm(vcpu))
-				return true;
-
-			return msr->host_initiated;
-		}
-
-		return msr->host_initiated ||
-			guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);
-	}
-
-	if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
-	    !kvm_cpu_cap_has(X86_FEATURE_IBT))
-		return false;
-
-	return msr->host_initiated ||
-		guest_cpuid_has(vcpu, X86_FEATURE_IBT) ||
-		guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);
-}
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
@@ -4036,46 +4040,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vcpu->arch.guest_fpu.xfd_err = data;
 		break;
 #endif
-#define CET_EXCLUSIVE_BITS		(CET_SUPPRESS | CET_WAIT_ENDBR)
-#define CET_CTRL_RESERVED_BITS		GENMASK(9, 6)
-#define CET_SHSTK_MASK_BITS		GENMASK(1, 0)
-#define CET_IBT_MASK_BITS		(GENMASK_ULL(5, 2) | \
-					 GENMASK_ULL(63, 10))
-#define CET_LEG_BITMAP_BASE(data)	((data) >> 12)
 	case MSR_IA32_U_CET:
-	case MSR_IA32_S_CET:
-		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
-			return 1;
-		if (!!(data & CET_CTRL_RESERVED_BITS))
-			return 1;
-		if (!guest_can_use(vcpu, X86_FEATURE_SHSTK) &&
-		    (data & CET_SHSTK_MASK_BITS))
-			return 1;
-		if (!guest_can_use(vcpu, X86_FEATURE_IBT) &&
-		    (data & CET_IBT_MASK_BITS))
-			return 1;
-		if (!IS_ALIGNED(CET_LEG_BITMAP_BASE(data), 4) ||
-		    (data & CET_EXCLUSIVE_BITS) == CET_EXCLUSIVE_BITS)
-			return 1;
-		if (msr == MSR_IA32_U_CET)
-			kvm_set_xsave_msr(msr_info);
-		break;
-	case MSR_KVM_GUEST_SSP:
-	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
-		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
-			return 1;
-		if (is_noncanonical_address(data, vcpu))
-			return 1;
-		if (!IS_ALIGNED(data, 4))
-			return 1;
-		if (msr == MSR_IA32_PL0_SSP || msr == MSR_IA32_PL1_SSP ||
-		    msr == MSR_IA32_PL2_SSP) {
-			vcpu->arch.cet_s_ssp[msr - MSR_IA32_PL0_SSP] = data;
-			if (!vcpu->arch.cet_sss_active && data)
-				vcpu->arch.cet_sss_active = true;
-		} else if (msr == MSR_IA32_PL3_SSP) {
-			kvm_set_xsave_msr(msr_info);
-		}
+	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+		kvm_set_xsave_msr(msr_info);
 		break;
 	default:
 		if (kvm_pmu_is_valid_msr(vcpu, msr))
@@ -4436,17 +4403,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 #endif
 	case MSR_IA32_U_CET:
-	case MSR_IA32_S_CET:
-	case MSR_KVM_GUEST_SSP:
-	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
-		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
-			return 1;
-		if (msr == MSR_IA32_PL0_SSP || msr == MSR_IA32_PL1_SSP ||
-		    msr == MSR_IA32_PL2_SSP) {
-			msr_info->data = vcpu->arch.cet_s_ssp[msr - MSR_IA32_PL0_SSP];
-		} else if (msr == MSR_IA32_U_CET || msr == MSR_IA32_PL3_SSP) {
-			kvm_get_xsave_msr(msr_info);
-		}
+	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+		kvm_get_xsave_msr(msr_info);
 		break;
 	default:
 		if (kvm_pmu_is_valid_msr(vcpu, msr))
@@ -7330,9 +7288,13 @@ static void kvm_probe_msr_to_save(u32 msr_index)
 		break;
 	case MSR_IA32_U_CET:
 	case MSR_IA32_S_CET:
+		if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
+		    !kvm_cpu_cap_has(X86_FEATURE_IBT))
+			return;
+		break;
 	case MSR_KVM_GUEST_SSP:
 	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
-		if (!kvm_is_cet_supported())
+		if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK))
 			return;
 		break;
 	default:
@@ -9664,13 +9626,8 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 		kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
 	}
 	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
-		u32 eax, ebx, ecx, edx;
-
-		cpuid_count(0xd, 1, &eax, &ebx, &ecx, &edx);
 		rdmsrl(MSR_IA32_XSS, host_xss);
 		kvm_caps.supported_xss = host_xss & KVM_SUPPORTED_XSS;
-		if (ecx & XFEATURE_MASK_CET_KERNEL)
-			kvm_caps.supported_xss |= XFEATURE_MASK_CET_KERNEL;
 	}
 
 	rdmsrl_safe(MSR_EFER, &host_efer);

base-commit: efb9177acd7a4df5883b844e1ec9c69ef0899c9c
--
  
Paolo Bonzini Aug. 4, 2023, 9:40 p.m. UTC | #4
On 8/3/23 06:27, Yang Weijiang wrote:
> +		if (msr_info->index == MSR_KVM_GUEST_SSP)
> +			msr_info->data = vmcs_readl(GUEST_SSP);

Accesses to MSR_KVM_(GUEST_)SSP must be rejected unless host-initiated.

Paolo
  
Paolo Bonzini Aug. 4, 2023, 9:45 p.m. UTC | #5
On 8/4/23 23:27, Sean Christopherson wrote:
>>> +
>>> +	if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
>>> +	    !kvm_cpu_cap_has(X86_FEATURE_IBT))
>>> +		return false;
>>> +
>>> +	return msr->host_initiated ||
>>> +		guest_cpuid_has(vcpu, X86_FEATURE_IBT) ||
>>> +		guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);
>
> Similar to my suggestsion for XSS, I think we drop the waiver for host_initiated
> accesses, i.e. require the feature to be enabled and exposed to the guest, even
> for the host.

No, please don't.  Allowing host-initiated accesses is what makes it 
possible to take the list of MSR indices and pass it blindly to 
KVM_GET_MSR and KVM_SET_MSR.  This should be documented, will send a patch.

Paolo
  
Sean Christopherson Aug. 4, 2023, 10:21 p.m. UTC | #6
On Fri, Aug 04, 2023, Paolo Bonzini wrote:
> On 8/4/23 23:27, Sean Christopherson wrote:
> > > > +
> > > > +	if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
> > > > +	    !kvm_cpu_cap_has(X86_FEATURE_IBT))
> > > > +		return false;
> > > > +
> > > > +	return msr->host_initiated ||
> > > > +		guest_cpuid_has(vcpu, X86_FEATURE_IBT) ||
> > > > +		guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);
> > 
> > Similar to my suggestsion for XSS, I think we drop the waiver for host_initiated
> > accesses, i.e. require the feature to be enabled and exposed to the guest, even
> > for the host.
> 
> No, please don't.  Allowing host-initiated accesses is what makes it
> possible to take the list of MSR indices and pass it blindly to KVM_GET_MSR
> and KVM_SET_MSR.

I don't see how that can work today.  Oooh, the MSRs that don't exempt host_initiated
are added to the list of MSRs to save/restore, i.e. KVM "silently" supports
MSR_AMD64_OSVW_ID_LENGTH and MSR_AMD64_OSVW_STATUS.

And guest_pv_has() returns true unless userspace has opted in to enforcement.

Sad panda.

That means we need to figure out a solution for KVM stuffing GUEST_SSP on RSM,
which is a "host" write but a guest controlled value.
  
Yang, Weijiang Aug. 6, 2023, 8:44 a.m. UTC | #7
On 8/5/2023 5:27 AM, Sean Christopherson wrote:
> On Fri, Aug 04, 2023, Chao Gao wrote:
>> On Thu, Aug 03, 2023 at 12:27:24AM -0400, Yang Weijiang wrote:
>>> Add emulation interface for CET MSR read and write.
>>> The emulation code is split into common part and vendor specific
>>> part, the former resides in x86.c to benefic different x86 CPU
>>> vendors, the latter for VMX is implemented in this patch.
>>>
>>> Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
>>> ---
>>> arch/x86/kvm/vmx/vmx.c |  27 +++++++++++
>>> arch/x86/kvm/x86.c     | 104 +++++++++++++++++++++++++++++++++++++----
>>> arch/x86/kvm/x86.h     |  18 +++++++
>>> 3 files changed, 141 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>>> index 6aa76124e81e..ccf750e79608 100644
>>> --- a/arch/x86/kvm/vmx/vmx.c
>>> +++ b/arch/x86/kvm/vmx/vmx.c
>>> @@ -2095,6 +2095,18 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>> 		else
>>> 			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
>>> 		break;
>>> +	case MSR_IA32_S_CET:
>>> +	case MSR_KVM_GUEST_SSP:
>>> +	case MSR_IA32_INT_SSP_TAB:
>>> +		if (kvm_get_msr_common(vcpu, msr_info))
>>> +			return 1;
>>> +		if (msr_info->index == MSR_KVM_GUEST_SSP)
>>> +			msr_info->data = vmcs_readl(GUEST_SSP);
>>> +		else if (msr_info->index == MSR_IA32_S_CET)
>>> +			msr_info->data = vmcs_readl(GUEST_S_CET);
>>> +		else if (msr_info->index == MSR_IA32_INT_SSP_TAB)
>>> +			msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
>> This if-else-if suggests that they are focibly grouped together to just
>> share the call of kvm_get_msr_common(). For readability, I think it is better
>> to handle them separately.
>>
>> e.g.,
>> 	case MSR_IA32_S_CET:
>> 		if (kvm_get_msr_common(vcpu, msr_info))
>> 			return 1;
>> 		msr_info->data = vmcs_readl(GUEST_S_CET);
>> 		break;
>>
>> 	case MSR_KVM_GUEST_SSP:
>> 		if (kvm_get_msr_common(vcpu, msr_info))
>> 			return 1;
>> 		msr_info->data = vmcs_readl(GUEST_SSP);
>> 		break;
> Actually, we can do even better.  We have an existing framework for these types
> of prechecks, I just completely forgot about it :-(  (my "look at PAT" was a bad
> suggestion).
>
> Handle the checks in __kvm_set_msr() and __kvm_get_msr(), i.e. *before* calling
> into vendor code.  Then vendor code doesn't need to make weird callbacks.
I see, will change it, thank you!
>>> int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>> {
>>> 	u32 msr = msr_info->index;
>>> @@ -3981,6 +4014,45 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>> 		vcpu->arch.guest_fpu.xfd_err = data;
>>> 		break;
>>> #endif
>>> +#define CET_EXCLUSIVE_BITS		(CET_SUPPRESS | CET_WAIT_ENDBR)
>>> +#define CET_CTRL_RESERVED_BITS		GENMASK(9, 6)
> Please use a single namespace for these #defines, e.g. CET_CTRL_* or maybe
> CET_US_* for everything.
OK.
>>> +#define CET_SHSTK_MASK_BITS		GENMASK(1, 0)
>>> +#define CET_IBT_MASK_BITS		(GENMASK_ULL(5, 2) | \
>>> +					 GENMASK_ULL(63, 10))
>>> +#define CET_LEG_BITMAP_BASE(data)	((data) >> 12)
> Bah, stupid SDM.  Please spell out "LEGACY", I though "LEG" was short for "LEGAL"
> since this looks a lot like a page shift, i.e. getting a pfn.
Sure :-)
>>> +static bool kvm_cet_is_msr_accessible(struct kvm_vcpu *vcpu,
>>> +				      struct msr_data *msr)
>>> +{
>>> +	if (is_shadow_stack_msr(msr->index)) {
>>> +		if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK))
>>> +			return false;
>>> +
>>> +		if (msr->index == MSR_KVM_GUEST_SSP)
>>> +			return msr->host_initiated;
>>> +
>>> +		return msr->host_initiated ||
>>> +			guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);
>>> +	}
>>> +
>>> +	if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
>>> +	    !kvm_cpu_cap_has(X86_FEATURE_IBT))
>>> +		return false;
>>> +
>>> +	return msr->host_initiated ||
>>> +		guest_cpuid_has(vcpu, X86_FEATURE_IBT) ||
>>> +		guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);
> Similar to my suggestsion for XSS, I think we drop the waiver for host_initiated
> accesses, i.e. require the feature to be enabled and exposed to the guest, even
> for the host.
I saw Paolo shares different opinion on this, so would hold on for a while...
>>> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
>>> index c69fc027f5ec..3b79d6db2f83 100644
>>> --- a/arch/x86/kvm/x86.h
>>> +++ b/arch/x86/kvm/x86.h
>>> @@ -552,4 +552,22 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
>>> 			 unsigned int port, void *data,  unsigned int count,
>>> 			 int in);
>>>
>>> +/*
>>> + * Guest xstate MSRs have been loaded in __msr_io(), disable preemption before
>>> + * access the MSRs to avoid MSR content corruption.
>>> + */
>> I think it is better to describe what the function does prior to jumping into
>> details like where guest FPU is loaded.
OK, will do it, thanks!
>> /*
>>   * Lock and/or reload guest FPU and access xstate MSRs. For accesses initiated
>>   * by host, guest FPU is loaded in __msr_io(). For accesses initiated by guest,
>>   * guest FPU should have been loaded already.
>>   */
>>> +static inline void kvm_get_xsave_msr(struct msr_data *msr_info)
>>> +{
>>> +	kvm_fpu_get();
>>> +	rdmsrl(msr_info->index, msr_info->data);
>>> +	kvm_fpu_put();
>>> +}
>>> +
>>> +static inline void kvm_set_xsave_msr(struct msr_data *msr_info)
>>> +{
>>> +	kvm_fpu_get();
>>> +	wrmsrl(msr_info->index, msr_info->data);
>>> +	kvm_fpu_put();
>>> +}
>> Can you rename functions to kvm_get/set_xstate_msr() to align with the comment
>> and patch 6? And if there is no user outside x86.c, you can just put these two
>> functions right after the is_xstate_msr() added in patch 6.
OK, maybe I added the helpers in this patch duo to compilation error "function is defined but not used".
> +1.  These should also assert that (a) guest FPU state is loaded and
Do you mean something like this:
WARN_ON_ONCE(!vcpu->arch.guest_fpu->in_use) or  KVM_BUG_ON()
added in the helpers?
> (b) the MSR
> is passed through to the guest.  I might be ok dropping (b) if both VMX and SVM
> passthrough all MSRs if they're exposed to the guest, i.e. not lazily passed
> through.
I'm OK to add the assert if finally all the CET MSRs are passed through directly.
> Sans any changes to kvm_{g,s}et_xsave_msr(), I think this?  (completely untested)
>
>
> ---
>   arch/x86/kvm/vmx/vmx.c |  34 +++-------
>   arch/x86/kvm/x86.c     | 151 +++++++++++++++--------------------------
>   2 files changed, 64 insertions(+), 121 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 491039aeb61b..1211eb469d06 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -2100,16 +2100,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>   			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
>   		break;
>   	case MSR_IA32_S_CET:
> +		msr_info->data = vmcs_readl(GUEST_S_CET);
> +		break;
>   	case MSR_KVM_GUEST_SSP:
> +		msr_info->data = vmcs_readl(GUEST_SSP);
> +		break;
>   	case MSR_IA32_INT_SSP_TAB:
> -		if (kvm_get_msr_common(vcpu, msr_info))
> -			return 1;
> -		if (msr_info->index == MSR_KVM_GUEST_SSP)
> -			msr_info->data = vmcs_readl(GUEST_SSP);
> -		else if (msr_info->index == MSR_IA32_S_CET)
> -			msr_info->data = vmcs_readl(GUEST_S_CET);
> -		else if (msr_info->index == MSR_IA32_INT_SSP_TAB)
> -			msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
> +		msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
>   		break;
>   	case MSR_IA32_DEBUGCTLMSR:
>   		msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
> @@ -2432,25 +2429,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>   		else
>   			vmx->pt_desc.guest.addr_a[index / 2] = data;
>   		break;
> -	case MSR_IA32_PL0_SSP ... MSR_IA32_PL2_SSP:
> -		if (kvm_set_msr_common(vcpu, msr_info))
> -			return 1;
> -		if (data) {
> -			vmx_disable_write_intercept_sss_msr(vcpu);
> -			wrmsrl(msr_index, data);
> -		}
> -		break;
>   	case MSR_IA32_S_CET:
> +		vmcs_writel(GUEST_S_CET, data);
> +		break;
>   	case MSR_KVM_GUEST_SSP:
> +		vmcs_writel(GUEST_SSP, data);
> +		break;
>   	case MSR_IA32_INT_SSP_TAB:
> -		if (kvm_set_msr_common(vcpu, msr_info))
> -			return 1;
> -		if (msr_index == MSR_KVM_GUEST_SSP)
> -			vmcs_writel(GUEST_SSP, data);
> -		else if (msr_index == MSR_IA32_S_CET)
> -			vmcs_writel(GUEST_S_CET, data);
> -		else if (msr_index == MSR_IA32_INT_SSP_TAB)
> -			vmcs_writel(GUEST_INTR_SSP_TABLE, data);
> +		vmcs_writel(GUEST_INTR_SSP_TABLE, data);
>   		break;
>   	case MSR_IA32_PERF_CAPABILITIES:
>   		if (data && !vcpu_to_pmu(vcpu)->version)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 7385fc25a987..75e6de7c9268 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1838,6 +1838,11 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
>   }
>   EXPORT_SYMBOL_GPL(kvm_msr_allowed);
>   
> +#define CET_US_RESERVED_BITS		GENMASK(9, 6)
> +#define CET_US_SHSTK_MASK_BITS		GENMASK(1, 0)
> +#define CET_US_IBT_MASK_BITS		(GENMASK_ULL(5, 2) | GENMASK_ULL(63, 10))
> +#define CET_US_LEGACY_BITMAP_BASE(data)	((data) >> 12)
> +
>   /*
>    * Write @data into the MSR specified by @index.  Select MSR specific fault
>    * checks are bypassed if @host_initiated is %true.
> @@ -1897,6 +1902,35 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
>   
>   		data = (u32)data;
>   		break;
> +	case MSR_IA32_U_CET:
> +	case MSR_IA32_S_CET:
> +		if (!guest_can_use(vcpu, X86_FEATURE_SHSTK) &&
> +		    !guest_can_use(vcpu, X86_FEATURE_IBT))
> +		    	return 1;
> +		if (data & CET_US_RESERVED_BITS)
> +			return 1;
> +		if (!guest_can_use(vcpu, X86_FEATURE_SHSTK) &&
> +		    (data & CET_US_SHSTK_MASK_BITS))
> +			return 1;
> +		if (!guest_can_use(vcpu, X86_FEATURE_IBT) &&
> +		    (data & CET_US_IBT_MASK_BITS))
> +			return 1;
> +		if (!IS_ALIGNED(CET_US_LEGACY_BITMAP_BASE(data), 4))
> +			return 1;
> +
> +		/* IBT can be suppressed iff the TRACKER isn't WAIT_ENDR. */
> +		if ((data & CET_SUPPRESS) && (data & CET_WAIT_ENDBR))
> +			return 1;
> +		break;
> +	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
> +	case MSR_KVM_GUEST_SSP:
> +		if (!guest_can_use(vcpu, X86_FEATURE_SHSTK))
> +			return 1;
> +		if (is_noncanonical_address(data, vcpu))
> +			return 1;
> +		if (!IS_ALIGNED(data, 4))
> +			return 1;
> +		break;
>   	}
>   
>   	msr.data = data;
> @@ -1940,6 +1974,17 @@ static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
>   		    !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
>   			return 1;
>   		break;
> +	case MSR_IA32_U_CET:
> +	case MSR_IA32_S_CET:
> +		if (!guest_can_use(vcpu, X86_FEATURE_IBT) &&
> +		    !guest_can_use(vcpu, X86_FEATURE_SHSTK))
> +			return 1;
> +		break;
> +	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
> +	case MSR_KVM_GUEST_SSP:
> +		if (!guest_can_use(vcpu, X86_FEATURE_SHSTK))
> +			return 1;
> +		break;
>   	}
>   
>   	msr.index = index;
> @@ -3640,47 +3685,6 @@ static bool kvm_is_msr_to_save(u32 msr_index)
>   	return false;
>   }
>   
> -static inline bool is_shadow_stack_msr(u32 msr)
> -{
> -	return msr == MSR_IA32_PL0_SSP ||
> -		msr == MSR_IA32_PL1_SSP ||
> -		msr == MSR_IA32_PL2_SSP ||
> -		msr == MSR_IA32_PL3_SSP ||
> -		msr == MSR_IA32_INT_SSP_TAB ||
> -		msr == MSR_KVM_GUEST_SSP;
> -}
> -
> -static bool kvm_cet_is_msr_accessible(struct kvm_vcpu *vcpu,
> -				      struct msr_data *msr)
> -{
> -	if (is_shadow_stack_msr(msr->index)) {
> -		if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK))
> -			return false;
> -
> -		/*
> -		 * This MSR is synthesized mainly for userspace access during
> -		 * Live Migration, it also can be accessed in SMM mode by VMM.
> -		 * Guest is not allowed to access this MSR.
> -		 */
> -		if (msr->index == MSR_KVM_GUEST_SSP) {
> -			if (IS_ENABLED(CONFIG_X86_64) && is_smm(vcpu))
> -				return true;
> -
> -			return msr->host_initiated;
> -		}
> -
> -		return msr->host_initiated ||
> -			guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);
> -	}
> -
> -	if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
> -	    !kvm_cpu_cap_has(X86_FEATURE_IBT))
> -		return false;
> -
> -	return msr->host_initiated ||
> -		guest_cpuid_has(vcpu, X86_FEATURE_IBT) ||
> -		guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);
> -}
>   
>   int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>   {
> @@ -4036,46 +4040,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>   		vcpu->arch.guest_fpu.xfd_err = data;
>   		break;
>   #endif
> -#define CET_EXCLUSIVE_BITS		(CET_SUPPRESS | CET_WAIT_ENDBR)
> -#define CET_CTRL_RESERVED_BITS		GENMASK(9, 6)
> -#define CET_SHSTK_MASK_BITS		GENMASK(1, 0)
> -#define CET_IBT_MASK_BITS		(GENMASK_ULL(5, 2) | \
> -					 GENMASK_ULL(63, 10))
> -#define CET_LEG_BITMAP_BASE(data)	((data) >> 12)
>   	case MSR_IA32_U_CET:
> -	case MSR_IA32_S_CET:
> -		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
> -			return 1;
> -		if (!!(data & CET_CTRL_RESERVED_BITS))
> -			return 1;
> -		if (!guest_can_use(vcpu, X86_FEATURE_SHSTK) &&
> -		    (data & CET_SHSTK_MASK_BITS))
> -			return 1;
> -		if (!guest_can_use(vcpu, X86_FEATURE_IBT) &&
> -		    (data & CET_IBT_MASK_BITS))
> -			return 1;
> -		if (!IS_ALIGNED(CET_LEG_BITMAP_BASE(data), 4) ||
> -		    (data & CET_EXCLUSIVE_BITS) == CET_EXCLUSIVE_BITS)
> -			return 1;
> -		if (msr == MSR_IA32_U_CET)
> -			kvm_set_xsave_msr(msr_info);
> -		break;
> -	case MSR_KVM_GUEST_SSP:
> -	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
> -		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
> -			return 1;
> -		if (is_noncanonical_address(data, vcpu))
> -			return 1;
> -		if (!IS_ALIGNED(data, 4))
> -			return 1;
> -		if (msr == MSR_IA32_PL0_SSP || msr == MSR_IA32_PL1_SSP ||
> -		    msr == MSR_IA32_PL2_SSP) {
> -			vcpu->arch.cet_s_ssp[msr - MSR_IA32_PL0_SSP] = data;
> -			if (!vcpu->arch.cet_sss_active && data)
> -				vcpu->arch.cet_sss_active = true;
> -		} else if (msr == MSR_IA32_PL3_SSP) {
> -			kvm_set_xsave_msr(msr_info);
> -		}
> +	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
> +		kvm_set_xsave_msr(msr_info);
>   		break;
>   	default:
>   		if (kvm_pmu_is_valid_msr(vcpu, msr))
> @@ -4436,17 +4403,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>   		break;
>   #endif
>   	case MSR_IA32_U_CET:
> -	case MSR_IA32_S_CET:
> -	case MSR_KVM_GUEST_SSP:
> -	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
> -		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
> -			return 1;
> -		if (msr == MSR_IA32_PL0_SSP || msr == MSR_IA32_PL1_SSP ||
> -		    msr == MSR_IA32_PL2_SSP) {
> -			msr_info->data = vcpu->arch.cet_s_ssp[msr - MSR_IA32_PL0_SSP];
> -		} else if (msr == MSR_IA32_U_CET || msr == MSR_IA32_PL3_SSP) {
> -			kvm_get_xsave_msr(msr_info);
> -		}
> +	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
> +		kvm_get_xsave_msr(msr_info);
>   		break;
>   	default:
>   		if (kvm_pmu_is_valid_msr(vcpu, msr))
> @@ -7330,9 +7288,13 @@ static void kvm_probe_msr_to_save(u32 msr_index)
>   		break;
>   	case MSR_IA32_U_CET:
>   	case MSR_IA32_S_CET:
> +		if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
> +		    !kvm_cpu_cap_has(X86_FEATURE_IBT))
> +			return;
> +		break;
>   	case MSR_KVM_GUEST_SSP:
>   	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
> -		if (!kvm_is_cet_supported())
> +		if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK))
>   			return;
>   		break;
>   	default:
> @@ -9664,13 +9626,8 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
>   		kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
>   	}
>   	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
> -		u32 eax, ebx, ecx, edx;
> -
> -		cpuid_count(0xd, 1, &eax, &ebx, &ecx, &edx);
>   		rdmsrl(MSR_IA32_XSS, host_xss);
>   		kvm_caps.supported_xss = host_xss & KVM_SUPPORTED_XSS;
> -		if (ecx & XFEATURE_MASK_CET_KERNEL)
> -			kvm_caps.supported_xss |= XFEATURE_MASK_CET_KERNEL;
>   	}
>   
>   	rdmsrl_safe(MSR_EFER, &host_efer);
>
> base-commit: efb9177acd7a4df5883b844e1ec9c69ef0899c9c
The code looks good to me except the handling of MSR_KVM_GUEST_SSP,
non-host-initiated read/write should be prevented.
  
Paolo Bonzini Aug. 7, 2023, 7 a.m. UTC | #8
On 8/6/23 10:44, Yang, Weijiang wrote:
>> Similar to my suggestsion for XSS, I think we drop the waiver for 
>> host_initiated
>> accesses, i.e. require the feature to be enabled and exposed to the 
>> guest, even
>> for the host.
>
> I saw Paolo shares different opinion on this, so would hold on for a 
> while...

It's not *so* different: the host initiated access should be allowed, 
but it should only allow writing zero.  So, something like:

> +static bool kvm_cet_is_msr_accessible(struct kvm_vcpu *vcpu,
> +                      struct msr_data *msr)
> +{

bool host_msr_reset =
	msr->host_initiated && msr->data == 0;

and then below you use host_msr_reset instead of msr->host_initiated.

> +        if (msr->index == MSR_KVM_GUEST_SSP)
> +            return msr->host_initiated;
> +
> +        return msr->host_initiated ||
> +            guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);

This can be unified like this:

return
	(host_msr_reset || guest_cpuid_has(vcpu, X86_FEATURE_SHSTK)) &&
	(msr->index != MSR_KVM_GUEST_SSP || msr->host_initiated);

> +    }
> +
> +    if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
> +        !kvm_cpu_cap_has(X86_FEATURE_IBT))
> +        return false;
> +
> +    return msr->host_initiated ||
> +        guest_cpuid_has(vcpu, X86_FEATURE_IBT) ||
> +        guest_cpuid_has(vcpu, X86_FEATURE_SHSTK); 

while this can simply use host_msr_reset.

Paolo
  
Paolo Bonzini Aug. 7, 2023, 7:03 a.m. UTC | #9
On 8/5/23 00:21, Sean Christopherson wrote:
> Oooh, the MSRs that don't exempt host_initiated are added to the list

(are *not* added)

> of MSRs to save/restore, i.e. KVM "silently" supports 
> MSR_AMD64_OSVW_ID_LENGTH and MSR_AMD64_OSVW_STATUS.
> 
> And guest_pv_has() returns true unless userspace has opted in to
> enforcement.

Two different ways of having the same bug.  The latter was introduced in 
the implementation of KVM_CAP_ENFORCE_PV_FEATURE_CPUID; it would become 
a problem if some selftests started using it.

Paolo
  
Yang, Weijiang Aug. 9, 2023, 3:05 a.m. UTC | #10
On 8/5/2023 5:40 AM, Paolo Bonzini wrote:
> On 8/3/23 06:27, Yang Weijiang wrote:
>> +        if (msr_info->index == MSR_KVM_GUEST_SSP)
>> +            msr_info->data = vmcs_readl(GUEST_SSP);
>
> Accesses to MSR_KVM_(GUEST_)SSP must be rejected unless host-initiated.
Yes, it's kept, in v5 it's folded in:

+static bool kvm_cet_is_msr_accessible(struct kvm_vcpu *vcpu,

+struct msr_data *msr)

+{

+if (is_shadow_stack_msr(msr->index)) {

+if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK))

+return false;

+

+if (msr->index == MSR_KVM_GUEST_SSP)

+return msr->host_initiated;

+

+return msr->host_initiated ||

+guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);

+}

+

+if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&

+!kvm_cpu_cap_has(X86_FEATURE_IBT))

+return false;

+

+return msr->host_initiated ||

+guest_cpuid_has(vcpu, X86_FEATURE_IBT) ||

+guest_cpuid_has(vcpu, X86_FEATURE_SHSTK); }

+

> Paolo
>
  
Yang, Weijiang Aug. 9, 2023, 7:12 a.m. UTC | #11
On 8/4/2023 4:28 PM, Chao Gao wrote:
>> +	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
>> +		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
>> +			return 1;
>> +		if (is_noncanonical_address(data, vcpu))
>> +			return 1;
>> +		if (!IS_ALIGNED(data, 4))
>> +			return 1;
> Why should MSR_IA32_INT_SSP_TAB be 4-byte aligned? I don't see
> this requirement in SDM.
It must be something wrong in my memory, thanks for catching it!
> IA32_INTERRUPT_SSP_TABLE_ADDR:
>
> Linear address of a table of seven shadow
> stack pointers that are selected in IA-32e
> mode using the IST index (when not 0) from
> the interrupt gate descriptor. (R/W)
> This MSR is not present on processors that
> do not support Intel 64 architecture. This
> field cannot represent a non-canonical
> address.
  

Patch

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6aa76124e81e..ccf750e79608 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2095,6 +2095,18 @@  static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		else
 			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
 		break;
+	case MSR_IA32_S_CET:
+	case MSR_KVM_GUEST_SSP:
+	case MSR_IA32_INT_SSP_TAB:
+		if (kvm_get_msr_common(vcpu, msr_info))
+			return 1;
+		if (msr_info->index == MSR_KVM_GUEST_SSP)
+			msr_info->data = vmcs_readl(GUEST_SSP);
+		else if (msr_info->index == MSR_IA32_S_CET)
+			msr_info->data = vmcs_readl(GUEST_S_CET);
+		else if (msr_info->index == MSR_IA32_INT_SSP_TAB)
+			msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
+		break;
 	case MSR_IA32_DEBUGCTLMSR:
 		msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
 		break;
@@ -2404,6 +2416,18 @@  static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		else
 			vmx->pt_desc.guest.addr_a[index / 2] = data;
 		break;
+	case MSR_IA32_S_CET:
+	case MSR_KVM_GUEST_SSP:
+	case MSR_IA32_INT_SSP_TAB:
+		if (kvm_set_msr_common(vcpu, msr_info))
+			return 1;
+		if (msr_index == MSR_KVM_GUEST_SSP)
+			vmcs_writel(GUEST_SSP, data);
+		else if (msr_index == MSR_IA32_S_CET)
+			vmcs_writel(GUEST_S_CET, data);
+		else if (msr_index == MSR_IA32_INT_SSP_TAB)
+			vmcs_writel(GUEST_INTR_SSP_TABLE, data);
+		break;
 	case MSR_IA32_PERF_CAPABILITIES:
 		if (data && !vcpu_to_pmu(vcpu)->version)
 			return 1;
@@ -4864,6 +4888,9 @@  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		vmcs_write64(GUEST_BNDCFGS, 0);
 
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
+	vmcs_writel(GUEST_SSP, 0);
+	vmcs_writel(GUEST_S_CET, 0);
+	vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
 
 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5b63441fd2d2..98f3ff6078e6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3627,6 +3627,39 @@  static bool kvm_is_msr_to_save(u32 msr_index)
 	return false;
 }
 
+static inline bool is_shadow_stack_msr(u32 msr)
+{
+	return msr == MSR_IA32_PL0_SSP ||
+		msr == MSR_IA32_PL1_SSP ||
+		msr == MSR_IA32_PL2_SSP ||
+		msr == MSR_IA32_PL3_SSP ||
+		msr == MSR_IA32_INT_SSP_TAB ||
+		msr == MSR_KVM_GUEST_SSP;
+}
+
+static bool kvm_cet_is_msr_accessible(struct kvm_vcpu *vcpu,
+				      struct msr_data *msr)
+{
+	if (is_shadow_stack_msr(msr->index)) {
+		if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK))
+			return false;
+
+		if (msr->index == MSR_KVM_GUEST_SSP)
+			return msr->host_initiated;
+
+		return msr->host_initiated ||
+			guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);
+	}
+
+	if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
+	    !kvm_cpu_cap_has(X86_FEATURE_IBT))
+		return false;
+
+	return msr->host_initiated ||
+		guest_cpuid_has(vcpu, X86_FEATURE_IBT) ||
+		guest_cpuid_has(vcpu, X86_FEATURE_SHSTK);
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	u32 msr = msr_info->index;
@@ -3981,6 +4014,45 @@  int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vcpu->arch.guest_fpu.xfd_err = data;
 		break;
 #endif
+#define CET_EXCLUSIVE_BITS		(CET_SUPPRESS | CET_WAIT_ENDBR)
+#define CET_CTRL_RESERVED_BITS		GENMASK(9, 6)
+#define CET_SHSTK_MASK_BITS		GENMASK(1, 0)
+#define CET_IBT_MASK_BITS		(GENMASK_ULL(5, 2) | \
+					 GENMASK_ULL(63, 10))
+#define CET_LEG_BITMAP_BASE(data)	((data) >> 12)
+	case MSR_IA32_U_CET:
+	case MSR_IA32_S_CET:
+		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
+			return 1;
+		if (!!(data & CET_CTRL_RESERVED_BITS))
+			return 1;
+		if (!guest_can_use(vcpu, X86_FEATURE_SHSTK) &&
+		    (data & CET_SHSTK_MASK_BITS))
+			return 1;
+		if (!guest_can_use(vcpu, X86_FEATURE_IBT) &&
+		    (data & CET_IBT_MASK_BITS))
+			return 1;
+		if (!IS_ALIGNED(CET_LEG_BITMAP_BASE(data), 4) ||
+		    (data & CET_EXCLUSIVE_BITS) == CET_EXCLUSIVE_BITS)
+			return 1;
+		if (msr == MSR_IA32_U_CET)
+			kvm_set_xsave_msr(msr_info);
+		break;
+	case MSR_KVM_GUEST_SSP:
+	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
+		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
+			return 1;
+		if (is_noncanonical_address(data, vcpu))
+			return 1;
+		if (!IS_ALIGNED(data, 4))
+			return 1;
+		if (msr == MSR_IA32_PL0_SSP || msr == MSR_IA32_PL1_SSP ||
+		    msr == MSR_IA32_PL2_SSP) {
+			vcpu->arch.cet_s_ssp[msr - MSR_IA32_PL0_SSP] = data;
+		} else if (msr == MSR_IA32_PL3_SSP) {
+			kvm_set_xsave_msr(msr_info);
+		}
+		break;
 	default:
 		if (kvm_pmu_is_valid_msr(vcpu, msr))
 			return kvm_pmu_set_msr(vcpu, msr_info);
@@ -4051,7 +4123,9 @@  static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
-	switch (msr_info->index) {
+	u32 msr = msr_info->index;
+
+	switch (msr) {
 	case MSR_IA32_PLATFORM_ID:
 	case MSR_IA32_EBL_CR_POWERON:
 	case MSR_IA32_LASTBRANCHFROMIP:
@@ -4086,7 +4160,7 @@  int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
 	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
 	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
-		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+		if (kvm_pmu_is_valid_msr(vcpu, msr))
 			return kvm_pmu_get_msr(vcpu, msr_info);
 		msr_info->data = 0;
 		break;
@@ -4137,7 +4211,7 @@  int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_MTRRcap:
 	case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
 	case MSR_MTRRdefType:
-		return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
+		return kvm_mtrr_get_msr(vcpu, msr, &msr_info->data);
 	case 0xcd: /* fsb frequency */
 		msr_info->data = 3;
 		break;
@@ -4159,7 +4233,7 @@  int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = kvm_get_apic_base(vcpu);
 		break;
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
-		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
+		return kvm_x2apic_msr_read(vcpu, msr, &msr_info->data);
 	case MSR_IA32_TSC_DEADLINE:
 		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
 		break;
@@ -4253,7 +4327,7 @@  int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
-		return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
+		return get_msr_mce(vcpu, msr, &msr_info->data,
 				   msr_info->host_initiated);
 	case MSR_IA32_XSS:
 		if (!msr_info->host_initiated &&
@@ -4284,7 +4358,7 @@  int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case HV_X64_MSR_TSC_EMULATION_STATUS:
 	case HV_X64_MSR_TSC_INVARIANT_CONTROL:
 		return kvm_hv_get_msr_common(vcpu,
-					     msr_info->index, &msr_info->data,
+					     msr, &msr_info->data,
 					     msr_info->host_initiated);
 	case MSR_IA32_BBL_CR_CTL3:
 		/* This legacy MSR exists but isn't fully documented in current
@@ -4337,8 +4411,22 @@  int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = vcpu->arch.guest_fpu.xfd_err;
 		break;
 #endif
+	case MSR_IA32_U_CET:
+	case MSR_IA32_S_CET:
+	case MSR_KVM_GUEST_SSP:
+	case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
+		if (!kvm_cet_is_msr_accessible(vcpu, msr_info))
+			return 1;
+		if (msr == MSR_IA32_PL0_SSP || msr == MSR_IA32_PL1_SSP ||
+		    msr == MSR_IA32_PL2_SSP) {
+			msr_info->data =
+				vcpu->arch.cet_s_ssp[msr - MSR_IA32_PL0_SSP];
+		} else if (msr == MSR_IA32_U_CET || msr == MSR_IA32_PL3_SSP) {
+			kvm_get_xsave_msr(msr_info);
+		}
+		break;
 	default:
-		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+		if (kvm_pmu_is_valid_msr(vcpu, msr))
 			return kvm_pmu_get_msr(vcpu, msr_info);
 
 		/*
@@ -4346,7 +4434,7 @@  int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		 * to-be-saved, even if an MSR isn't fully supported.
 		 */
 		if (msr_info->host_initiated &&
-		    kvm_is_msr_to_save(msr_info->index)) {
+		    kvm_is_msr_to_save(msr)) {
 			msr_info->data = 0;
 			break;
 		}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c69fc027f5ec..3b79d6db2f83 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -552,4 +552,22 @@  int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
 			 unsigned int port, void *data,  unsigned int count,
 			 int in);
 
+/*
+ * Guest xstate MSRs have been loaded in __msr_io(), disable preemption before
+ * access the MSRs to avoid MSR content corruption.
+ */
+static inline void kvm_get_xsave_msr(struct msr_data *msr_info)
+{
+	kvm_fpu_get();
+	rdmsrl(msr_info->index, msr_info->data);
+	kvm_fpu_put();
+}
+
+static inline void kvm_set_xsave_msr(struct msr_data *msr_info)
+{
+	kvm_fpu_get();
+	wrmsrl(msr_info->index, msr_info->data);
+	kvm_fpu_put();
+}
+
 #endif