[RFC,v2,02/11] KVM: x86: Advertise CPUID.7.2.EDX and RRSBA_CTRL support

Message ID 20230414062545.270178-3-chao.gao@intel.com
State New
Headers
Series Intel IA32_SPEC_CTRL Virtualization |

Commit Message

Chao Gao April 14, 2023, 6:25 a.m. UTC
  From: Zhang Chen <chen.zhang@intel.com>

Add a kvm-only CPUID feature leaf for CPUID.7.2.EDX and RRSBA_CTRL
as the first feature in the leaf.

RRSBA_CTRL is enumerated by CPUID.7.2.EDX[2]. If supported, RRSBA_DIS_U
(bit 5) and RRSBA_DIS_S (bit 6) of IA32_SPEC_CTRL MSR can be used to
disable RRSBA behavior for CPL3 and CPL0/1/2 respectively.

Note that KVM does not intercept guests' IA32_SPEC_CTRL MSR accesses
after a non-zero is written to the MSR. Therefore, guests can already
toggle the two bits if the host supports RRSBA_CTRL, and no extra code
is needed to allow guests to toggle the two bits.

Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Chao Gao <chao.gao@intel.com>
Tested-by: Jiaan Lu <jiaan.lu@intel.com>
---
 arch/x86/kvm/cpuid.c         | 22 +++++++++++++++++++---
 arch/x86/kvm/reverse_cpuid.h |  7 +++++++
 2 files changed, 26 insertions(+), 3 deletions(-)
  

Comments

Binbin Wu April 16, 2023, 7:04 a.m. UTC | #1
On 4/14/2023 2:25 PM, Chao Gao wrote:
> From: Zhang Chen <chen.zhang@intel.com>
>
> Add a kvm-only CPUID feature leaf for CPUID.7.2.EDX and RRSBA_CTRL
> as the first feature in the leaf.
>
> RRSBA_CTRL is enumerated by CPUID.7.2.EDX[2]. If supported, RRSBA_DIS_U
> (bit 5) and RRSBA_DIS_S (bit 6) of IA32_SPEC_CTRL MSR can be used to
> disable RRSBA behavior for CPL3 and CPL0/1/2 respectively.
>
> Note that KVM does not intercept guests' IA32_SPEC_CTRL MSR accesses
> after a non-zero is written to the MSR. Therefore, guests can already
> toggle the two bits if the host supports RRSBA_CTRL, and no extra code
> is needed to allow guests to toggle the two bits.
>
> Signed-off-by: Zhang Chen <chen.zhang@intel.com>
> Signed-off-by: Chao Gao <chao.gao@intel.com>
> Tested-by: Jiaan Lu <jiaan.lu@intel.com>
> ---
>   arch/x86/kvm/cpuid.c         | 22 +++++++++++++++++++---
>   arch/x86/kvm/reverse_cpuid.h |  7 +++++++
>   2 files changed, 26 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 9583a110cf5f..f024c3ac2203 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -685,6 +685,10 @@ void kvm_set_cpu_caps(void)
>   		SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
>   	);
>   
> +	kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX,
> +		SF(RRSBA_CTRL)
> +	);
> +

Is it slightly better to put the new added one after

     kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, ...

to make it in order?


>   	kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
>   		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
>   		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
> @@ -949,13 +953,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
>   		break;
>   	/* function 7 has additional index. */
>   	case 7:
> -		entry->eax = min(entry->eax, 1u);
> +		entry->eax = min(entry->eax, 2u);
>   		cpuid_entry_override(entry, CPUID_7_0_EBX);
>   		cpuid_entry_override(entry, CPUID_7_ECX);
>   		cpuid_entry_override(entry, CPUID_7_EDX);
>   
> -		/* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */
> -		if (entry->eax == 1) {
> +		max_idx = entry->eax;
> +
> +		if (max_idx >= 1) {
>   			entry = do_host_cpuid(array, function, 1);
>   			if (!entry)
>   				goto out;
> @@ -965,6 +970,17 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
>   			entry->ebx = 0;
>   			entry->ecx = 0;
>   		}
> +
> +		if (max_idx >= 2) {
> +			entry = do_host_cpuid(array, function, 2);
> +			if (!entry)
> +				goto out;
> +
> +			cpuid_entry_override(entry, CPUID_7_2_EDX);
> +			entry->eax = 0;
> +			entry->ebx = 0;
> +			entry->ecx = 0;
> +		}
>   		break;
>   	case 0xa: { /* Architectural Performance Monitoring */
>   		union cpuid10_eax eax;
> diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
> index a5717282bb9c..72bad8314a9c 100644
> --- a/arch/x86/kvm/reverse_cpuid.h
> +++ b/arch/x86/kvm/reverse_cpuid.h
> @@ -15,6 +15,7 @@ enum kvm_only_cpuid_leafs {
>   	CPUID_12_EAX	 = NCAPINTS,
>   	CPUID_7_1_EDX,
>   	CPUID_8000_0007_EDX,
> +	CPUID_7_2_EDX,
>   	NR_KVM_CPU_CAPS,
>   
>   	NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
> @@ -47,6 +48,9 @@ enum kvm_only_cpuid_leafs {
>   /* CPUID level 0x80000007 (EDX). */
>   #define KVM_X86_FEATURE_CONSTANT_TSC	KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
>   
> +/* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */
> +#define KVM_X86_FEATURE_RRSBA_CTRL	KVM_X86_FEATURE(CPUID_7_2_EDX, 2)
> +
>   struct cpuid_reg {
>   	u32 function;
>   	u32 index;
> @@ -69,6 +73,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
>   	[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
>   	[CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
>   	[CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
> +	[CPUID_7_2_EDX]       = {         7, 2, CPUID_EDX},
>   	[CPUID_12_EAX]        = {0x00000012, 0, CPUID_EAX},
>   	[CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
>   	[CPUID_7_1_EDX]       = {         7, 1, CPUID_EDX},
> @@ -108,6 +113,8 @@ static __always_inline u32 __feature_translate(int x86_feature)
>   		return KVM_X86_FEATURE_SGX_EDECCSSA;
>   	else if (x86_feature == X86_FEATURE_CONSTANT_TSC)
>   		return KVM_X86_FEATURE_CONSTANT_TSC;
> +	else if (x86_feature == X86_FEATURE_RRSBA_CTRL)
> +		return KVM_X86_FEATURE_RRSBA_CTRL;
>   
>   	return x86_feature;
>   }
  
Chao Gao April 16, 2023, 1:25 p.m. UTC | #2
On Sun, Apr 16, 2023 at 03:04:59PM +0800, Binbin Wu wrote:
>
>On 4/14/2023 2:25 PM, Chao Gao wrote:
>> From: Zhang Chen <chen.zhang@intel.com>
>> 
>> Add a kvm-only CPUID feature leaf for CPUID.7.2.EDX and RRSBA_CTRL
>> as the first feature in the leaf.
>> 
>> RRSBA_CTRL is enumerated by CPUID.7.2.EDX[2]. If supported, RRSBA_DIS_U
>> (bit 5) and RRSBA_DIS_S (bit 6) of IA32_SPEC_CTRL MSR can be used to
>> disable RRSBA behavior for CPL3 and CPL0/1/2 respectively.
>> 
>> Note that KVM does not intercept guests' IA32_SPEC_CTRL MSR accesses
>> after a non-zero is written to the MSR. Therefore, guests can already
>> toggle the two bits if the host supports RRSBA_CTRL, and no extra code
>> is needed to allow guests to toggle the two bits.
>> 
>> Signed-off-by: Zhang Chen <chen.zhang@intel.com>
>> Signed-off-by: Chao Gao <chao.gao@intel.com>
>> Tested-by: Jiaan Lu <jiaan.lu@intel.com>
>> ---
>>   arch/x86/kvm/cpuid.c         | 22 +++++++++++++++++++---
>>   arch/x86/kvm/reverse_cpuid.h |  7 +++++++
>>   2 files changed, 26 insertions(+), 3 deletions(-)
>> 
>> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
>> index 9583a110cf5f..f024c3ac2203 100644
>> --- a/arch/x86/kvm/cpuid.c
>> +++ b/arch/x86/kvm/cpuid.c
>> @@ -685,6 +685,10 @@ void kvm_set_cpu_caps(void)
>>   		SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
>>   	);
>> +	kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX,
>> +		SF(RRSBA_CTRL)
>> +	);
>> +
>
>Is it slightly better to put the new added one after
>
>    kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, ...
>
>to make it in order?

Yes. Will do.
  
Xiaoyao Li May 15, 2023, 6:53 a.m. UTC | #3
On 4/14/2023 2:25 PM, Chao Gao wrote:
> From: Zhang Chen <chen.zhang@intel.com>
> 
> Add a kvm-only CPUID feature leaf for CPUID.7.2.EDX and RRSBA_CTRL
> as the first feature in the leaf.
> 
> RRSBA_CTRL is enumerated by CPUID.7.2.EDX[2]. If supported, RRSBA_DIS_U
> (bit 5) and RRSBA_DIS_S (bit 6) of IA32_SPEC_CTRL MSR can be used to
> disable RRSBA behavior for CPL3 and CPL0/1/2 respectively.
> 
> Note that KVM does not intercept guests' IA32_SPEC_CTRL MSR accesses
> after a non-zero is written to the MSR. Therefore, guests can already
> toggle the two bits if the host supports RRSBA_CTRL, and no extra code
> is needed to allow guests to toggle the two bits.

This is a bug that also matters with other bits in MSR_IA32_SPEC_CTRL 
which has a dedicated enumeration CPUID bit and no support in KVM yet.

I think we need to fix this bug at first.

> Signed-off-by: Zhang Chen <chen.zhang@intel.com>
> Signed-off-by: Chao Gao <chao.gao@intel.com>
> Tested-by: Jiaan Lu <jiaan.lu@intel.com>
> ---
>   arch/x86/kvm/cpuid.c         | 22 +++++++++++++++++++---
>   arch/x86/kvm/reverse_cpuid.h |  7 +++++++
>   2 files changed, 26 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 9583a110cf5f..f024c3ac2203 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -685,6 +685,10 @@ void kvm_set_cpu_caps(void)
>   		SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
>   	);
>   
> +	kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX,
> +		SF(RRSBA_CTRL)
> +	);
> +

Please move this hook up to right follow the leaf CPUID_7_1_EAX.

>   	kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
>   		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
>   		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
> @@ -949,13 +953,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
>   		break;
>   	/* function 7 has additional index. */
>   	case 7:
> -		entry->eax = min(entry->eax, 1u);
> +		entry->eax = min(entry->eax, 2u);
>   		cpuid_entry_override(entry, CPUID_7_0_EBX);
>   		cpuid_entry_override(entry, CPUID_7_ECX);
>   		cpuid_entry_override(entry, CPUID_7_EDX);
>   
> -		/* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */
> -		if (entry->eax == 1) {
> +		max_idx = entry->eax;
> +
> +		if (max_idx >= 1) {
>   			entry = do_host_cpuid(array, function, 1);
>   			if (!entry)
>   				goto out;
> @@ -965,6 +970,17 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
>   			entry->ebx = 0;
>   			entry->ecx = 0;
>   		}
> +
> +		if (max_idx >= 2) {
> +			entry = do_host_cpuid(array, function, 2);
> +			if (!entry)
> +				goto out;
> +
> +			cpuid_entry_override(entry, CPUID_7_2_EDX);
> +			entry->eax = 0;
> +			entry->ebx = 0;
> +			entry->ecx = 0;
> +		}
>   		break;
>   	case 0xa: { /* Architectural Performance Monitoring */
>   		union cpuid10_eax eax;
> diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
> index a5717282bb9c..72bad8314a9c 100644
> --- a/arch/x86/kvm/reverse_cpuid.h
> +++ b/arch/x86/kvm/reverse_cpuid.h
> @@ -15,6 +15,7 @@ enum kvm_only_cpuid_leafs {
>   	CPUID_12_EAX	 = NCAPINTS,
>   	CPUID_7_1_EDX,
>   	CPUID_8000_0007_EDX,
> +	CPUID_7_2_EDX,
>   	NR_KVM_CPU_CAPS,
>   
>   	NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
> @@ -47,6 +48,9 @@ enum kvm_only_cpuid_leafs {
>   /* CPUID level 0x80000007 (EDX). */
>   #define KVM_X86_FEATURE_CONSTANT_TSC	KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
>   
> +/* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */
> +#define KVM_X86_FEATURE_RRSBA_CTRL	KVM_X86_FEATURE(CPUID_7_2_EDX, 2)
> +
>   struct cpuid_reg {
>   	u32 function;
>   	u32 index;
> @@ -69,6 +73,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
>   	[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
>   	[CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
>   	[CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
> +	[CPUID_7_2_EDX]       = {         7, 2, CPUID_EDX},
>   	[CPUID_12_EAX]        = {0x00000012, 0, CPUID_EAX},
>   	[CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
>   	[CPUID_7_1_EDX]       = {         7, 1, CPUID_EDX},
> @@ -108,6 +113,8 @@ static __always_inline u32 __feature_translate(int x86_feature)
>   		return KVM_X86_FEATURE_SGX_EDECCSSA;
>   	else if (x86_feature == X86_FEATURE_CONSTANT_TSC)
>   		return KVM_X86_FEATURE_CONSTANT_TSC;
> +	else if (x86_feature == X86_FEATURE_RRSBA_CTRL)
> +		return KVM_X86_FEATURE_RRSBA_CTRL;
>   
>   	return x86_feature;
>   }
  
Chao Gao May 16, 2023, 2:04 a.m. UTC | #4
On Mon, May 15, 2023 at 02:53:07PM +0800, Xiaoyao Li wrote:
>On 4/14/2023 2:25 PM, Chao Gao wrote:
>> From: Zhang Chen <chen.zhang@intel.com>
>> 
>> Add a kvm-only CPUID feature leaf for CPUID.7.2.EDX and RRSBA_CTRL
>> as the first feature in the leaf.
>> 
>> RRSBA_CTRL is enumerated by CPUID.7.2.EDX[2]. If supported, RRSBA_DIS_U
>> (bit 5) and RRSBA_DIS_S (bit 6) of IA32_SPEC_CTRL MSR can be used to
>> disable RRSBA behavior for CPL3 and CPL0/1/2 respectively.
>> 
>> Note that KVM does not intercept guests' IA32_SPEC_CTRL MSR accesses
>> after a non-zero is written to the MSR. Therefore, guests can already
>> toggle the two bits if the host supports RRSBA_CTRL, and no extra code
>> is needed to allow guests to toggle the two bits.
>
>This is a bug that also matters with other bits in MSR_IA32_SPEC_CTRL which
>has a dedicated enumeration CPUID bit and no support in KVM yet.

Do you mean passing through the MSR is a bug? guest can write any hardware
supported value to the MSR if the MSR isn't intercepted.

I guess this is intentional and a trade-off for performance (note that
context-switch may cause writes to the MSR). And see

commit 841c2be09fe4 ("kvm: x86: replace kvm_spec_ctrl_test_value with runtime test on the host")

it appears that this behavior is widely recognized.

>
>I think we need to fix this bug at first.

I have no idea how to fix the "bug" without intercepting the MSR. The
performance penalty makes me think intercepting the MSR is not a viable
solution.

>
>> Signed-off-by: Zhang Chen <chen.zhang@intel.com>
>> Signed-off-by: Chao Gao <chao.gao@intel.com>
>> Tested-by: Jiaan Lu <jiaan.lu@intel.com>
>> ---
>>   arch/x86/kvm/cpuid.c         | 22 +++++++++++++++++++---
>>   arch/x86/kvm/reverse_cpuid.h |  7 +++++++
>>   2 files changed, 26 insertions(+), 3 deletions(-)
>> 
>> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
>> index 9583a110cf5f..f024c3ac2203 100644
>> --- a/arch/x86/kvm/cpuid.c
>> +++ b/arch/x86/kvm/cpuid.c
>> @@ -685,6 +685,10 @@ void kvm_set_cpu_caps(void)
>>   		SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
>>   	);
>> +	kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX,
>> +		SF(RRSBA_CTRL)
>> +	);
>> +
>
>Please move this hook up to right follow the leaf CPUID_7_1_EAX.

sure. will do.
  
Xiaoyao Li May 16, 2023, 2:22 a.m. UTC | #5
On 5/16/2023 10:04 AM, Chao Gao wrote:
> On Mon, May 15, 2023 at 02:53:07PM +0800, Xiaoyao Li wrote:
>> On 4/14/2023 2:25 PM, Chao Gao wrote:
>>> From: Zhang Chen <chen.zhang@intel.com>
>>>
>>> Add a kvm-only CPUID feature leaf for CPUID.7.2.EDX and RRSBA_CTRL
>>> as the first feature in the leaf.
>>>
>>> RRSBA_CTRL is enumerated by CPUID.7.2.EDX[2]. If supported, RRSBA_DIS_U
>>> (bit 5) and RRSBA_DIS_S (bit 6) of IA32_SPEC_CTRL MSR can be used to
>>> disable RRSBA behavior for CPL3 and CPL0/1/2 respectively.
>>>
>>> Note that KVM does not intercept guests' IA32_SPEC_CTRL MSR accesses
>>> after a non-zero is written to the MSR. Therefore, guests can already
>>> toggle the two bits if the host supports RRSBA_CTRL, and no extra code
>>> is needed to allow guests to toggle the two bits.
>>
>> This is a bug that also matters with other bits in MSR_IA32_SPEC_CTRL which
>> has a dedicated enumeration CPUID bit and no support in KVM yet.
> 
> Do you mean passing through the MSR is a bug? guest can write any hardware
> supported value to the MSR if the MSR isn't intercepted.
> 
> I guess this is intentional and a trade-off for performance (note that
> context-switch may cause writes to the MSR). And see
> 
> commit 841c2be09fe4 ("kvm: x86: replace kvm_spec_ctrl_test_value with runtime test on the host")
> 
> it appears that this behavior is widely recognized.
> 
>>
>> I think we need to fix this bug at first.
> 
> I have no idea how to fix the "bug" without intercepting the MSR. The
> performance penalty makes me think intercepting the MSR is not a viable
> solution.

I thought correctness always takes higher priority over performance.

>>
>>> Signed-off-by: Zhang Chen <chen.zhang@intel.com>
>>> Signed-off-by: Chao Gao <chao.gao@intel.com>
>>> Tested-by: Jiaan Lu <jiaan.lu@intel.com>
>>> ---
>>>    arch/x86/kvm/cpuid.c         | 22 +++++++++++++++++++---
>>>    arch/x86/kvm/reverse_cpuid.h |  7 +++++++
>>>    2 files changed, 26 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
>>> index 9583a110cf5f..f024c3ac2203 100644
>>> --- a/arch/x86/kvm/cpuid.c
>>> +++ b/arch/x86/kvm/cpuid.c
>>> @@ -685,6 +685,10 @@ void kvm_set_cpu_caps(void)
>>>    		SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
>>>    	);
>>> +	kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX,
>>> +		SF(RRSBA_CTRL)
>>> +	);
>>> +
>>
>> Please move this hook up to right follow the leaf CPUID_7_1_EAX.
> 
> sure. will do.
  
Chao Gao May 16, 2023, 3:01 a.m. UTC | #6
On Tue, May 16, 2023 at 10:22:22AM +0800, Xiaoyao Li wrote:
>> > I think we need to fix this bug at first.
>> 
>> I have no idea how to fix the "bug" without intercepting the MSR. The
>> performance penalty makes me think intercepting the MSR is not a viable
>> solution.
>
>I thought correctness always takes higher priority over performance.

It is generally true. however, there are situations where we should make
trade-offs between correctness and other factors (like performance):

E.g., instructions without control bits, to be 100% compliant with CPU
spec, in theory, VMMs can trap/decode every instruction and inject #UD
if a guest tries to use some instructions it shouldn't.
  
Xiaoyao Li May 16, 2023, 7:03 a.m. UTC | #7
On 5/16/2023 11:01 AM, Chao Gao wrote:
> On Tue, May 16, 2023 at 10:22:22AM +0800, Xiaoyao Li wrote:
>>>> I think we need to fix this bug at first.
>>>
>>> I have no idea how to fix the "bug" without intercepting the MSR. The
>>> performance penalty makes me think intercepting the MSR is not a viable
>>> solution.
>>
>> I thought correctness always takes higher priority over performance.
> 
> It is generally true. however, there are situations where we should make
> trade-offs between correctness and other factors (like performance):
> 
> E.g., instructions without control bits, to be 100% compliant with CPU
> spec, in theory, VMMs can trap/decode every instruction and inject #UD
> if a guest tries to use some instructions it shouldn't.

This is the virtualization hole. IMHO, they are different things.

Pass through MSR_IA32_SPEC_CTRL was introduced in commit d28b387fb74d 
("KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL"). At that time 
there was only a few bits defined, and the changelog called out that

   No attempt is made to handle STIBP here, intentionally. Filtering
   STIBP may be added in a future patch, which may require trapping all
   writes if we don't want to pass it through directly to the guest.

Per my undesrstanding, it implied that we need to re-visit it when more 
bits added instead of following the pass-through design siliently.
  
Chao Gao May 16, 2023, 9:09 a.m. UTC | #8
On Tue, May 16, 2023 at 03:03:15PM +0800, Xiaoyao Li wrote:
>On 5/16/2023 11:01 AM, Chao Gao wrote:
>> On Tue, May 16, 2023 at 10:22:22AM +0800, Xiaoyao Li wrote:
>> > > > I think we need to fix this bug at first.
>> > > 
>> > > I have no idea how to fix the "bug" without intercepting the MSR. The
>> > > performance penalty makes me think intercepting the MSR is not a viable
>> > > solution.
>> > 
>> > I thought correctness always takes higher priority over performance.
>> 
>> It is generally true. however, there are situations where we should make
>> trade-offs between correctness and other factors (like performance):
>> 
>> E.g., instructions without control bits, to be 100% compliant with CPU
>> spec, in theory, VMMs can trap/decode every instruction and inject #UD
>> if a guest tries to use some instructions it shouldn't.
>
>This is the virtualization hole. IMHO, they are different things.

what are the differences between?
1. Executing some unsupported instructions should cause #UD. But this is allowed
   in a KVM guest.
2. Setting some reserved bits in SPEC_CTRL MSR should cause #GP. But this is
   allowed in a KVM guest.

>
>Pass through MSR_IA32_SPEC_CTRL was introduced in commit d28b387fb74d
>("KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL"). At that time there
>was only a few bits defined, and the changelog called out that
>
>  No attempt is made to handle STIBP here, intentionally. Filtering
>  STIBP may be added in a future patch, which may require trapping all
>  writes if we don't want to pass it through directly to the guest.
>
>Per my undesrstanding, it implied that we need to re-visit it when more bits
>added instead of following the pass-through design siliently.

I don't object to re-visiting the design. My point is that to prevent guests from
setting RRSBA_CTRL/BHI_CTRL when they are not advertised isn't a strong
justfication for intercepting the MSR. STIBP and other bits (except IBRS) have
the same problem. And the gain of fixing this is too small.

If passing through the SPEC_CTRL MSR to guests might cause security issues, I
would agree to intercept accesses to the MSR.
  
Xiaoyao Li May 18, 2023, 9:50 a.m. UTC | #9
On 5/16/2023 5:09 PM, Chao Gao wrote:
> On Tue, May 16, 2023 at 03:03:15PM +0800, Xiaoyao Li wrote:
>> On 5/16/2023 11:01 AM, Chao Gao wrote:
>>> On Tue, May 16, 2023 at 10:22:22AM +0800, Xiaoyao Li wrote:
>>>>>> I think we need to fix this bug at first.
>>>>>
>>>>> I have no idea how to fix the "bug" without intercepting the MSR. The
>>>>> performance penalty makes me think intercepting the MSR is not a viable
>>>>> solution.
>>>>
>>>> I thought correctness always takes higher priority over performance.
>>>
>>> It is generally true. however, there are situations where we should make
>>> trade-offs between correctness and other factors (like performance):
>>>
>>> E.g., instructions without control bits, to be 100% compliant with CPU
>>> spec, in theory, VMMs can trap/decode every instruction and inject #UD
>>> if a guest tries to use some instructions it shouldn't.
>>
>> This is the virtualization hole. IMHO, they are different things.
> 
> what are the differences between?
> 1. Executing some unsupported instructions should cause #UD. But this is allowed
>     in a KVM guest.
> 2. Setting some reserved bits in SPEC_CTRL MSR should cause #GP. But this is
>     allowed in a KVM guest.

The difference is that for virtualization hole, there is no way but 
intercept and decode every instruction if we want the correctness. It's 
a disaster.

But for MSR virtualization, we do have an option and we don't need to 
trap every instruction. MSR interception is the designated mechanism to 
correctly and elegantly virtualize the MSR.

>>
>> Pass through MSR_IA32_SPEC_CTRL was introduced in commit d28b387fb74d
>> ("KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL"). At that time there
>> was only a few bits defined, and the changelog called out that
>>
>>   No attempt is made to handle STIBP here, intentionally. Filtering
>>   STIBP may be added in a future patch, which may require trapping all
>>   writes if we don't want to pass it through directly to the guest.
>>
>> Per my undesrstanding, it implied that we need to re-visit it when more bits
>> added instead of following the pass-through design siliently.
> 
> I don't object to re-visiting the design. My point is that to prevent guests from
> setting RRSBA_CTRL/BHI_CTRL when they are not advertised isn't a strong
> justfication for intercepting the MSR. STIBP and other bits (except IBRS) have
> the same problem. And the gain of fixing this is too small.
> 
> If passing through the SPEC_CTRL MSR to guests might cause security issues, I
> would agree to intercept accesses to the MSR.

I never buy it. How to interpret the security? If the user wants to hide 
one feature from guest but KVM allows it when KVM does have a reasonable 
way to hide it. Does it violate the security?
  
Chao Gao May 19, 2023, 9:43 a.m. UTC | #10
On Thu, May 18, 2023 at 05:50:17PM +0800, Xiaoyao Li wrote:
>On 5/16/2023 5:09 PM, Chao Gao wrote:
>> On Tue, May 16, 2023 at 03:03:15PM +0800, Xiaoyao Li wrote:
>> > On 5/16/2023 11:01 AM, Chao Gao wrote:
>> > > On Tue, May 16, 2023 at 10:22:22AM +0800, Xiaoyao Li wrote:
>> > > > > > I think we need to fix this bug at first.
>> > > > > 
>> > > > > I have no idea how to fix the "bug" without intercepting the MSR. The
>> > > > > performance penalty makes me think intercepting the MSR is not a viable
>> > > > > solution.
>> > > > 
>> > > > I thought correctness always takes higher priority over performance.
>> > > 
>> > > It is generally true. however, there are situations where we should make
>> > > trade-offs between correctness and other factors (like performance):
>> > > 
>> > > E.g., instructions without control bits, to be 100% compliant with CPU
>> > > spec, in theory, VMMs can trap/decode every instruction and inject #UD
>> > > if a guest tries to use some instructions it shouldn't.
>> > 
>> > This is the virtualization hole. IMHO, they are different things.
>> 
>> what are the differences between?
>> 1. Executing some unsupported instructions should cause #UD. But this is allowed
>>     in a KVM guest.
>> 2. Setting some reserved bits in SPEC_CTRL MSR should cause #GP. But this is
>>     allowed in a KVM guest.
>
>The difference is that for virtualization hole, there is no way but intercept
>and decode every instruction if we want the correctness. It's a disaster.
>
>But for MSR virtualization, we do have an option and we don't need to trap
>every instruction. MSR interception is the designated mechanism to correctly
>and elegantly virtualize the MSR.

The gains in this two cases are similar: some operations in guest are prevented.
But the costs on performance are not. So, how do you draw the line when we can
sacrafice correctness for performance?

>
>> > 
>> > Pass through MSR_IA32_SPEC_CTRL was introduced in commit d28b387fb74d
>> > ("KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL"). At that time there
>> > was only a few bits defined, and the changelog called out that
>> > 
>> >   No attempt is made to handle STIBP here, intentionally. Filtering
>> >   STIBP may be added in a future patch, which may require trapping all
>> >   writes if we don't want to pass it through directly to the guest.
>> > 
>> > Per my undesrstanding, it implied that we need to re-visit it when more bits
>> > added instead of following the pass-through design siliently.
>> 
>> I don't object to re-visiting the design. My point is that to prevent guests from
>> setting RRSBA_CTRL/BHI_CTRL when they are not advertised isn't a strong
>> justfication for intercepting the MSR. STIBP and other bits (except IBRS) have
>> the same problem. And the gain of fixing this is too small.
>> 
>> If passing through the SPEC_CTRL MSR to guests might cause security issues, I
>> would agree to intercept accesses to the MSR.
>
>I never buy it. How to interpret the security? If the user wants to hide one
>feature from guest but KVM allows it when KVM does have a reasonable way to
>hide it. Does it violate the security?

I would say no. By "security", I mean guest becomes vulnerable to some issues or
guest attacks host or guest can access unauthorized data.

I tried to say that if the value of intercepting IA32_SPEC_CTRL outweighs the
perform penalty, it makes sense to do that. I guess the decision has been made
when enabling STIBP and it means people care more about performance than
preventing guests from setting STIBP.
  

Patch

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9583a110cf5f..f024c3ac2203 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -685,6 +685,10 @@  void kvm_set_cpu_caps(void)
 		SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
 	);
 
+	kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX,
+		SF(RRSBA_CTRL)
+	);
+
 	kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
 		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
@@ -949,13 +953,14 @@  static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		break;
 	/* function 7 has additional index. */
 	case 7:
-		entry->eax = min(entry->eax, 1u);
+		entry->eax = min(entry->eax, 2u);
 		cpuid_entry_override(entry, CPUID_7_0_EBX);
 		cpuid_entry_override(entry, CPUID_7_ECX);
 		cpuid_entry_override(entry, CPUID_7_EDX);
 
-		/* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */
-		if (entry->eax == 1) {
+		max_idx = entry->eax;
+
+		if (max_idx >= 1) {
 			entry = do_host_cpuid(array, function, 1);
 			if (!entry)
 				goto out;
@@ -965,6 +970,17 @@  static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 			entry->ebx = 0;
 			entry->ecx = 0;
 		}
+
+		if (max_idx >= 2) {
+			entry = do_host_cpuid(array, function, 2);
+			if (!entry)
+				goto out;
+
+			cpuid_entry_override(entry, CPUID_7_2_EDX);
+			entry->eax = 0;
+			entry->ebx = 0;
+			entry->ecx = 0;
+		}
 		break;
 	case 0xa: { /* Architectural Performance Monitoring */
 		union cpuid10_eax eax;
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index a5717282bb9c..72bad8314a9c 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -15,6 +15,7 @@  enum kvm_only_cpuid_leafs {
 	CPUID_12_EAX	 = NCAPINTS,
 	CPUID_7_1_EDX,
 	CPUID_8000_0007_EDX,
+	CPUID_7_2_EDX,
 	NR_KVM_CPU_CAPS,
 
 	NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -47,6 +48,9 @@  enum kvm_only_cpuid_leafs {
 /* CPUID level 0x80000007 (EDX). */
 #define KVM_X86_FEATURE_CONSTANT_TSC	KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
 
+/* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */
+#define KVM_X86_FEATURE_RRSBA_CTRL	KVM_X86_FEATURE(CPUID_7_2_EDX, 2)
+
 struct cpuid_reg {
 	u32 function;
 	u32 index;
@@ -69,6 +73,7 @@  static const struct cpuid_reg reverse_cpuid[] = {
 	[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
 	[CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
 	[CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
+	[CPUID_7_2_EDX]       = {         7, 2, CPUID_EDX},
 	[CPUID_12_EAX]        = {0x00000012, 0, CPUID_EAX},
 	[CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
 	[CPUID_7_1_EDX]       = {         7, 1, CPUID_EDX},
@@ -108,6 +113,8 @@  static __always_inline u32 __feature_translate(int x86_feature)
 		return KVM_X86_FEATURE_SGX_EDECCSSA;
 	else if (x86_feature == X86_FEATURE_CONSTANT_TSC)
 		return KVM_X86_FEATURE_CONSTANT_TSC;
+	else if (x86_feature == X86_FEATURE_RRSBA_CTRL)
+		return KVM_X86_FEATURE_RRSBA_CTRL;
 
 	return x86_feature;
 }