[v3] KVM x86/xen: add an override for PVCLOCK_TSC_STABLE_BIT

Message ID 20231101112934.631344-1-paul@xen.org
State New
Headers
Series [v3] KVM x86/xen: add an override for PVCLOCK_TSC_STABLE_BIT |

Commit Message

Paul Durrant Nov. 1, 2023, 11:29 a.m. UTC
  From: Paul Durrant <pdurrant@amazon.com>

Unless explicitly told to do so (by passing 'clocksource=tsc' and
'tsc=stable:socket', and then jumping through some hoops concerning
potential CPU hotplug) Xen will never use TSC as its clocksource.
Hence, by default, a Xen guest will not see PVCLOCK_TSC_STABLE_BIT set
in either the primary or secondary pvclock memory areas. This has
led to bugs in some guest kernels which only become evident if
PVCLOCK_TSC_STABLE_BIT *is* set in the pvclocks. Hence, to support
such guests, give the VMM a new Xen HVM config flag to tell KVM to
forcibly clear the bit in the Xen pvclocks.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
---

v3:
 - Moved clearing of PVCLOCK_TSC_STABLE_BIT the right side of the
   memcpy().
 - Added an all-vCPUs KVM_REQ_CLOCK_UPDATE when the HVM config
   flag is changed.
---
 Documentation/virt/kvm/api.rst |  6 ++++++
 arch/x86/kvm/x86.c             | 28 +++++++++++++++++++++++-----
 arch/x86/kvm/xen.c             | 15 ++++++++++++++-
 include/uapi/linux/kvm.h       |  1 +
 4 files changed, 44 insertions(+), 6 deletions(-)
  

Comments

Sean Christopherson Nov. 1, 2023, 4:46 p.m. UTC | #1
On Wed, Nov 01, 2023, Paul Durrant wrote:
> @@ -3231,12 +3245,15 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>  	vcpu->hv_clock.flags = pvclock_flags;
>  
>  	if (vcpu->pv_time.active)
> -		kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
> +		kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false);
> +
>  	if (vcpu->xen.vcpu_info_cache.active)
>  		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
> -					offsetof(struct compat_vcpu_info, time));
> +					offsetof(struct compat_vcpu_info, time),
> +					xen_pvclock_tsc_unstable);
>  	if (vcpu->xen.vcpu_time_info_cache.active)
> -		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
> +		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0,
> +					xen_pvclock_tsc_unstable);
>  	kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
>  	return 0;

Please rebase, this conflicts with commit ee11ab6bb04e ("KVM: X86: Reduce size of
kvm_vcpu_arch structure when CONFIG_KVM_XEN=n").  I can solve the conflict, but
I really shouldn't have to.

Also, your version of git should support --base, which makes life much easier for
others when non-trivial conflicts are encountered.  From maintainer-kvm-x86.rst:

 : Git Base
 : ~~~~~~~~
 : If you are using git version 2.9.0 or later (Googlers, this is all of you!),
 : use ``git format-patch`` with the ``--base`` flag to automatically include the
 : base tree information in the generated patches.
 : 
 : Note, ``--base=auto`` works as expected if and only if a branch's upstream is
 : set to the base topic branch, e.g. it will do the wrong thing if your upstream
 : is set to your personal repository for backup purposes.  An alternative "auto"
 : solution is to derive the names of your development branches based on their
 : KVM x86 topic, and feed that into ``--base``.  E.g. ``x86/pmu/my_branch_name``,
 : and then write a small wrapper to extract ``pmu`` from the current branch name
 : to yield ``--base=x/pmu``, where ``x`` is whatever name your repository uses to
 : track the KVM x86 remote.

> @@ -4531,7 +4548,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  		    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
>  		    KVM_XEN_HVM_CONFIG_SHARED_INFO |
>  		    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
> -		    KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
> +		    KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
> +		    KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
>  		if (sched_info_on())
>  			r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
>  			     KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
> diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
> index 40edf4d1974c..7699d94f190b 100644
> --- a/arch/x86/kvm/xen.c
> +++ b/arch/x86/kvm/xen.c
> @@ -1111,9 +1111,12 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
>  
>  int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
>  {
> +	bool update_pvclock = false;
> +
>  	/* Only some feature flags need to be *enabled* by userspace */
>  	u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
> -		KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
> +		KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
> +		KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
>  
>  	if (xhc->flags & ~permitted_flags)
>  		return -EINVAL;
> @@ -1134,9 +1137,19 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
>  	else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
>  		static_branch_slow_dec_deferred(&kvm_xen_enabled);
>  
> +	if ((kvm->arch.xen_hvm_config.flags &
> +	     KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) !=
> +	    (xhc->flags &
> +	     KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE))
> +		update_pvclock = true;

Rather than a boolean and the above, which is a bit hard to parse, what about
taking a snapshot of the old flags and then doing an XOR?

	/* Only some feature flags need to be *enabled* by userspace */
	u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
		KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
		KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
	u32 old_flags;

	if (xhc->flags & ~permitted_flags)
		return -EINVAL;

	/*
	 * With hypercall interception the kernel generates its own
	 * hypercall page so it must not be provided.
	 */
	if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
	    (xhc->blob_addr_32 || xhc->blob_addr_64 ||
	     xhc->blob_size_32 || xhc->blob_size_64))
		return -EINVAL;

	mutex_lock(&kvm->arch.xen.xen_lock);

	if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
		static_branch_inc(&kvm_xen_enabled.key);
	else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
		static_branch_slow_dec_deferred(&kvm_xen_enabled);

	old_flags = kvm->arch.xen_hvm_config.flags;
	memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));

	mutex_unlock(&kvm->arch.xen.xen_lock);

	if ((old_flags ^ xhc->flags) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
		kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);

	return 0;
  
Paul Durrant Nov. 1, 2023, 5:02 p.m. UTC | #2
On 01/11/2023 16:46, Sean Christopherson wrote:
> On Wed, Nov 01, 2023, Paul Durrant wrote:
>> @@ -3231,12 +3245,15 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>>   	vcpu->hv_clock.flags = pvclock_flags;
>>   
>>   	if (vcpu->pv_time.active)
>> -		kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
>> +		kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false);
>> +
>>   	if (vcpu->xen.vcpu_info_cache.active)
>>   		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
>> -					offsetof(struct compat_vcpu_info, time));
>> +					offsetof(struct compat_vcpu_info, time),
>> +					xen_pvclock_tsc_unstable);
>>   	if (vcpu->xen.vcpu_time_info_cache.active)
>> -		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
>> +		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0,
>> +					xen_pvclock_tsc_unstable);
>>   	kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
>>   	return 0;
> 
> Please rebase, this conflicts with commit ee11ab6bb04e ("KVM: X86: Reduce size of
> kvm_vcpu_arch structure when CONFIG_KVM_XEN=n").  I can solve the conflict, but
> I really shouldn't have to.
> 
> Also, your version of git should support --base, which makes life much easier for
> others when non-trivial conflicts are encountered.  From maintainer-kvm-x86.rst:
> 
>   : Git Base
>   : ~~~~~~~~
>   : If you are using git version 2.9.0 or later (Googlers, this is all of you!),
>   : use ``git format-patch`` with the ``--base`` flag to automatically include the
>   : base tree information in the generated patches.
>   :
>   : Note, ``--base=auto`` works as expected if and only if a branch's upstream is
>   : set to the base topic branch, e.g. it will do the wrong thing if your upstream
>   : is set to your personal repository for backup purposes.  An alternative "auto"
>   : solution is to derive the names of your development branches based on their
>   : KVM x86 topic, and feed that into ``--base``.  E.g. ``x86/pmu/my_branch_name``,
>   : and then write a small wrapper to extract ``pmu`` from the current branch name
>   : to yield ``--base=x/pmu``, where ``x`` is whatever name your repository uses to
>   : track the KVM x86 remote.
> 

Ok, I'll sort that out. Thanks for the tip.

>> @@ -4531,7 +4548,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>>   		    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
>>   		    KVM_XEN_HVM_CONFIG_SHARED_INFO |
>>   		    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
>> -		    KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
>> +		    KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
>> +		    KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
>>   		if (sched_info_on())
>>   			r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
>>   			     KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
>> diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
>> index 40edf4d1974c..7699d94f190b 100644
>> --- a/arch/x86/kvm/xen.c
>> +++ b/arch/x86/kvm/xen.c
>> @@ -1111,9 +1111,12 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
>>   
>>   int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
>>   {
>> +	bool update_pvclock = false;
>> +
>>   	/* Only some feature flags need to be *enabled* by userspace */
>>   	u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
>> -		KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
>> +		KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
>> +		KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
>>   
>>   	if (xhc->flags & ~permitted_flags)
>>   		return -EINVAL;
>> @@ -1134,9 +1137,19 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
>>   	else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
>>   		static_branch_slow_dec_deferred(&kvm_xen_enabled);
>>   
>> +	if ((kvm->arch.xen_hvm_config.flags &
>> +	     KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) !=
>> +	    (xhc->flags &
>> +	     KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE))
>> +		update_pvclock = true;
> 
> Rather than a boolean and the above, which is a bit hard to parse, what about
> taking a snapshot of the old flags and then doing an XOR?
> 
> 	/* Only some feature flags need to be *enabled* by userspace */
> 	u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
> 		KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
> 		KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
> 	u32 old_flags;
> 
> 	if (xhc->flags & ~permitted_flags)
> 		return -EINVAL;
> 
> 	/*
> 	 * With hypercall interception the kernel generates its own
> 	 * hypercall page so it must not be provided.
> 	 */
> 	if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
> 	    (xhc->blob_addr_32 || xhc->blob_addr_64 ||
> 	     xhc->blob_size_32 || xhc->blob_size_64))
> 		return -EINVAL;
> 
> 	mutex_lock(&kvm->arch.xen.xen_lock);
> 
> 	if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
> 		static_branch_inc(&kvm_xen_enabled.key);
> 	else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
> 		static_branch_slow_dec_deferred(&kvm_xen_enabled);
> 
> 	old_flags = kvm->arch.xen_hvm_config.flags;
> 	memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
> 
> 	mutex_unlock(&kvm->arch.xen.xen_lock);
> 
> 	if ((old_flags ^ xhc->flags) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
> 		kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
> 
> 	return 0;

Sure, I can do it that way if you prefer.

   Paul
  

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 21a7578142a1..9752a01270df 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8252,6 +8252,7 @@  PVHVM guests. Valid flags are::
   #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL		(1 << 4)
   #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND		(1 << 5)
   #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG	(1 << 6)
+  #define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE	(1 << 7)
 
 The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
 ioctl is available, for the guest to set its hypercall page.
@@ -8295,6 +8296,11 @@  behave more correctly, not using the XEN_RUNSTATE_UPDATE flag until/unless
 specifically enabled (by the guest making the hypercall, causing the VMM
 to enable the KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG attribute).
 
+The KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE flag indicates that KVM supports
+clearing the PVCLOCK_TSC_STABLE_BIT flag in Xen pvclock sources. This will be
+done when the KVM_CAP_XEN_HVM ioctl sets the
+KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE flag.
+
 8.31 KVM_CAP_PPC_MULTITCE
 -------------------------
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 41cce5031126..4ad17ad0fc0c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3096,7 +3096,8 @@  u64 get_kvmclock_ns(struct kvm *kvm)
 
 static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
 				    struct gfn_to_pfn_cache *gpc,
-				    unsigned int offset)
+				    unsigned int offset,
+				    bool force_tsc_unstable)
 {
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	struct pvclock_vcpu_time_info *guest_hv_clock;
@@ -3133,6 +3134,10 @@  static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
 	}
 
 	memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
+
+	if (force_tsc_unstable)
+		guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT;
+
 	smp_wmb();
 
 	guest_hv_clock->version = ++vcpu->hv_clock.version;
@@ -3154,6 +3159,15 @@  static int kvm_guest_time_update(struct kvm_vcpu *v)
 	u8 pvclock_flags;
 	bool use_master_clock;
 
+	/*
+	 * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
+	 * explicitly told to use TSC as its clocksource Xen will not set this bit.
+	 * This default behaviour led to bugs in some guest kernels which cause
+	 * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
+	 */
+	bool xen_pvclock_tsc_unstable =
+		ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
+
 	kernel_ns = 0;
 	host_tsc = 0;
 
@@ -3231,12 +3245,15 @@  static int kvm_guest_time_update(struct kvm_vcpu *v)
 	vcpu->hv_clock.flags = pvclock_flags;
 
 	if (vcpu->pv_time.active)
-		kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
+		kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false);
+
 	if (vcpu->xen.vcpu_info_cache.active)
 		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
-					offsetof(struct compat_vcpu_info, time));
+					offsetof(struct compat_vcpu_info, time),
+					xen_pvclock_tsc_unstable);
 	if (vcpu->xen.vcpu_time_info_cache.active)
-		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
+		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0,
+					xen_pvclock_tsc_unstable);
 	kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
 	return 0;
 }
@@ -4531,7 +4548,8 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
 		    KVM_XEN_HVM_CONFIG_SHARED_INFO |
 		    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
-		    KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+		    KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
+		    KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
 		if (sched_info_on())
 			r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
 			     KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 40edf4d1974c..7699d94f190b 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -1111,9 +1111,12 @@  int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 
 int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
 {
+	bool update_pvclock = false;
+
 	/* Only some feature flags need to be *enabled* by userspace */
 	u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
-		KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+		KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
+		KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
 
 	if (xhc->flags & ~permitted_flags)
 		return -EINVAL;
@@ -1134,9 +1137,19 @@  int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
 	else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
 		static_branch_slow_dec_deferred(&kvm_xen_enabled);
 
+	if ((kvm->arch.xen_hvm_config.flags &
+	     KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) !=
+	    (xhc->flags &
+	     KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE))
+		update_pvclock = true;
+
 	memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
 
 	mutex_unlock(&kvm->arch.xen.xen_lock);
+
+	if (update_pvclock)
+		kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
+
 	return 0;
 }
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 13065dd96132..e21b53e8358d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1282,6 +1282,7 @@  struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 4)
 #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND		(1 << 5)
 #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG	(1 << 6)
+#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE	(1 << 7)
 
 struct kvm_xen_hvm_config {
 	__u32 flags;