[v3] KVM x86/xen: add an override for PVCLOCK_TSC_STABLE_BIT
Commit Message
From: Paul Durrant <pdurrant@amazon.com>
Unless explicitly told to do so (by passing 'clocksource=tsc' and
'tsc=stable:socket', and then jumping through some hoops concerning
potential CPU hotplug) Xen will never use TSC as its clocksource.
Hence, by default, a Xen guest will not see PVCLOCK_TSC_STABLE_BIT set
in either the primary or secondary pvclock memory areas. This has
led to bugs in some guest kernels which only become evident if
PVCLOCK_TSC_STABLE_BIT *is* set in the pvclocks. Hence, to support
such guests, give the VMM a new Xen HVM config flag to tell KVM to
forcibly clear the bit in the Xen pvclocks.
Signed-off-by: Paul Durrant <pdurrant@amazon.com>
---
v3:
- Moved clearing of PVCLOCK_TSC_STABLE_BIT the right side of the
memcpy().
- Added an all-vCPUs KVM_REQ_CLOCK_UPDATE when the HVM config
flag is changed.
---
Documentation/virt/kvm/api.rst | 6 ++++++
arch/x86/kvm/x86.c | 28 +++++++++++++++++++++++-----
arch/x86/kvm/xen.c | 15 ++++++++++++++-
include/uapi/linux/kvm.h | 1 +
4 files changed, 44 insertions(+), 6 deletions(-)
Comments
On Wed, Nov 01, 2023, Paul Durrant wrote:
> @@ -3231,12 +3245,15 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
> vcpu->hv_clock.flags = pvclock_flags;
>
> if (vcpu->pv_time.active)
> - kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
> + kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false);
> +
> if (vcpu->xen.vcpu_info_cache.active)
> kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
> - offsetof(struct compat_vcpu_info, time));
> + offsetof(struct compat_vcpu_info, time),
> + xen_pvclock_tsc_unstable);
> if (vcpu->xen.vcpu_time_info_cache.active)
> - kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
> + kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0,
> + xen_pvclock_tsc_unstable);
> kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
> return 0;
Please rebase, this conflicts with commit ee11ab6bb04e ("KVM: X86: Reduce size of
kvm_vcpu_arch structure when CONFIG_KVM_XEN=n"). I can solve the conflict, but
I really shouldn't have to.
Also, your version of git should support --base, which makes life much easier for
others when non-trivial conflicts are encountered. From maintainer-kvm-x86.rst:
: Git Base
: ~~~~~~~~
: If you are using git version 2.9.0 or later (Googlers, this is all of you!),
: use ``git format-patch`` with the ``--base`` flag to automatically include the
: base tree information in the generated patches.
:
: Note, ``--base=auto`` works as expected if and only if a branch's upstream is
: set to the base topic branch, e.g. it will do the wrong thing if your upstream
: is set to your personal repository for backup purposes. An alternative "auto"
: solution is to derive the names of your development branches based on their
: KVM x86 topic, and feed that into ``--base``. E.g. ``x86/pmu/my_branch_name``,
: and then write a small wrapper to extract ``pmu`` from the current branch name
: to yield ``--base=x/pmu``, where ``x`` is whatever name your repository uses to
: track the KVM x86 remote.
> @@ -4531,7 +4548,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
> KVM_XEN_HVM_CONFIG_SHARED_INFO |
> KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
> - KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
> + KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
> + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
> if (sched_info_on())
> r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
> KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
> diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
> index 40edf4d1974c..7699d94f190b 100644
> --- a/arch/x86/kvm/xen.c
> +++ b/arch/x86/kvm/xen.c
> @@ -1111,9 +1111,12 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
>
> int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
> {
> + bool update_pvclock = false;
> +
> /* Only some feature flags need to be *enabled* by userspace */
> u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
> - KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
> + KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
> + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
>
> if (xhc->flags & ~permitted_flags)
> return -EINVAL;
> @@ -1134,9 +1137,19 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
> else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
> static_branch_slow_dec_deferred(&kvm_xen_enabled);
>
> + if ((kvm->arch.xen_hvm_config.flags &
> + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) !=
> + (xhc->flags &
> + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE))
> + update_pvclock = true;
Rather than a boolean and the above, which is a bit hard to parse, what about
taking a snapshot of the old flags and then doing an XOR?
/* Only some feature flags need to be *enabled* by userspace */
u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
u32 old_flags;
if (xhc->flags & ~permitted_flags)
return -EINVAL;
/*
* With hypercall interception the kernel generates its own
* hypercall page so it must not be provided.
*/
if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
(xhc->blob_addr_32 || xhc->blob_addr_64 ||
xhc->blob_size_32 || xhc->blob_size_64))
return -EINVAL;
mutex_lock(&kvm->arch.xen.xen_lock);
if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
static_branch_inc(&kvm_xen_enabled.key);
else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
static_branch_slow_dec_deferred(&kvm_xen_enabled);
old_flags = kvm->arch.xen_hvm_config.flags;
memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
mutex_unlock(&kvm->arch.xen.xen_lock);
if ((old_flags ^ xhc->flags) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
return 0;
On 01/11/2023 16:46, Sean Christopherson wrote:
> On Wed, Nov 01, 2023, Paul Durrant wrote:
>> @@ -3231,12 +3245,15 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>> vcpu->hv_clock.flags = pvclock_flags;
>>
>> if (vcpu->pv_time.active)
>> - kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
>> + kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false);
>> +
>> if (vcpu->xen.vcpu_info_cache.active)
>> kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
>> - offsetof(struct compat_vcpu_info, time));
>> + offsetof(struct compat_vcpu_info, time),
>> + xen_pvclock_tsc_unstable);
>> if (vcpu->xen.vcpu_time_info_cache.active)
>> - kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
>> + kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0,
>> + xen_pvclock_tsc_unstable);
>> kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
>> return 0;
>
> Please rebase, this conflicts with commit ee11ab6bb04e ("KVM: X86: Reduce size of
> kvm_vcpu_arch structure when CONFIG_KVM_XEN=n"). I can solve the conflict, but
> I really shouldn't have to.
>
> Also, your version of git should support --base, which makes life much easier for
> others when non-trivial conflicts are encountered. From maintainer-kvm-x86.rst:
>
> : Git Base
> : ~~~~~~~~
> : If you are using git version 2.9.0 or later (Googlers, this is all of you!),
> : use ``git format-patch`` with the ``--base`` flag to automatically include the
> : base tree information in the generated patches.
> :
> : Note, ``--base=auto`` works as expected if and only if a branch's upstream is
> : set to the base topic branch, e.g. it will do the wrong thing if your upstream
> : is set to your personal repository for backup purposes. An alternative "auto"
> : solution is to derive the names of your development branches based on their
> : KVM x86 topic, and feed that into ``--base``. E.g. ``x86/pmu/my_branch_name``,
> : and then write a small wrapper to extract ``pmu`` from the current branch name
> : to yield ``--base=x/pmu``, where ``x`` is whatever name your repository uses to
> : track the KVM x86 remote.
>
Ok, I'll sort that out. Thanks for the tip.
>> @@ -4531,7 +4548,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>> KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
>> KVM_XEN_HVM_CONFIG_SHARED_INFO |
>> KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
>> - KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
>> + KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
>> + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
>> if (sched_info_on())
>> r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
>> KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
>> diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
>> index 40edf4d1974c..7699d94f190b 100644
>> --- a/arch/x86/kvm/xen.c
>> +++ b/arch/x86/kvm/xen.c
>> @@ -1111,9 +1111,12 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
>>
>> int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
>> {
>> + bool update_pvclock = false;
>> +
>> /* Only some feature flags need to be *enabled* by userspace */
>> u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
>> - KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
>> + KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
>> + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
>>
>> if (xhc->flags & ~permitted_flags)
>> return -EINVAL;
>> @@ -1134,9 +1137,19 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
>> else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
>> static_branch_slow_dec_deferred(&kvm_xen_enabled);
>>
>> + if ((kvm->arch.xen_hvm_config.flags &
>> + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) !=
>> + (xhc->flags &
>> + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE))
>> + update_pvclock = true;
>
> Rather than a boolean and the above, which is a bit hard to parse, what about
> taking a snapshot of the old flags and then doing an XOR?
>
> /* Only some feature flags need to be *enabled* by userspace */
> u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
> KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
> KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
> u32 old_flags;
>
> if (xhc->flags & ~permitted_flags)
> return -EINVAL;
>
> /*
> * With hypercall interception the kernel generates its own
> * hypercall page so it must not be provided.
> */
> if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
> (xhc->blob_addr_32 || xhc->blob_addr_64 ||
> xhc->blob_size_32 || xhc->blob_size_64))
> return -EINVAL;
>
> mutex_lock(&kvm->arch.xen.xen_lock);
>
> if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
> static_branch_inc(&kvm_xen_enabled.key);
> else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
> static_branch_slow_dec_deferred(&kvm_xen_enabled);
>
> old_flags = kvm->arch.xen_hvm_config.flags;
> memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
>
> mutex_unlock(&kvm->arch.xen.xen_lock);
>
> if ((old_flags ^ xhc->flags) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
> kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
>
> return 0;
Sure, I can do it that way if you prefer.
Paul
@@ -8252,6 +8252,7 @@ PVHVM guests. Valid flags are::
#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6)
+ #define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7)
The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
ioctl is available, for the guest to set its hypercall page.
@@ -8295,6 +8296,11 @@ behave more correctly, not using the XEN_RUNSTATE_UPDATE flag until/unless
specifically enabled (by the guest making the hypercall, causing the VMM
to enable the KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG attribute).
+The KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE flag indicates that KVM supports
+clearing the PVCLOCK_TSC_STABLE_BIT flag in Xen pvclock sources. This will be
+done when the KVM_CAP_XEN_HVM ioctl sets the
+KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE flag.
+
8.31 KVM_CAP_PPC_MULTITCE
-------------------------
@@ -3096,7 +3096,8 @@ u64 get_kvmclock_ns(struct kvm *kvm)
static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
struct gfn_to_pfn_cache *gpc,
- unsigned int offset)
+ unsigned int offset,
+ bool force_tsc_unstable)
{
struct kvm_vcpu_arch *vcpu = &v->arch;
struct pvclock_vcpu_time_info *guest_hv_clock;
@@ -3133,6 +3134,10 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
}
memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
+
+ if (force_tsc_unstable)
+ guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT;
+
smp_wmb();
guest_hv_clock->version = ++vcpu->hv_clock.version;
@@ -3154,6 +3159,15 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
u8 pvclock_flags;
bool use_master_clock;
+ /*
+ * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
+ * explicitly told to use TSC as its clocksource Xen will not set this bit.
+ * This default behaviour led to bugs in some guest kernels which cause
+ * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
+ */
+ bool xen_pvclock_tsc_unstable =
+ ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
+
kernel_ns = 0;
host_tsc = 0;
@@ -3231,12 +3245,15 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.flags = pvclock_flags;
if (vcpu->pv_time.active)
- kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
+ kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false);
+
if (vcpu->xen.vcpu_info_cache.active)
kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
- offsetof(struct compat_vcpu_info, time));
+ offsetof(struct compat_vcpu_info, time),
+ xen_pvclock_tsc_unstable);
if (vcpu->xen.vcpu_time_info_cache.active)
- kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
+ kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0,
+ xen_pvclock_tsc_unstable);
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
@@ -4531,7 +4548,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
KVM_XEN_HVM_CONFIG_SHARED_INFO |
KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
- KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
if (sched_info_on())
r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
@@ -1111,9 +1111,12 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
{
+ bool update_pvclock = false;
+
/* Only some feature flags need to be *enabled* by userspace */
u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
- KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
if (xhc->flags & ~permitted_flags)
return -EINVAL;
@@ -1134,9 +1137,19 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
static_branch_slow_dec_deferred(&kvm_xen_enabled);
+ if ((kvm->arch.xen_hvm_config.flags &
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) !=
+ (xhc->flags &
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE))
+ update_pvclock = true;
+
memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
mutex_unlock(&kvm->arch.xen.xen_lock);
+
+ if (update_pvclock)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
+
return 0;
}
@@ -1282,6 +1282,7 @@ struct kvm_x86_mce {
#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6)
+#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7)
struct kvm_xen_hvm_config {
__u32 flags;