[PATCHv2,04/13] x86/kvm: Do not try to disable kvmclock if it was not enabled

Message ID 20231020151242.1814-5-kirill.shutemov@linux.intel.com
State New
Headers
Series x86/tdx: Add kexec support |

Commit Message

Kirill A. Shutemov Oct. 20, 2023, 3:12 p.m. UTC
  kvm_guest_cpu_offline() tries to disable kvmclock regardless if it is
present in the VM. It leads to write to a MSR that doesn't exist on some
configurations, namely in TDX guest:

	unchecked MSR access error: WRMSR to 0x12 (tried to write 0x0000000000000000)
	at rIP: 0xffffffff8110687c (kvmclock_disable+0x1c/0x30)

kvmclock enabling is gated by CLOCKSOURCE and CLOCKSOURCE2 KVM paravirt
features.

Do not disable kvmclock if it was not enabled.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Fixes: c02027b5742b ("x86/kvm: Disable kvmclock on all CPUs on shutdown")
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Wanpeng Li <wanpengli@tencent.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kernel/kvmclock.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)
  

Comments

Sean Christopherson Oct. 20, 2023, 3:32 p.m. UTC | #1
On Fri, Oct 20, 2023, Kirill A. Shutemov wrote:
> kvm_guest_cpu_offline() tries to disable kvmclock regardless if it is
> present in the VM. It leads to write to a MSR that doesn't exist on some
> configurations, namely in TDX guest:
> 
> 	unchecked MSR access error: WRMSR to 0x12 (tried to write 0x0000000000000000)
> 	at rIP: 0xffffffff8110687c (kvmclock_disable+0x1c/0x30)
> 
> kvmclock enabling is gated by CLOCKSOURCE and CLOCKSOURCE2 KVM paravirt
> features.
> 
> Do not disable kvmclock if it was not enabled.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Fixes: c02027b5742b ("x86/kvm: Disable kvmclock on all CPUs on shutdown")
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Wanpeng Li <wanpengli@tencent.com>
> Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
> Cc: Sean Christopherson <seanjc@google.com>
> ---

Reviewed-by: Sean Christopherson <seanjc@google.com>
  
Vitaly Kuznetsov Oct. 20, 2023, 3:41 p.m. UTC | #2
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> writes:

> kvm_guest_cpu_offline() tries to disable kvmclock regardless if it is
> present in the VM. It leads to write to a MSR that doesn't exist on some
> configurations, namely in TDX guest:
>
> 	unchecked MSR access error: WRMSR to 0x12 (tried to write 0x0000000000000000)
> 	at rIP: 0xffffffff8110687c (kvmclock_disable+0x1c/0x30)
>
> kvmclock enabling is gated by CLOCKSOURCE and CLOCKSOURCE2 KVM paravirt
> features.
>
> Do not disable kvmclock if it was not enabled.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Fixes: c02027b5742b ("x86/kvm: Disable kvmclock on all CPUs on shutdown")
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Wanpeng Li <wanpengli@tencent.com>
> Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
> Cc: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kernel/kvmclock.c | 12 ++++++++----
>  1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index fb8f52149be9..f2fff625576d 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -24,8 +24,8 @@
>  
>  static int kvmclock __initdata = 1;
>  static int kvmclock_vsyscall __initdata = 1;
> -static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME;
> -static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK;
> +static int msr_kvm_system_time __ro_after_init;
> +static int msr_kvm_wall_clock __ro_after_init;
>  static u64 kvm_sched_clock_offset __ro_after_init;
>  
>  static int __init parse_no_kvmclock(char *arg)
> @@ -195,7 +195,8 @@ static void kvm_setup_secondary_clock(void)
>  
>  void kvmclock_disable(void)
>  {
> -	native_write_msr(msr_kvm_system_time, 0, 0);
> +	if (msr_kvm_system_time)
> +		native_write_msr(msr_kvm_system_time, 0, 0);
>  }
>  
>  static void __init kvmclock_init_mem(void)
> @@ -294,7 +295,10 @@ void __init kvmclock_init(void)
>  	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
>  		msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
>  		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
> -	} else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
> +	} else if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
> +		msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
> +		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
> +	} else {
>  		return;
>  	}

This should work, so

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>

but my personal preference would be to change kvm_guest_cpu_offline()
to check KVM features explicitly instead of checking MSRs against '0'
at least becase it already does so for other features. Completely
untested:

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b8ab9ee5896c..1ee49c98e70a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -454,7 +454,9 @@ static void kvm_guest_cpu_offline(bool shutdown)
        kvm_pv_disable_apf();
        if (!shutdown)
                apf_task_wake_all();
-       kvmclock_disable();
+       if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2) ||
+           kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))
+               kvmclock_disable();
 }
  
Sean Christopherson Oct. 20, 2023, 5:07 p.m. UTC | #3
On Fri, Oct 20, 2023, Vitaly Kuznetsov wrote:
> > ---
> >  arch/x86/kernel/kvmclock.c | 12 ++++++++----
> >  1 file changed, 8 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> > index fb8f52149be9..f2fff625576d 100644
> > --- a/arch/x86/kernel/kvmclock.c
> > +++ b/arch/x86/kernel/kvmclock.c
> > @@ -24,8 +24,8 @@
> >  
> >  static int kvmclock __initdata = 1;
> >  static int kvmclock_vsyscall __initdata = 1;
> > -static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME;
> > -static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK;
> > +static int msr_kvm_system_time __ro_after_init;
> > +static int msr_kvm_wall_clock __ro_after_init;
> >  static u64 kvm_sched_clock_offset __ro_after_init;
> >  
> >  static int __init parse_no_kvmclock(char *arg)
> > @@ -195,7 +195,8 @@ static void kvm_setup_secondary_clock(void)
> >  
> >  void kvmclock_disable(void)
> >  {
> > -	native_write_msr(msr_kvm_system_time, 0, 0);
> > +	if (msr_kvm_system_time)
> > +		native_write_msr(msr_kvm_system_time, 0, 0);
> >  }
> >  
> >  static void __init kvmclock_init_mem(void)
> > @@ -294,7 +295,10 @@ void __init kvmclock_init(void)
> >  	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
> >  		msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
> >  		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
> > -	} else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
> > +	} else if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
> > +		msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
> > +		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
> > +	} else {
> >  		return;
> >  	}
> 
> This should work, so
> 
> Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> 
> but my personal preference would be to change kvm_guest_cpu_offline()
> to check KVM features explicitly instead of checking MSRs against '0'
> at least becase it already does so for other features. Completely
> untested:
> 
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index b8ab9ee5896c..1ee49c98e70a 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -454,7 +454,9 @@ static void kvm_guest_cpu_offline(bool shutdown)
>         kvm_pv_disable_apf();
>         if (!shutdown)
>                 apf_task_wake_all();
> -       kvmclock_disable();
> +       if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2) ||
> +           kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))
> +               kvmclock_disable();
>  }

That would result in an unnecessray WRMSR in the case where kvmclock is disabled
on the command line.  It _should_ be benign given how the code is written, but
it's not impossible to imagine a scenario where someone disabled kvmclock in the
guest because of a hypervisor bug.  And the WRMSR would become a bogus write to
MSR 0x0 if someone made a "cleanup" to set msr_kvm_system_time if and only if
kvmclock is actually used, e.g. if someone made Kirill's change sans the check in
kvmclock_disable().
  
Vitaly Kuznetsov Oct. 23, 2023, 8:45 a.m. UTC | #4
Sean Christopherson <seanjc@google.com> writes:

> On Fri, Oct 20, 2023, Vitaly Kuznetsov wrote:
>> > ---
>> >  arch/x86/kernel/kvmclock.c | 12 ++++++++----
>> >  1 file changed, 8 insertions(+), 4 deletions(-)
>> >
>> > diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
>> > index fb8f52149be9..f2fff625576d 100644
>> > --- a/arch/x86/kernel/kvmclock.c
>> > +++ b/arch/x86/kernel/kvmclock.c
>> > @@ -24,8 +24,8 @@
>> >  
>> >  static int kvmclock __initdata = 1;
>> >  static int kvmclock_vsyscall __initdata = 1;
>> > -static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME;
>> > -static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK;
>> > +static int msr_kvm_system_time __ro_after_init;
>> > +static int msr_kvm_wall_clock __ro_after_init;
>> >  static u64 kvm_sched_clock_offset __ro_after_init;
>> >  
>> >  static int __init parse_no_kvmclock(char *arg)
>> > @@ -195,7 +195,8 @@ static void kvm_setup_secondary_clock(void)
>> >  
>> >  void kvmclock_disable(void)
>> >  {
>> > -	native_write_msr(msr_kvm_system_time, 0, 0);
>> > +	if (msr_kvm_system_time)
>> > +		native_write_msr(msr_kvm_system_time, 0, 0);
>> >  }
>> >  
>> >  static void __init kvmclock_init_mem(void)
>> > @@ -294,7 +295,10 @@ void __init kvmclock_init(void)
>> >  	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
>> >  		msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
>> >  		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
>> > -	} else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
>> > +	} else if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
>> > +		msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
>> > +		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
>> > +	} else {
>> >  		return;
>> >  	}
>> 
>> This should work, so
>> 
>> Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>> 
>> but my personal preference would be to change kvm_guest_cpu_offline()
>> to check KVM features explicitly instead of checking MSRs against '0'
>> at least becase it already does so for other features. Completely
>> untested:
>> 
>> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
>> index b8ab9ee5896c..1ee49c98e70a 100644
>> --- a/arch/x86/kernel/kvm.c
>> +++ b/arch/x86/kernel/kvm.c
>> @@ -454,7 +454,9 @@ static void kvm_guest_cpu_offline(bool shutdown)
>>         kvm_pv_disable_apf();
>>         if (!shutdown)
>>                 apf_task_wake_all();
>> -       kvmclock_disable();
>> +       if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2) ||
>> +           kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))
>> +               kvmclock_disable();
>>  }
>
> That would result in an unnecessray WRMSR in the case where kvmclock is disabled
> on the command line.  It _should_ be benign given how the code is written, but
> it's not impossible to imagine a scenario where someone disabled kvmclock in the
> guest because of a hypervisor bug.  And the WRMSR would become a bogus write to
> MSR 0x0 if someone made a "cleanup" to set msr_kvm_system_time if and only if
> kvmclock is actually used, e.g. if someone made Kirill's change sans the check in
> kvmclock_disable().

True but we don't have such module params to disable other PV features so
e.g. KVM_FEATURE_PV_EOI/KVM_FEATURE_MIGRATION_CONTROL are written to
unconditionally. Wouldn't it be better to handle parameters like
'no-kvmclock' by clearing the feature bit in kvm_arch_para_features()'s
return value so all kvm_para_has_feature() calls for it just return
'false'? We can even do an umbreall "no-kvm-features=<mask>" to cover
all possible debug cases.
  
Sean Christopherson Oct. 23, 2023, 2:40 p.m. UTC | #5
On Mon, Oct 23, 2023, Vitaly Kuznetsov wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
> > On Fri, Oct 20, 2023, Vitaly Kuznetsov wrote:
> >> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> >> index b8ab9ee5896c..1ee49c98e70a 100644
> >> --- a/arch/x86/kernel/kvm.c
> >> +++ b/arch/x86/kernel/kvm.c
> >> @@ -454,7 +454,9 @@ static void kvm_guest_cpu_offline(bool shutdown)
> >>         kvm_pv_disable_apf();
> >>         if (!shutdown)
> >>                 apf_task_wake_all();
> >> -       kvmclock_disable();
> >> +       if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2) ||
> >> +           kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))
> >> +               kvmclock_disable();
> >>  }
> >
> > That would result in an unnecessray WRMSR in the case where kvmclock is disabled
> > on the command line.  It _should_ be benign given how the code is written, but
> > it's not impossible to imagine a scenario where someone disabled kvmclock in the
> > guest because of a hypervisor bug.  And the WRMSR would become a bogus write to
> > MSR 0x0 if someone made a "cleanup" to set msr_kvm_system_time if and only if
> > kvmclock is actually used, e.g. if someone made Kirill's change sans the check in
> > kvmclock_disable().
> 
> True but we don't have such module params to disable other PV features so
> e.g. KVM_FEATURE_PV_EOI/KVM_FEATURE_MIGRATION_CONTROL are written to
> unconditionally. Wouldn't it be better to handle parameters like
> 'no-kvmclock' by clearing the feature bit in kvm_arch_para_features()'s
> return value so all kvm_para_has_feature() calls for it just return
> 'false'? We can even do an umbreall "no-kvm-features=<mask>" to cover
> all possible debug cases.

I don't know that it's worth the effort, or that it'd even be a net positive.

Today, kvm_para_has_feature() goes through to CPUID every time, e.g. we'd have
to add a small bit of infrastructure to snapshot and clear bits, or rework things
to let kvm_para_has_feature peek at kvmclock.

And things like KVM_FEATURE_PV_TLB_FLUSH would be quite weird, e.g. we either end
up leaving the feature bit set while returning "false" for pv_tlb_flush_supported(),
or we'd clear the feature bit for a rather large number of conditions that don't
really have anything to do with KVM_FEATURE_PV_TLB_FLUSH being available.

  static bool pv_tlb_flush_supported(void)
  {
	return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
		!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
		kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
		!boot_cpu_has(X86_FEATURE_MWAIT) &&
		(num_possible_cpus() != 1));
  }
  

Patch

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index fb8f52149be9..f2fff625576d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -24,8 +24,8 @@ 
 
 static int kvmclock __initdata = 1;
 static int kvmclock_vsyscall __initdata = 1;
-static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME;
-static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK;
+static int msr_kvm_system_time __ro_after_init;
+static int msr_kvm_wall_clock __ro_after_init;
 static u64 kvm_sched_clock_offset __ro_after_init;
 
 static int __init parse_no_kvmclock(char *arg)
@@ -195,7 +195,8 @@  static void kvm_setup_secondary_clock(void)
 
 void kvmclock_disable(void)
 {
-	native_write_msr(msr_kvm_system_time, 0, 0);
+	if (msr_kvm_system_time)
+		native_write_msr(msr_kvm_system_time, 0, 0);
 }
 
 static void __init kvmclock_init_mem(void)
@@ -294,7 +295,10 @@  void __init kvmclock_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
 		msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
 		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
-	} else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+	} else if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+		msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
+		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+	} else {
 		return;
 	}