[v2,1/2] x86/kvm/async_pf: Use separate percpu variable to track the enabling of asyncpf

Message ID 20231025055914.1201792-2-xiaoyao.li@intel.com
State New
Headers
Series x86/asyncpf: Fixes the size of asyncpf PV data and related docs |

Commit Message

Xiaoyao Li Oct. 25, 2023, 5:59 a.m. UTC
  Refer to commit fd10cde9294f ("KVM paravirt: Add async PF initialization
to PV guest") and commit 344d9588a9df ("KVM: Add PV MSR to enable
asynchronous page faults delivery"). It turns out that at the time when
asyncpf was introduced, the purpose was defining the shared PV data 'struct
kvm_vcpu_pv_apf_data' with the size of 64 bytes. However, it made a mistake
and defined the size to 68 bytes, which failed to make fit in a cache line
and made the code inconsistent with the documentation.

Below justification quoted from Sean[*]

  KVM (the host side) has *never* read kvm_vcpu_pv_apf_data.enabled, and
  the documentation clearly states that enabling is based solely on the
  bit in the synthetic MSR.

  So rather than update the documentation, fix the goof by removing the
  enabled filed and use the separate percpu variable instread.
  KVM-as-a-host obviously doesn't enforce anything or consume the size,
  and changing the header will only affect guests that are rebuilt against
  the new header, so there's no chance of ABI breakage between KVM and its
  guests. The only possible breakage is if some other hypervisor is
  emulating KVM's async #PF (LOL) and relies on the guest to set
  kvm_vcpu_pv_apf_data.enabled. But (a) I highly doubt such a hypervisor
  exists, (b) that would arguably be a violation of KVM's "spec", and
  (c) the worst case scenario is that the guest would simply lose async
  #PF functionality.

[*] https://lore.kernel.org/all/ZS7ERnnRqs8Fl0ZF@google.com/T/#u

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
---
 Documentation/virt/kvm/x86/msr.rst   |  1 -
 arch/x86/include/uapi/asm/kvm_para.h |  1 -
 arch/x86/kernel/kvm.c                | 11 ++++++-----
 3 files changed, 6 insertions(+), 7 deletions(-)
  

Comments

Vitaly Kuznetsov Oct. 25, 2023, 9:10 a.m. UTC | #1
Xiaoyao Li <xiaoyao.li@intel.com> writes:

> Refer to commit fd10cde9294f ("KVM paravirt: Add async PF initialization
> to PV guest") and commit 344d9588a9df ("KVM: Add PV MSR to enable
> asynchronous page faults delivery"). It turns out that at the time when
> asyncpf was introduced, the purpose was defining the shared PV data 'struct
> kvm_vcpu_pv_apf_data' with the size of 64 bytes. However, it made a mistake
> and defined the size to 68 bytes, which failed to make fit in a cache line
> and made the code inconsistent with the documentation.

Oh, I actually though it was done on purpose :-) 'enabled' is not
accessed by the host, it's only purpose is to cache MSR_KVM_ASYNC_PF_EN.

>
> Below justification quoted from Sean[*]
>
>   KVM (the host side) has *never* read kvm_vcpu_pv_apf_data.enabled, and
>   the documentation clearly states that enabling is based solely on the
>   bit in the synthetic MSR.
>
>   So rather than update the documentation, fix the goof by removing the
>   enabled filed and use the separate percpu variable instread.
>   KVM-as-a-host obviously doesn't enforce anything or consume the size,
>   and changing the header will only affect guests that are rebuilt against
>   the new header, so there's no chance of ABI breakage between KVM and its
>   guests. The only possible breakage is if some other hypervisor is
>   emulating KVM's async #PF (LOL) and relies on the guest to set
>   kvm_vcpu_pv_apf_data.enabled. But (a) I highly doubt such a hypervisor
>   exists, (b) that would arguably be a violation of KVM's "spec", and
>   (c) the worst case scenario is that the guest would simply lose async
>   #PF functionality.
>
> [*] https://lore.kernel.org/all/ZS7ERnnRqs8Fl0ZF@google.com/T/#u
>
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> ---
>  Documentation/virt/kvm/x86/msr.rst   |  1 -
>  arch/x86/include/uapi/asm/kvm_para.h |  1 -
>  arch/x86/kernel/kvm.c                | 11 ++++++-----
>  3 files changed, 6 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/virt/kvm/x86/msr.rst b/Documentation/virt/kvm/x86/msr.rst
> index 9315fc385fb0..f6d70f99a1a7 100644
> --- a/Documentation/virt/kvm/x86/msr.rst
> +++ b/Documentation/virt/kvm/x86/msr.rst
> @@ -204,7 +204,6 @@ data:
>  		__u32 token;
>  
>  		__u8 pad[56];
> -		__u32 enabled;
>  	  };
>  
>  	Bits 5-4 of the MSR are reserved and should be zero. Bit 0 is set to 1
> diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
> index 6e64b27b2c1e..605899594ebb 100644
> --- a/arch/x86/include/uapi/asm/kvm_para.h
> +++ b/arch/x86/include/uapi/asm/kvm_para.h
> @@ -142,7 +142,6 @@ struct kvm_vcpu_pv_apf_data {
>  	__u32 token;
>  
>  	__u8 pad[56];
> -	__u32 enabled;
>  };
>  
>  #define KVM_PV_EOI_BIT 0
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index b8ab9ee5896c..388a3fdd3cad 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -65,6 +65,7 @@ static int __init parse_no_stealacc(char *arg)
>  
>  early_param("no-steal-acc", parse_no_stealacc);
>  
> +static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);

Would it make a difference is we replace this with a cpumask? I realize
that we need to access it on all CPUs from hotpaths but this mask will
rarely change so maybe there's no real perfomance hit?

>  static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
>  DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
>  static int has_steal_clock = 0;
> @@ -244,7 +245,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void)
>  {
>  	u32 flags = 0;
>  
> -	if (__this_cpu_read(apf_reason.enabled)) {
> +	if (__this_cpu_read(async_pf_enabled)) {
>  		flags = __this_cpu_read(apf_reason.flags);
>  		__this_cpu_write(apf_reason.flags, 0);
>  	}
> @@ -295,7 +296,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
>  
>  	inc_irq_stat(irq_hv_callback_count);
>  
> -	if (__this_cpu_read(apf_reason.enabled)) {
> +	if (__this_cpu_read(async_pf_enabled)) {
>  		token = __this_cpu_read(apf_reason.token);
>  		kvm_async_pf_task_wake(token);
>  		__this_cpu_write(apf_reason.token, 0);
> @@ -362,7 +363,7 @@ static void kvm_guest_cpu_init(void)
>  		wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
>  
>  		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
> -		__this_cpu_write(apf_reason.enabled, 1);
> +		__this_cpu_write(async_pf_enabled, 1);

As 'async_pf_enabled' is bool, it would probably be more natural to
write

	__this_cpu_write(async_pf_enabled, true);

>  		pr_debug("setup async PF for cpu %d\n", smp_processor_id());
>  	}
>  
> @@ -383,11 +384,11 @@ static void kvm_guest_cpu_init(void)
>  
>  static void kvm_pv_disable_apf(void)
>  {
> -	if (!__this_cpu_read(apf_reason.enabled))
> +	if (!__this_cpu_read(async_pf_enabled))
>  		return;
>  
>  	wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
> -	__this_cpu_write(apf_reason.enabled, 0);
> +	__this_cpu_write(async_pf_enabled, 0);

... and 'false' here.

>  
>  	pr_debug("disable async PF for cpu %d\n", smp_processor_id());
>  }
  
Sean Christopherson Oct. 25, 2023, 2:22 p.m. UTC | #2
On Wed, Oct 25, 2023, Vitaly Kuznetsov wrote:
> Xiaoyao Li <xiaoyao.li@intel.com> writes:
> > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> > index b8ab9ee5896c..388a3fdd3cad 100644
> > --- a/arch/x86/kernel/kvm.c
> > +++ b/arch/x86/kernel/kvm.c
> > @@ -65,6 +65,7 @@ static int __init parse_no_stealacc(char *arg)
> >  
> >  early_param("no-steal-acc", parse_no_stealacc);
> >  
> > +static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
> 
> Would it make a difference is we replace this with a cpumask? I realize
> that we need to access it on all CPUs from hotpaths but this mask will
> rarely change so maybe there's no real perfomance hit?

FWIW, I personally prefer per-CPU booleans from a readability perspective.  I
doubt there is a meaningful performance difference for a bitmap vs. individual
booleans, the check is already gated by a static key, i.e. kernels that are NOT
running as KVM guests don't care.

Actually, if there's performance gains to be had, optimizing kvm_read_and_reset_apf_flags()
to read the "enabled" flag if and only if it's necessary is a more likely candidate.
Assuming the host isn't being malicious/stupid, then apf_reason.flags will be '0'
if PV async #PFs are disabled.  The only question is whether or not apf_reason.flags
is predictable enough for the CPU.

Aha!  In practice, the CPU already needs to resolve a branch based on apf_reason.flags,
it's just "hidden" up in __kvm_handle_async_pf().

If we really want to micro-optimize, provide an __always_inline inner helper so
that __kvm_handle_async_pf() doesn't need to make a CALL just to read the flags.
Then in the common case where a #PF isn't due to the host swapping out a page,
the paravirt happy path doesn't need a taken branch and never reads the enabled
variable.  E.g. the below generates:

   0xffffffff81939ed0 <+0>:	41 54              	push   %r12
   0xffffffff81939ed2 <+2>:	31 c0              	xor    %eax,%eax
   0xffffffff81939ed4 <+4>:	55                 	push   %rbp
   0xffffffff81939ed5 <+5>:	53                 	push   %rbx
   0xffffffff81939ed6 <+6>:	48 83 ec 08        	sub    $0x8,%rsp
   0xffffffff81939eda <+10>:	65 8b 2d df 81 6f 7e	mov    %gs:0x7e6f81df(%rip),%ebp        # 0x320c0 <apf_reason>
   0xffffffff81939ee1 <+17>:	85 ed              	test   %ebp,%ebp
   0xffffffff81939ee3 <+19>:	75 09              	jne    0xffffffff81939eee <__kvm_handle_async_pf+30>
   0xffffffff81939ee5 <+21>:	48 83 c4 08        	add    $0x8,%rsp
   0xffffffff81939ee9 <+25>:	5b                 	pop    %rbx
   0xffffffff81939eea <+26>:	5d                 	pop    %rbp
   0xffffffff81939eeb <+27>:	41 5c              	pop    %r12
   0xffffffff81939eed <+29>:	c3                 	ret


diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b8ab9ee5896c..b24133dc0731 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -240,22 +240,29 @@ void kvm_async_pf_task_wake(u32 token)
 }
 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
 
-noinstr u32 kvm_read_and_reset_apf_flags(void)
+static __always_inline u32 __kvm_read_and_reset_apf_flags(void)
 {
-       u32 flags = 0;
+       u32 flags = __this_cpu_read(apf_reason.flags);
 
-       if (__this_cpu_read(apf_reason.enabled)) {
-               flags = __this_cpu_read(apf_reason.flags);
-               __this_cpu_write(apf_reason.flags, 0);
+       if (unlikely(flags)) {
+               if (likely(__this_cpu_read(apf_reason.enabled)))
+                       __this_cpu_write(apf_reason.flags, 0);
+               else
+                       flags = 0;
        }
 
        return flags;
 }
+
+u32 kvm_read_and_reset_apf_flags(void)
+{
+       return __kvm_read_and_reset_apf_flags();
+}
 EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
 
 noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
 {
-       u32 flags = kvm_read_and_reset_apf_flags();
+       u32 flags = __kvm_read_and_reset_apf_flags();
        irqentry_state_t state;
 
        if (!flags)

> >  static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
> >  DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
> >  static int has_steal_clock = 0;
> > @@ -244,7 +245,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void)
> >  {
> >  	u32 flags = 0;
> >  
> > -	if (__this_cpu_read(apf_reason.enabled)) {
> > +	if (__this_cpu_read(async_pf_enabled)) {
> >  		flags = __this_cpu_read(apf_reason.flags);
> >  		__this_cpu_write(apf_reason.flags, 0);
> >  	}
> > @@ -295,7 +296,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
> >  
> >  	inc_irq_stat(irq_hv_callback_count);
> >  
> > -	if (__this_cpu_read(apf_reason.enabled)) {
> > +	if (__this_cpu_read(async_pf_enabled)) {
> >  		token = __this_cpu_read(apf_reason.token);
> >  		kvm_async_pf_task_wake(token);
> >  		__this_cpu_write(apf_reason.token, 0);
> > @@ -362,7 +363,7 @@ static void kvm_guest_cpu_init(void)
> >  		wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
> >  
> >  		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
> > -		__this_cpu_write(apf_reason.enabled, 1);
> > +		__this_cpu_write(async_pf_enabled, 1);
> 
> As 'async_pf_enabled' is bool, it would probably be more natural to
> write
> 
> 	__this_cpu_write(async_pf_enabled, true);

+1000
  
Xiaoyao Li Oct. 30, 2023, 5:17 a.m. UTC | #3
On 10/25/2023 5:10 PM, Vitaly Kuznetsov wrote:
> Xiaoyao Li <xiaoyao.li@intel.com> writes:
> 
>> Refer to commit fd10cde9294f ("KVM paravirt: Add async PF initialization
>> to PV guest") and commit 344d9588a9df ("KVM: Add PV MSR to enable
>> asynchronous page faults delivery"). It turns out that at the time when
>> asyncpf was introduced, the purpose was defining the shared PV data 'struct
>> kvm_vcpu_pv_apf_data' with the size of 64 bytes. However, it made a mistake
>> and defined the size to 68 bytes, which failed to make fit in a cache line
>> and made the code inconsistent with the documentation.
> 
> Oh, I actually though it was done on purpose :-) 'enabled' is not
> accessed by the host, it's only purpose is to cache MSR_KVM_ASYNC_PF_EN.

I didn't find any clue to show it was on purpose, so thought it was a 
mistake. Anyway, if the fact is it was done on purpose and people now 
still accept it. I can drop this patch, and write another one to 
document it's intentional instead.

>>
>> Below justification quoted from Sean[*]
>>
>>    KVM (the host side) has *never* read kvm_vcpu_pv_apf_data.enabled, and
>>    the documentation clearly states that enabling is based solely on the
>>    bit in the synthetic MSR.
>>
>>    So rather than update the documentation, fix the goof by removing the
>>    enabled filed and use the separate percpu variable instread.
>>    KVM-as-a-host obviously doesn't enforce anything or consume the size,
>>    and changing the header will only affect guests that are rebuilt against
>>    the new header, so there's no chance of ABI breakage between KVM and its
>>    guests. The only possible breakage is if some other hypervisor is
>>    emulating KVM's async #PF (LOL) and relies on the guest to set
>>    kvm_vcpu_pv_apf_data.enabled. But (a) I highly doubt such a hypervisor
>>    exists, (b) that would arguably be a violation of KVM's "spec", and
>>    (c) the worst case scenario is that the guest would simply lose async
>>    #PF functionality.
>>
>> [*] https://lore.kernel.org/all/ZS7ERnnRqs8Fl0ZF@google.com/T/#u
>>
>> Suggested-by: Sean Christopherson <seanjc@google.com>
>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>> ---
>>   Documentation/virt/kvm/x86/msr.rst   |  1 -
>>   arch/x86/include/uapi/asm/kvm_para.h |  1 -
>>   arch/x86/kernel/kvm.c                | 11 ++++++-----
>>   3 files changed, 6 insertions(+), 7 deletions(-)
>>
>> diff --git a/Documentation/virt/kvm/x86/msr.rst b/Documentation/virt/kvm/x86/msr.rst
>> index 9315fc385fb0..f6d70f99a1a7 100644
>> --- a/Documentation/virt/kvm/x86/msr.rst
>> +++ b/Documentation/virt/kvm/x86/msr.rst
>> @@ -204,7 +204,6 @@ data:
>>   		__u32 token;
>>   
>>   		__u8 pad[56];
>> -		__u32 enabled;
>>   	  };
>>   
>>   	Bits 5-4 of the MSR are reserved and should be zero. Bit 0 is set to 1
>> diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
>> index 6e64b27b2c1e..605899594ebb 100644
>> --- a/arch/x86/include/uapi/asm/kvm_para.h
>> +++ b/arch/x86/include/uapi/asm/kvm_para.h
>> @@ -142,7 +142,6 @@ struct kvm_vcpu_pv_apf_data {
>>   	__u32 token;
>>   
>>   	__u8 pad[56];
>> -	__u32 enabled;
>>   };
>>   
>>   #define KVM_PV_EOI_BIT 0
>> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
>> index b8ab9ee5896c..388a3fdd3cad 100644
>> --- a/arch/x86/kernel/kvm.c
>> +++ b/arch/x86/kernel/kvm.c
>> @@ -65,6 +65,7 @@ static int __init parse_no_stealacc(char *arg)
>>   
>>   early_param("no-steal-acc", parse_no_stealacc);
>>   
>> +static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
> 
> Would it make a difference is we replace this with a cpumask? I realize
> that we need to access it on all CPUs from hotpaths but this mask will
> rarely change so maybe there's no real perfomance hit?
> 
>>   static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
>>   DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
>>   static int has_steal_clock = 0;
>> @@ -244,7 +245,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void)
>>   {
>>   	u32 flags = 0;
>>   
>> -	if (__this_cpu_read(apf_reason.enabled)) {
>> +	if (__this_cpu_read(async_pf_enabled)) {
>>   		flags = __this_cpu_read(apf_reason.flags);
>>   		__this_cpu_write(apf_reason.flags, 0);
>>   	}
>> @@ -295,7 +296,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
>>   
>>   	inc_irq_stat(irq_hv_callback_count);
>>   
>> -	if (__this_cpu_read(apf_reason.enabled)) {
>> +	if (__this_cpu_read(async_pf_enabled)) {
>>   		token = __this_cpu_read(apf_reason.token);
>>   		kvm_async_pf_task_wake(token);
>>   		__this_cpu_write(apf_reason.token, 0);
>> @@ -362,7 +363,7 @@ static void kvm_guest_cpu_init(void)
>>   		wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
>>   
>>   		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
>> -		__this_cpu_write(apf_reason.enabled, 1);
>> +		__this_cpu_write(async_pf_enabled, 1);
> 
> As 'async_pf_enabled' is bool, it would probably be more natural to
> write
> 
> 	__this_cpu_write(async_pf_enabled, true);
> 
>>   		pr_debug("setup async PF for cpu %d\n", smp_processor_id());
>>   	}
>>   
>> @@ -383,11 +384,11 @@ static void kvm_guest_cpu_init(void)
>>   
>>   static void kvm_pv_disable_apf(void)
>>   {
>> -	if (!__this_cpu_read(apf_reason.enabled))
>> +	if (!__this_cpu_read(async_pf_enabled))
>>   		return;
>>   
>>   	wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
>> -	__this_cpu_write(apf_reason.enabled, 0);
>> +	__this_cpu_write(async_pf_enabled, 0);
> 
> ... and 'false' here.

sure, I can do it in a v3, if v3 is needed.

>>   
>>   	pr_debug("disable async PF for cpu %d\n", smp_processor_id());
>>   }
>
  
Xiaoyao Li Oct. 30, 2023, 5:47 a.m. UTC | #4
On 10/25/2023 10:22 PM, Sean Christopherson wrote:
> On Wed, Oct 25, 2023, Vitaly Kuznetsov wrote:
>> Xiaoyao Li <xiaoyao.li@intel.com> writes:
>>> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
>>> index b8ab9ee5896c..388a3fdd3cad 100644
>>> --- a/arch/x86/kernel/kvm.c
>>> +++ b/arch/x86/kernel/kvm.c
>>> @@ -65,6 +65,7 @@ static int __init parse_no_stealacc(char *arg)
>>>   
>>>   early_param("no-steal-acc", parse_no_stealacc);
>>>   
>>> +static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
>>
>> Would it make a difference is we replace this with a cpumask? I realize
>> that we need to access it on all CPUs from hotpaths but this mask will
>> rarely change so maybe there's no real perfomance hit?
> 
> FWIW, I personally prefer per-CPU booleans from a readability perspective.  I
> doubt there is a meaningful performance difference for a bitmap vs. individual
> booleans, the check is already gated by a static key, i.e. kernels that are NOT
> running as KVM guests don't care.

I agree with it.

> Actually, if there's performance gains to be had, optimizing kvm_read_and_reset_apf_flags()
> to read the "enabled" flag if and only if it's necessary is a more likely candidate.
> Assuming the host isn't being malicious/stupid, then apf_reason.flags will be '0'
> if PV async #PFs are disabled.  The only question is whether or not apf_reason.flags
> is predictable enough for the CPU.
> 
> Aha!  In practice, the CPU already needs to resolve a branch based on apf_reason.flags,
> it's just "hidden" up in __kvm_handle_async_pf().
> 
> If we really want to micro-optimize, provide an __always_inline inner helper so
> that __kvm_handle_async_pf() doesn't need to make a CALL just to read the flags.
> Then in the common case where a #PF isn't due to the host swapping out a page,
> the paravirt happy path doesn't need a taken branch and never reads the enabled
> variable.  E.g. the below generates:

If this is wanted. It can be a separate patch, irrelevant with this 
series, I think.

>     0xffffffff81939ed0 <+0>:	41 54              	push   %r12
>     0xffffffff81939ed2 <+2>:	31 c0              	xor    %eax,%eax
>     0xffffffff81939ed4 <+4>:	55                 	push   %rbp
>     0xffffffff81939ed5 <+5>:	53                 	push   %rbx
>     0xffffffff81939ed6 <+6>:	48 83 ec 08        	sub    $0x8,%rsp
>     0xffffffff81939eda <+10>:	65 8b 2d df 81 6f 7e	mov    %gs:0x7e6f81df(%rip),%ebp        # 0x320c0 <apf_reason>
>     0xffffffff81939ee1 <+17>:	85 ed              	test   %ebp,%ebp
>     0xffffffff81939ee3 <+19>:	75 09              	jne    0xffffffff81939eee <__kvm_handle_async_pf+30>
>     0xffffffff81939ee5 <+21>:	48 83 c4 08        	add    $0x8,%rsp
>     0xffffffff81939ee9 <+25>:	5b                 	pop    %rbx
>     0xffffffff81939eea <+26>:	5d                 	pop    %rbp
>     0xffffffff81939eeb <+27>:	41 5c              	pop    %r12
>     0xffffffff81939eed <+29>:	c3                 	ret
> 
> 
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index b8ab9ee5896c..b24133dc0731 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -240,22 +240,29 @@ void kvm_async_pf_task_wake(u32 token)
>   }
>   EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
>   
> -noinstr u32 kvm_read_and_reset_apf_flags(void)
> +static __always_inline u32 __kvm_read_and_reset_apf_flags(void)
>   {
> -       u32 flags = 0;
> +       u32 flags = __this_cpu_read(apf_reason.flags);
>   
> -       if (__this_cpu_read(apf_reason.enabled)) {
> -               flags = __this_cpu_read(apf_reason.flags);
> -               __this_cpu_write(apf_reason.flags, 0);
> +       if (unlikely(flags)) {
> +               if (likely(__this_cpu_read(apf_reason.enabled)))
> +                       __this_cpu_write(apf_reason.flags, 0);
> +               else
> +                       flags = 0;
>          }
>   
>          return flags;
>   }
> +
> +u32 kvm_read_and_reset_apf_flags(void)
> +{
> +       return __kvm_read_and_reset_apf_flags();
> +}
>   EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
>   
>   noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
>   {
> -       u32 flags = kvm_read_and_reset_apf_flags();
> +       u32 flags = __kvm_read_and_reset_apf_flags();
>          irqentry_state_t state;
>   
>          if (!flags)
> 
>>>   static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
>>>   DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
>>>   static int has_steal_clock = 0;
>>> @@ -244,7 +245,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void)
>>>   {
>>>   	u32 flags = 0;
>>>   
>>> -	if (__this_cpu_read(apf_reason.enabled)) {
>>> +	if (__this_cpu_read(async_pf_enabled)) {
>>>   		flags = __this_cpu_read(apf_reason.flags);
>>>   		__this_cpu_write(apf_reason.flags, 0);
>>>   	}
>>> @@ -295,7 +296,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
>>>   
>>>   	inc_irq_stat(irq_hv_callback_count);
>>>   
>>> -	if (__this_cpu_read(apf_reason.enabled)) {
>>> +	if (__this_cpu_read(async_pf_enabled)) {
>>>   		token = __this_cpu_read(apf_reason.token);
>>>   		kvm_async_pf_task_wake(token);
>>>   		__this_cpu_write(apf_reason.token, 0);
>>> @@ -362,7 +363,7 @@ static void kvm_guest_cpu_init(void)
>>>   		wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
>>>   
>>>   		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
>>> -		__this_cpu_write(apf_reason.enabled, 1);
>>> +		__this_cpu_write(async_pf_enabled, 1);
>>
>> As 'async_pf_enabled' is bool, it would probably be more natural to
>> write
>>
>> 	__this_cpu_write(async_pf_enabled, true);
> 
> +1000
  
Sean Christopherson Oct. 30, 2023, 11:17 p.m. UTC | #5
On Mon, Oct 30, 2023, Xiaoyao Li wrote:
> On 10/25/2023 10:22 PM, Sean Christopherson wrote:
> > On Wed, Oct 25, 2023, Vitaly Kuznetsov wrote:
> > > Xiaoyao Li <xiaoyao.li@intel.com> writes:
> > > > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> > > > index b8ab9ee5896c..388a3fdd3cad 100644
> > > > --- a/arch/x86/kernel/kvm.c
> > > > +++ b/arch/x86/kernel/kvm.c
> > > > @@ -65,6 +65,7 @@ static int __init parse_no_stealacc(char *arg)
> > > >   early_param("no-steal-acc", parse_no_stealacc);
> > > > +static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
> > > 
> > > Would it make a difference is we replace this with a cpumask? I realize
> > > that we need to access it on all CPUs from hotpaths but this mask will
> > > rarely change so maybe there's no real perfomance hit?
> > 
> > FWIW, I personally prefer per-CPU booleans from a readability perspective.  I
> > doubt there is a meaningful performance difference for a bitmap vs. individual
> > booleans, the check is already gated by a static key, i.e. kernels that are NOT
> > running as KVM guests don't care.
> 
> I agree with it.
> 
> > Actually, if there's performance gains to be had, optimizing kvm_read_and_reset_apf_flags()
> > to read the "enabled" flag if and only if it's necessary is a more likely candidate.
> > Assuming the host isn't being malicious/stupid, then apf_reason.flags will be '0'
> > if PV async #PFs are disabled.  The only question is whether or not apf_reason.flags
> > is predictable enough for the CPU.
> > 
> > Aha!  In practice, the CPU already needs to resolve a branch based on apf_reason.flags,
> > it's just "hidden" up in __kvm_handle_async_pf().
> > 
> > If we really want to micro-optimize, provide an __always_inline inner helper so
> > that __kvm_handle_async_pf() doesn't need to make a CALL just to read the flags.
> > Then in the common case where a #PF isn't due to the host swapping out a page,
> > the paravirt happy path doesn't need a taken branch and never reads the enabled
> > variable.  E.g. the below generates:
> 
> If this is wanted. It can be a separate patch, irrelevant with this series,
> I think.

Yes, it's definitely beyond the scope of this series.
  

Patch

diff --git a/Documentation/virt/kvm/x86/msr.rst b/Documentation/virt/kvm/x86/msr.rst
index 9315fc385fb0..f6d70f99a1a7 100644
--- a/Documentation/virt/kvm/x86/msr.rst
+++ b/Documentation/virt/kvm/x86/msr.rst
@@ -204,7 +204,6 @@  data:
 		__u32 token;
 
 		__u8 pad[56];
-		__u32 enabled;
 	  };
 
 	Bits 5-4 of the MSR are reserved and should be zero. Bit 0 is set to 1
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 6e64b27b2c1e..605899594ebb 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -142,7 +142,6 @@  struct kvm_vcpu_pv_apf_data {
 	__u32 token;
 
 	__u8 pad[56];
-	__u32 enabled;
 };
 
 #define KVM_PV_EOI_BIT 0
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b8ab9ee5896c..388a3fdd3cad 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -65,6 +65,7 @@  static int __init parse_no_stealacc(char *arg)
 
 early_param("no-steal-acc", parse_no_stealacc);
 
+static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
 static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
 DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
 static int has_steal_clock = 0;
@@ -244,7 +245,7 @@  noinstr u32 kvm_read_and_reset_apf_flags(void)
 {
 	u32 flags = 0;
 
-	if (__this_cpu_read(apf_reason.enabled)) {
+	if (__this_cpu_read(async_pf_enabled)) {
 		flags = __this_cpu_read(apf_reason.flags);
 		__this_cpu_write(apf_reason.flags, 0);
 	}
@@ -295,7 +296,7 @@  DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
 
 	inc_irq_stat(irq_hv_callback_count);
 
-	if (__this_cpu_read(apf_reason.enabled)) {
+	if (__this_cpu_read(async_pf_enabled)) {
 		token = __this_cpu_read(apf_reason.token);
 		kvm_async_pf_task_wake(token);
 		__this_cpu_write(apf_reason.token, 0);
@@ -362,7 +363,7 @@  static void kvm_guest_cpu_init(void)
 		wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
 
 		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
-		__this_cpu_write(apf_reason.enabled, 1);
+		__this_cpu_write(async_pf_enabled, 1);
 		pr_debug("setup async PF for cpu %d\n", smp_processor_id());
 	}
 
@@ -383,11 +384,11 @@  static void kvm_guest_cpu_init(void)
 
 static void kvm_pv_disable_apf(void)
 {
-	if (!__this_cpu_read(apf_reason.enabled))
+	if (!__this_cpu_read(async_pf_enabled))
 		return;
 
 	wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
-	__this_cpu_write(apf_reason.enabled, 0);
+	__this_cpu_write(async_pf_enabled, 0);
 
 	pr_debug("disable async PF for cpu %d\n", smp_processor_id());
 }