[04/16] KVM: x86/mmu: Pass full 64-bit error code when handling page faults

Message ID 20240228024147.41573-5-seanjc@google.com
State New
Headers
Series KVM: x86/mmu: Page fault and MMIO cleanups |

Commit Message

Sean Christopherson Feb. 28, 2024, 2:41 a.m. UTC
  From: Isaku Yamahata <isaku.yamahata@intel.com>

Plumb the full 64-bit error code throughout the page fault handling code
so that KVM can use the upper 32 bits, e.g. SNP's PFERR_GUEST_ENC_MASK
will be used to determine whether or not a fault is private vs. shared.

Note, passing the 64-bit error code to FNAME(walk_addr)() does NOT change
the behavior of permission_fault() when invoked in the page fault path, as
KVM explicitly clears PFERR_IMPLICIT_ACCESS in kvm_mmu_page_fault().

Continue passing '0' from the async #PF worker, as guest_memfd() and thus
private memory doesn't support async page faults.

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
[mdr: drop references/changes on rebase, update commit message]
Signed-off-by: Michael Roth <michael.roth@amd.com>
[sean: drop truncation in call to FNAME(walk_addr)(), rewrite changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c          | 3 +--
 arch/x86/kvm/mmu/mmu_internal.h | 4 ++--
 arch/x86/kvm/mmu/mmutrace.h     | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)
  

Comments

Dongli Zhang Feb. 28, 2024, 7:30 a.m. UTC | #1
On 2/27/24 18:41, Sean Christopherson wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> Plumb the full 64-bit error code throughout the page fault handling code
> so that KVM can use the upper 32 bits, e.g. SNP's PFERR_GUEST_ENC_MASK
> will be used to determine whether or not a fault is private vs. shared.
> 
> Note, passing the 64-bit error code to FNAME(walk_addr)() does NOT change
> the behavior of permission_fault() when invoked in the page fault path, as
> KVM explicitly clears PFERR_IMPLICIT_ACCESS in kvm_mmu_page_fault().

May this lead to a WARN_ON_ONCE?

5843 int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
u64 error_code,
5844                        void *insn, int insn_len)
5845 {
.. ...
5856          */
5857         if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
5858                 error_code &= ~PFERR_IMPLICIT_ACCESS;

> 
> Continue passing '0' from the async #PF worker, as guest_memfd() and thus

:s/guest_memfd()/guest_memfd/ ?

Dongli Zhang



> private memory doesn't support async page faults.
> 
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> [mdr: drop references/changes on rebase, update commit message]
> Signed-off-by: Michael Roth <michael.roth@amd.com>
> [sean: drop truncation in call to FNAME(walk_addr)(), rewrite changelog]
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/mmu/mmu.c          | 3 +--
>  arch/x86/kvm/mmu/mmu_internal.h | 4 ++--
>  arch/x86/kvm/mmu/mmutrace.h     | 2 +-
>  3 files changed, 4 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e2fd74e06ff8..408969ac1291 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5860,8 +5860,7 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
>  	}
>  
>  	if (r == RET_PF_INVALID) {
> -		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
> -					  lower_32_bits(error_code), false,
> +		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
>  					  &emulation_type);
>  		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
>  			return -EIO;
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 0eea6c5a824d..1fab1f2359b5 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -190,7 +190,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
>  struct kvm_page_fault {
>  	/* arguments to kvm_mmu_do_page_fault.  */
>  	const gpa_t addr;
> -	const u32 error_code;
> +	const u64 error_code;
>  	const bool prefetch;
>  
>  	/* Derived from error_code.  */
> @@ -288,7 +288,7 @@ static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
>  }
>  
>  static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> -					u32 err, bool prefetch, int *emulation_type)
> +					u64 err, bool prefetch, int *emulation_type)
>  {
>  	struct kvm_page_fault fault = {
>  		.addr = cr2_or_gpa,
> diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
> index ae86820cef69..195d98bc8de8 100644
> --- a/arch/x86/kvm/mmu/mmutrace.h
> +++ b/arch/x86/kvm/mmu/mmutrace.h
> @@ -260,7 +260,7 @@ TRACE_EVENT(
>  	TP_STRUCT__entry(
>  		__field(int, vcpu_id)
>  		__field(gpa_t, cr2_or_gpa)
> -		__field(u32, error_code)
> +		__field(u64, error_code)
>  		__field(u64 *, sptep)
>  		__field(u64, old_spte)
>  		__field(u64, new_spte)
  
Sean Christopherson Feb. 28, 2024, 4:22 p.m. UTC | #2
On Tue, Feb 27, 2024, Dongli Zhang wrote:
> 
> 
> On 2/27/24 18:41, Sean Christopherson wrote:
> > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > 
> > Plumb the full 64-bit error code throughout the page fault handling code
> > so that KVM can use the upper 32 bits, e.g. SNP's PFERR_GUEST_ENC_MASK
> > will be used to determine whether or not a fault is private vs. shared.
> > 
> > Note, passing the 64-bit error code to FNAME(walk_addr)() does NOT change
> > the behavior of permission_fault() when invoked in the page fault path, as
> > KVM explicitly clears PFERR_IMPLICIT_ACCESS in kvm_mmu_page_fault().
> 
> May this lead to a WARN_ON_ONCE?
> 
> 5843 int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> u64 error_code,
> 5844                        void *insn, int insn_len)
> 5845 {
> ... ...
> 5856          */
> 5857         if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
> 5858                 error_code &= ~PFERR_IMPLICIT_ACCESS;

Nope, it shouldn't.  PFERR_IMPLICIT_ACCESS is a synthetic, KVM-defined flag, and
should never be in the error code passed to kvm_mmu_page_fault().  If the WARN
fires, it means hardware (specifically, AMD CPUs for #NPF) has started using the
bit for something, and that we need to update KVM to use a different bit.

> > Continue passing '0' from the async #PF worker, as guest_memfd() and thus
> 
> :s/guest_memfd()/guest_memfd/ ?

I've been styling it as guest_memfd() to make it look like a syscall, e.g. like
memfd_create(), when I'm talking about a file that was created by userspace, as
opposed to GUEST_MEMFD when I'm talking about the ioctl() itself.
  
Dongli Zhang Feb. 29, 2024, 1:32 p.m. UTC | #3
On 2/28/24 08:22, Sean Christopherson wrote:
> On Tue, Feb 27, 2024, Dongli Zhang wrote:
>>
>>
>> On 2/27/24 18:41, Sean Christopherson wrote:
>>> From: Isaku Yamahata <isaku.yamahata@intel.com>
>>>
>>> Plumb the full 64-bit error code throughout the page fault handling code
>>> so that KVM can use the upper 32 bits, e.g. SNP's PFERR_GUEST_ENC_MASK
>>> will be used to determine whether or not a fault is private vs. shared.
>>>
>>> Note, passing the 64-bit error code to FNAME(walk_addr)() does NOT change
>>> the behavior of permission_fault() when invoked in the page fault path, as
>>> KVM explicitly clears PFERR_IMPLICIT_ACCESS in kvm_mmu_page_fault().
>>
>> May this lead to a WARN_ON_ONCE?
>>
>> 5843 int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>> u64 error_code,
>> 5844                        void *insn, int insn_len)
>> 5845 {
>> ... ...
>> 5856          */
>> 5857         if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
>> 5858                 error_code &= ~PFERR_IMPLICIT_ACCESS;
> 
> Nope, it shouldn't.  PFERR_IMPLICIT_ACCESS is a synthetic, KVM-defined flag, and
> should never be in the error code passed to kvm_mmu_page_fault().  If the WARN
> fires, it means hardware (specifically, AMD CPUs for #NPF) has started using the
> bit for something, and that we need to update KVM to use a different bit.

Thank you very much for the explanation.

I see it is impossible to have PFERR_IMPLICIT_ACCESS set here, unless there is
AMD hardware issue or Intel page fault handler morphs the error_code erroneously.

I meant the above commit message confused me when I was reading it. E.g., how
about something like:

"Note, passing the 64-bit error code to FNAME(walk_addr)() does NOT change
the behavior of permission_fault() when invoked in the page fault path, as it
should never be in the error code because ...."


Thank you very much!

Dongli Zhang

> 
>>> Continue passing '0' from the async #PF worker, as guest_memfd() and thus
>>
>> :s/guest_memfd()/guest_memfd/ ?
> 
> I've been styling it as guest_memfd() to make it look like a syscall, e.g. like
> memfd_create(), when I'm talking about a file that was created by userspace, as
> opposed to GUEST_MEMFD when I'm talking about the ioctl() itself.
  

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e2fd74e06ff8..408969ac1291 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5860,8 +5860,7 @@  int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
 	}
 
 	if (r == RET_PF_INVALID) {
-		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
-					  lower_32_bits(error_code), false,
+		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
 					  &emulation_type);
 		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
 			return -EIO;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 0eea6c5a824d..1fab1f2359b5 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -190,7 +190,7 @@  static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
 struct kvm_page_fault {
 	/* arguments to kvm_mmu_do_page_fault.  */
 	const gpa_t addr;
-	const u32 error_code;
+	const u64 error_code;
 	const bool prefetch;
 
 	/* Derived from error_code.  */
@@ -288,7 +288,7 @@  static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
 }
 
 static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-					u32 err, bool prefetch, int *emulation_type)
+					u64 err, bool prefetch, int *emulation_type)
 {
 	struct kvm_page_fault fault = {
 		.addr = cr2_or_gpa,
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index ae86820cef69..195d98bc8de8 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -260,7 +260,7 @@  TRACE_EVENT(
 	TP_STRUCT__entry(
 		__field(int, vcpu_id)
 		__field(gpa_t, cr2_or_gpa)
-		__field(u32, error_code)
+		__field(u64, error_code)
 		__field(u64 *, sptep)
 		__field(u64, old_spte)
 		__field(u64, new_spte)