[RFC,v8,01/56] KVM: x86: Add 'fault_is_private' x86 op

Message ID 20230220183847.59159-2-michael.roth@amd.com
State New
Headers
Series Add AMD Secure Nested Paging (SEV-SNP) Hypervisor Support |

Commit Message

Michael Roth Feb. 20, 2023, 6:37 p.m. UTC
  This callback is used by the KVM MMU to check whether a #NPF was for a
private GPA or not.

In some cases the full 64-bit error code for the #NPF will be needed to
make this determination, so also update kvm_mmu_do_page_fault() to
accept the full 64-bit value so it can be plumbed through to the
callback.

Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  1 +
 arch/x86/kvm/mmu/mmu.c             |  3 +--
 arch/x86/kvm/mmu/mmu_internal.h    | 37 +++++++++++++++++++++++++++---
 4 files changed, 37 insertions(+), 5 deletions(-)
  

Comments

Zhi Wang March 1, 2023, 10:25 a.m. UTC | #1
On Mon, 20 Feb 2023 12:37:52 -0600
Michael Roth <michael.roth@amd.com> wrote:

Basically, I don't think kvm_mmu_fault_is_private() is promising after going
through both SNP and TDX patches:

1) Fault path is critical. kvm_mmu_fault_is_private() is always doing a gfn_to
_memslot() no matter SNP/TDX is enabled or not. It might mostly hits the
slots->last_used_slot, but the worst case is going through an RB-tree search.

Adding extra overhead on the generic fault path needs to be re-considered
carefully. At least, check if the guest is a CC(SNP/TDX) guest.

2) Just after the gfn_to_memslot() in kvm_mmu_fault_is_private(), there is
another gfn_to_memslot():

static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                                        u64 err, bool prefetch)
{
        struct kvm_page_fault fault = {
                .addr = cr2_or_gpa,
                .error_code = lower_32_bits(err),
                .exec = err & PFERR_FETCH_MASK,
                .write = err & PFERR_WRITE_MASK,
                .present = err & PFERR_PRESENT_MASK,
                .rsvd = err & PFERR_RSVD_MASK,
                .user = err & PFERR_USER_MASK,
                .prefetch = prefetch,
                .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
                .nx_huge_page_workaround_enabled =
                        is_nx_huge_page_enabled(vcpu->kvm),

                .max_level = KVM_MAX_HUGEPAGE_LEVEL,
                .req_level = PG_LEVEL_4K,
                .goal_level = PG_LEVEL_4K,
                .is_private = kvm_mmu_fault_is_private(vcpu->kvm, cr2_or_gpa, err),
        };
        int r;

        if (vcpu->arch.mmu->root_role.direct) {
                fault.gfn = fault.addr >> PAGE_SHIFT;
		/* here */
                fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
        }

I was thinking if checking the private slot and kvm_slot_can_be_private() is
necessary in kvm_mmu_fault_is_private().

TDP MMU is expecting fault.is_private to indicate if CPU thinks the fault
is private or not (For SNP, it is in PF error code, for TDX it is the shared
bit in the fault GPA). TDP MMU will check if the slot is a private slot or
not, leave the userspace to handle it when they thinks differently.

My points:

1) Resolving the PFER in kvm_x86_ops.fault_is_private and setting
fault.is_private is enough. The rest can be handled by the TDP MMU.

2) Put the kvm_x86_ops.fault_is_private in a separate patch so that TDX series
can include it. (64bit-error code part can stay in another patch)

> This callback is used by the KVM MMU to check whether a #NPF was for a
> private GPA or not.
> 
> In some cases the full 64-bit error code for the #NPF will be needed to
> make this determination, so also update kvm_mmu_do_page_fault() to
> accept the full 64-bit value so it can be plumbed through to the
> callback.
> 
> Signed-off-by: Michael Roth <michael.roth@amd.com>
> ---
>  arch/x86/include/asm/kvm-x86-ops.h |  1 +
>  arch/x86/include/asm/kvm_host.h    |  1 +
>  arch/x86/kvm/mmu/mmu.c             |  3 +--
>  arch/x86/kvm/mmu/mmu_internal.h    | 37 +++++++++++++++++++++++++++---
>  4 files changed, 37 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 8dc345cc6318..72183da010b8 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -131,6 +131,7 @@ KVM_X86_OP(msr_filter_changed)
>  KVM_X86_OP(complete_emulated_msr)
>  KVM_X86_OP(vcpu_deliver_sipi_vector)
>  KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
> +KVM_X86_OP_OPTIONAL_RET0(fault_is_private);
>  
>  #undef KVM_X86_OP
>  #undef KVM_X86_OP_OPTIONAL
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index e552374f2357..f856d689dda0 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1643,6 +1643,7 @@ struct kvm_x86_ops {
>  
>  	void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
>  			     int root_level);
> +	bool (*fault_is_private)(struct kvm *kvm, gpa_t gpa, u64 error_code, bool *private_fault);
>  
>  	bool (*has_wbinvd_exit)(void);
>  
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index eda615f3951c..fb3f34b7391c 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5724,8 +5724,7 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
>  	}
>  
>  	if (r == RET_PF_INVALID) {
> -		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
> -					  lower_32_bits(error_code), false);
> +		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false);
>  		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
>  			return -EIO;
>  	}
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index e642d431df4b..557a001210df 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -231,6 +231,37 @@ struct kvm_page_fault {
>  
>  int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
>  
> +static bool kvm_mmu_fault_is_private(struct kvm *kvm, gpa_t gpa, u64 err)
> +{
> +	struct kvm_memory_slot *slot;
> +	bool private_fault = false;
> +	gfn_t gfn = gpa_to_gfn(gpa);
> +
> +	slot = gfn_to_memslot(kvm, gfn);
> +	if (!slot) {
> +		pr_debug("%s: no slot, GFN: 0x%llx\n", __func__, gfn);
> +		goto out;
> +	}
> +
> +	if (!kvm_slot_can_be_private(slot)) {
> +		pr_debug("%s: slot is not private, GFN: 0x%llx\n", __func__, gfn);
> +		goto out;
> +	}
> +
> +	if (static_call(kvm_x86_fault_is_private)(kvm, gpa, err, &private_fault))
> +		goto out;
> +
> +	/*
> +	 * Handling below is for UPM self-tests and guests that treat userspace
> +	 * as the authority on whether a fault should be private or not.
> +	 */
> +	private_fault = kvm_mem_is_private(kvm, gpa >> PAGE_SHIFT);
> +
> +out:
> +	pr_debug("%s: GFN: 0x%llx, private: %d\n", __func__, gfn, private_fault);
> +	return private_fault;
> +}
> +
>  /*
>   * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(),
>   * and of course kvm_mmu_do_page_fault().
> @@ -262,11 +293,11 @@ enum {
>  };
>  
>  static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> -					u32 err, bool prefetch)
> +					u64 err, bool prefetch)
>  {
>  	struct kvm_page_fault fault = {
>  		.addr = cr2_or_gpa,
> -		.error_code = err,
> +		.error_code = lower_32_bits(err),
>  		.exec = err & PFERR_FETCH_MASK,
>  		.write = err & PFERR_WRITE_MASK,
>  		.present = err & PFERR_PRESENT_MASK,
> @@ -280,7 +311,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  		.max_level = KVM_MAX_HUGEPAGE_LEVEL,
>  		.req_level = PG_LEVEL_4K,
>  		.goal_level = PG_LEVEL_4K,
> -		.is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
> +		.is_private = kvm_mmu_fault_is_private(vcpu->kvm, cr2_or_gpa, err),
>  	};
>  	int r;
>
  
Isaku Yamahata March 18, 2023, 4:51 a.m. UTC | #2
On Mon, Feb 20, 2023 at 12:37:52PM -0600,
Michael Roth <michael.roth@amd.com> wrote:

> This callback is used by the KVM MMU to check whether a #NPF was for a
> private GPA or not.
> 
> In some cases the full 64-bit error code for the #NPF will be needed to
> make this determination, so also update kvm_mmu_do_page_fault() to
> accept the full 64-bit value so it can be plumbed through to the
> callback.

We can split 64-bit part into the independent patch.

> Signed-off-by: Michael Roth <michael.roth@amd.com>
> ---
>  arch/x86/include/asm/kvm-x86-ops.h |  1 +
>  arch/x86/include/asm/kvm_host.h    |  1 +
>  arch/x86/kvm/mmu/mmu.c             |  3 +--
>  arch/x86/kvm/mmu/mmu_internal.h    | 37 +++++++++++++++++++++++++++---
>  4 files changed, 37 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 8dc345cc6318..72183da010b8 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -131,6 +131,7 @@ KVM_X86_OP(msr_filter_changed)
>  KVM_X86_OP(complete_emulated_msr)
>  KVM_X86_OP(vcpu_deliver_sipi_vector)
>  KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
> +KVM_X86_OP_OPTIONAL_RET0(fault_is_private);
>  
>  #undef KVM_X86_OP
>  #undef KVM_X86_OP_OPTIONAL
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index e552374f2357..f856d689dda0 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1643,6 +1643,7 @@ struct kvm_x86_ops {
>  
>  	void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
>  			     int root_level);
> +	bool (*fault_is_private)(struct kvm *kvm, gpa_t gpa, u64 error_code, bool *private_fault);
>  
>  	bool (*has_wbinvd_exit)(void);
>  
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index eda615f3951c..fb3f34b7391c 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5724,8 +5724,7 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
>  	}
>  
>  	if (r == RET_PF_INVALID) {
> -		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
> -					  lower_32_bits(error_code), false);
> +		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false);
>  		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
>  			return -EIO;
>  	}
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index e642d431df4b..557a001210df 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -231,6 +231,37 @@ struct kvm_page_fault {
>  
>  int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
>  
> +static bool kvm_mmu_fault_is_private(struct kvm *kvm, gpa_t gpa, u64 err)
> +{
> +	struct kvm_memory_slot *slot;
> +	bool private_fault = false;
> +	gfn_t gfn = gpa_to_gfn(gpa);
> +
> +	slot = gfn_to_memslot(kvm, gfn);
> +	if (!slot) {
> +		pr_debug("%s: no slot, GFN: 0x%llx\n", __func__, gfn);
> +		goto out;
> +	}
> +
> +	if (!kvm_slot_can_be_private(slot)) {
> +		pr_debug("%s: slot is not private, GFN: 0x%llx\n", __func__, gfn);
> +		goto out;
> +	}
> +
> +	if (static_call(kvm_x86_fault_is_private)(kvm, gpa, err, &private_fault))
> +		goto out;
> +
> +	/*
> +	 * Handling below is for UPM self-tests and guests that treat userspace
> +	 * as the authority on whether a fault should be private or not.
> +	 */
> +	private_fault = kvm_mem_is_private(kvm, gpa >> PAGE_SHIFT);
> +
> +out:
> +	pr_debug("%s: GFN: 0x%llx, private: %d\n", __func__, gfn, private_fault);
> +	return private_fault;
> +}
> +
>  /*
>   * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(),
>   * and of course kvm_mmu_do_page_fault().
> @@ -262,11 +293,11 @@ enum {
>  };
>  
>  static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> -					u32 err, bool prefetch)
> +					u64 err, bool prefetch)
>  {
>  	struct kvm_page_fault fault = {
>  		.addr = cr2_or_gpa,
> -		.error_code = err,
> +		.error_code = lower_32_bits(err),
>  		.exec = err & PFERR_FETCH_MASK,
>  		.write = err & PFERR_WRITE_MASK,
>  		.present = err & PFERR_PRESENT_MASK,
> @@ -280,7 +311,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  		.max_level = KVM_MAX_HUGEPAGE_LEVEL,
>  		.req_level = PG_LEVEL_4K,
>  		.goal_level = PG_LEVEL_4K,
> -		.is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
> +		.is_private = kvm_mmu_fault_is_private(vcpu->kvm, cr2_or_gpa, err),

I don't think kvm_mmu_fault_is_private(). It's too heavy. We can make it
it's own. I.e. the following.

From b0f914a1a4d154f076c0294831ce9ef0df7eb3d3 Mon Sep 17 00:00:00 2001
Message-Id: <b0f914a1a4d154f076c0294831ce9ef0df7eb3d3.1679114841.git.isaku.yamahata@intel.com>
In-Reply-To: <428a676face7a06a90e59dca1c32941c9b6ee001.1679114841.git.isaku.yamahata@intel.com>
References: <428a676face7a06a90e59dca1c32941c9b6ee001.1679114841.git.isaku.yamahata@intel.com>
From: Isaku Yamahata <isaku.yamahata@intel.com>
Date: Fri, 17 Mar 2023 11:18:13 -0700
Subject: [PATCH 2/4] KVM: x86: Add 'fault_is_private' x86 op

This callback is used by the KVM MMU to check whether a KVM page fault was
for a private GPA or not.

Originally-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  1 +
 arch/x86/kvm/mmu.h                 | 19 +++++++++++++++++++
 arch/x86/kvm/mmu/mmu_internal.h    |  2 +-
 arch/x86/kvm/x86.c                 |  8 ++++++++
 5 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index e1f57905c8fe..dc5f18ac0bd5 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -99,6 +99,7 @@ KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
 KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
 KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
 KVM_X86_OP(load_mmu_pgd)
+KVM_X86_OP(fault_is_private)
 KVM_X86_OP_OPTIONAL(link_private_spt)
 KVM_X86_OP_OPTIONAL(free_private_spt)
 KVM_X86_OP_OPTIONAL(split_private_spt)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 59196a80c3c8..0382d236fbf4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1730,6 +1730,7 @@ struct kvm_x86_ops {
 
 	void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
 			     int root_level);
+	bool (*fault_is_private)(struct kvm *kvm, gpa_t gpa, u64 error_code);
 
 	int (*link_private_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
 				void *private_spt);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 4aaef2132b97..1f21680b9b97 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -289,6 +289,25 @@ static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
 	return translate_nested_gpa(vcpu, gpa, access, exception);
 }
 
+static inline bool kvm_mmu_fault_is_private_default(struct kvm *kvm, gpa_t gpa, u64 err)
+{
+	struct kvm_memory_slot *slot;
+	gfn_t gfn = gpa_to_gfn(gpa);
+
+	slot = gfn_to_memslot(kvm, gfn);
+	if (!slot)
+		return false;
+
+	if (!kvm_slot_can_be_private(slot))
+		return false;
+
+	/*
+	 * Handling below is for UPM self-tests and guests that treat userspace
+	 * as the authority on whether a fault should be private or not.
+	 */
+	return kvm_mem_is_private(kvm, gfn);
+}
+
 static inline gfn_t kvm_gfn_shared_mask(const struct kvm *kvm)
 {
 #ifdef CONFIG_KVM_MMU_PRIVATE
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index bb5709f1cb57..6b54b069d1ed 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -445,7 +445,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 		.max_level = vcpu->kvm->arch.tdp_max_page_level,
 		.req_level = PG_LEVEL_4K,
 		.goal_level = PG_LEVEL_4K,
-		.is_private = kvm_is_private_gpa(vcpu->kvm, cr2_or_gpa),
+		.is_private = static_call(kvm_x86_fault_is_private)(vcpu->kvm, cr2_or_gpa, err),
 	};
 	int r;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fd14368c6bc8..0311ab450330 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9419,6 +9419,14 @@ static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
 #undef __KVM_X86_OP
 
 	kvm_pmu_ops_update(ops->pmu_ops);
+
+	/*
+	 * TODO: Once all backend fills this option, remove this and the default
+	 * function.
+	 */
+	if (!ops->runtime_ops->fault_is_private)
+		static_call_update(kvm_x86_fault_is_private,
+				   kvm_mmu_fault_is_private_default);
 }
 
 static int kvm_x86_check_processor_compatibility(void)
  
Isaku Yamahata March 18, 2023, 4:53 a.m. UTC | #3
On Mon, Feb 20, 2023 at 12:37:52PM -0600,
Michael Roth <michael.roth@amd.com> wrote:

> This callback is used by the KVM MMU to check whether a #NPF was for a
> private GPA or not.
> 
> In some cases the full 64-bit error code for the #NPF will be needed to
> make this determination, so also update kvm_mmu_do_page_fault() to
> accept the full 64-bit value so it can be plumbed through to the
> callback.

Here is a patch to change error code 64-bit.

From 428a676face7a06a90e59dca1c32941c9b6ee001 Mon Sep 17 00:00:00 2001
Message-Id: <428a676face7a06a90e59dca1c32941c9b6ee001.1679114841.git.isaku.yamahata@intel.com>
From: Isaku Yamahata <isaku.yamahata@intel.com>
Date: Fri, 17 Mar 2023 12:58:42 -0700
Subject: [PATCH 1/4] KVM: x86/mmu: Pass round full 64-bit error code for the
 KVM page fault

In some cases the full 64-bit error code for the KVM page fault will be
needed to make this determination, so also update kvm_mmu_do_page_fault()
to accept the full 64-bit value so it can be plumbed through to the
callback.

The upper 32 bits of error code is discarded at kvm_mmu_page_fault()
by lower_32_bits().  Now it's passed down as full 64 bits. It turns out
that only FNAME(page_fault) depends on it.  Move lower_32_bits() into
FNAME(page_fault).

The accesses of fault->error_code are as follows
- FNAME(page_fault): change to explicitly use lower_32_bits()
- kvm_tdp_page_fault(): explicit mask with PFERR_LEVEL_MASK
- kvm_mmu_page_fault(): explicit mask with PFERR_RSVD_MASK,
                        PFERR_NESTED_GUEST_PAGE
- mmutrace: changed u32 -> u64
- pgprintk(): change %x -> %llx

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/kvm/mmu.h              | 2 +-
 arch/x86/kvm/mmu/mmu.c          | 7 +++----
 arch/x86/kvm/mmu/mmu_internal.h | 4 ++--
 arch/x86/kvm/mmu/mmutrace.h     | 2 +-
 arch/x86/kvm/mmu/paging_tmpl.h  | 4 ++--
 5 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index de9c6b98c41b..4aaef2132b97 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -156,7 +156,7 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
 }
 
 kvm_pfn_t kvm_mmu_map_tdp_page(struct kvm_vcpu *vcpu, gpa_t gpa,
-			       u32 error_code, int max_level);
+			       u64 error_code, int max_level);
 
 /*
  * Check if a given access (described through the I/D, W/R and U/S bits of a
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 960609d72dd6..0ec94c72895c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4860,7 +4860,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
 				struct kvm_page_fault *fault)
 {
-	pgprintk("%s: gva %llx error %x\n", __func__, fault->addr, fault->error_code);
+	pgprintk("%s: gva %llx error %llx\n", __func__, fault->addr, fault->error_code);
 
 	/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
 	fault->max_level = PG_LEVEL_2M;
@@ -4986,7 +4986,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 }
 
 kvm_pfn_t kvm_mmu_map_tdp_page(struct kvm_vcpu *vcpu, gpa_t gpa,
-			       u32 error_code, int max_level)
+			       u64 error_code, int max_level)
 {
 	int r;
 	struct kvm_page_fault fault = (struct kvm_page_fault) {
@@ -6238,8 +6238,7 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
 	}
 
 	if (r == RET_PF_INVALID) {
-		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
-					  lower_32_bits(error_code), false);
+		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false);
 		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
 			return -EIO;
 	}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index aa0836191b5a..bb5709f1cb57 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -341,7 +341,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
 struct kvm_page_fault {
 	/* arguments to kvm_mmu_do_page_fault.  */
 	const gpa_t addr;
-	const u32 error_code;
+	const u64 error_code;
 	const bool prefetch;
 
 	/* Derived from error_code.  */
@@ -427,7 +427,7 @@ enum {
 };
 
 static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-					u32 err, bool prefetch)
+					u64 err, bool prefetch)
 {
 	struct kvm_page_fault fault = {
 		.addr = cr2_or_gpa,
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 2d7555381955..2e77883c92f6 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -261,7 +261,7 @@ TRACE_EVENT(
 	TP_STRUCT__entry(
 		__field(int, vcpu_id)
 		__field(gpa_t, cr2_or_gpa)
-		__field(u32, error_code)
+		__field(u64, error_code)
 		__field(u64 *, sptep)
 		__field(u64, old_spte)
 		__field(u64, new_spte)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 594af2e1fd2f..cab6822709e2 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -791,7 +791,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	int r;
 	bool is_self_change_mapping;
 
-	pgprintk("%s: addr %llx err %x\n", __func__, fault->addr, fault->error_code);
+	pgprintk("%s: addr %llx err %llx\n", __func__, fault->addr, fault->error_code);
 	WARN_ON_ONCE(fault->is_tdp);
 
 	/*
@@ -800,7 +800,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	 * The bit needs to be cleared before walking guest page tables.
 	 */
 	r = FNAME(walk_addr)(&walker, vcpu, fault->addr,
-			     fault->error_code & ~PFERR_RSVD_MASK);
+			     lower_32_bits(fault->error_code) & ~PFERR_RSVD_MASK);
 
 	/*
 	 * The page is not mapped by the guest.  Let the guest handle it.
  
Michael Roth March 20, 2023, 5:46 p.m. UTC | #4
On Fri, Mar 17, 2023 at 09:51:37PM -0700, Isaku Yamahata wrote:
> On Mon, Feb 20, 2023 at 12:37:52PM -0600,
> Michael Roth <michael.roth@amd.com> wrote:
> 
> > This callback is used by the KVM MMU to check whether a #NPF was for a
> > private GPA or not.
> > 
> > In some cases the full 64-bit error code for the #NPF will be needed to
> > make this determination, so also update kvm_mmu_do_page_fault() to
> > accept the full 64-bit value so it can be plumbed through to the
> > callback.

Hi Isaku, Zhi,

Thanks for your efforts trying to get us in sync on these shared
interfaces. Would be great to have a common base we can build on for the
SNP/TDX series. You mentioned a couple patches here that I couldn't find
on the list, are you planning to submit these as a separate series?

> 
> We can split 64-bit part into the independent patch.

Agreed that makes sense.

> 
> > Signed-off-by: Michael Roth <michael.roth@amd.com>
> > ---

<snip>

> > +static bool kvm_mmu_fault_is_private(struct kvm *kvm, gpa_t gpa, u64 err)
> > +{
> > +	struct kvm_memory_slot *slot;
> > +	bool private_fault = false;
> > +	gfn_t gfn = gpa_to_gfn(gpa);
> > +
> > +	slot = gfn_to_memslot(kvm, gfn);
> > +	if (!slot) {
> > +		pr_debug("%s: no slot, GFN: 0x%llx\n", __func__, gfn);
> > +		goto out;
> > +	}
> > +
> > +	if (!kvm_slot_can_be_private(slot)) {
> > +		pr_debug("%s: slot is not private, GFN: 0x%llx\n", __func__, gfn);
> > +		goto out;
> > +	}
> > +
> > +	if (static_call(kvm_x86_fault_is_private)(kvm, gpa, err, &private_fault))
> > +		goto out;
> > +
> > +	/*
> > +	 * Handling below is for UPM self-tests and guests that treat userspace
> > +	 * as the authority on whether a fault should be private or not.
> > +	 */
> > +	private_fault = kvm_mem_is_private(kvm, gpa >> PAGE_SHIFT);
> > +
> > +out:
> > +	pr_debug("%s: GFN: 0x%llx, private: %d\n", __func__, gfn, private_fault);
> > +	return private_fault;
> > +}
> > +
> >  /*
> >   * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(),
> >   * and of course kvm_mmu_do_page_fault().
> > @@ -262,11 +293,11 @@ enum {
> >  };
> >  
> >  static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> > -					u32 err, bool prefetch)
> > +					u64 err, bool prefetch)
> >  {
> >  	struct kvm_page_fault fault = {
> >  		.addr = cr2_or_gpa,
> > -		.error_code = err,
> > +		.error_code = lower_32_bits(err),
> >  		.exec = err & PFERR_FETCH_MASK,
> >  		.write = err & PFERR_WRITE_MASK,
> >  		.present = err & PFERR_PRESENT_MASK,
> > @@ -280,7 +311,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> >  		.max_level = KVM_MAX_HUGEPAGE_LEVEL,
> >  		.req_level = PG_LEVEL_4K,
> >  		.goal_level = PG_LEVEL_4K,
> > -		.is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
> > +		.is_private = kvm_mmu_fault_is_private(vcpu->kvm, cr2_or_gpa, err),
> 
> I don't think kvm_mmu_fault_is_private(). It's too heavy. We can make it
> it's own. I.e. the following.

Is it causing performance issues? If most of that is mainly due to
gfn_to_memslot()/kvm_slot_can_be_private() check, then maybe that part
can be dropped. In the past Sean has mentioned that we shouldn't have to
do kvm_slot_can_be_private() checks prior to kvm_mem_is_private(), but I
haven't tried removing those yet to see if things still work as expected.

> 
> From b0f914a1a4d154f076c0294831ce9ef0df7eb3d3 Mon Sep 17 00:00:00 2001
> Message-Id: <b0f914a1a4d154f076c0294831ce9ef0df7eb3d3.1679114841.git.isaku.yamahata@intel.com>
> In-Reply-To: <428a676face7a06a90e59dca1c32941c9b6ee001.1679114841.git.isaku.yamahata@intel.com>
> References: <428a676face7a06a90e59dca1c32941c9b6ee001.1679114841.git.isaku.yamahata@intel.com>
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> Date: Fri, 17 Mar 2023 11:18:13 -0700
> Subject: [PATCH 2/4] KVM: x86: Add 'fault_is_private' x86 op
> 
> This callback is used by the KVM MMU to check whether a KVM page fault was
> for a private GPA or not.
> 
> Originally-by: Michael Roth <michael.roth@amd.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/include/asm/kvm-x86-ops.h |  1 +
>  arch/x86/include/asm/kvm_host.h    |  1 +
>  arch/x86/kvm/mmu.h                 | 19 +++++++++++++++++++
>  arch/x86/kvm/mmu/mmu_internal.h    |  2 +-
>  arch/x86/kvm/x86.c                 |  8 ++++++++
>  5 files changed, 30 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index e1f57905c8fe..dc5f18ac0bd5 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -99,6 +99,7 @@ KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
>  KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
>  KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
>  KVM_X86_OP(load_mmu_pgd)
> +KVM_X86_OP(fault_is_private)
>  KVM_X86_OP_OPTIONAL(link_private_spt)
>  KVM_X86_OP_OPTIONAL(free_private_spt)
>  KVM_X86_OP_OPTIONAL(split_private_spt)
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 59196a80c3c8..0382d236fbf4 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1730,6 +1730,7 @@ struct kvm_x86_ops {
>  
>  	void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
>  			     int root_level);
> +	bool (*fault_is_private)(struct kvm *kvm, gpa_t gpa, u64 error_code);
>  
>  	int (*link_private_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
>  				void *private_spt);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 4aaef2132b97..1f21680b9b97 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -289,6 +289,25 @@ static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
>  	return translate_nested_gpa(vcpu, gpa, access, exception);
>  }
>  
> +static inline bool kvm_mmu_fault_is_private_default(struct kvm *kvm, gpa_t gpa, u64 err)
> +{
> +	struct kvm_memory_slot *slot;
> +	gfn_t gfn = gpa_to_gfn(gpa);
> +
> +	slot = gfn_to_memslot(kvm, gfn);
> +	if (!slot)
> +		return false;
> +
> +	if (!kvm_slot_can_be_private(slot))
> +		return false;
> +
> +	/*
> +	 * Handling below is for UPM self-tests and guests that treat userspace
> +	 * as the authority on whether a fault should be private or not.
> +	 */
> +	return kvm_mem_is_private(kvm, gfn);
> +}
> +
>  static inline gfn_t kvm_gfn_shared_mask(const struct kvm *kvm)
>  {
>  #ifdef CONFIG_KVM_MMU_PRIVATE
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index bb5709f1cb57..6b54b069d1ed 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -445,7 +445,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  		.max_level = vcpu->kvm->arch.tdp_max_page_level,
>  		.req_level = PG_LEVEL_4K,
>  		.goal_level = PG_LEVEL_4K,
> -		.is_private = kvm_is_private_gpa(vcpu->kvm, cr2_or_gpa),
> +		.is_private = static_call(kvm_x86_fault_is_private)(vcpu->kvm, cr2_or_gpa, err),
>  	};
>  	int r;
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index fd14368c6bc8..0311ab450330 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9419,6 +9419,14 @@ static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
>  #undef __KVM_X86_OP
>  
>  	kvm_pmu_ops_update(ops->pmu_ops);
> +
> +	/*
> +	 * TODO: Once all backend fills this option, remove this and the default
> +	 * function.
> +	 */
> +	if (!ops->runtime_ops->fault_is_private)
> +		static_call_update(kvm_x86_fault_is_private,
> +				   kvm_mmu_fault_is_private_default);

I'm not sure about this approach, since the self-tests (and possibly SEV
(which doesn't use a separate #NPF error bit like SNP/TDX)) currently
rely on that kvm_mem_is_private() call to determine whether to handle as
a private fault or not. But to run either of those, we would need to
load the kvm_amd module, which will have already introduced it's own
kvm_x86_fault_is_private implementation via svm_init(), so the handling
provided by kvm_mmu_fault_is_private_default would never be available and
so we wouldn't be able to run the UPM self-tests.

To me it seems like that handling always needs to be in place as a
fallback when not running SNP/TDX. It doesn't necessarily need to be in the
kvm_x86_fault_is_private handler though, maybe some generic handling for
UPM selftests can be pushed down into KVM MMU. Doing so could also
address a race that Sean mentioned between the time kvm_mem_is_private()
is called here (which happens before mmu_invalidate_seq is recorded for
the #NPF) vs. when it actually gets used in __kvm_faultin_pfn().

If we take that approach, then the requirements for specific TDX/SNP
handling are reduced as well, since we only need to check the
encryption/shared bit, and that could maybe be done as a simple setting
that where you tell KVM MMU the position of the bit, whether it
indicates shared vs. private, then both TDX/SNP could re-use a simple
helper to check the #NPF error code and set .is_private based on that.

Then KVM MMU could, if no bit is indicated, just fall back to using the
value of kvm_mem_is_private() somewhere in __kvm_fault_pfn() or
something.

I mentioned this to Sean a while back, which I think is compatible with
what he was looking for:

  https://lore.kernel.org/lkml/20230220162210.42rjdgbdwbjiextz@amd.com/

Would be good to get his input before spending too much time adding new
state/configuration stuff in KVM MMU though.

As an interim solution, would my original patch work if we could
confirm that the gfn_to_memslot()/kvm_slot_can_be_private() sequence is
no longer needed?

Thanks!

-Mike

>  }
>  
>  static int kvm_x86_check_processor_compatibility(void)
> -- 
> 2.25.1
> 
> 
> 
> 
> -- 
> Isaku Yamahata <isaku.yamahata@gmail.com>
  

Patch

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 8dc345cc6318..72183da010b8 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -131,6 +131,7 @@  KVM_X86_OP(msr_filter_changed)
 KVM_X86_OP(complete_emulated_msr)
 KVM_X86_OP(vcpu_deliver_sipi_vector)
 KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
+KVM_X86_OP_OPTIONAL_RET0(fault_is_private);
 
 #undef KVM_X86_OP
 #undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e552374f2357..f856d689dda0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1643,6 +1643,7 @@  struct kvm_x86_ops {
 
 	void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
 			     int root_level);
+	bool (*fault_is_private)(struct kvm *kvm, gpa_t gpa, u64 error_code, bool *private_fault);
 
 	bool (*has_wbinvd_exit)(void);
 
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index eda615f3951c..fb3f34b7391c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5724,8 +5724,7 @@  int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
 	}
 
 	if (r == RET_PF_INVALID) {
-		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
-					  lower_32_bits(error_code), false);
+		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false);
 		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
 			return -EIO;
 	}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index e642d431df4b..557a001210df 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -231,6 +231,37 @@  struct kvm_page_fault {
 
 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
 
+static bool kvm_mmu_fault_is_private(struct kvm *kvm, gpa_t gpa, u64 err)
+{
+	struct kvm_memory_slot *slot;
+	bool private_fault = false;
+	gfn_t gfn = gpa_to_gfn(gpa);
+
+	slot = gfn_to_memslot(kvm, gfn);
+	if (!slot) {
+		pr_debug("%s: no slot, GFN: 0x%llx\n", __func__, gfn);
+		goto out;
+	}
+
+	if (!kvm_slot_can_be_private(slot)) {
+		pr_debug("%s: slot is not private, GFN: 0x%llx\n", __func__, gfn);
+		goto out;
+	}
+
+	if (static_call(kvm_x86_fault_is_private)(kvm, gpa, err, &private_fault))
+		goto out;
+
+	/*
+	 * Handling below is for UPM self-tests and guests that treat userspace
+	 * as the authority on whether a fault should be private or not.
+	 */
+	private_fault = kvm_mem_is_private(kvm, gpa >> PAGE_SHIFT);
+
+out:
+	pr_debug("%s: GFN: 0x%llx, private: %d\n", __func__, gfn, private_fault);
+	return private_fault;
+}
+
 /*
  * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(),
  * and of course kvm_mmu_do_page_fault().
@@ -262,11 +293,11 @@  enum {
 };
 
 static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-					u32 err, bool prefetch)
+					u64 err, bool prefetch)
 {
 	struct kvm_page_fault fault = {
 		.addr = cr2_or_gpa,
-		.error_code = err,
+		.error_code = lower_32_bits(err),
 		.exec = err & PFERR_FETCH_MASK,
 		.write = err & PFERR_WRITE_MASK,
 		.present = err & PFERR_PRESENT_MASK,
@@ -280,7 +311,7 @@  static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 		.max_level = KVM_MAX_HUGEPAGE_LEVEL,
 		.req_level = PG_LEVEL_4K,
 		.goal_level = PG_LEVEL_4K,
-		.is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
+		.is_private = kvm_mmu_fault_is_private(vcpu->kvm, cr2_or_gpa, err),
 	};
 	int r;