[v10,047/108] KVM: x86/tdp_mmu: Don't zap private pages for unsupported cases

Message ID 9e8346b692eb377576363a028c3688c66f3c0bfe.1667110240.git.isaku.yamahata@intel.com
State New
Headers
Series KVM TDX basic feature support |

Commit Message

Isaku Yamahata Oct. 30, 2022, 6:22 a.m. UTC
  From: Sean Christopherson <sean.j.christopherson@intel.com>

TDX supports only write-back(WB) memory type for private memory
architecturally so that (virtualized) memory type change doesn't make sense
for private memory.  Also currently, page migration isn't supported for TDX
yet. (TDX architecturally supports page migration. it's KVM and kernel
implementation issue.)

Regarding memory type change (mtrr virtualization and lapic page mapping
change), pages are zapped by kvm_zap_gfn_range().  On the next KVM page
fault, the SPTE entry with a new memory type for the page is populated.
Regarding page migration, pages are zapped by the mmu notifier. On the next
KVM page fault, the new migrated page is populated.  Don't zap private
pages on unmapping for those two cases.

When deleting/moving a KVM memory slot, zap private pages. Typically
tearing down VM.  Don't invalidate private page tables. i.e. zap only leaf
SPTEs for KVM mmu that has a shared bit mask. The existing
kvm_tdp_mmu_invalidate_all_roots() depends on role.invalid with read-lock
of mmu_lock so that other vcpu can operate on KVM mmu concurrently.  It
marks the root page table invalid and zaps SPTEs of the root page
tables. The TDX module doesn't allow to unlink a protected root page table
from the hardware and then allocate a new one for it. i.e. replacing a
protected root page table.  Instead, zap only leaf SPTEs for KVM mmu with a
shared bit mask set.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/kvm/mmu/mmu.c     | 85 ++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/mmu/tdp_mmu.c | 24 ++++++++---
 arch/x86/kvm/mmu/tdp_mmu.h |  5 ++-
 3 files changed, 103 insertions(+), 11 deletions(-)
  

Comments

Ackerley Tng Nov. 22, 2022, 9:26 p.m. UTC | #1
> From: Sean Christopherson <sean.j.christopherson@intel.com>

> TDX supports only write-back(WB) memory type for private memory
> architecturally so that (virtualized) memory type change doesn't make  
> sense
> for private memory.  Also currently, page migration isn't supported for  
> TDX
> yet. (TDX architecturally supports page migration. it's KVM and kernel
> implementation issue.)

> Regarding memory type change (mtrr virtualization and lapic page mapping
> change), pages are zapped by kvm_zap_gfn_range().  On the next KVM page
> fault, the SPTE entry with a new memory type for the page is populated.
> Regarding page migration, pages are zapped by the mmu notifier. On the  
> next
> KVM page fault, the new migrated page is populated.  Don't zap private
> pages on unmapping for those two cases.

> When deleting/moving a KVM memory slot, zap private pages. Typically
> tearing down VM.  Don't invalidate private page tables. i.e. zap only leaf
> SPTEs for KVM mmu that has a shared bit mask. The existing
> kvm_tdp_mmu_invalidate_all_roots() depends on role.invalid with read-lock
> of mmu_lock so that other vcpu can operate on KVM mmu concurrently.  It
> marks the root page table invalid and zaps SPTEs of the root page
> tables. The TDX module doesn't allow to unlink a protected root page table
> from the hardware and then allocate a new one for it. i.e. replacing a
> protected root page table.  Instead, zap only leaf SPTEs for KVM mmu with  
> a
> shared bit mask set.

> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>   arch/x86/kvm/mmu/mmu.c     | 85 ++++++++++++++++++++++++++++++++++++--
>   arch/x86/kvm/mmu/tdp_mmu.c | 24 ++++++++---
>   arch/x86/kvm/mmu/tdp_mmu.h |  5 ++-
>   3 files changed, 103 insertions(+), 11 deletions(-)

> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index faf69774c7ce..0237e143299c 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1577,8 +1577,38 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct  
> kvm_gfn_range *range)
>   	if (kvm_memslots_have_rmaps(kvm))
>   		flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);

> -	if (is_tdp_mmu_enabled(kvm))
> -		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
> +	if (is_tdp_mmu_enabled(kvm)) {
> +		bool zap_private;

We should initialize zap_private to true, otherwise zap_private is
uninitialized in call

     kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush, zap_private)

if the condition

     if (kvm_slot_can_be_private(range->slot)) {

evaluates to false.

> +
> +		if (kvm_slot_can_be_private(range->slot)) {
> +			if (range->flags & KVM_GFN_RANGE_FLAGS_RESTRICTED_MEM)
> +				/*
> +				 * For private slot, the callback is triggered
> +				 * via falloc.  Mode can be allocation or punch
> +				 * hole.  Because the private-shared conversion
> +				 * is done via
> +				 * KVM_MEMORY_ENCRYPT_REG/UNREG_REGION, we can
> +				 * ignore the request from restrictedmem.
> +				 */
> +				return flush;
> +			else if (range->flags & KVM_GFN_RANGE_FLAGS_SET_MEM_ATTR) {
> +				if (range->attr == KVM_MEM_ATTR_SHARED)
> +					zap_private = true;
> +				else {
> +					WARN_ON_ONCE(range->attr != KVM_MEM_ATTR_PRIVATE);
> +					zap_private = false;
> +				}
> +			} else
> +				/*
> +				 * kvm_unmap_gfn_range() is called via mmu
> +				 * notifier.  For now page migration for private
> +				 * page isn't supported yet, don't zap private
> +				 * pages.
> +				 */
> +				zap_private = false;
> +		}
> +		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush, zap_private);
> +	}

>   	return flush;
>   }
  
Kai Huang Dec. 14, 2022, 11:17 a.m. UTC | #2
On Sat, 2022-10-29 at 23:22 -0700, isaku.yamahata@intel.com wrote:
> From: Sean Christopherson <sean.j.christopherson@intel.com>
> 
> TDX supports only write-back(WB) memory type for private memory
> architecturally so that (virtualized) memory type change doesn't make sense
> for private memory.  Also currently, page migration isn't supported for TDX
> yet. (TDX architecturally supports page migration. it's KVM and kernel
> implementation issue.)
> 
> Regarding memory type change (mtrr virtualization and lapic page mapping
> change), pages are zapped by kvm_zap_gfn_range().  On the next KVM page
> fault, the SPTE entry with a new memory type for the page is populated.
> Regarding page migration, pages are zapped by the mmu notifier. On the next
> KVM page fault, the new migrated page is populated.  Don't zap private
> pages on unmapping for those two cases.
> 
> When deleting/moving a KVM memory slot, zap private pages. Typically
> tearing down VM.  Don't invalidate private page tables. i.e. zap only leaf
> SPTEs for KVM mmu that has a shared bit mask. The existing
> kvm_tdp_mmu_invalidate_all_roots() depends on role.invalid with read-lock
> of mmu_lock so that other vcpu can operate on KVM mmu concurrently.  It
> marks the root page table invalid and zaps SPTEs of the root page
> tables. The TDX module doesn't allow to unlink a protected root page table
> from the hardware and then allocate a new one for it. i.e. replacing a
> protected root page table.  Instead, zap only leaf SPTEs for KVM mmu with a
> shared bit mask set.
> 
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/kvm/mmu/mmu.c     | 85 ++++++++++++++++++++++++++++++++++++--
>  arch/x86/kvm/mmu/tdp_mmu.c | 24 ++++++++---
>  arch/x86/kvm/mmu/tdp_mmu.h |  5 ++-
>  3 files changed, 103 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index faf69774c7ce..0237e143299c 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1577,8 +1577,38 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>  	if (kvm_memslots_have_rmaps(kvm))
>  		flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
>  
> -	if (is_tdp_mmu_enabled(kvm))
> -		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
> +	if (is_tdp_mmu_enabled(kvm)) {
> +		bool zap_private;
> +
> +		if (kvm_slot_can_be_private(range->slot)) {
> +			if (range->flags & KVM_GFN_RANGE_FLAGS_RESTRICTED_MEM)
> +				/*
> +				 * For private slot, the callback is triggered
> +				 * via falloc.  Mode can be allocation or punch
				       ^
				       fallocate(), please?

> +				 * hole.  Because the private-shared conversion
> +				 * is done via
> +				 * KVM_MEMORY_ENCRYPT_REG/UNREG_REGION, we can
> +				 * ignore the request from restrictedmem.
> +				 */
> +				return flush;

Sorry why "private-shared conversion is done via KVM_MEMORY_ENCRYPT_REG" results
in "we can ignore the requres from restrictedmem"?

If we punch a hole, the pages are de-allocated, correct? 

> +			else if (range->flags & KVM_GFN_RANGE_FLAGS_SET_MEM_ATTR) {
> +				if (range->attr == KVM_MEM_ATTR_SHARED)
> +					zap_private = true;
> +				else {
> +					WARN_ON_ONCE(range->attr != KVM_MEM_ATTR_PRIVATE);
> +					zap_private = false;
> +				}
> +			} else
> +				/*
> +				 * kvm_unmap_gfn_range() is called via mmu
> +				 * notifier.  For now page migration for private
> +				 * page isn't supported yet, don't zap private
> +				 * pages.
> +				 */
> +				zap_private = false;

Page migration is not the only reason that KVM will receive the MMU notifer --
just say something like "for now all private pages are pinned during VM's life 
time".


> +		}
> +		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush, zap_private);
> +	}
>  
>  	return flush;
>  }
> @@ -6066,11 +6096,48 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
>  	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
>  }
>  
> +static void kvm_mmu_zap_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
> +{
> +	bool flush = false;
> +
> +	write_lock(&kvm->mmu_lock);
> +
> +	/*
> +	 * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
> +	 * case scenario we'll have unused shadow pages lying around until they
> +	 * are recycled due to age or when the VM is destroyed.
> +	 */
> +	if (is_tdp_mmu_enabled(kvm)) {
> +		struct kvm_gfn_range range = {
> +		      .slot = slot,
> +		      .start = slot->base_gfn,
> +		      .end = slot->base_gfn + slot->npages,
> +		      .may_block = false,
> +		};
> +
> +		/*
> +		 * this handles both private gfn and shared gfn.
> +		 * All private page should be zapped on memslot deletion.
> +		 */
> +		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, &range, flush, true);
> +	} else {
> +		flush = slot_handle_level(kvm, slot, __kvm_zap_rmap, PG_LEVEL_4K,
> +					  KVM_MAX_HUGEPAGE_LEVEL, true);
> +	}
> +	if (flush)
> +		kvm_flush_remote_tlbs(kvm);
> +
> +	write_unlock(&kvm->mmu_lock);
> +}
> +
>  static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
>  			struct kvm_memory_slot *slot,
>  			struct kvm_page_track_notifier_node *node)
>  {
> -	kvm_mmu_zap_all_fast(kvm);
> +	if (kvm_gfn_shared_mask(kvm))
> +		kvm_mmu_zap_memslot(kvm, slot);
> +	else
> +		kvm_mmu_zap_all_fast(kvm);
>  }

A comment would be nice here.

>  
>  int kvm_mmu_init_vm(struct kvm *kvm)
> @@ -6173,8 +6240,18 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
>  
>  	if (is_tdp_mmu_enabled(kvm)) {
>  		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
> +			/*
> +			 * zap_private = true. Zap both private/shared pages.
> +			 *
> +			 * kvm_zap_gfn_range() is used when PAT memory type was

Is it PAT or MTRR, or both (thus just memory type)?

> +			 * changed.  Later on the next kvm page fault, populate
> +			 * it with updated spte entry.
> +			 * Because only WB is supported for private pages, don't
> +			 * care of private pages.
> +			 */

Then why bother zapping private?  If I read correctly, the changelog says "don't
zap private"?

>  			flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
> -						      gfn_end, true, flush);
> +						      gfn_end, true, flush,
> +						      true);
>  	}
>  

Btw, as you mentioned in the changelog, private memory always has WB memory
type, thus cannot be virtualized.  Is it better to modify update_mtrr() to just
return early if the gfn range is purely private?

IMHO the handling of MTRR/PAT virtualization for TDX guest deserves dedicated
patch(es) to put them together so it's easier to review.  Now the relevant parts
spread in multiple independent patches (MSR handling, vt_get_mt_mask(), etc).
  
Isaku Yamahata Dec. 15, 2022, 10:46 p.m. UTC | #3
On Wed, Dec 14, 2022 at 11:17:32AM +0000,
"Huang, Kai" <kai.huang@intel.com> wrote:

> On Sat, 2022-10-29 at 23:22 -0700, isaku.yamahata@intel.com wrote:
> > From: Sean Christopherson <sean.j.christopherson@intel.com>
> > 
> > TDX supports only write-back(WB) memory type for private memory
> > architecturally so that (virtualized) memory type change doesn't make sense
> > for private memory.  Also currently, page migration isn't supported for TDX
> > yet. (TDX architecturally supports page migration. it's KVM and kernel
> > implementation issue.)
> > 
> > Regarding memory type change (mtrr virtualization and lapic page mapping
> > change), pages are zapped by kvm_zap_gfn_range().  On the next KVM page
> > fault, the SPTE entry with a new memory type for the page is populated.
> > Regarding page migration, pages are zapped by the mmu notifier. On the next
> > KVM page fault, the new migrated page is populated.  Don't zap private
> > pages on unmapping for those two cases.
> > 
> > When deleting/moving a KVM memory slot, zap private pages. Typically
> > tearing down VM.  Don't invalidate private page tables. i.e. zap only leaf
> > SPTEs for KVM mmu that has a shared bit mask. The existing
> > kvm_tdp_mmu_invalidate_all_roots() depends on role.invalid with read-lock
> > of mmu_lock so that other vcpu can operate on KVM mmu concurrently.  It
> > marks the root page table invalid and zaps SPTEs of the root page
> > tables. The TDX module doesn't allow to unlink a protected root page table
> > from the hardware and then allocate a new one for it. i.e. replacing a
> > protected root page table.  Instead, zap only leaf SPTEs for KVM mmu with a
> > shared bit mask set.
> > 
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > ---
> >  arch/x86/kvm/mmu/mmu.c     | 85 ++++++++++++++++++++++++++++++++++++--
> >  arch/x86/kvm/mmu/tdp_mmu.c | 24 ++++++++---
> >  arch/x86/kvm/mmu/tdp_mmu.h |  5 ++-
> >  3 files changed, 103 insertions(+), 11 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index faf69774c7ce..0237e143299c 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -1577,8 +1577,38 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> >  	if (kvm_memslots_have_rmaps(kvm))
> >  		flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
> >  
> > -	if (is_tdp_mmu_enabled(kvm))
> > -		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
> > +	if (is_tdp_mmu_enabled(kvm)) {
> > +		bool zap_private;
> > +
> > +		if (kvm_slot_can_be_private(range->slot)) {
> > +			if (range->flags & KVM_GFN_RANGE_FLAGS_RESTRICTED_MEM)
> > +				/*
> > +				 * For private slot, the callback is triggered
> > +				 * via falloc.  Mode can be allocation or punch
> 				       ^
> 				       fallocate(), please?
> 
> > +				 * hole.  Because the private-shared conversion
> > +				 * is done via
> > +				 * KVM_MEMORY_ENCRYPT_REG/UNREG_REGION, we can
> > +				 * ignore the request from restrictedmem.
> > +				 */
> > +				return flush;
> 
> Sorry why "private-shared conversion is done via KVM_MEMORY_ENCRYPT_REG" results
> in "we can ignore the requres from restrictedmem"?
> 
> If we punch a hole, the pages are de-allocated, correct?

With v10 UPM, we can have zap_private = true always.

With v9 UPM, the callback is triggered both for allocation and punch-hole without
any further argument.  With v10 UPM, the callback is triggered only for punching
hole.  

> 
> > +			else if (range->flags & KVM_GFN_RANGE_FLAGS_SET_MEM_ATTR) {
> > +				if (range->attr == KVM_MEM_ATTR_SHARED)
> > +					zap_private = true;
> > +				else {
> > +					WARN_ON_ONCE(range->attr != KVM_MEM_ATTR_PRIVATE);
> > +					zap_private = false;
> > +				}
> > +			} else
> > +				/*
> > +				 * kvm_unmap_gfn_range() is called via mmu
> > +				 * notifier.  For now page migration for private
> > +				 * page isn't supported yet, don't zap private
> > +				 * pages.
> > +				 */
> > +				zap_private = false;
> 
> Page migration is not the only reason that KVM will receive the MMU notifer --
> just say something like "for now all private pages are pinned during VM's life 
> time".

Will update the comment.


> 
> 
> > +		}
> > +		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush, zap_private);
> > +	}
> >  
> >  	return flush;
> >  }
> > @@ -6066,11 +6096,48 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
> >  	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
> >  }
> >  
> > +static void kvm_mmu_zap_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
> > +{
> > +	bool flush = false;
> > +
> > +	write_lock(&kvm->mmu_lock);
> > +
> > +	/*
> > +	 * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
> > +	 * case scenario we'll have unused shadow pages lying around until they
> > +	 * are recycled due to age or when the VM is destroyed.
> > +	 */
> > +	if (is_tdp_mmu_enabled(kvm)) {
> > +		struct kvm_gfn_range range = {
> > +		      .slot = slot,
> > +		      .start = slot->base_gfn,
> > +		      .end = slot->base_gfn + slot->npages,
> > +		      .may_block = false,
> > +		};
> > +
> > +		/*
> > +		 * this handles both private gfn and shared gfn.
> > +		 * All private page should be zapped on memslot deletion.
> > +		 */
> > +		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, &range, flush, true);
> > +	} else {
> > +		flush = slot_handle_level(kvm, slot, __kvm_zap_rmap, PG_LEVEL_4K,
> > +					  KVM_MAX_HUGEPAGE_LEVEL, true);
> > +	}
> > +	if (flush)
> > +		kvm_flush_remote_tlbs(kvm);
> > +
> > +	write_unlock(&kvm->mmu_lock);
> > +}
> > +
> >  static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
> >  			struct kvm_memory_slot *slot,
> >  			struct kvm_page_track_notifier_node *node)
> >  {
> > -	kvm_mmu_zap_all_fast(kvm);
> > +	if (kvm_gfn_shared_mask(kvm))
> > +		kvm_mmu_zap_memslot(kvm, slot);
> > +	else
> > +		kvm_mmu_zap_all_fast(kvm);
> >  }
> 
> A comment would be nice here.

Will add a comment.


> >  
> >  int kvm_mmu_init_vm(struct kvm *kvm)
> > @@ -6173,8 +6240,18 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
> >  
> >  	if (is_tdp_mmu_enabled(kvm)) {
> >  		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
> > +			/*
> > +			 * zap_private = true. Zap both private/shared pages.
> > +			 *
> > +			 * kvm_zap_gfn_range() is used when PAT memory type was
> 
> Is it PAT or MTRR, or both (thus just memory type)?

Both. Will update the comment.

> 
> > +			 * changed.  Later on the next kvm page fault, populate
> > +			 * it with updated spte entry.
> > +			 * Because only WB is supported for private pages, don't
> > +			 * care of private pages.
> > +			 */
> 
> Then why bother zapping private?  If I read correctly, the changelog says "don't
> zap private"?

Right. Will fix.


> >  			flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
> > -						      gfn_end, true, flush);
> > +						      gfn_end, true, flush,
> > +						      true);
> >  	}
> >  
> 
> Btw, as you mentioned in the changelog, private memory always has WB memory
> type, thus cannot be virtualized.  Is it better to modify update_mtrr() to just
> return early if the gfn range is purely private?

MTRR support in cpuid is fixed to 1, PAT in cpuid is native.
MTRR and PAT are supported on shared pages.


> IMHO the handling of MTRR/PAT virtualization for TDX guest deserves dedicated
> patch(es) to put them together so it's easier to review.  Now the relevant parts
> spread in multiple independent patches (MSR handling, vt_get_mt_mask(), etc).

Ok, let me check it.
  
Kai Huang Dec. 15, 2022, 11:03 p.m. UTC | #4
On Thu, 2022-12-15 at 14:46 -0800, Isaku Yamahata wrote:
> > Btw, as you mentioned in the changelog, private memory always has WB memory
> > type, thus cannot be virtualized.  Is it better to modify update_mtrr() to
> > just
> > return early if the gfn range is purely private?
> 
> MTRR support in cpuid is fixed to 1, PAT in cpuid is native.
> MTRR and PAT are supported on shared pages.
> 

But none of those are mentioned in the changelog or whatever.  They are hidden
in the spec or in the later patches far away.  

ALso, the handling of "load/save IA32_PAT" in VMEXIT/VMENTRY VMCS control is
different between TDX module and KVM.  None of those are mentioned.

All those make the patch review so hard.

> 
> > IMHO the handling of MTRR/PAT virtualization for TDX guest deserves
> > dedicated
> > patch(es) to put them together so it's easier to review.  Now the relevant
> > parts
> > spread in multiple independent patches (MSR handling, vt_get_mt_mask(),
> > etc).
> 
> Ok, let me check it.

Yes.  IMHO you should put all relevant parts together and make a clear changelog
to justify the patch(es).
  
Kai Huang Dec. 15, 2022, 11:27 p.m. UTC | #5
On Thu, 2022-12-15 at 14:46 -0800, Isaku Yamahata wrote:
> > Sorry why "private-shared conversion is done via KVM_MEMORY_ENCRYPT_REG"
> > results
> > in "we can ignore the requres from restrictedmem"?
> > 
> > If we punch a hole, the pages are de-allocated, correct?
> 
> With v10 UPM, we can have zap_private = true always.
> 
> With v9 UPM, the callback is triggered both for allocation and punch-hole
> without
> any further argument.  With v10 UPM, the callback is triggered only for
> punching
> hole.  

I honestly don't quite understand "why v9 UPM the callback is triggered for
allocation" (assuming the "callback" here you mean the invaidate notifier). 
Removing it seems to be right.

But, this is not the point.  The point is you need to be clear about how to use
"punch hole" and/or set_memory_attributes().

Now my assumption is "punch hole" can be done alone w/o set_memory_attributes().
For instance, perhaps a (I guess valid) case is hot-removal of memory from TDX
guest, where you don't need to set_memory_attributes().

However it does seem that set_memory_attributes(shared) should (or must) be done
after "punch hole".  And we need to document that clearly.

So it seems a more reasonable way is:

1) in "punch hole", you zap everything.
2) in set_memory_attribute() from private -> shared, you try to zap all private
pages anyway (it should be very quick if "punch hole" has been done), and you
can pr_warn_once() (or WARN()) if you found any private page still exists.

Did I miss anything?

But I am not sure about set_memory_attribute() from shared -> private.  Should
we do fallocate() before that?
  

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index faf69774c7ce..0237e143299c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1577,8 +1577,38 @@  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 	if (kvm_memslots_have_rmaps(kvm))
 		flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
 
-	if (is_tdp_mmu_enabled(kvm))
-		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
+	if (is_tdp_mmu_enabled(kvm)) {
+		bool zap_private;
+
+		if (kvm_slot_can_be_private(range->slot)) {
+			if (range->flags & KVM_GFN_RANGE_FLAGS_RESTRICTED_MEM)
+				/*
+				 * For private slot, the callback is triggered
+				 * via falloc.  Mode can be allocation or punch
+				 * hole.  Because the private-shared conversion
+				 * is done via
+				 * KVM_MEMORY_ENCRYPT_REG/UNREG_REGION, we can
+				 * ignore the request from restrictedmem.
+				 */
+				return flush;
+			else if (range->flags & KVM_GFN_RANGE_FLAGS_SET_MEM_ATTR) {
+				if (range->attr == KVM_MEM_ATTR_SHARED)
+					zap_private = true;
+				else {
+					WARN_ON_ONCE(range->attr != KVM_MEM_ATTR_PRIVATE);
+					zap_private = false;
+				}
+			} else
+				/*
+				 * kvm_unmap_gfn_range() is called via mmu
+				 * notifier.  For now page migration for private
+				 * page isn't supported yet, don't zap private
+				 * pages.
+				 */
+				zap_private = false;
+		}
+		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush, zap_private);
+	}
 
 	return flush;
 }
@@ -6066,11 +6096,48 @@  static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
 	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
 }
 
+static void kvm_mmu_zap_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+	bool flush = false;
+
+	write_lock(&kvm->mmu_lock);
+
+	/*
+	 * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
+	 * case scenario we'll have unused shadow pages lying around until they
+	 * are recycled due to age or when the VM is destroyed.
+	 */
+	if (is_tdp_mmu_enabled(kvm)) {
+		struct kvm_gfn_range range = {
+		      .slot = slot,
+		      .start = slot->base_gfn,
+		      .end = slot->base_gfn + slot->npages,
+		      .may_block = false,
+		};
+
+		/*
+		 * this handles both private gfn and shared gfn.
+		 * All private page should be zapped on memslot deletion.
+		 */
+		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, &range, flush, true);
+	} else {
+		flush = slot_handle_level(kvm, slot, __kvm_zap_rmap, PG_LEVEL_4K,
+					  KVM_MAX_HUGEPAGE_LEVEL, true);
+	}
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+
+	write_unlock(&kvm->mmu_lock);
+}
+
 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
 			struct kvm_memory_slot *slot,
 			struct kvm_page_track_notifier_node *node)
 {
-	kvm_mmu_zap_all_fast(kvm);
+	if (kvm_gfn_shared_mask(kvm))
+		kvm_mmu_zap_memslot(kvm, slot);
+	else
+		kvm_mmu_zap_all_fast(kvm);
 }
 
 int kvm_mmu_init_vm(struct kvm *kvm)
@@ -6173,8 +6240,18 @@  void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 
 	if (is_tdp_mmu_enabled(kvm)) {
 		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+			/*
+			 * zap_private = true. Zap both private/shared pages.
+			 *
+			 * kvm_zap_gfn_range() is used when PAT memory type was
+			 * changed.  Later on the next kvm page fault, populate
+			 * it with updated spte entry.
+			 * Because only WB is supported for private pages, don't
+			 * care of private pages.
+			 */
 			flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
-						      gfn_end, true, flush);
+						      gfn_end, true, flush,
+						      true);
 	}
 
 	if (flush)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index b2f56110d62d..85d990ec149e 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -948,7 +948,8 @@  bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
  * operation can cause a soft lockup.
  */
 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
-			      gfn_t start, gfn_t end, bool can_yield, bool flush)
+			      gfn_t start, gfn_t end, bool can_yield, bool flush,
+			      bool zap_private)
 {
 	struct tdp_iter iter;
 
@@ -956,6 +957,10 @@  static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
 
 	lockdep_assert_held_write(&kvm->mmu_lock);
 
+	WARN_ON_ONCE(zap_private && !is_private_sp(root));
+	if (!zap_private && is_private_sp(root))
+		return false;
+
 	rcu_read_lock();
 
 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
@@ -988,12 +993,13 @@  static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
  * more SPTEs were zapped since the MMU lock was last acquired.
  */
 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
-			   bool can_yield, bool flush)
+			   bool can_yield, bool flush, bool zap_private)
 {
 	struct kvm_mmu_page *root;
 
 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
-		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
+		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush,
+					  zap_private && is_private_sp(root));
 
 	return flush;
 }
@@ -1053,6 +1059,12 @@  void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
 
 	lockdep_assert_held_write(&kvm->mmu_lock);
 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+		/*
+		 * Skip private root since private page table
+		 * is only torn down when VM is destroyed.
+		 */
+		if (is_private_sp(root))
+			continue;
 		if (!root->role.invalid &&
 		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
 			root->role.invalid = true;
@@ -1245,11 +1257,13 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	return ret;
 }
 
+/* Used by mmu notifier via kvm_unmap_gfn_range() */
 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
-				 bool flush)
+				 bool flush, bool zap_private)
 {
 	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
-				     range->end, range->may_block, flush);
+				     range->end, range->may_block, flush,
+				     zap_private);
 }
 
 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index c163f7cc23ca..c98c7df449a8 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -16,7 +16,8 @@  void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
 			  bool shared);
 
 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start,
-				 gfn_t end, bool can_yield, bool flush);
+			   gfn_t end, bool can_yield, bool flush,
+			   bool zap_private);
 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
 void kvm_tdp_mmu_zap_all(struct kvm *kvm);
 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
@@ -25,7 +26,7 @@  void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
 
 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
-				 bool flush);
+				 bool flush, bool zap_private);
 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);