[v10,6/9] KVM: Unmap existing mappings when change the memory attributes

Message ID 20221202061347.1070246-7-chao.p.peng@linux.intel.com
State New
Headers
Series KVM: mm: fd-based approach for supporting KVM |

Commit Message

Chao Peng Dec. 2, 2022, 6:13 a.m. UTC
  Unmap the existing guest mappings when memory attribute is changed
between shared and private. This is needed because shared pages and
private pages are from different backends, unmapping existing ones
gives a chance for page fault handler to re-populate the mappings
according to the new attribute.

Only architecture has private memory support needs this and the
supported architecture is expected to rewrite the weak
kvm_arch_has_private_mem().

Also, during memory attribute changing and the unmapping time frame,
page fault handler may happen in the same memory range and can cause
incorrect page state, invoke kvm_mmu_invalidate_* helpers to let the
page fault handler retry during this time frame.

Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 include/linux/kvm_host.h |   7 +-
 virt/kvm/kvm_main.c      | 168 ++++++++++++++++++++++++++-------------
 2 files changed, 116 insertions(+), 59 deletions(-)
  

Comments

Yuan Yao Dec. 7, 2022, 8:13 a.m. UTC | #1
On Fri, Dec 02, 2022 at 02:13:44PM +0800, Chao Peng wrote:
> Unmap the existing guest mappings when memory attribute is changed
> between shared and private. This is needed because shared pages and
> private pages are from different backends, unmapping existing ones
> gives a chance for page fault handler to re-populate the mappings
> according to the new attribute.
>
> Only architecture has private memory support needs this and the
> supported architecture is expected to rewrite the weak
> kvm_arch_has_private_mem().
>
> Also, during memory attribute changing and the unmapping time frame,
> page fault handler may happen in the same memory range and can cause
> incorrect page state, invoke kvm_mmu_invalidate_* helpers to let the
> page fault handler retry during this time frame.
>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> ---
>  include/linux/kvm_host.h |   7 +-
>  virt/kvm/kvm_main.c      | 168 ++++++++++++++++++++++++++-------------
>  2 files changed, 116 insertions(+), 59 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 3d69484d2704..3331c0c92838 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -255,7 +255,6 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
>  #endif
>
> -#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
>  struct kvm_gfn_range {
>  	struct kvm_memory_slot *slot;
>  	gfn_t start;
> @@ -264,6 +263,8 @@ struct kvm_gfn_range {
>  	bool may_block;
>  };
>  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> +
> +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
>  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
>  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
>  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> @@ -785,11 +786,12 @@ struct kvm {
>
>  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
>  	struct mmu_notifier mmu_notifier;
> +#endif
>  	unsigned long mmu_invalidate_seq;
>  	long mmu_invalidate_in_progress;
>  	gfn_t mmu_invalidate_range_start;
>  	gfn_t mmu_invalidate_range_end;
> -#endif
> +
>  	struct list_head devices;
>  	u64 manual_dirty_log_protect;
>  	struct dentry *debugfs_dentry;
> @@ -1480,6 +1482,7 @@ bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_arch_post_init_vm(struct kvm *kvm);
>  void kvm_arch_pre_destroy_vm(struct kvm *kvm);
>  int kvm_arch_create_vm_debugfs(struct kvm *kvm);
> +bool kvm_arch_has_private_mem(struct kvm *kvm);
>
>  #ifndef __KVM_HAVE_ARCH_VM_ALLOC
>  /*
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index ad55dfbc75d7..4e1e1e113bf0 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -520,6 +520,62 @@ void kvm_destroy_vcpus(struct kvm *kvm)
>  }
>  EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
>
> +void kvm_mmu_invalidate_begin(struct kvm *kvm)
> +{
> +	/*
> +	 * The count increase must become visible at unlock time as no
> +	 * spte can be established without taking the mmu_lock and
> +	 * count is also read inside the mmu_lock critical section.
> +	 */
> +	kvm->mmu_invalidate_in_progress++;
> +
> +	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> +		kvm->mmu_invalidate_range_start = INVALID_GPA;
> +		kvm->mmu_invalidate_range_end = INVALID_GPA;
> +	}
> +}
> +
> +void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
> +
> +	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> +		kvm->mmu_invalidate_range_start = start;
> +		kvm->mmu_invalidate_range_end = end;
> +	} else {
> +		/*
> +		 * Fully tracking multiple concurrent ranges has diminishing
> +		 * returns. Keep things simple and just find the minimal range
> +		 * which includes the current and new ranges. As there won't be
> +		 * enough information to subtract a range after its invalidate
> +		 * completes, any ranges invalidated concurrently will
> +		 * accumulate and persist until all outstanding invalidates
> +		 * complete.
> +		 */
> +		kvm->mmu_invalidate_range_start =
> +			min(kvm->mmu_invalidate_range_start, start);
> +		kvm->mmu_invalidate_range_end =
> +			max(kvm->mmu_invalidate_range_end, end);
> +	}
> +}
> +
> +void kvm_mmu_invalidate_end(struct kvm *kvm)
> +{
> +	/*
> +	 * This sequence increase will notify the kvm page fault that
> +	 * the page that is going to be mapped in the spte could have
> +	 * been freed.
> +	 */
> +	kvm->mmu_invalidate_seq++;
> +	smp_wmb();
> +	/*
> +	 * The above sequence increase must be visible before the
> +	 * below count decrease, which is ensured by the smp_wmb above
> +	 * in conjunction with the smp_rmb in mmu_invalidate_retry().
> +	 */
> +	kvm->mmu_invalidate_in_progress--;
> +}
> +
>  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
>  static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
>  {
> @@ -714,45 +770,6 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
>  	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
>  }
>
> -void kvm_mmu_invalidate_begin(struct kvm *kvm)
> -{
> -	/*
> -	 * The count increase must become visible at unlock time as no
> -	 * spte can be established without taking the mmu_lock and
> -	 * count is also read inside the mmu_lock critical section.
> -	 */
> -	kvm->mmu_invalidate_in_progress++;
> -
> -	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> -		kvm->mmu_invalidate_range_start = INVALID_GPA;
> -		kvm->mmu_invalidate_range_end = INVALID_GPA;
> -	}
> -}
> -
> -void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> -{
> -	WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
> -
> -	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> -		kvm->mmu_invalidate_range_start = start;
> -		kvm->mmu_invalidate_range_end = end;
> -	} else {
> -		/*
> -		 * Fully tracking multiple concurrent ranges has diminishing
> -		 * returns. Keep things simple and just find the minimal range
> -		 * which includes the current and new ranges. As there won't be
> -		 * enough information to subtract a range after its invalidate
> -		 * completes, any ranges invalidated concurrently will
> -		 * accumulate and persist until all outstanding invalidates
> -		 * complete.
> -		 */
> -		kvm->mmu_invalidate_range_start =
> -			min(kvm->mmu_invalidate_range_start, start);
> -		kvm->mmu_invalidate_range_end =
> -			max(kvm->mmu_invalidate_range_end, end);
> -	}
> -}
> -
>  static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>  {
>  	kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
> @@ -806,23 +823,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
>  	return 0;
>  }
>
> -void kvm_mmu_invalidate_end(struct kvm *kvm)
> -{
> -	/*
> -	 * This sequence increase will notify the kvm page fault that
> -	 * the page that is going to be mapped in the spte could have
> -	 * been freed.
> -	 */
> -	kvm->mmu_invalidate_seq++;
> -	smp_wmb();
> -	/*
> -	 * The above sequence increase must be visible before the
> -	 * below count decrease, which is ensured by the smp_wmb above
> -	 * in conjunction with the smp_rmb in mmu_invalidate_retry().
> -	 */
> -	kvm->mmu_invalidate_in_progress--;
> -}
> -
>  static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
>  					const struct mmu_notifier_range *range)
>  {
> @@ -1140,6 +1140,11 @@ int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
>  	return 0;
>  }
>
> +bool __weak kvm_arch_has_private_mem(struct kvm *kvm)
> +{
> +	return false;
> +}
> +
>  static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
>  {
>  	struct kvm *kvm = kvm_arch_alloc_vm();
> @@ -2349,15 +2354,47 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
>  	return 0;
>  }
>
> +static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	struct kvm_gfn_range gfn_range;
> +	struct kvm_memory_slot *slot;
> +	struct kvm_memslots *slots;
> +	struct kvm_memslot_iter iter;
> +	int i;
> +	int r = 0;
> +
> +	gfn_range.pte = __pte(0);
> +	gfn_range.may_block = true;
> +
> +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> +		slots = __kvm_memslots(kvm, i);
> +
> +		kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
> +			slot = iter.slot;
> +			gfn_range.start = max(start, slot->base_gfn);
> +			gfn_range.end = min(end, slot->base_gfn + slot->npages);
> +			if (gfn_range.start >= gfn_range.end)
> +				continue;
> +			gfn_range.slot = slot;
> +
> +			r |= kvm_unmap_gfn_range(kvm, &gfn_range);
> +		}
> +	}
> +
> +	if (r)
> +		kvm_flush_remote_tlbs(kvm);
> +}
> +
>  static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
>  					   struct kvm_memory_attributes *attrs)
>  {
>  	gfn_t start, end;
>  	unsigned long i;
>  	void *entry;
> +	int idx;
>  	u64 supported_attrs = kvm_supported_mem_attributes(kvm);
>
> -	/* flags is currently not used. */
> +	/* 'flags' is currently not used. */
>  	if (attrs->flags)
>  		return -EINVAL;
>  	if (attrs->attributes & ~supported_attrs)
> @@ -2372,6 +2409,13 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
>
>  	entry = attrs->attributes ? xa_mk_value(attrs->attributes) : NULL;
>
> +	if (kvm_arch_has_private_mem(kvm)) {
> +		KVM_MMU_LOCK(kvm);
> +		kvm_mmu_invalidate_begin(kvm);
> +		kvm_mmu_invalidate_range_add(kvm, start, end);

Nit: this works for KVM_MEMORY_ATTRIBUTE_PRIVATE, but
the invalidation should be necessary yet for attribute change of:

KVM_MEMORY_ATTRIBUTE_READ
KVM_MEMORY_ATTRIBUTE_WRITE
KVM_MEMORY_ATTRIBUTE_EXECUTE

> +		KVM_MMU_UNLOCK(kvm);
> +	}
> +
>  	mutex_lock(&kvm->lock);
>  	for (i = start; i < end; i++)
>  		if (xa_err(xa_store(&kvm->mem_attr_array, i, entry,
> @@ -2379,6 +2423,16 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
>  			break;
>  	mutex_unlock(&kvm->lock);
>
> +	if (kvm_arch_has_private_mem(kvm)) {
> +		idx = srcu_read_lock(&kvm->srcu);
> +		KVM_MMU_LOCK(kvm);
> +		if (i > start)
> +			kvm_unmap_mem_range(kvm, start, i);
> +		kvm_mmu_invalidate_end(kvm);

Ditto.

> +		KVM_MMU_UNLOCK(kvm);
> +		srcu_read_unlock(&kvm->srcu, idx);
> +	}
> +
>  	attrs->address = i << PAGE_SHIFT;
>  	attrs->size = (end - i) << PAGE_SHIFT;
>
> --
> 2.25.1
>
>
  
Fuad Tabba Dec. 7, 2022, 5:16 p.m. UTC | #2
Hi,

On Fri, Dec 2, 2022 at 6:19 AM Chao Peng <chao.p.peng@linux.intel.com> wrote:
>
> Unmap the existing guest mappings when memory attribute is changed
> between shared and private. This is needed because shared pages and
> private pages are from different backends, unmapping existing ones
> gives a chance for page fault handler to re-populate the mappings
> according to the new attribute.
>
> Only architecture has private memory support needs this and the
> supported architecture is expected to rewrite the weak
> kvm_arch_has_private_mem().

This kind of ties into the discussion of being able to share memory in
place. For pKVM for example, shared and private memory would have the
same backend, and the unmapping wouldn't be needed.

So I guess that, instead of kvm_arch_has_private_mem(), can the check
be done differently, e.g., with a different function, say
kvm_arch_private_notify_attribute_change() (but maybe with a more
friendly name than what I suggested :) )?

Thanks,
/fuad

>
> Also, during memory attribute changing and the unmapping time frame,
> page fault handler may happen in the same memory range and can cause
> incorrect page state, invoke kvm_mmu_invalidate_* helpers to let the
> page fault handler retry during this time frame.
>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> ---
>  include/linux/kvm_host.h |   7 +-
>  virt/kvm/kvm_main.c      | 168 ++++++++++++++++++++++++++-------------
>  2 files changed, 116 insertions(+), 59 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 3d69484d2704..3331c0c92838 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -255,7 +255,6 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
>  #endif
>
> -#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
>  struct kvm_gfn_range {
>         struct kvm_memory_slot *slot;
>         gfn_t start;
> @@ -264,6 +263,8 @@ struct kvm_gfn_range {
>         bool may_block;
>  };
>  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> +
> +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
>  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
>  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
>  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> @@ -785,11 +786,12 @@ struct kvm {
>
>  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
>         struct mmu_notifier mmu_notifier;
> +#endif
>         unsigned long mmu_invalidate_seq;
>         long mmu_invalidate_in_progress;
>         gfn_t mmu_invalidate_range_start;
>         gfn_t mmu_invalidate_range_end;
> -#endif
> +
>         struct list_head devices;
>         u64 manual_dirty_log_protect;
>         struct dentry *debugfs_dentry;
> @@ -1480,6 +1482,7 @@ bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_arch_post_init_vm(struct kvm *kvm);
>  void kvm_arch_pre_destroy_vm(struct kvm *kvm);
>  int kvm_arch_create_vm_debugfs(struct kvm *kvm);
> +bool kvm_arch_has_private_mem(struct kvm *kvm);
>
>  #ifndef __KVM_HAVE_ARCH_VM_ALLOC
>  /*
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index ad55dfbc75d7..4e1e1e113bf0 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -520,6 +520,62 @@ void kvm_destroy_vcpus(struct kvm *kvm)
>  }
>  EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
>
> +void kvm_mmu_invalidate_begin(struct kvm *kvm)
> +{
> +       /*
> +        * The count increase must become visible at unlock time as no
> +        * spte can be established without taking the mmu_lock and
> +        * count is also read inside the mmu_lock critical section.
> +        */
> +       kvm->mmu_invalidate_in_progress++;
> +
> +       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> +               kvm->mmu_invalidate_range_start = INVALID_GPA;
> +               kvm->mmu_invalidate_range_end = INVALID_GPA;
> +       }
> +}
> +
> +void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +       WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
> +
> +       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> +               kvm->mmu_invalidate_range_start = start;
> +               kvm->mmu_invalidate_range_end = end;
> +       } else {
> +               /*
> +                * Fully tracking multiple concurrent ranges has diminishing
> +                * returns. Keep things simple and just find the minimal range
> +                * which includes the current and new ranges. As there won't be
> +                * enough information to subtract a range after its invalidate
> +                * completes, any ranges invalidated concurrently will
> +                * accumulate and persist until all outstanding invalidates
> +                * complete.
> +                */
> +               kvm->mmu_invalidate_range_start =
> +                       min(kvm->mmu_invalidate_range_start, start);
> +               kvm->mmu_invalidate_range_end =
> +                       max(kvm->mmu_invalidate_range_end, end);
> +       }
> +}
> +
> +void kvm_mmu_invalidate_end(struct kvm *kvm)
> +{
> +       /*
> +        * This sequence increase will notify the kvm page fault that
> +        * the page that is going to be mapped in the spte could have
> +        * been freed.
> +        */
> +       kvm->mmu_invalidate_seq++;
> +       smp_wmb();
> +       /*
> +        * The above sequence increase must be visible before the
> +        * below count decrease, which is ensured by the smp_wmb above
> +        * in conjunction with the smp_rmb in mmu_invalidate_retry().
> +        */
> +       kvm->mmu_invalidate_in_progress--;
> +}
> +
>  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
>  static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
>  {
> @@ -714,45 +770,6 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
>         kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
>  }
>
> -void kvm_mmu_invalidate_begin(struct kvm *kvm)
> -{
> -       /*
> -        * The count increase must become visible at unlock time as no
> -        * spte can be established without taking the mmu_lock and
> -        * count is also read inside the mmu_lock critical section.
> -        */
> -       kvm->mmu_invalidate_in_progress++;
> -
> -       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> -               kvm->mmu_invalidate_range_start = INVALID_GPA;
> -               kvm->mmu_invalidate_range_end = INVALID_GPA;
> -       }
> -}
> -
> -void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> -{
> -       WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
> -
> -       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> -               kvm->mmu_invalidate_range_start = start;
> -               kvm->mmu_invalidate_range_end = end;
> -       } else {
> -               /*
> -                * Fully tracking multiple concurrent ranges has diminishing
> -                * returns. Keep things simple and just find the minimal range
> -                * which includes the current and new ranges. As there won't be
> -                * enough information to subtract a range after its invalidate
> -                * completes, any ranges invalidated concurrently will
> -                * accumulate and persist until all outstanding invalidates
> -                * complete.
> -                */
> -               kvm->mmu_invalidate_range_start =
> -                       min(kvm->mmu_invalidate_range_start, start);
> -               kvm->mmu_invalidate_range_end =
> -                       max(kvm->mmu_invalidate_range_end, end);
> -       }
> -}
> -
>  static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>  {
>         kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
> @@ -806,23 +823,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
>         return 0;
>  }
>
> -void kvm_mmu_invalidate_end(struct kvm *kvm)
> -{
> -       /*
> -        * This sequence increase will notify the kvm page fault that
> -        * the page that is going to be mapped in the spte could have
> -        * been freed.
> -        */
> -       kvm->mmu_invalidate_seq++;
> -       smp_wmb();
> -       /*
> -        * The above sequence increase must be visible before the
> -        * below count decrease, which is ensured by the smp_wmb above
> -        * in conjunction with the smp_rmb in mmu_invalidate_retry().
> -        */
> -       kvm->mmu_invalidate_in_progress--;
> -}
> -
>  static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
>                                         const struct mmu_notifier_range *range)
>  {
> @@ -1140,6 +1140,11 @@ int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
>         return 0;
>  }
>
> +bool __weak kvm_arch_has_private_mem(struct kvm *kvm)
> +{
> +       return false;
> +}
> +
>  static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
>  {
>         struct kvm *kvm = kvm_arch_alloc_vm();
> @@ -2349,15 +2354,47 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
>         return 0;
>  }
>
> +static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +       struct kvm_gfn_range gfn_range;
> +       struct kvm_memory_slot *slot;
> +       struct kvm_memslots *slots;
> +       struct kvm_memslot_iter iter;
> +       int i;
> +       int r = 0;
> +
> +       gfn_range.pte = __pte(0);
> +       gfn_range.may_block = true;
> +
> +       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> +               slots = __kvm_memslots(kvm, i);
> +
> +               kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
> +                       slot = iter.slot;
> +                       gfn_range.start = max(start, slot->base_gfn);
> +                       gfn_range.end = min(end, slot->base_gfn + slot->npages);
> +                       if (gfn_range.start >= gfn_range.end)
> +                               continue;
> +                       gfn_range.slot = slot;
> +
> +                       r |= kvm_unmap_gfn_range(kvm, &gfn_range);
> +               }
> +       }
> +
> +       if (r)
> +               kvm_flush_remote_tlbs(kvm);
> +}
> +
>  static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
>                                            struct kvm_memory_attributes *attrs)
>  {
>         gfn_t start, end;
>         unsigned long i;
>         void *entry;
> +       int idx;
>         u64 supported_attrs = kvm_supported_mem_attributes(kvm);
>
> -       /* flags is currently not used. */
> +       /* 'flags' is currently not used. */
>         if (attrs->flags)
>                 return -EINVAL;
>         if (attrs->attributes & ~supported_attrs)
> @@ -2372,6 +2409,13 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
>
>         entry = attrs->attributes ? xa_mk_value(attrs->attributes) : NULL;
>
> +       if (kvm_arch_has_private_mem(kvm)) {
> +               KVM_MMU_LOCK(kvm);
> +               kvm_mmu_invalidate_begin(kvm);
> +               kvm_mmu_invalidate_range_add(kvm, start, end);
> +               KVM_MMU_UNLOCK(kvm);
> +       }
> +
>         mutex_lock(&kvm->lock);
>         for (i = start; i < end; i++)
>                 if (xa_err(xa_store(&kvm->mem_attr_array, i, entry,
> @@ -2379,6 +2423,16 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
>                         break;
>         mutex_unlock(&kvm->lock);
>
> +       if (kvm_arch_has_private_mem(kvm)) {
> +               idx = srcu_read_lock(&kvm->srcu);
> +               KVM_MMU_LOCK(kvm);
> +               if (i > start)
> +                       kvm_unmap_mem_range(kvm, start, i);
> +               kvm_mmu_invalidate_end(kvm);
> +               KVM_MMU_UNLOCK(kvm);
> +               srcu_read_unlock(&kvm->srcu, idx);
> +       }
> +
>         attrs->address = i << PAGE_SHIFT;
>         attrs->size = (end - i) << PAGE_SHIFT;
>
> --
> 2.25.1
>
  
Chao Peng Dec. 8, 2022, 11:13 a.m. UTC | #3
On Wed, Dec 07, 2022 at 05:16:34PM +0000, Fuad Tabba wrote:
> Hi,
> 
> On Fri, Dec 2, 2022 at 6:19 AM Chao Peng <chao.p.peng@linux.intel.com> wrote:
> >
> > Unmap the existing guest mappings when memory attribute is changed
> > between shared and private. This is needed because shared pages and
> > private pages are from different backends, unmapping existing ones
> > gives a chance for page fault handler to re-populate the mappings
> > according to the new attribute.
> >
> > Only architecture has private memory support needs this and the
> > supported architecture is expected to rewrite the weak
> > kvm_arch_has_private_mem().
> 
> This kind of ties into the discussion of being able to share memory in
> place. For pKVM for example, shared and private memory would have the
> same backend, and the unmapping wouldn't be needed.
> 
> So I guess that, instead of kvm_arch_has_private_mem(), can the check
> be done differently, e.g., with a different function, say
> kvm_arch_private_notify_attribute_change() (but maybe with a more
> friendly name than what I suggested :) )?

Besides controlling the unmapping here, kvm_arch_has_private_mem() is
also used to gate the memslot KVM_MEM_PRIVATE flag in patch09. I know
unmapping is confirmed unnecessary for pKVM, but how about
KVM_MEM_PRIVATE? Will pKVM add its own flag or reuse KVM_MEM_PRIVATE?
If the answer is the latter, then yes we should use a different check
which only works for confidential usages here.

Thanks,
Chao
> 
> Thanks,
> /fuad
> 
> >
> > Also, during memory attribute changing and the unmapping time frame,
> > page fault handler may happen in the same memory range and can cause
> > incorrect page state, invoke kvm_mmu_invalidate_* helpers to let the
> > page fault handler retry during this time frame.
> >
> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > ---
> >  include/linux/kvm_host.h |   7 +-
> >  virt/kvm/kvm_main.c      | 168 ++++++++++++++++++++++++++-------------
> >  2 files changed, 116 insertions(+), 59 deletions(-)
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 3d69484d2704..3331c0c92838 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -255,7 +255,6 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> >  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
> >  #endif
> >
> > -#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
> >  struct kvm_gfn_range {
> >         struct kvm_memory_slot *slot;
> >         gfn_t start;
> > @@ -264,6 +263,8 @@ struct kvm_gfn_range {
> >         bool may_block;
> >  };
> >  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> > +
> > +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
> >  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> >  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> >  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > @@ -785,11 +786,12 @@ struct kvm {
> >
> >  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> >         struct mmu_notifier mmu_notifier;
> > +#endif
> >         unsigned long mmu_invalidate_seq;
> >         long mmu_invalidate_in_progress;
> >         gfn_t mmu_invalidate_range_start;
> >         gfn_t mmu_invalidate_range_end;
> > -#endif
> > +
> >         struct list_head devices;
> >         u64 manual_dirty_log_protect;
> >         struct dentry *debugfs_dentry;
> > @@ -1480,6 +1482,7 @@ bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
> >  int kvm_arch_post_init_vm(struct kvm *kvm);
> >  void kvm_arch_pre_destroy_vm(struct kvm *kvm);
> >  int kvm_arch_create_vm_debugfs(struct kvm *kvm);
> > +bool kvm_arch_has_private_mem(struct kvm *kvm);
> >
> >  #ifndef __KVM_HAVE_ARCH_VM_ALLOC
> >  /*
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index ad55dfbc75d7..4e1e1e113bf0 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -520,6 +520,62 @@ void kvm_destroy_vcpus(struct kvm *kvm)
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
> >
> > +void kvm_mmu_invalidate_begin(struct kvm *kvm)
> > +{
> > +       /*
> > +        * The count increase must become visible at unlock time as no
> > +        * spte can be established without taking the mmu_lock and
> > +        * count is also read inside the mmu_lock critical section.
> > +        */
> > +       kvm->mmu_invalidate_in_progress++;
> > +
> > +       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > +               kvm->mmu_invalidate_range_start = INVALID_GPA;
> > +               kvm->mmu_invalidate_range_end = INVALID_GPA;
> > +       }
> > +}
> > +
> > +void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> > +{
> > +       WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
> > +
> > +       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > +               kvm->mmu_invalidate_range_start = start;
> > +               kvm->mmu_invalidate_range_end = end;
> > +       } else {
> > +               /*
> > +                * Fully tracking multiple concurrent ranges has diminishing
> > +                * returns. Keep things simple and just find the minimal range
> > +                * which includes the current and new ranges. As there won't be
> > +                * enough information to subtract a range after its invalidate
> > +                * completes, any ranges invalidated concurrently will
> > +                * accumulate and persist until all outstanding invalidates
> > +                * complete.
> > +                */
> > +               kvm->mmu_invalidate_range_start =
> > +                       min(kvm->mmu_invalidate_range_start, start);
> > +               kvm->mmu_invalidate_range_end =
> > +                       max(kvm->mmu_invalidate_range_end, end);
> > +       }
> > +}
> > +
> > +void kvm_mmu_invalidate_end(struct kvm *kvm)
> > +{
> > +       /*
> > +        * This sequence increase will notify the kvm page fault that
> > +        * the page that is going to be mapped in the spte could have
> > +        * been freed.
> > +        */
> > +       kvm->mmu_invalidate_seq++;
> > +       smp_wmb();
> > +       /*
> > +        * The above sequence increase must be visible before the
> > +        * below count decrease, which is ensured by the smp_wmb above
> > +        * in conjunction with the smp_rmb in mmu_invalidate_retry().
> > +        */
> > +       kvm->mmu_invalidate_in_progress--;
> > +}
> > +
> >  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> >  static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
> >  {
> > @@ -714,45 +770,6 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
> >         kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
> >  }
> >
> > -void kvm_mmu_invalidate_begin(struct kvm *kvm)
> > -{
> > -       /*
> > -        * The count increase must become visible at unlock time as no
> > -        * spte can be established without taking the mmu_lock and
> > -        * count is also read inside the mmu_lock critical section.
> > -        */
> > -       kvm->mmu_invalidate_in_progress++;
> > -
> > -       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > -               kvm->mmu_invalidate_range_start = INVALID_GPA;
> > -               kvm->mmu_invalidate_range_end = INVALID_GPA;
> > -       }
> > -}
> > -
> > -void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> > -{
> > -       WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
> > -
> > -       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > -               kvm->mmu_invalidate_range_start = start;
> > -               kvm->mmu_invalidate_range_end = end;
> > -       } else {
> > -               /*
> > -                * Fully tracking multiple concurrent ranges has diminishing
> > -                * returns. Keep things simple and just find the minimal range
> > -                * which includes the current and new ranges. As there won't be
> > -                * enough information to subtract a range after its invalidate
> > -                * completes, any ranges invalidated concurrently will
> > -                * accumulate and persist until all outstanding invalidates
> > -                * complete.
> > -                */
> > -               kvm->mmu_invalidate_range_start =
> > -                       min(kvm->mmu_invalidate_range_start, start);
> > -               kvm->mmu_invalidate_range_end =
> > -                       max(kvm->mmu_invalidate_range_end, end);
> > -       }
> > -}
> > -
> >  static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> >  {
> >         kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
> > @@ -806,23 +823,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> >         return 0;
> >  }
> >
> > -void kvm_mmu_invalidate_end(struct kvm *kvm)
> > -{
> > -       /*
> > -        * This sequence increase will notify the kvm page fault that
> > -        * the page that is going to be mapped in the spte could have
> > -        * been freed.
> > -        */
> > -       kvm->mmu_invalidate_seq++;
> > -       smp_wmb();
> > -       /*
> > -        * The above sequence increase must be visible before the
> > -        * below count decrease, which is ensured by the smp_wmb above
> > -        * in conjunction with the smp_rmb in mmu_invalidate_retry().
> > -        */
> > -       kvm->mmu_invalidate_in_progress--;
> > -}
> > -
> >  static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> >                                         const struct mmu_notifier_range *range)
> >  {
> > @@ -1140,6 +1140,11 @@ int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
> >         return 0;
> >  }
> >
> > +bool __weak kvm_arch_has_private_mem(struct kvm *kvm)
> > +{
> > +       return false;
> > +}
> > +
> >  static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
> >  {
> >         struct kvm *kvm = kvm_arch_alloc_vm();
> > @@ -2349,15 +2354,47 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
> >         return 0;
> >  }
> >
> > +static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
> > +{
> > +       struct kvm_gfn_range gfn_range;
> > +       struct kvm_memory_slot *slot;
> > +       struct kvm_memslots *slots;
> > +       struct kvm_memslot_iter iter;
> > +       int i;
> > +       int r = 0;
> > +
> > +       gfn_range.pte = __pte(0);
> > +       gfn_range.may_block = true;
> > +
> > +       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> > +               slots = __kvm_memslots(kvm, i);
> > +
> > +               kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
> > +                       slot = iter.slot;
> > +                       gfn_range.start = max(start, slot->base_gfn);
> > +                       gfn_range.end = min(end, slot->base_gfn + slot->npages);
> > +                       if (gfn_range.start >= gfn_range.end)
> > +                               continue;
> > +                       gfn_range.slot = slot;
> > +
> > +                       r |= kvm_unmap_gfn_range(kvm, &gfn_range);
> > +               }
> > +       }
> > +
> > +       if (r)
> > +               kvm_flush_remote_tlbs(kvm);
> > +}
> > +
> >  static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> >                                            struct kvm_memory_attributes *attrs)
> >  {
> >         gfn_t start, end;
> >         unsigned long i;
> >         void *entry;
> > +       int idx;
> >         u64 supported_attrs = kvm_supported_mem_attributes(kvm);
> >
> > -       /* flags is currently not used. */
> > +       /* 'flags' is currently not used. */
> >         if (attrs->flags)
> >                 return -EINVAL;
> >         if (attrs->attributes & ~supported_attrs)
> > @@ -2372,6 +2409,13 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> >
> >         entry = attrs->attributes ? xa_mk_value(attrs->attributes) : NULL;
> >
> > +       if (kvm_arch_has_private_mem(kvm)) {
> > +               KVM_MMU_LOCK(kvm);
> > +               kvm_mmu_invalidate_begin(kvm);
> > +               kvm_mmu_invalidate_range_add(kvm, start, end);
> > +               KVM_MMU_UNLOCK(kvm);
> > +       }
> > +
> >         mutex_lock(&kvm->lock);
> >         for (i = start; i < end; i++)
> >                 if (xa_err(xa_store(&kvm->mem_attr_array, i, entry,
> > @@ -2379,6 +2423,16 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> >                         break;
> >         mutex_unlock(&kvm->lock);
> >
> > +       if (kvm_arch_has_private_mem(kvm)) {
> > +               idx = srcu_read_lock(&kvm->srcu);
> > +               KVM_MMU_LOCK(kvm);
> > +               if (i > start)
> > +                       kvm_unmap_mem_range(kvm, start, i);
> > +               kvm_mmu_invalidate_end(kvm);
> > +               KVM_MMU_UNLOCK(kvm);
> > +               srcu_read_unlock(&kvm->srcu, idx);
> > +       }
> > +
> >         attrs->address = i << PAGE_SHIFT;
> >         attrs->size = (end - i) << PAGE_SHIFT;
> >
> > --
> > 2.25.1
> >
  
Chao Peng Dec. 8, 2022, 11:20 a.m. UTC | #4
On Wed, Dec 07, 2022 at 04:13:14PM +0800, Yuan Yao wrote:
> On Fri, Dec 02, 2022 at 02:13:44PM +0800, Chao Peng wrote:
> > Unmap the existing guest mappings when memory attribute is changed
> > between shared and private. This is needed because shared pages and
> > private pages are from different backends, unmapping existing ones
> > gives a chance for page fault handler to re-populate the mappings
> > according to the new attribute.
> >
> > Only architecture has private memory support needs this and the
> > supported architecture is expected to rewrite the weak
> > kvm_arch_has_private_mem().
> >
> > Also, during memory attribute changing and the unmapping time frame,
> > page fault handler may happen in the same memory range and can cause
> > incorrect page state, invoke kvm_mmu_invalidate_* helpers to let the
> > page fault handler retry during this time frame.
> >
> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > ---
> >  include/linux/kvm_host.h |   7 +-
> >  virt/kvm/kvm_main.c      | 168 ++++++++++++++++++++++++++-------------
> >  2 files changed, 116 insertions(+), 59 deletions(-)
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 3d69484d2704..3331c0c92838 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -255,7 +255,6 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> >  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
> >  #endif
> >
> > -#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
> >  struct kvm_gfn_range {
> >  	struct kvm_memory_slot *slot;
> >  	gfn_t start;
> > @@ -264,6 +263,8 @@ struct kvm_gfn_range {
> >  	bool may_block;
> >  };
> >  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> > +
> > +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
> >  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> >  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> >  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > @@ -785,11 +786,12 @@ struct kvm {
> >
> >  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> >  	struct mmu_notifier mmu_notifier;
> > +#endif
> >  	unsigned long mmu_invalidate_seq;
> >  	long mmu_invalidate_in_progress;
> >  	gfn_t mmu_invalidate_range_start;
> >  	gfn_t mmu_invalidate_range_end;
> > -#endif
> > +
> >  	struct list_head devices;
> >  	u64 manual_dirty_log_protect;
> >  	struct dentry *debugfs_dentry;
> > @@ -1480,6 +1482,7 @@ bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
> >  int kvm_arch_post_init_vm(struct kvm *kvm);
> >  void kvm_arch_pre_destroy_vm(struct kvm *kvm);
> >  int kvm_arch_create_vm_debugfs(struct kvm *kvm);
> > +bool kvm_arch_has_private_mem(struct kvm *kvm);
> >
> >  #ifndef __KVM_HAVE_ARCH_VM_ALLOC
> >  /*
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index ad55dfbc75d7..4e1e1e113bf0 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -520,6 +520,62 @@ void kvm_destroy_vcpus(struct kvm *kvm)
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
> >
> > +void kvm_mmu_invalidate_begin(struct kvm *kvm)
> > +{
> > +	/*
> > +	 * The count increase must become visible at unlock time as no
> > +	 * spte can be established without taking the mmu_lock and
> > +	 * count is also read inside the mmu_lock critical section.
> > +	 */
> > +	kvm->mmu_invalidate_in_progress++;
> > +
> > +	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > +		kvm->mmu_invalidate_range_start = INVALID_GPA;
> > +		kvm->mmu_invalidate_range_end = INVALID_GPA;
> > +	}
> > +}
> > +
> > +void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> > +{
> > +	WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
> > +
> > +	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > +		kvm->mmu_invalidate_range_start = start;
> > +		kvm->mmu_invalidate_range_end = end;
> > +	} else {
> > +		/*
> > +		 * Fully tracking multiple concurrent ranges has diminishing
> > +		 * returns. Keep things simple and just find the minimal range
> > +		 * which includes the current and new ranges. As there won't be
> > +		 * enough information to subtract a range after its invalidate
> > +		 * completes, any ranges invalidated concurrently will
> > +		 * accumulate and persist until all outstanding invalidates
> > +		 * complete.
> > +		 */
> > +		kvm->mmu_invalidate_range_start =
> > +			min(kvm->mmu_invalidate_range_start, start);
> > +		kvm->mmu_invalidate_range_end =
> > +			max(kvm->mmu_invalidate_range_end, end);
> > +	}
> > +}
> > +
> > +void kvm_mmu_invalidate_end(struct kvm *kvm)
> > +{
> > +	/*
> > +	 * This sequence increase will notify the kvm page fault that
> > +	 * the page that is going to be mapped in the spte could have
> > +	 * been freed.
> > +	 */
> > +	kvm->mmu_invalidate_seq++;
> > +	smp_wmb();
> > +	/*
> > +	 * The above sequence increase must be visible before the
> > +	 * below count decrease, which is ensured by the smp_wmb above
> > +	 * in conjunction with the smp_rmb in mmu_invalidate_retry().
> > +	 */
> > +	kvm->mmu_invalidate_in_progress--;
> > +}
> > +
> >  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> >  static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
> >  {
> > @@ -714,45 +770,6 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
> >  	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
> >  }
> >
> > -void kvm_mmu_invalidate_begin(struct kvm *kvm)
> > -{
> > -	/*
> > -	 * The count increase must become visible at unlock time as no
> > -	 * spte can be established without taking the mmu_lock and
> > -	 * count is also read inside the mmu_lock critical section.
> > -	 */
> > -	kvm->mmu_invalidate_in_progress++;
> > -
> > -	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > -		kvm->mmu_invalidate_range_start = INVALID_GPA;
> > -		kvm->mmu_invalidate_range_end = INVALID_GPA;
> > -	}
> > -}
> > -
> > -void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> > -{
> > -	WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
> > -
> > -	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > -		kvm->mmu_invalidate_range_start = start;
> > -		kvm->mmu_invalidate_range_end = end;
> > -	} else {
> > -		/*
> > -		 * Fully tracking multiple concurrent ranges has diminishing
> > -		 * returns. Keep things simple and just find the minimal range
> > -		 * which includes the current and new ranges. As there won't be
> > -		 * enough information to subtract a range after its invalidate
> > -		 * completes, any ranges invalidated concurrently will
> > -		 * accumulate and persist until all outstanding invalidates
> > -		 * complete.
> > -		 */
> > -		kvm->mmu_invalidate_range_start =
> > -			min(kvm->mmu_invalidate_range_start, start);
> > -		kvm->mmu_invalidate_range_end =
> > -			max(kvm->mmu_invalidate_range_end, end);
> > -	}
> > -}
> > -
> >  static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> >  {
> >  	kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
> > @@ -806,23 +823,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> >  	return 0;
> >  }
> >
> > -void kvm_mmu_invalidate_end(struct kvm *kvm)
> > -{
> > -	/*
> > -	 * This sequence increase will notify the kvm page fault that
> > -	 * the page that is going to be mapped in the spte could have
> > -	 * been freed.
> > -	 */
> > -	kvm->mmu_invalidate_seq++;
> > -	smp_wmb();
> > -	/*
> > -	 * The above sequence increase must be visible before the
> > -	 * below count decrease, which is ensured by the smp_wmb above
> > -	 * in conjunction with the smp_rmb in mmu_invalidate_retry().
> > -	 */
> > -	kvm->mmu_invalidate_in_progress--;
> > -}
> > -
> >  static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> >  					const struct mmu_notifier_range *range)
> >  {
> > @@ -1140,6 +1140,11 @@ int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
> >  	return 0;
> >  }
> >
> > +bool __weak kvm_arch_has_private_mem(struct kvm *kvm)
> > +{
> > +	return false;
> > +}
> > +
> >  static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
> >  {
> >  	struct kvm *kvm = kvm_arch_alloc_vm();
> > @@ -2349,15 +2354,47 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
> >  	return 0;
> >  }
> >
> > +static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
> > +{
> > +	struct kvm_gfn_range gfn_range;
> > +	struct kvm_memory_slot *slot;
> > +	struct kvm_memslots *slots;
> > +	struct kvm_memslot_iter iter;
> > +	int i;
> > +	int r = 0;
> > +
> > +	gfn_range.pte = __pte(0);
> > +	gfn_range.may_block = true;
> > +
> > +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> > +		slots = __kvm_memslots(kvm, i);
> > +
> > +		kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
> > +			slot = iter.slot;
> > +			gfn_range.start = max(start, slot->base_gfn);
> > +			gfn_range.end = min(end, slot->base_gfn + slot->npages);
> > +			if (gfn_range.start >= gfn_range.end)
> > +				continue;
> > +			gfn_range.slot = slot;
> > +
> > +			r |= kvm_unmap_gfn_range(kvm, &gfn_range);
> > +		}
> > +	}
> > +
> > +	if (r)
> > +		kvm_flush_remote_tlbs(kvm);
> > +}
> > +
> >  static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> >  					   struct kvm_memory_attributes *attrs)
> >  {
> >  	gfn_t start, end;
> >  	unsigned long i;
> >  	void *entry;
> > +	int idx;
> >  	u64 supported_attrs = kvm_supported_mem_attributes(kvm);
> >
> > -	/* flags is currently not used. */
> > +	/* 'flags' is currently not used. */
> >  	if (attrs->flags)
> >  		return -EINVAL;
> >  	if (attrs->attributes & ~supported_attrs)
> > @@ -2372,6 +2409,13 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> >
> >  	entry = attrs->attributes ? xa_mk_value(attrs->attributes) : NULL;
> >
> > +	if (kvm_arch_has_private_mem(kvm)) {
> > +		KVM_MMU_LOCK(kvm);
> > +		kvm_mmu_invalidate_begin(kvm);
> > +		kvm_mmu_invalidate_range_add(kvm, start, end);
> 
> Nit: this works for KVM_MEMORY_ATTRIBUTE_PRIVATE, but
> the invalidation should be necessary yet for attribute change of:
> 
> KVM_MEMORY_ATTRIBUTE_READ
> KVM_MEMORY_ATTRIBUTE_WRITE
> KVM_MEMORY_ATTRIBUTE_EXECUTE

The unmapping is only needed for confidential usages which uses
KVM_MEMORY_ATTRIBUTE_PRIVATE only and the other flags are defined here
for other usages like pKVM. As Fuad commented in a different reply, pKVM
supports in-place remapping and unmapping is unnecessary.

Thanks,
Chao
> 
> > +		KVM_MMU_UNLOCK(kvm);
> > +	}
> > +
> >  	mutex_lock(&kvm->lock);
> >  	for (i = start; i < end; i++)
> >  		if (xa_err(xa_store(&kvm->mem_attr_array, i, entry,
> > @@ -2379,6 +2423,16 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> >  			break;
> >  	mutex_unlock(&kvm->lock);
> >
> > +	if (kvm_arch_has_private_mem(kvm)) {
> > +		idx = srcu_read_lock(&kvm->srcu);
> > +		KVM_MMU_LOCK(kvm);
> > +		if (i > start)
> > +			kvm_unmap_mem_range(kvm, start, i);
> > +		kvm_mmu_invalidate_end(kvm);
> 
> Ditto.
> 
> > +		KVM_MMU_UNLOCK(kvm);
> > +		srcu_read_unlock(&kvm->srcu, idx);
> > +	}
> > +
> >  	attrs->address = i << PAGE_SHIFT;
> >  	attrs->size = (end - i) << PAGE_SHIFT;
> >
> > --
> > 2.25.1
> >
> >
  
Yuan Yao Dec. 9, 2022, 5:43 a.m. UTC | #5
On Thu, Dec 08, 2022 at 07:20:43PM +0800, Chao Peng wrote:
> On Wed, Dec 07, 2022 at 04:13:14PM +0800, Yuan Yao wrote:
> > On Fri, Dec 02, 2022 at 02:13:44PM +0800, Chao Peng wrote:
> > > Unmap the existing guest mappings when memory attribute is changed
> > > between shared and private. This is needed because shared pages and
> > > private pages are from different backends, unmapping existing ones
> > > gives a chance for page fault handler to re-populate the mappings
> > > according to the new attribute.
> > >
> > > Only architecture has private memory support needs this and the
> > > supported architecture is expected to rewrite the weak
> > > kvm_arch_has_private_mem().
> > >
> > > Also, during memory attribute changing and the unmapping time frame,
> > > page fault handler may happen in the same memory range and can cause
> > > incorrect page state, invoke kvm_mmu_invalidate_* helpers to let the
> > > page fault handler retry during this time frame.
> > >
> > > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > > ---
> > >  include/linux/kvm_host.h |   7 +-
> > >  virt/kvm/kvm_main.c      | 168 ++++++++++++++++++++++++++-------------
> > >  2 files changed, 116 insertions(+), 59 deletions(-)
> > >
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 3d69484d2704..3331c0c92838 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -255,7 +255,6 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> > >  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
> > >  #endif
> > >
> > > -#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
> > >  struct kvm_gfn_range {
> > >  	struct kvm_memory_slot *slot;
> > >  	gfn_t start;
> > > @@ -264,6 +263,8 @@ struct kvm_gfn_range {
> > >  	bool may_block;
> > >  };
> > >  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> > > +
> > > +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
> > >  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > >  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > >  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > > @@ -785,11 +786,12 @@ struct kvm {
> > >
> > >  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> > >  	struct mmu_notifier mmu_notifier;
> > > +#endif
> > >  	unsigned long mmu_invalidate_seq;
> > >  	long mmu_invalidate_in_progress;
> > >  	gfn_t mmu_invalidate_range_start;
> > >  	gfn_t mmu_invalidate_range_end;
> > > -#endif
> > > +
> > >  	struct list_head devices;
> > >  	u64 manual_dirty_log_protect;
> > >  	struct dentry *debugfs_dentry;
> > > @@ -1480,6 +1482,7 @@ bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
> > >  int kvm_arch_post_init_vm(struct kvm *kvm);
> > >  void kvm_arch_pre_destroy_vm(struct kvm *kvm);
> > >  int kvm_arch_create_vm_debugfs(struct kvm *kvm);
> > > +bool kvm_arch_has_private_mem(struct kvm *kvm);
> > >
> > >  #ifndef __KVM_HAVE_ARCH_VM_ALLOC
> > >  /*
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index ad55dfbc75d7..4e1e1e113bf0 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -520,6 +520,62 @@ void kvm_destroy_vcpus(struct kvm *kvm)
> > >  }
> > >  EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
> > >
> > > +void kvm_mmu_invalidate_begin(struct kvm *kvm)
> > > +{
> > > +	/*
> > > +	 * The count increase must become visible at unlock time as no
> > > +	 * spte can be established without taking the mmu_lock and
> > > +	 * count is also read inside the mmu_lock critical section.
> > > +	 */
> > > +	kvm->mmu_invalidate_in_progress++;
> > > +
> > > +	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > > +		kvm->mmu_invalidate_range_start = INVALID_GPA;
> > > +		kvm->mmu_invalidate_range_end = INVALID_GPA;
> > > +	}
> > > +}
> > > +
> > > +void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> > > +{
> > > +	WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
> > > +
> > > +	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > > +		kvm->mmu_invalidate_range_start = start;
> > > +		kvm->mmu_invalidate_range_end = end;
> > > +	} else {
> > > +		/*
> > > +		 * Fully tracking multiple concurrent ranges has diminishing
> > > +		 * returns. Keep things simple and just find the minimal range
> > > +		 * which includes the current and new ranges. As there won't be
> > > +		 * enough information to subtract a range after its invalidate
> > > +		 * completes, any ranges invalidated concurrently will
> > > +		 * accumulate and persist until all outstanding invalidates
> > > +		 * complete.
> > > +		 */
> > > +		kvm->mmu_invalidate_range_start =
> > > +			min(kvm->mmu_invalidate_range_start, start);
> > > +		kvm->mmu_invalidate_range_end =
> > > +			max(kvm->mmu_invalidate_range_end, end);
> > > +	}
> > > +}
> > > +
> > > +void kvm_mmu_invalidate_end(struct kvm *kvm)
> > > +{
> > > +	/*
> > > +	 * This sequence increase will notify the kvm page fault that
> > > +	 * the page that is going to be mapped in the spte could have
> > > +	 * been freed.
> > > +	 */
> > > +	kvm->mmu_invalidate_seq++;
> > > +	smp_wmb();
> > > +	/*
> > > +	 * The above sequence increase must be visible before the
> > > +	 * below count decrease, which is ensured by the smp_wmb above
> > > +	 * in conjunction with the smp_rmb in mmu_invalidate_retry().
> > > +	 */
> > > +	kvm->mmu_invalidate_in_progress--;
> > > +}
> > > +
> > >  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> > >  static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
> > >  {
> > > @@ -714,45 +770,6 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
> > >  	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
> > >  }
> > >
> > > -void kvm_mmu_invalidate_begin(struct kvm *kvm)
> > > -{
> > > -	/*
> > > -	 * The count increase must become visible at unlock time as no
> > > -	 * spte can be established without taking the mmu_lock and
> > > -	 * count is also read inside the mmu_lock critical section.
> > > -	 */
> > > -	kvm->mmu_invalidate_in_progress++;
> > > -
> > > -	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > > -		kvm->mmu_invalidate_range_start = INVALID_GPA;
> > > -		kvm->mmu_invalidate_range_end = INVALID_GPA;
> > > -	}
> > > -}
> > > -
> > > -void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> > > -{
> > > -	WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
> > > -
> > > -	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > > -		kvm->mmu_invalidate_range_start = start;
> > > -		kvm->mmu_invalidate_range_end = end;
> > > -	} else {
> > > -		/*
> > > -		 * Fully tracking multiple concurrent ranges has diminishing
> > > -		 * returns. Keep things simple and just find the minimal range
> > > -		 * which includes the current and new ranges. As there won't be
> > > -		 * enough information to subtract a range after its invalidate
> > > -		 * completes, any ranges invalidated concurrently will
> > > -		 * accumulate and persist until all outstanding invalidates
> > > -		 * complete.
> > > -		 */
> > > -		kvm->mmu_invalidate_range_start =
> > > -			min(kvm->mmu_invalidate_range_start, start);
> > > -		kvm->mmu_invalidate_range_end =
> > > -			max(kvm->mmu_invalidate_range_end, end);
> > > -	}
> > > -}
> > > -
> > >  static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> > >  {
> > >  	kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
> > > @@ -806,23 +823,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> > >  	return 0;
> > >  }
> > >
> > > -void kvm_mmu_invalidate_end(struct kvm *kvm)
> > > -{
> > > -	/*
> > > -	 * This sequence increase will notify the kvm page fault that
> > > -	 * the page that is going to be mapped in the spte could have
> > > -	 * been freed.
> > > -	 */
> > > -	kvm->mmu_invalidate_seq++;
> > > -	smp_wmb();
> > > -	/*
> > > -	 * The above sequence increase must be visible before the
> > > -	 * below count decrease, which is ensured by the smp_wmb above
> > > -	 * in conjunction with the smp_rmb in mmu_invalidate_retry().
> > > -	 */
> > > -	kvm->mmu_invalidate_in_progress--;
> > > -}
> > > -
> > >  static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> > >  					const struct mmu_notifier_range *range)
> > >  {
> > > @@ -1140,6 +1140,11 @@ int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
> > >  	return 0;
> > >  }
> > >
> > > +bool __weak kvm_arch_has_private_mem(struct kvm *kvm)
> > > +{
> > > +	return false;
> > > +}
> > > +
> > >  static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
> > >  {
> > >  	struct kvm *kvm = kvm_arch_alloc_vm();
> > > @@ -2349,15 +2354,47 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
> > >  	return 0;
> > >  }
> > >
> > > +static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
> > > +{
> > > +	struct kvm_gfn_range gfn_range;
> > > +	struct kvm_memory_slot *slot;
> > > +	struct kvm_memslots *slots;
> > > +	struct kvm_memslot_iter iter;
> > > +	int i;
> > > +	int r = 0;
> > > +
> > > +	gfn_range.pte = __pte(0);
> > > +	gfn_range.may_block = true;
> > > +
> > > +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> > > +		slots = __kvm_memslots(kvm, i);
> > > +
> > > +		kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
> > > +			slot = iter.slot;
> > > +			gfn_range.start = max(start, slot->base_gfn);
> > > +			gfn_range.end = min(end, slot->base_gfn + slot->npages);
> > > +			if (gfn_range.start >= gfn_range.end)
> > > +				continue;
> > > +			gfn_range.slot = slot;
> > > +
> > > +			r |= kvm_unmap_gfn_range(kvm, &gfn_range);
> > > +		}
> > > +	}
> > > +
> > > +	if (r)
> > > +		kvm_flush_remote_tlbs(kvm);
> > > +}
> > > +
> > >  static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> > >  					   struct kvm_memory_attributes *attrs)
> > >  {
> > >  	gfn_t start, end;
> > >  	unsigned long i;
> > >  	void *entry;
> > > +	int idx;
> > >  	u64 supported_attrs = kvm_supported_mem_attributes(kvm);
> > >
> > > -	/* flags is currently not used. */
> > > +	/* 'flags' is currently not used. */
> > >  	if (attrs->flags)
> > >  		return -EINVAL;
> > >  	if (attrs->attributes & ~supported_attrs)
> > > @@ -2372,6 +2409,13 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> > >
> > >  	entry = attrs->attributes ? xa_mk_value(attrs->attributes) : NULL;
> > >
> > > +	if (kvm_arch_has_private_mem(kvm)) {
> > > +		KVM_MMU_LOCK(kvm);
> > > +		kvm_mmu_invalidate_begin(kvm);
> > > +		kvm_mmu_invalidate_range_add(kvm, start, end);
> >
> > Nit: this works for KVM_MEMORY_ATTRIBUTE_PRIVATE, but
> > the invalidation should be necessary yet for attribute change of:
> >
> > KVM_MEMORY_ATTRIBUTE_READ
> > KVM_MEMORY_ATTRIBUTE_WRITE
> > KVM_MEMORY_ATTRIBUTE_EXECUTE
>
> The unmapping is only needed for confidential usages which uses
> KVM_MEMORY_ATTRIBUTE_PRIVATE only and the other flags are defined here
> for other usages like pKVM. As Fuad commented in a different reply, pKVM
> supports in-place remapping and unmapping is unnecessary.

Ah, I see. It's fine to me, thanks.

>
> Thanks,
> Chao
> >
> > > +		KVM_MMU_UNLOCK(kvm);
> > > +	}
> > > +
> > >  	mutex_lock(&kvm->lock);
> > >  	for (i = start; i < end; i++)
> > >  		if (xa_err(xa_store(&kvm->mem_attr_array, i, entry,
> > > @@ -2379,6 +2423,16 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> > >  			break;
> > >  	mutex_unlock(&kvm->lock);
> > >
> > > +	if (kvm_arch_has_private_mem(kvm)) {
> > > +		idx = srcu_read_lock(&kvm->srcu);
> > > +		KVM_MMU_LOCK(kvm);
> > > +		if (i > start)
> > > +			kvm_unmap_mem_range(kvm, start, i);
> > > +		kvm_mmu_invalidate_end(kvm);
> >
> > Ditto.
> >
> > > +		KVM_MMU_UNLOCK(kvm);
> > > +		srcu_read_unlock(&kvm->srcu, idx);
> > > +	}
> > > +
> > >  	attrs->address = i << PAGE_SHIFT;
> > >  	attrs->size = (end - i) << PAGE_SHIFT;
> > >
> > > --
> > > 2.25.1
> > >
> > >
  
Fuad Tabba Dec. 9, 2022, 8:57 a.m. UTC | #6
Hi,

On Thu, Dec 8, 2022 at 11:18 AM Chao Peng <chao.p.peng@linux.intel.com> wrote:
>
> On Wed, Dec 07, 2022 at 05:16:34PM +0000, Fuad Tabba wrote:
> > Hi,
> >
> > On Fri, Dec 2, 2022 at 6:19 AM Chao Peng <chao.p.peng@linux.intel.com> wrote:
> > >
> > > Unmap the existing guest mappings when memory attribute is changed
> > > between shared and private. This is needed because shared pages and
> > > private pages are from different backends, unmapping existing ones
> > > gives a chance for page fault handler to re-populate the mappings
> > > according to the new attribute.
> > >
> > > Only architecture has private memory support needs this and the
> > > supported architecture is expected to rewrite the weak
> > > kvm_arch_has_private_mem().
> >
> > This kind of ties into the discussion of being able to share memory in
> > place. For pKVM for example, shared and private memory would have the
> > same backend, and the unmapping wouldn't be needed.
> >
> > So I guess that, instead of kvm_arch_has_private_mem(), can the check
> > be done differently, e.g., with a different function, say
> > kvm_arch_private_notify_attribute_change() (but maybe with a more
> > friendly name than what I suggested :) )?
>
> Besides controlling the unmapping here, kvm_arch_has_private_mem() is
> also used to gate the memslot KVM_MEM_PRIVATE flag in patch09. I know
> unmapping is confirmed unnecessary for pKVM, but how about
> KVM_MEM_PRIVATE? Will pKVM add its own flag or reuse KVM_MEM_PRIVATE?
> If the answer is the latter, then yes we should use a different check
> which only works for confidential usages here.

I think it makes sense for pKVM to use the same flag (KVM_MEM_PRIVATE)
and not to add another one.

Thank you,
/fuad



>
> Thanks,
> Chao
> >
> > Thanks,
> > /fuad
> >
> > >
> > > Also, during memory attribute changing and the unmapping time frame,
> > > page fault handler may happen in the same memory range and can cause
> > > incorrect page state, invoke kvm_mmu_invalidate_* helpers to let the
> > > page fault handler retry during this time frame.
> > >
> > > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > > ---
> > >  include/linux/kvm_host.h |   7 +-
> > >  virt/kvm/kvm_main.c      | 168 ++++++++++++++++++++++++++-------------
> > >  2 files changed, 116 insertions(+), 59 deletions(-)
> > >
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 3d69484d2704..3331c0c92838 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -255,7 +255,6 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> > >  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
> > >  #endif
> > >
> > > -#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
> > >  struct kvm_gfn_range {
> > >         struct kvm_memory_slot *slot;
> > >         gfn_t start;
> > > @@ -264,6 +263,8 @@ struct kvm_gfn_range {
> > >         bool may_block;
> > >  };
> > >  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> > > +
> > > +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
> > >  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > >  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > >  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > > @@ -785,11 +786,12 @@ struct kvm {
> > >
> > >  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> > >         struct mmu_notifier mmu_notifier;
> > > +#endif
> > >         unsigned long mmu_invalidate_seq;
> > >         long mmu_invalidate_in_progress;
> > >         gfn_t mmu_invalidate_range_start;
> > >         gfn_t mmu_invalidate_range_end;
> > > -#endif
> > > +
> > >         struct list_head devices;
> > >         u64 manual_dirty_log_protect;
> > >         struct dentry *debugfs_dentry;
> > > @@ -1480,6 +1482,7 @@ bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
> > >  int kvm_arch_post_init_vm(struct kvm *kvm);
> > >  void kvm_arch_pre_destroy_vm(struct kvm *kvm);
> > >  int kvm_arch_create_vm_debugfs(struct kvm *kvm);
> > > +bool kvm_arch_has_private_mem(struct kvm *kvm);
> > >
> > >  #ifndef __KVM_HAVE_ARCH_VM_ALLOC
> > >  /*
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index ad55dfbc75d7..4e1e1e113bf0 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -520,6 +520,62 @@ void kvm_destroy_vcpus(struct kvm *kvm)
> > >  }
> > >  EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
> > >
> > > +void kvm_mmu_invalidate_begin(struct kvm *kvm)
> > > +{
> > > +       /*
> > > +        * The count increase must become visible at unlock time as no
> > > +        * spte can be established without taking the mmu_lock and
> > > +        * count is also read inside the mmu_lock critical section.
> > > +        */
> > > +       kvm->mmu_invalidate_in_progress++;
> > > +
> > > +       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > > +               kvm->mmu_invalidate_range_start = INVALID_GPA;
> > > +               kvm->mmu_invalidate_range_end = INVALID_GPA;
> > > +       }
> > > +}
> > > +
> > > +void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> > > +{
> > > +       WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
> > > +
> > > +       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > > +               kvm->mmu_invalidate_range_start = start;
> > > +               kvm->mmu_invalidate_range_end = end;
> > > +       } else {
> > > +               /*
> > > +                * Fully tracking multiple concurrent ranges has diminishing
> > > +                * returns. Keep things simple and just find the minimal range
> > > +                * which includes the current and new ranges. As there won't be
> > > +                * enough information to subtract a range after its invalidate
> > > +                * completes, any ranges invalidated concurrently will
> > > +                * accumulate and persist until all outstanding invalidates
> > > +                * complete.
> > > +                */
> > > +               kvm->mmu_invalidate_range_start =
> > > +                       min(kvm->mmu_invalidate_range_start, start);
> > > +               kvm->mmu_invalidate_range_end =
> > > +                       max(kvm->mmu_invalidate_range_end, end);
> > > +       }
> > > +}
> > > +
> > > +void kvm_mmu_invalidate_end(struct kvm *kvm)
> > > +{
> > > +       /*
> > > +        * This sequence increase will notify the kvm page fault that
> > > +        * the page that is going to be mapped in the spte could have
> > > +        * been freed.
> > > +        */
> > > +       kvm->mmu_invalidate_seq++;
> > > +       smp_wmb();
> > > +       /*
> > > +        * The above sequence increase must be visible before the
> > > +        * below count decrease, which is ensured by the smp_wmb above
> > > +        * in conjunction with the smp_rmb in mmu_invalidate_retry().
> > > +        */
> > > +       kvm->mmu_invalidate_in_progress--;
> > > +}
> > > +
> > >  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> > >  static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
> > >  {
> > > @@ -714,45 +770,6 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
> > >         kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
> > >  }
> > >
> > > -void kvm_mmu_invalidate_begin(struct kvm *kvm)
> > > -{
> > > -       /*
> > > -        * The count increase must become visible at unlock time as no
> > > -        * spte can be established without taking the mmu_lock and
> > > -        * count is also read inside the mmu_lock critical section.
> > > -        */
> > > -       kvm->mmu_invalidate_in_progress++;
> > > -
> > > -       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > > -               kvm->mmu_invalidate_range_start = INVALID_GPA;
> > > -               kvm->mmu_invalidate_range_end = INVALID_GPA;
> > > -       }
> > > -}
> > > -
> > > -void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> > > -{
> > > -       WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
> > > -
> > > -       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > > -               kvm->mmu_invalidate_range_start = start;
> > > -               kvm->mmu_invalidate_range_end = end;
> > > -       } else {
> > > -               /*
> > > -                * Fully tracking multiple concurrent ranges has diminishing
> > > -                * returns. Keep things simple and just find the minimal range
> > > -                * which includes the current and new ranges. As there won't be
> > > -                * enough information to subtract a range after its invalidate
> > > -                * completes, any ranges invalidated concurrently will
> > > -                * accumulate and persist until all outstanding invalidates
> > > -                * complete.
> > > -                */
> > > -               kvm->mmu_invalidate_range_start =
> > > -                       min(kvm->mmu_invalidate_range_start, start);
> > > -               kvm->mmu_invalidate_range_end =
> > > -                       max(kvm->mmu_invalidate_range_end, end);
> > > -       }
> > > -}
> > > -
> > >  static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> > >  {
> > >         kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
> > > @@ -806,23 +823,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> > >         return 0;
> > >  }
> > >
> > > -void kvm_mmu_invalidate_end(struct kvm *kvm)
> > > -{
> > > -       /*
> > > -        * This sequence increase will notify the kvm page fault that
> > > -        * the page that is going to be mapped in the spte could have
> > > -        * been freed.
> > > -        */
> > > -       kvm->mmu_invalidate_seq++;
> > > -       smp_wmb();
> > > -       /*
> > > -        * The above sequence increase must be visible before the
> > > -        * below count decrease, which is ensured by the smp_wmb above
> > > -        * in conjunction with the smp_rmb in mmu_invalidate_retry().
> > > -        */
> > > -       kvm->mmu_invalidate_in_progress--;
> > > -}
> > > -
> > >  static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> > >                                         const struct mmu_notifier_range *range)
> > >  {
> > > @@ -1140,6 +1140,11 @@ int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
> > >         return 0;
> > >  }
> > >
> > > +bool __weak kvm_arch_has_private_mem(struct kvm *kvm)
> > > +{
> > > +       return false;
> > > +}
> > > +
> > >  static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
> > >  {
> > >         struct kvm *kvm = kvm_arch_alloc_vm();
> > > @@ -2349,15 +2354,47 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
> > >         return 0;
> > >  }
> > >
> > > +static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
> > > +{
> > > +       struct kvm_gfn_range gfn_range;
> > > +       struct kvm_memory_slot *slot;
> > > +       struct kvm_memslots *slots;
> > > +       struct kvm_memslot_iter iter;
> > > +       int i;
> > > +       int r = 0;
> > > +
> > > +       gfn_range.pte = __pte(0);
> > > +       gfn_range.may_block = true;
> > > +
> > > +       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> > > +               slots = __kvm_memslots(kvm, i);
> > > +
> > > +               kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
> > > +                       slot = iter.slot;
> > > +                       gfn_range.start = max(start, slot->base_gfn);
> > > +                       gfn_range.end = min(end, slot->base_gfn + slot->npages);
> > > +                       if (gfn_range.start >= gfn_range.end)
> > > +                               continue;
> > > +                       gfn_range.slot = slot;
> > > +
> > > +                       r |= kvm_unmap_gfn_range(kvm, &gfn_range);
> > > +               }
> > > +       }
> > > +
> > > +       if (r)
> > > +               kvm_flush_remote_tlbs(kvm);
> > > +}
> > > +
> > >  static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> > >                                            struct kvm_memory_attributes *attrs)
> > >  {
> > >         gfn_t start, end;
> > >         unsigned long i;
> > >         void *entry;
> > > +       int idx;
> > >         u64 supported_attrs = kvm_supported_mem_attributes(kvm);
> > >
> > > -       /* flags is currently not used. */
> > > +       /* 'flags' is currently not used. */
> > >         if (attrs->flags)
> > >                 return -EINVAL;
> > >         if (attrs->attributes & ~supported_attrs)
> > > @@ -2372,6 +2409,13 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> > >
> > >         entry = attrs->attributes ? xa_mk_value(attrs->attributes) : NULL;
> > >
> > > +       if (kvm_arch_has_private_mem(kvm)) {
> > > +               KVM_MMU_LOCK(kvm);
> > > +               kvm_mmu_invalidate_begin(kvm);
> > > +               kvm_mmu_invalidate_range_add(kvm, start, end);
> > > +               KVM_MMU_UNLOCK(kvm);
> > > +       }
> > > +
> > >         mutex_lock(&kvm->lock);
> > >         for (i = start; i < end; i++)
> > >                 if (xa_err(xa_store(&kvm->mem_attr_array, i, entry,
> > > @@ -2379,6 +2423,16 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> > >                         break;
> > >         mutex_unlock(&kvm->lock);
> > >
> > > +       if (kvm_arch_has_private_mem(kvm)) {
> > > +               idx = srcu_read_lock(&kvm->srcu);
> > > +               KVM_MMU_LOCK(kvm);
> > > +               if (i > start)
> > > +                       kvm_unmap_mem_range(kvm, start, i);
> > > +               kvm_mmu_invalidate_end(kvm);
> > > +               KVM_MMU_UNLOCK(kvm);
> > > +               srcu_read_unlock(&kvm->srcu, idx);
> > > +       }
> > > +
> > >         attrs->address = i << PAGE_SHIFT;
> > >         attrs->size = (end - i) << PAGE_SHIFT;
> > >
> > > --
> > > 2.25.1
> > >
  
Chao Peng Dec. 12, 2022, 7:22 a.m. UTC | #7
On Fri, Dec 09, 2022 at 08:57:31AM +0000, Fuad Tabba wrote:
> Hi,
> 
> On Thu, Dec 8, 2022 at 11:18 AM Chao Peng <chao.p.peng@linux.intel.com> wrote:
> >
> > On Wed, Dec 07, 2022 at 05:16:34PM +0000, Fuad Tabba wrote:
> > > Hi,
> > >
> > > On Fri, Dec 2, 2022 at 6:19 AM Chao Peng <chao.p.peng@linux.intel.com> wrote:
> > > >
> > > > Unmap the existing guest mappings when memory attribute is changed
> > > > between shared and private. This is needed because shared pages and
> > > > private pages are from different backends, unmapping existing ones
> > > > gives a chance for page fault handler to re-populate the mappings
> > > > according to the new attribute.
> > > >
> > > > Only architecture has private memory support needs this and the
> > > > supported architecture is expected to rewrite the weak
> > > > kvm_arch_has_private_mem().
> > >
> > > This kind of ties into the discussion of being able to share memory in
> > > place. For pKVM for example, shared and private memory would have the
> > > same backend, and the unmapping wouldn't be needed.
> > >
> > > So I guess that, instead of kvm_arch_has_private_mem(), can the check
> > > be done differently, e.g., with a different function, say
> > > kvm_arch_private_notify_attribute_change() (but maybe with a more
> > > friendly name than what I suggested :) )?
> >
> > Besides controlling the unmapping here, kvm_arch_has_private_mem() is
> > also used to gate the memslot KVM_MEM_PRIVATE flag in patch09. I know
> > unmapping is confirmed unnecessary for pKVM, but how about
> > KVM_MEM_PRIVATE? Will pKVM add its own flag or reuse KVM_MEM_PRIVATE?
> > If the answer is the latter, then yes we should use a different check
> > which only works for confidential usages here.
> 
> I think it makes sense for pKVM to use the same flag (KVM_MEM_PRIVATE)
> and not to add another one.

Thanks for the reply.
Chao
> 
> Thank you,
> /fuad
> 
> 
> 
> >
> > Thanks,
> > Chao
> > >
> > > Thanks,
> > > /fuad
> > >
> > > >
> > > > Also, during memory attribute changing and the unmapping time frame,
> > > > page fault handler may happen in the same memory range and can cause
> > > > incorrect page state, invoke kvm_mmu_invalidate_* helpers to let the
> > > > page fault handler retry during this time frame.
> > > >
> > > > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > > > ---
> > > >  include/linux/kvm_host.h |   7 +-
> > > >  virt/kvm/kvm_main.c      | 168 ++++++++++++++++++++++++++-------------
> > > >  2 files changed, 116 insertions(+), 59 deletions(-)
> > > >
> > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > index 3d69484d2704..3331c0c92838 100644
> > > > --- a/include/linux/kvm_host.h
> > > > +++ b/include/linux/kvm_host.h
> > > > @@ -255,7 +255,6 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> > > >  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
> > > >  #endif
> > > >
> > > > -#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
> > > >  struct kvm_gfn_range {
> > > >         struct kvm_memory_slot *slot;
> > > >         gfn_t start;
> > > > @@ -264,6 +263,8 @@ struct kvm_gfn_range {
> > > >         bool may_block;
> > > >  };
> > > >  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> > > > +
> > > > +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
> > > >  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > > >  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > > >  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > > > @@ -785,11 +786,12 @@ struct kvm {
> > > >
> > > >  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> > > >         struct mmu_notifier mmu_notifier;
> > > > +#endif
> > > >         unsigned long mmu_invalidate_seq;
> > > >         long mmu_invalidate_in_progress;
> > > >         gfn_t mmu_invalidate_range_start;
> > > >         gfn_t mmu_invalidate_range_end;
> > > > -#endif
> > > > +
> > > >         struct list_head devices;
> > > >         u64 manual_dirty_log_protect;
> > > >         struct dentry *debugfs_dentry;
> > > > @@ -1480,6 +1482,7 @@ bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
> > > >  int kvm_arch_post_init_vm(struct kvm *kvm);
> > > >  void kvm_arch_pre_destroy_vm(struct kvm *kvm);
> > > >  int kvm_arch_create_vm_debugfs(struct kvm *kvm);
> > > > +bool kvm_arch_has_private_mem(struct kvm *kvm);
> > > >
> > > >  #ifndef __KVM_HAVE_ARCH_VM_ALLOC
> > > >  /*
> > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > index ad55dfbc75d7..4e1e1e113bf0 100644
> > > > --- a/virt/kvm/kvm_main.c
> > > > +++ b/virt/kvm/kvm_main.c
> > > > @@ -520,6 +520,62 @@ void kvm_destroy_vcpus(struct kvm *kvm)
> > > >  }
> > > >  EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
> > > >
> > > > +void kvm_mmu_invalidate_begin(struct kvm *kvm)
> > > > +{
> > > > +       /*
> > > > +        * The count increase must become visible at unlock time as no
> > > > +        * spte can be established without taking the mmu_lock and
> > > > +        * count is also read inside the mmu_lock critical section.
> > > > +        */
> > > > +       kvm->mmu_invalidate_in_progress++;
> > > > +
> > > > +       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > > > +               kvm->mmu_invalidate_range_start = INVALID_GPA;
> > > > +               kvm->mmu_invalidate_range_end = INVALID_GPA;
> > > > +       }
> > > > +}
> > > > +
> > > > +void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> > > > +{
> > > > +       WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
> > > > +
> > > > +       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > > > +               kvm->mmu_invalidate_range_start = start;
> > > > +               kvm->mmu_invalidate_range_end = end;
> > > > +       } else {
> > > > +               /*
> > > > +                * Fully tracking multiple concurrent ranges has diminishing
> > > > +                * returns. Keep things simple and just find the minimal range
> > > > +                * which includes the current and new ranges. As there won't be
> > > > +                * enough information to subtract a range after its invalidate
> > > > +                * completes, any ranges invalidated concurrently will
> > > > +                * accumulate and persist until all outstanding invalidates
> > > > +                * complete.
> > > > +                */
> > > > +               kvm->mmu_invalidate_range_start =
> > > > +                       min(kvm->mmu_invalidate_range_start, start);
> > > > +               kvm->mmu_invalidate_range_end =
> > > > +                       max(kvm->mmu_invalidate_range_end, end);
> > > > +       }
> > > > +}
> > > > +
> > > > +void kvm_mmu_invalidate_end(struct kvm *kvm)
> > > > +{
> > > > +       /*
> > > > +        * This sequence increase will notify the kvm page fault that
> > > > +        * the page that is going to be mapped in the spte could have
> > > > +        * been freed.
> > > > +        */
> > > > +       kvm->mmu_invalidate_seq++;
> > > > +       smp_wmb();
> > > > +       /*
> > > > +        * The above sequence increase must be visible before the
> > > > +        * below count decrease, which is ensured by the smp_wmb above
> > > > +        * in conjunction with the smp_rmb in mmu_invalidate_retry().
> > > > +        */
> > > > +       kvm->mmu_invalidate_in_progress--;
> > > > +}
> > > > +
> > > >  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> > > >  static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
> > > >  {
> > > > @@ -714,45 +770,6 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
> > > >         kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
> > > >  }
> > > >
> > > > -void kvm_mmu_invalidate_begin(struct kvm *kvm)
> > > > -{
> > > > -       /*
> > > > -        * The count increase must become visible at unlock time as no
> > > > -        * spte can be established without taking the mmu_lock and
> > > > -        * count is also read inside the mmu_lock critical section.
> > > > -        */
> > > > -       kvm->mmu_invalidate_in_progress++;
> > > > -
> > > > -       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > > > -               kvm->mmu_invalidate_range_start = INVALID_GPA;
> > > > -               kvm->mmu_invalidate_range_end = INVALID_GPA;
> > > > -       }
> > > > -}
> > > > -
> > > > -void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> > > > -{
> > > > -       WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
> > > > -
> > > > -       if (likely(kvm->mmu_invalidate_in_progress == 1)) {
> > > > -               kvm->mmu_invalidate_range_start = start;
> > > > -               kvm->mmu_invalidate_range_end = end;
> > > > -       } else {
> > > > -               /*
> > > > -                * Fully tracking multiple concurrent ranges has diminishing
> > > > -                * returns. Keep things simple and just find the minimal range
> > > > -                * which includes the current and new ranges. As there won't be
> > > > -                * enough information to subtract a range after its invalidate
> > > > -                * completes, any ranges invalidated concurrently will
> > > > -                * accumulate and persist until all outstanding invalidates
> > > > -                * complete.
> > > > -                */
> > > > -               kvm->mmu_invalidate_range_start =
> > > > -                       min(kvm->mmu_invalidate_range_start, start);
> > > > -               kvm->mmu_invalidate_range_end =
> > > > -                       max(kvm->mmu_invalidate_range_end, end);
> > > > -       }
> > > > -}
> > > > -
> > > >  static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> > > >  {
> > > >         kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
> > > > @@ -806,23 +823,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> > > >         return 0;
> > > >  }
> > > >
> > > > -void kvm_mmu_invalidate_end(struct kvm *kvm)
> > > > -{
> > > > -       /*
> > > > -        * This sequence increase will notify the kvm page fault that
> > > > -        * the page that is going to be mapped in the spte could have
> > > > -        * been freed.
> > > > -        */
> > > > -       kvm->mmu_invalidate_seq++;
> > > > -       smp_wmb();
> > > > -       /*
> > > > -        * The above sequence increase must be visible before the
> > > > -        * below count decrease, which is ensured by the smp_wmb above
> > > > -        * in conjunction with the smp_rmb in mmu_invalidate_retry().
> > > > -        */
> > > > -       kvm->mmu_invalidate_in_progress--;
> > > > -}
> > > > -
> > > >  static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> > > >                                         const struct mmu_notifier_range *range)
> > > >  {
> > > > @@ -1140,6 +1140,11 @@ int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
> > > >         return 0;
> > > >  }
> > > >
> > > > +bool __weak kvm_arch_has_private_mem(struct kvm *kvm)
> > > > +{
> > > > +       return false;
> > > > +}
> > > > +
> > > >  static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
> > > >  {
> > > >         struct kvm *kvm = kvm_arch_alloc_vm();
> > > > @@ -2349,15 +2354,47 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
> > > >         return 0;
> > > >  }
> > > >
> > > > +static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
> > > > +{
> > > > +       struct kvm_gfn_range gfn_range;
> > > > +       struct kvm_memory_slot *slot;
> > > > +       struct kvm_memslots *slots;
> > > > +       struct kvm_memslot_iter iter;
> > > > +       int i;
> > > > +       int r = 0;
> > > > +
> > > > +       gfn_range.pte = __pte(0);
> > > > +       gfn_range.may_block = true;
> > > > +
> > > > +       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> > > > +               slots = __kvm_memslots(kvm, i);
> > > > +
> > > > +               kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
> > > > +                       slot = iter.slot;
> > > > +                       gfn_range.start = max(start, slot->base_gfn);
> > > > +                       gfn_range.end = min(end, slot->base_gfn + slot->npages);
> > > > +                       if (gfn_range.start >= gfn_range.end)
> > > > +                               continue;
> > > > +                       gfn_range.slot = slot;
> > > > +
> > > > +                       r |= kvm_unmap_gfn_range(kvm, &gfn_range);
> > > > +               }
> > > > +       }
> > > > +
> > > > +       if (r)
> > > > +               kvm_flush_remote_tlbs(kvm);
> > > > +}
> > > > +
> > > >  static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> > > >                                            struct kvm_memory_attributes *attrs)
> > > >  {
> > > >         gfn_t start, end;
> > > >         unsigned long i;
> > > >         void *entry;
> > > > +       int idx;
> > > >         u64 supported_attrs = kvm_supported_mem_attributes(kvm);
> > > >
> > > > -       /* flags is currently not used. */
> > > > +       /* 'flags' is currently not used. */
> > > >         if (attrs->flags)
> > > >                 return -EINVAL;
> > > >         if (attrs->attributes & ~supported_attrs)
> > > > @@ -2372,6 +2409,13 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> > > >
> > > >         entry = attrs->attributes ? xa_mk_value(attrs->attributes) : NULL;
> > > >
> > > > +       if (kvm_arch_has_private_mem(kvm)) {
> > > > +               KVM_MMU_LOCK(kvm);
> > > > +               kvm_mmu_invalidate_begin(kvm);
> > > > +               kvm_mmu_invalidate_range_add(kvm, start, end);
> > > > +               KVM_MMU_UNLOCK(kvm);
> > > > +       }
> > > > +
> > > >         mutex_lock(&kvm->lock);
> > > >         for (i = start; i < end; i++)
> > > >                 if (xa_err(xa_store(&kvm->mem_attr_array, i, entry,
> > > > @@ -2379,6 +2423,16 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> > > >                         break;
> > > >         mutex_unlock(&kvm->lock);
> > > >
> > > > +       if (kvm_arch_has_private_mem(kvm)) {
> > > > +               idx = srcu_read_lock(&kvm->srcu);
> > > > +               KVM_MMU_LOCK(kvm);
> > > > +               if (i > start)
> > > > +                       kvm_unmap_mem_range(kvm, start, i);
> > > > +               kvm_mmu_invalidate_end(kvm);
> > > > +               KVM_MMU_UNLOCK(kvm);
> > > > +               srcu_read_unlock(&kvm->srcu, idx);
> > > > +       }
> > > > +
> > > >         attrs->address = i << PAGE_SHIFT;
> > > >         attrs->size = (end - i) << PAGE_SHIFT;
> > > >
> > > > --
> > > > 2.25.1
> > > >
  
Kai Huang Dec. 13, 2022, 11:51 p.m. UTC | #8
On Fri, 2022-12-02 at 14:13 +0800, Chao Peng wrote:
>  
> -	/* flags is currently not used. */
> +	/* 'flags' is currently not used. */
>  	if (attrs->flags)
>  		return -EINVAL;

Unintended code change.
  
Chao Peng Dec. 19, 2022, 7:54 a.m. UTC | #9
On Tue, Dec 13, 2022 at 11:51:25PM +0000, Huang, Kai wrote:
> On Fri, 2022-12-02 at 14:13 +0800, Chao Peng wrote:
> >  
> > -	/* flags is currently not used. */
> > +	/* 'flags' is currently not used. */
> >  	if (attrs->flags)
> >  		return -EINVAL;
> 
> Unintended code change.

Yeah!

Chao
  
Sean Christopherson Jan. 13, 2023, 10:50 p.m. UTC | #10
On Fri, Dec 02, 2022, Chao Peng wrote:
> @@ -785,11 +786,12 @@ struct kvm {
>  
>  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
>  	struct mmu_notifier mmu_notifier;
> +#endif
>  	unsigned long mmu_invalidate_seq;
>  	long mmu_invalidate_in_progress;
>  	gfn_t mmu_invalidate_range_start;
>  	gfn_t mmu_invalidate_range_end;
> -#endif

Blech.  The existing code is a bit ugly, and trying to extend for this use case
makes things even worse.

Rather than use the base MMU_NOTIFIER Kconfig and an arbitrary define, I think we
should first add a proper Kconfig, e.g. KVM_GENERIC_MMU_NOTIFIER, to replace the
combination.  E.g

	config KVM_GENERIC_MMU_NOTIFIER
	       select MMU_NOTIFIER
	       bool

and then all architectures that currently #define KVM_ARCH_WANT_MMU_NOTIFIER can
simply select the Kconfig, which is everything except s390.  "GENERIC" again because
s390 does select MMU_NOTIFER and actually registers its own notifier for s390's
version of protected VMs (at least, I think that's what its "pv" stands for).

And then later down the line in this series, when the attributes and private mem
needs to tie into the notifiers, we can do:


	config KVM_GENERIC_MEMORY_ATTRIBUTES
	       select KVM_GENERIC_MMU_NOTIFIER
	       bool

I.e. that way this patch doesn't need to partially expose KVM's notifier stuff
and can instead just keep the soon-to-be-existing KVM_GENERIC_MMU_NOTIFIER.

Taking a depending on KVM_GENERIC_MMU_NOTIFIER for KVM_GENERIC_MEMORY_ATTRIBUTES
makes sense, because AFAICT, changing any type of attribute, e.g. RWX bits, is
going to necessitate unmapping the affected gfn range.

>  	struct list_head devices;
>  	u64 manual_dirty_log_protect;
>  	struct dentry *debugfs_dentry;
> @@ -1480,6 +1482,7 @@ bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_arch_post_init_vm(struct kvm *kvm);
>  void kvm_arch_pre_destroy_vm(struct kvm *kvm);
>  int kvm_arch_create_vm_debugfs(struct kvm *kvm);
> +bool kvm_arch_has_private_mem(struct kvm *kvm);

The reference to private memory belongs in a later patch.  More below.

> +static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	struct kvm_gfn_range gfn_range;
> +	struct kvm_memory_slot *slot;
> +	struct kvm_memslots *slots;
> +	struct kvm_memslot_iter iter;
> +	int i;
> +	int r = 0;

The return from kvm_unmap_gfn_range() is a bool, this should be:

	bool flush = false;

> +
> +	gfn_range.pte = __pte(0);
> +	gfn_range.may_block = true;
> +
> +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> +		slots = __kvm_memslots(kvm, i);
> +
> +		kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
> +			slot = iter.slot;
> +			gfn_range.start = max(start, slot->base_gfn);
> +			gfn_range.end = min(end, slot->base_gfn + slot->npages);
> +			if (gfn_range.start >= gfn_range.end)
> +				continue;
> +			gfn_range.slot = slot;
> +
> +			r |= kvm_unmap_gfn_range(kvm, &gfn_range);
> +		}
> +	}
> +
> +	if (r)
> +		kvm_flush_remote_tlbs(kvm);
> +}
> +
>  static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
>  					   struct kvm_memory_attributes *attrs)
>  {
>  	gfn_t start, end;
>  	unsigned long i;
>  	void *entry;
> +	int idx;
>  	u64 supported_attrs = kvm_supported_mem_attributes(kvm);
>  
> -	/* flags is currently not used. */
> +	/* 'flags' is currently not used. */

Kind of a spurious change.

>  	if (attrs->flags)
>  		return -EINVAL;
>  	if (attrs->attributes & ~supported_attrs)
> @@ -2372,6 +2409,13 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
>  
>  	entry = attrs->attributes ? xa_mk_value(attrs->attributes) : NULL;
>  
> +	if (kvm_arch_has_private_mem(kvm)) {

I think we should assume that any future attributes will necessitate unmapping
and invalidation, i.e. drop the private mem check.  That allows introducing
kvm_arch_has_private_mem() in a later patch that is more directly related to
private memory.

> +		KVM_MMU_LOCK(kvm);
> +		kvm_mmu_invalidate_begin(kvm);
> +		kvm_mmu_invalidate_range_add(kvm, start, end);
> +		KVM_MMU_UNLOCK(kvm);
> +	}
> +
>  	mutex_lock(&kvm->lock);
>  	for (i = start; i < end; i++)
>  		if (xa_err(xa_store(&kvm->mem_attr_array, i, entry,
> @@ -2379,6 +2423,16 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
>  			break;
>  	mutex_unlock(&kvm->lock);
>  
> +	if (kvm_arch_has_private_mem(kvm)) {
> +		idx = srcu_read_lock(&kvm->srcu);

Mostly for reference, this goes away if slots_lock is used instead of kvm->lock.

> +		KVM_MMU_LOCK(kvm);
> +		if (i > start)
> +			kvm_unmap_mem_range(kvm, start, i);
> +		kvm_mmu_invalidate_end(kvm);
> +		KVM_MMU_UNLOCK(kvm);
> +		srcu_read_unlock(&kvm->srcu, idx);
> +	}
> +
>  	attrs->address = i << PAGE_SHIFT;
>  	attrs->size = (end - i) << PAGE_SHIFT;
>  
> -- 
> 2.25.1
>
  

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3d69484d2704..3331c0c92838 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -255,7 +255,6 @@  bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
-#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
 struct kvm_gfn_range {
 	struct kvm_memory_slot *slot;
 	gfn_t start;
@@ -264,6 +263,8 @@  struct kvm_gfn_range {
 	bool may_block;
 };
 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+
+#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
@@ -785,11 +786,12 @@  struct kvm {
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	struct mmu_notifier mmu_notifier;
+#endif
 	unsigned long mmu_invalidate_seq;
 	long mmu_invalidate_in_progress;
 	gfn_t mmu_invalidate_range_start;
 	gfn_t mmu_invalidate_range_end;
-#endif
+
 	struct list_head devices;
 	u64 manual_dirty_log_protect;
 	struct dentry *debugfs_dentry;
@@ -1480,6 +1482,7 @@  bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_post_init_vm(struct kvm *kvm);
 void kvm_arch_pre_destroy_vm(struct kvm *kvm);
 int kvm_arch_create_vm_debugfs(struct kvm *kvm);
+bool kvm_arch_has_private_mem(struct kvm *kvm);
 
 #ifndef __KVM_HAVE_ARCH_VM_ALLOC
 /*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ad55dfbc75d7..4e1e1e113bf0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -520,6 +520,62 @@  void kvm_destroy_vcpus(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
 
+void kvm_mmu_invalidate_begin(struct kvm *kvm)
+{
+	/*
+	 * The count increase must become visible at unlock time as no
+	 * spte can be established without taking the mmu_lock and
+	 * count is also read inside the mmu_lock critical section.
+	 */
+	kvm->mmu_invalidate_in_progress++;
+
+	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
+		kvm->mmu_invalidate_range_start = INVALID_GPA;
+		kvm->mmu_invalidate_range_end = INVALID_GPA;
+	}
+}
+
+void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+	WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
+
+	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
+		kvm->mmu_invalidate_range_start = start;
+		kvm->mmu_invalidate_range_end = end;
+	} else {
+		/*
+		 * Fully tracking multiple concurrent ranges has diminishing
+		 * returns. Keep things simple and just find the minimal range
+		 * which includes the current and new ranges. As there won't be
+		 * enough information to subtract a range after its invalidate
+		 * completes, any ranges invalidated concurrently will
+		 * accumulate and persist until all outstanding invalidates
+		 * complete.
+		 */
+		kvm->mmu_invalidate_range_start =
+			min(kvm->mmu_invalidate_range_start, start);
+		kvm->mmu_invalidate_range_end =
+			max(kvm->mmu_invalidate_range_end, end);
+	}
+}
+
+void kvm_mmu_invalidate_end(struct kvm *kvm)
+{
+	/*
+	 * This sequence increase will notify the kvm page fault that
+	 * the page that is going to be mapped in the spte could have
+	 * been freed.
+	 */
+	kvm->mmu_invalidate_seq++;
+	smp_wmb();
+	/*
+	 * The above sequence increase must be visible before the
+	 * below count decrease, which is ensured by the smp_wmb above
+	 * in conjunction with the smp_rmb in mmu_invalidate_retry().
+	 */
+	kvm->mmu_invalidate_in_progress--;
+}
+
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 {
@@ -714,45 +770,6 @@  static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
 }
 
-void kvm_mmu_invalidate_begin(struct kvm *kvm)
-{
-	/*
-	 * The count increase must become visible at unlock time as no
-	 * spte can be established without taking the mmu_lock and
-	 * count is also read inside the mmu_lock critical section.
-	 */
-	kvm->mmu_invalidate_in_progress++;
-
-	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
-		kvm->mmu_invalidate_range_start = INVALID_GPA;
-		kvm->mmu_invalidate_range_end = INVALID_GPA;
-	}
-}
-
-void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
-{
-	WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
-
-	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
-		kvm->mmu_invalidate_range_start = start;
-		kvm->mmu_invalidate_range_end = end;
-	} else {
-		/*
-		 * Fully tracking multiple concurrent ranges has diminishing
-		 * returns. Keep things simple and just find the minimal range
-		 * which includes the current and new ranges. As there won't be
-		 * enough information to subtract a range after its invalidate
-		 * completes, any ranges invalidated concurrently will
-		 * accumulate and persist until all outstanding invalidates
-		 * complete.
-		 */
-		kvm->mmu_invalidate_range_start =
-			min(kvm->mmu_invalidate_range_start, start);
-		kvm->mmu_invalidate_range_end =
-			max(kvm->mmu_invalidate_range_end, end);
-	}
-}
-
 static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
@@ -806,23 +823,6 @@  static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	return 0;
 }
 
-void kvm_mmu_invalidate_end(struct kvm *kvm)
-{
-	/*
-	 * This sequence increase will notify the kvm page fault that
-	 * the page that is going to be mapped in the spte could have
-	 * been freed.
-	 */
-	kvm->mmu_invalidate_seq++;
-	smp_wmb();
-	/*
-	 * The above sequence increase must be visible before the
-	 * below count decrease, which is ensured by the smp_wmb above
-	 * in conjunction with the smp_rmb in mmu_invalidate_retry().
-	 */
-	kvm->mmu_invalidate_in_progress--;
-}
-
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 					const struct mmu_notifier_range *range)
 {
@@ -1140,6 +1140,11 @@  int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
 	return 0;
 }
 
+bool __weak kvm_arch_has_private_mem(struct kvm *kvm)
+{
+	return false;
+}
+
 static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 {
 	struct kvm *kvm = kvm_arch_alloc_vm();
@@ -2349,15 +2354,47 @@  static u64 kvm_supported_mem_attributes(struct kvm *kvm)
 	return 0;
 }
 
+static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+	struct kvm_gfn_range gfn_range;
+	struct kvm_memory_slot *slot;
+	struct kvm_memslots *slots;
+	struct kvm_memslot_iter iter;
+	int i;
+	int r = 0;
+
+	gfn_range.pte = __pte(0);
+	gfn_range.may_block = true;
+
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		slots = __kvm_memslots(kvm, i);
+
+		kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
+			slot = iter.slot;
+			gfn_range.start = max(start, slot->base_gfn);
+			gfn_range.end = min(end, slot->base_gfn + slot->npages);
+			if (gfn_range.start >= gfn_range.end)
+				continue;
+			gfn_range.slot = slot;
+
+			r |= kvm_unmap_gfn_range(kvm, &gfn_range);
+		}
+	}
+
+	if (r)
+		kvm_flush_remote_tlbs(kvm);
+}
+
 static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
 					   struct kvm_memory_attributes *attrs)
 {
 	gfn_t start, end;
 	unsigned long i;
 	void *entry;
+	int idx;
 	u64 supported_attrs = kvm_supported_mem_attributes(kvm);
 
-	/* flags is currently not used. */
+	/* 'flags' is currently not used. */
 	if (attrs->flags)
 		return -EINVAL;
 	if (attrs->attributes & ~supported_attrs)
@@ -2372,6 +2409,13 @@  static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
 
 	entry = attrs->attributes ? xa_mk_value(attrs->attributes) : NULL;
 
+	if (kvm_arch_has_private_mem(kvm)) {
+		KVM_MMU_LOCK(kvm);
+		kvm_mmu_invalidate_begin(kvm);
+		kvm_mmu_invalidate_range_add(kvm, start, end);
+		KVM_MMU_UNLOCK(kvm);
+	}
+
 	mutex_lock(&kvm->lock);
 	for (i = start; i < end; i++)
 		if (xa_err(xa_store(&kvm->mem_attr_array, i, entry,
@@ -2379,6 +2423,16 @@  static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
 			break;
 	mutex_unlock(&kvm->lock);
 
+	if (kvm_arch_has_private_mem(kvm)) {
+		idx = srcu_read_lock(&kvm->srcu);
+		KVM_MMU_LOCK(kvm);
+		if (i > start)
+			kvm_unmap_mem_range(kvm, start, i);
+		kvm_mmu_invalidate_end(kvm);
+		KVM_MMU_UNLOCK(kvm);
+		srcu_read_unlock(&kvm->srcu, idx);
+	}
+
 	attrs->address = i << PAGE_SHIFT;
 	attrs->size = (end - i) << PAGE_SHIFT;