[v7,2/8] KVM: Introduce __kvm_follow_pfn function

Message ID 20230704075054.3344915-3-stevensd@google.com
State New
Headers
Series KVM: allow mapping non-refcounted pages |

Commit Message

David Stevens July 4, 2023, 7:50 a.m. UTC
  From: David Stevens <stevensd@chromium.org>

Introduce __kvm_follow_pfn, which will replace __gfn_to_pfn_memslot.
__kvm_follow_pfn refactors the old API's arguments into a struct and,
where possible, combines the boolean arguments into a single flags
argument.

Signed-off-by: David Stevens <stevensd@chromium.org>
---
 include/linux/kvm_host.h |  16 ++++
 virt/kvm/kvm_main.c      | 171 ++++++++++++++++++++++-----------------
 virt/kvm/kvm_mm.h        |   3 +-
 virt/kvm/pfncache.c      |   8 +-
 4 files changed, 122 insertions(+), 76 deletions(-)
  

Comments

Yu Zhang July 5, 2023, 3:10 a.m. UTC | #1
> @@ -2514,35 +2512,26 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
>   * The slow path to get the pfn of the specified host virtual address,
>   * 1 indicates success, -errno is returned if error is detected.
>   */
> -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> -			   bool interruptible, bool *writable, kvm_pfn_t *pfn)
> +static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
>  {
> -	unsigned int flags = FOLL_HWPOISON;
> +	unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags;
>  	struct page *page;
>  	int npages;
>  
>  	might_sleep();
>  
> -	if (writable)
> -		*writable = write_fault;
> -
> -	if (write_fault)
> -		flags |= FOLL_WRITE;
> -	if (async)
> -		flags |= FOLL_NOWAIT;
> -	if (interruptible)
> -		flags |= FOLL_INTERRUPTIBLE;
> -
> -	npages = get_user_pages_unlocked(addr, 1, &page, flags);
> +	npages = get_user_pages_unlocked(foll->hva, 1, &page, flags);
>  	if (npages != 1)
>  		return npages;
>  
> +	foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> +
>  	/* map read fault as writable if possible */
> -	if (unlikely(!write_fault) && writable) {
> +	if (unlikely(!foll->writable) && foll->allow_write_mapping) {

I guess !foll->writable should be !(foll->flags & FOLL_WRITE) here.

>  		struct page *wpage;
>  
> -		if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
> -			*writable = true;
> +		if (get_user_page_fast_only(foll->hva, FOLL_WRITE, &wpage)) {
> +			foll->writable = true;
>  			put_page(page);
>  			page = wpage;
>  		}
> @@ -2572,23 +2561,23 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn)
>  	return get_page_unless_zero(page);
>  }
>  
...

> +kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
> +			       bool atomic, bool interruptible, bool *async,
> +			       bool write_fault, bool *writable, hva_t *hva)
> +{
> +	kvm_pfn_t pfn;
> +	struct kvm_follow_pfn foll = {
> +		.slot = slot,
> +		.gfn = gfn,
> +		.flags = 0,
> +		.atomic = atomic,
> +		.allow_write_mapping = !!writable,
> +	};
> +
> +	if (write_fault)
> +		foll.flags |= FOLL_WRITE;
> +	if (async)
> +		foll.flags |= FOLL_NOWAIT;
> +	if (interruptible)
> +		foll.flags |= FOLL_INTERRUPTIBLE;
> +
> +	pfn = __kvm_follow_pfn(&foll);
> +	if (pfn == KVM_PFN_ERR_NEEDS_IO) {

Could we just use KVM_PFN_ERR_FAULT and foll.flags here? I.e.,
	if (pfn == KVM_PFN_ERR_FAULT && (foll.flags & FOLL_NOWAIT))?
Setting pfn to KVM_PFN_ERR_NEEDS_IO just to indicate an async fault
seems unnecessary.

> +		*async = true;
> +		pfn = KVM_PFN_ERR_FAULT;
> +	}
> +	if (hva)
> +		*hva = foll.hva;
> +	if (writable)
> +		*writable = foll.writable;
> +	return pfn;
>  }
>  EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
>  

B.R.
Yu
  
Zhi Wang July 5, 2023, 8:47 a.m. UTC | #2
On Tue,  4 Jul 2023 16:50:47 +0900
David Stevens <stevensd@chromium.org> wrote:

> From: David Stevens <stevensd@chromium.org>
> 
> Introduce __kvm_follow_pfn, which will replace __gfn_to_pfn_memslot.
> __kvm_follow_pfn refactors the old API's arguments into a struct and,
> where possible, combines the boolean arguments into a single flags
> argument.
> 
> Signed-off-by: David Stevens <stevensd@chromium.org>
> ---
>  include/linux/kvm_host.h |  16 ++++
>  virt/kvm/kvm_main.c      | 171 ++++++++++++++++++++++-----------------
>  virt/kvm/kvm_mm.h        |   3 +-
>  virt/kvm/pfncache.c      |   8 +-
>  4 files changed, 122 insertions(+), 76 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 9d3ac7720da9..ef2763c2b12e 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -97,6 +97,7 @@
>  #define KVM_PFN_ERR_HWPOISON	(KVM_PFN_ERR_MASK + 1)
>  #define KVM_PFN_ERR_RO_FAULT	(KVM_PFN_ERR_MASK + 2)
>  #define KVM_PFN_ERR_SIGPENDING	(KVM_PFN_ERR_MASK + 3)
> +#define KVM_PFN_ERR_NEEDS_IO	(KVM_PFN_ERR_MASK + 4)
>  
>  /*
>   * error pfns indicate that the gfn is in slot but faild to
> @@ -1156,6 +1157,21 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
>  void kvm_release_page_clean(struct page *page);
>  void kvm_release_page_dirty(struct page *page);
>  
> +struct kvm_follow_pfn {
> +	const struct kvm_memory_slot *slot;
> +	gfn_t gfn;
> +	unsigned int flags;
> +	bool atomic;
> +	/* Allow a read fault to create a writeable mapping. */
> +	bool allow_write_mapping;
> +
> +	/* Outputs of __kvm_follow_pfn */
> +	hva_t hva;
> +	bool writable;
> +};
> +
> +kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll);
> +
>  kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
>  kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
>  		      bool *writable);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 371bd783ff2b..b13f22861d2f 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2486,24 +2486,22 @@ static inline int check_user_page_hwpoison(unsigned long addr)
>   * true indicates success, otherwise false is returned.  It's also the
>   * only part that runs if we can in atomic context.
>   */
> -static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> -			    bool *writable, kvm_pfn_t *pfn)
> +static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
>  {
>  	struct page *page[1];
> +	bool write_fault = foll->flags & FOLL_WRITE;
>  
>  	/*
>  	 * Fast pin a writable pfn only if it is a write fault request
>  	 * or the caller allows to map a writable pfn for a read fault
>  	 * request.
>  	 */
> -	if (!(write_fault || writable))
> +	if (!(write_fault || foll->allow_write_mapping))
>  		return false;
>  
> -	if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
> +	if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) {
>  		*pfn = page_to_pfn(page[0]);
> -
> -		if (writable)
> -			*writable = true;
> +		foll->writable = foll->allow_write_mapping;
>  		return true;
>  	}
>  
> @@ -2514,35 +2512,26 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
>   * The slow path to get the pfn of the specified host virtual address,
>   * 1 indicates success, -errno is returned if error is detected.
>   */
> -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> -			   bool interruptible, bool *writable, kvm_pfn_t *pfn)
> +static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
>  {
> -	unsigned int flags = FOLL_HWPOISON;
> +	unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags;
>  	struct page *page;
>  	int npages;
>  
>  	might_sleep();
>  
> -	if (writable)
> -		*writable = write_fault;
> -
> -	if (write_fault)
> -		flags |= FOLL_WRITE;
> -	if (async)
> -		flags |= FOLL_NOWAIT;
> -	if (interruptible)
> -		flags |= FOLL_INTERRUPTIBLE;
> -
> -	npages = get_user_pages_unlocked(addr, 1, &page, flags);
> +	npages = get_user_pages_unlocked(foll->hva, 1, &page, flags);
>  	if (npages != 1)
>  		return npages;
>  
> +	foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> +
>  	/* map read fault as writable if possible */
> -	if (unlikely(!write_fault) && writable) {
> +	if (unlikely(!foll->writable) && foll->allow_write_mapping) {
>  		struct page *wpage;
>  
> -		if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
> -			*writable = true;
> +		if (get_user_page_fast_only(foll->hva, FOLL_WRITE, &wpage)) {
> +			foll->writable = true;
>  			put_page(page);
>  			page = wpage;
>  		}
> @@ -2572,23 +2561,23 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn)
>  	return get_page_unless_zero(page);
>  }
>  
> -static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> -			       unsigned long addr, bool write_fault,
> -			       bool *writable, kvm_pfn_t *p_pfn)
> +static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *foll,
> +			       kvm_pfn_t *p_pfn)
>  {
>  	kvm_pfn_t pfn;
>  	pte_t *ptep;
>  	spinlock_t *ptl;
> +	bool write_fault = foll->flags & FOLL_WRITE;
>  	int r;
>  
> -	r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
> +	r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
>  	if (r) {
>  		/*
>  		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
>  		 * not call the fault handler, so do it here.
>  		 */
>  		bool unlocked = false;
> -		r = fixup_user_fault(current->mm, addr,
> +		r = fixup_user_fault(current->mm, foll->hva,
>  				     (write_fault ? FAULT_FLAG_WRITE : 0),
>  				     &unlocked);
>  		if (unlocked)
> @@ -2596,7 +2585,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
>  		if (r)
>  			return r;
>  
> -		r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
> +		r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
>  		if (r)
>  			return r;
>  	}
> @@ -2606,8 +2595,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
>  		goto out;
>  	}
>  
> -	if (writable)
> -		*writable = pte_write(*ptep);
> +	foll->writable = pte_write(*ptep) && foll->allow_write_mapping;
>  	pfn = pte_pfn(*ptep);
>  
>  	/*
> @@ -2652,24 +2640,22 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
>   * 2): @write_fault = false && @writable, @writable will tell the caller
>   *     whether the mapping is writable.
>   */
> -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
> -		     bool *async, bool write_fault, bool *writable)
> +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll)
>  {
>  	struct vm_area_struct *vma;
>  	kvm_pfn_t pfn;
>  	int npages, r;
>  
>  	/* we can do it either atomically or asynchronously, not both */
> -	BUG_ON(atomic && async);
> +	BUG_ON(foll->atomic && (foll->flags & FOLL_NOWAIT));
>  
> -	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
> +	if (hva_to_pfn_fast(foll, &pfn))
>  		return pfn;
>  
> -	if (atomic)
> +	if (foll->atomic)
>  		return KVM_PFN_ERR_FAULT;
>  
> -	npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
> -				 writable, &pfn);
> +	npages = hva_to_pfn_slow(foll, &pfn);
>  	if (npages == 1)
>  		return pfn;
>  	if (npages == -EINTR)
> @@ -2677,83 +2663,122 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
>  
>  	mmap_read_lock(current->mm);
>  	if (npages == -EHWPOISON ||
> -	      (!async && check_user_page_hwpoison(addr))) {
> +	      (!(foll->flags & FOLL_NOWAIT) && check_user_page_hwpoison(foll->hva))) {
>  		pfn = KVM_PFN_ERR_HWPOISON;
>  		goto exit;
>  	}
>  
>  retry:
> -	vma = vma_lookup(current->mm, addr);
> +	vma = vma_lookup(current->mm, foll->hva);
>  
>  	if (vma == NULL)
>  		pfn = KVM_PFN_ERR_FAULT;
>  	else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
> -		r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
> +		r = hva_to_pfn_remapped(vma, foll, &pfn);
>  		if (r == -EAGAIN)
>  			goto retry;
>  		if (r < 0)
>  			pfn = KVM_PFN_ERR_FAULT;
>  	} else {
> -		if (async && vma_is_valid(vma, write_fault))
> -			*async = true;
> -		pfn = KVM_PFN_ERR_FAULT;
> +		if ((foll->flags & FOLL_NOWAIT) &&
> +		    vma_is_valid(vma, foll->flags & FOLL_WRITE))
> +			pfn = KVM_PFN_ERR_NEEDS_IO;
> +		else
> +			pfn = KVM_PFN_ERR_FAULT;
>  	}
>  exit:
>  	mmap_read_unlock(current->mm);
>  	return pfn;
>  }
>  
> -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
> -			       bool atomic, bool interruptible, bool *async,
> -			       bool write_fault, bool *writable, hva_t *hva)
> +kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll)
>  {
> -	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
> -
> -	if (hva)
> -		*hva = addr;
> +	foll->hva = __gfn_to_hva_many(foll->slot, foll->gfn, NULL,
> +				      foll->flags & FOLL_WRITE);
>  
> -	if (addr == KVM_HVA_ERR_RO_BAD) {
> -		if (writable)
> -			*writable = false;
> +	if (foll->hva == KVM_HVA_ERR_RO_BAD)
>  		return KVM_PFN_ERR_RO_FAULT;
> -	}
>  

Can you explain why updating foll->writable = false (previously *writeable
= false) is omitted here?

In the caller where the struct kvm_follow_pfn is initialized, e.g.
__gfn_to_pfn_memslot()/gfn_to_pfn_prot(), .writable is not initialized.
IIUC, they expect __kvm_follow_pfn() to update it and return .writable to
upper caller.

As the one of the output, it would be better to initalize it either in the
caller or update it in __kvm_follow_pfn(). Or
__gfn_to_pfn_memslot()/gfn_to_pfn_prot() will return random data in the
stack to the caller via bool *writable. It doesn't sound nice.

BTW: It seems both "writable" and "writeable" are used in this patch. I am
wondering maybe we can correct them.

> -	if (kvm_is_error_hva(addr)) {
> -		if (writable)
> -			*writable = false;
> +	if (kvm_is_error_hva(foll->hva))
>  		return KVM_PFN_NOSLOT;
> -	}
>  
> -	/* Do not map writable pfn in the readonly memslot. */
> -	if (writable && memslot_is_readonly(slot)) {
> -		*writable = false;
> -		writable = NULL;
> -	}
> +	if (memslot_is_readonly(foll->slot))
> +		foll->allow_write_mapping = false;
> +
> +	return hva_to_pfn(foll);
> +}
> +EXPORT_SYMBOL_GPL(__kvm_follow_pfn);
>  
> -	return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
> -			  writable);
> +kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
> +			       bool atomic, bool interruptible, bool *async,
> +			       bool write_fault, bool *writable, hva_t *hva)
> +{
> +	kvm_pfn_t pfn;
> +	struct kvm_follow_pfn foll = {
> +		.slot = slot,
> +		.gfn = gfn,
> +		.flags = 0,
> +		.atomic = atomic,
> +		.allow_write_mapping = !!writable,
> +	};
> +
> +	if (write_fault)
> +		foll.flags |= FOLL_WRITE;
> +	if (async)
> +		foll.flags |= FOLL_NOWAIT;
> +	if (interruptible)
> +		foll.flags |= FOLL_INTERRUPTIBLE;
> +
> +	pfn = __kvm_follow_pfn(&foll);
> +	if (pfn == KVM_PFN_ERR_NEEDS_IO) {
> +		*async = true;
> +		pfn = KVM_PFN_ERR_FAULT;
> +	}
> +	if (hva)
> +		*hva = foll.hva;
> +	if (writable)
> +		*writable = foll.writable;
> +	return pfn;
>  }
>  EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
>  
>  kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
>  		      bool *writable)
>  {
> -	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
> -				    NULL, write_fault, writable, NULL);
> +	kvm_pfn_t pfn;
> +	struct kvm_follow_pfn foll = {
> +		.slot = gfn_to_memslot(kvm, gfn),
> +		.gfn = gfn,
> +		.flags = write_fault ? FOLL_WRITE : 0,
> +		.allow_write_mapping = !!writable,
> +	};
> +	pfn = __kvm_follow_pfn(&foll);
> +	if (writable)
> +		*writable = foll.writable;
> +	return pfn;
>  }
>  EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
>  
>  kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
>  {
> -	return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
> -				    NULL, NULL);
> +	struct kvm_follow_pfn foll = {
> +		.slot = slot,
> +		.gfn = gfn,
> +		.flags = FOLL_WRITE,
> +	};
> +	return __kvm_follow_pfn(&foll);
>  }
>  EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
>  
>  kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
>  {
> -	return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
> -				    NULL, NULL);
> +	struct kvm_follow_pfn foll = {
> +		.slot = slot,
> +		.gfn = gfn,
> +		.flags = FOLL_WRITE,
> +		.atomic = true,
> +	};
> +	return __kvm_follow_pfn(&foll);
>  }
>  EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
>  
> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
> index 180f1a09e6ba..ed896aee5396 100644
> --- a/virt/kvm/kvm_mm.h
> +++ b/virt/kvm/kvm_mm.h
> @@ -20,8 +20,7 @@
>  #define KVM_MMU_UNLOCK(kvm)		spin_unlock(&(kvm)->mmu_lock)
>  #endif /* KVM_HAVE_MMU_RWLOCK */
>  
> -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
> -		     bool *async, bool write_fault, bool *writable);
> +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll);
>  
>  #ifdef CONFIG_HAVE_KVM_PFNCACHE
>  void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
> diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
> index 2d6aba677830..e3fefa753a51 100644
> --- a/virt/kvm/pfncache.c
> +++ b/virt/kvm/pfncache.c
> @@ -144,6 +144,12 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
>  	kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
>  	void *new_khva = NULL;
>  	unsigned long mmu_seq;
> +	struct kvm_follow_pfn foll = {
> +		.slot = gpc->memslot,
> +		.gfn = gpa_to_gfn(gpc->gpa),
> +		.flags = FOLL_WRITE,
> +		.hva = gpc->uhva,
> +	};
>  
>  	lockdep_assert_held(&gpc->refresh_lock);
>  
> @@ -183,7 +189,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
>  		}
>  
>  		/* We always request a writeable mapping */
> -		new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL);
> +		new_pfn = hva_to_pfn(&foll);
>  		if (is_error_noslot_pfn(new_pfn))
>  			goto out_error;
>
  
David Stevens July 5, 2023, 9:08 a.m. UTC | #3
On Wed, Jul 5, 2023 at 5:47 PM Zhi Wang <zhi.wang.linux@gmail.com> wrote:
>
> On Tue,  4 Jul 2023 16:50:47 +0900
> David Stevens <stevensd@chromium.org> wrote:
>
> > From: David Stevens <stevensd@chromium.org>
> >
> > Introduce __kvm_follow_pfn, which will replace __gfn_to_pfn_memslot.
> > __kvm_follow_pfn refactors the old API's arguments into a struct and,
> > where possible, combines the boolean arguments into a single flags
> > argument.
> >
> > Signed-off-by: David Stevens <stevensd@chromium.org>
> > ---
> >  include/linux/kvm_host.h |  16 ++++
> >  virt/kvm/kvm_main.c      | 171 ++++++++++++++++++++++-----------------
> >  virt/kvm/kvm_mm.h        |   3 +-
> >  virt/kvm/pfncache.c      |   8 +-
> >  4 files changed, 122 insertions(+), 76 deletions(-)
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 9d3ac7720da9..ef2763c2b12e 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -97,6 +97,7 @@
> >  #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1)
> >  #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2)
> >  #define KVM_PFN_ERR_SIGPENDING       (KVM_PFN_ERR_MASK + 3)
> > +#define KVM_PFN_ERR_NEEDS_IO (KVM_PFN_ERR_MASK + 4)
> >
> >  /*
> >   * error pfns indicate that the gfn is in slot but faild to
> > @@ -1156,6 +1157,21 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
> >  void kvm_release_page_clean(struct page *page);
> >  void kvm_release_page_dirty(struct page *page);
> >
> > +struct kvm_follow_pfn {
> > +     const struct kvm_memory_slot *slot;
> > +     gfn_t gfn;
> > +     unsigned int flags;
> > +     bool atomic;
> > +     /* Allow a read fault to create a writeable mapping. */
> > +     bool allow_write_mapping;
> > +
> > +     /* Outputs of __kvm_follow_pfn */
> > +     hva_t hva;
> > +     bool writable;
> > +};
> > +
> > +kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll);
> > +
> >  kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
> >  kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
> >                     bool *writable);
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 371bd783ff2b..b13f22861d2f 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -2486,24 +2486,22 @@ static inline int check_user_page_hwpoison(unsigned long addr)
> >   * true indicates success, otherwise false is returned.  It's also the
> >   * only part that runs if we can in atomic context.
> >   */
> > -static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> > -                         bool *writable, kvm_pfn_t *pfn)
> > +static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> >  {
> >       struct page *page[1];
> > +     bool write_fault = foll->flags & FOLL_WRITE;
> >
> >       /*
> >        * Fast pin a writable pfn only if it is a write fault request
> >        * or the caller allows to map a writable pfn for a read fault
> >        * request.
> >        */
> > -     if (!(write_fault || writable))
> > +     if (!(write_fault || foll->allow_write_mapping))
> >               return false;
> >
> > -     if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
> > +     if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) {
> >               *pfn = page_to_pfn(page[0]);
> > -
> > -             if (writable)
> > -                     *writable = true;
> > +             foll->writable = foll->allow_write_mapping;
> >               return true;
> >       }
> >
> > @@ -2514,35 +2512,26 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> >   * The slow path to get the pfn of the specified host virtual address,
> >   * 1 indicates success, -errno is returned if error is detected.
> >   */
> > -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> > -                        bool interruptible, bool *writable, kvm_pfn_t *pfn)
> > +static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> >  {
> > -     unsigned int flags = FOLL_HWPOISON;
> > +     unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags;
> >       struct page *page;
> >       int npages;
> >
> >       might_sleep();
> >
> > -     if (writable)
> > -             *writable = write_fault;
> > -
> > -     if (write_fault)
> > -             flags |= FOLL_WRITE;
> > -     if (async)
> > -             flags |= FOLL_NOWAIT;
> > -     if (interruptible)
> > -             flags |= FOLL_INTERRUPTIBLE;
> > -
> > -     npages = get_user_pages_unlocked(addr, 1, &page, flags);
> > +     npages = get_user_pages_unlocked(foll->hva, 1, &page, flags);
> >       if (npages != 1)
> >               return npages;
> >
> > +     foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> > +
> >       /* map read fault as writable if possible */
> > -     if (unlikely(!write_fault) && writable) {
> > +     if (unlikely(!foll->writable) && foll->allow_write_mapping) {
> >               struct page *wpage;
> >
> > -             if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
> > -                     *writable = true;
> > +             if (get_user_page_fast_only(foll->hva, FOLL_WRITE, &wpage)) {
> > +                     foll->writable = true;
> >                       put_page(page);
> >                       page = wpage;
> >               }
> > @@ -2572,23 +2561,23 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn)
> >       return get_page_unless_zero(page);
> >  }
> >
> > -static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> > -                            unsigned long addr, bool write_fault,
> > -                            bool *writable, kvm_pfn_t *p_pfn)
> > +static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *foll,
> > +                            kvm_pfn_t *p_pfn)
> >  {
> >       kvm_pfn_t pfn;
> >       pte_t *ptep;
> >       spinlock_t *ptl;
> > +     bool write_fault = foll->flags & FOLL_WRITE;
> >       int r;
> >
> > -     r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
> > +     r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
> >       if (r) {
> >               /*
> >                * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
> >                * not call the fault handler, so do it here.
> >                */
> >               bool unlocked = false;
> > -             r = fixup_user_fault(current->mm, addr,
> > +             r = fixup_user_fault(current->mm, foll->hva,
> >                                    (write_fault ? FAULT_FLAG_WRITE : 0),
> >                                    &unlocked);
> >               if (unlocked)
> > @@ -2596,7 +2585,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> >               if (r)
> >                       return r;
> >
> > -             r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
> > +             r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
> >               if (r)
> >                       return r;
> >       }
> > @@ -2606,8 +2595,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> >               goto out;
> >       }
> >
> > -     if (writable)
> > -             *writable = pte_write(*ptep);
> > +     foll->writable = pte_write(*ptep) && foll->allow_write_mapping;
> >       pfn = pte_pfn(*ptep);
> >
> >       /*
> > @@ -2652,24 +2640,22 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> >   * 2): @write_fault = false && @writable, @writable will tell the caller
> >   *     whether the mapping is writable.
> >   */
> > -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
> > -                  bool *async, bool write_fault, bool *writable)
> > +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll)
> >  {
> >       struct vm_area_struct *vma;
> >       kvm_pfn_t pfn;
> >       int npages, r;
> >
> >       /* we can do it either atomically or asynchronously, not both */
> > -     BUG_ON(atomic && async);
> > +     BUG_ON(foll->atomic && (foll->flags & FOLL_NOWAIT));
> >
> > -     if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
> > +     if (hva_to_pfn_fast(foll, &pfn))
> >               return pfn;
> >
> > -     if (atomic)
> > +     if (foll->atomic)
> >               return KVM_PFN_ERR_FAULT;
> >
> > -     npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
> > -                              writable, &pfn);
> > +     npages = hva_to_pfn_slow(foll, &pfn);
> >       if (npages == 1)
> >               return pfn;
> >       if (npages == -EINTR)
> > @@ -2677,83 +2663,122 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
> >
> >       mmap_read_lock(current->mm);
> >       if (npages == -EHWPOISON ||
> > -           (!async && check_user_page_hwpoison(addr))) {
> > +           (!(foll->flags & FOLL_NOWAIT) && check_user_page_hwpoison(foll->hva))) {
> >               pfn = KVM_PFN_ERR_HWPOISON;
> >               goto exit;
> >       }
> >
> >  retry:
> > -     vma = vma_lookup(current->mm, addr);
> > +     vma = vma_lookup(current->mm, foll->hva);
> >
> >       if (vma == NULL)
> >               pfn = KVM_PFN_ERR_FAULT;
> >       else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
> > -             r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
> > +             r = hva_to_pfn_remapped(vma, foll, &pfn);
> >               if (r == -EAGAIN)
> >                       goto retry;
> >               if (r < 0)
> >                       pfn = KVM_PFN_ERR_FAULT;
> >       } else {
> > -             if (async && vma_is_valid(vma, write_fault))
> > -                     *async = true;
> > -             pfn = KVM_PFN_ERR_FAULT;
> > +             if ((foll->flags & FOLL_NOWAIT) &&
> > +                 vma_is_valid(vma, foll->flags & FOLL_WRITE))
> > +                     pfn = KVM_PFN_ERR_NEEDS_IO;
> > +             else
> > +                     pfn = KVM_PFN_ERR_FAULT;
> >       }
> >  exit:
> >       mmap_read_unlock(current->mm);
> >       return pfn;
> >  }
> >
> > -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
> > -                            bool atomic, bool interruptible, bool *async,
> > -                            bool write_fault, bool *writable, hva_t *hva)
> > +kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll)
> >  {
> > -     unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
> > -
> > -     if (hva)
> > -             *hva = addr;
> > +     foll->hva = __gfn_to_hva_many(foll->slot, foll->gfn, NULL,
> > +                                   foll->flags & FOLL_WRITE);
> >
> > -     if (addr == KVM_HVA_ERR_RO_BAD) {
> > -             if (writable)
> > -                     *writable = false;
> > +     if (foll->hva == KVM_HVA_ERR_RO_BAD)
> >               return KVM_PFN_ERR_RO_FAULT;
> > -     }
> >
>
> Can you explain why updating foll->writable = false (previously *writeable
> = false) is omitted here?
>
> In the caller where the struct kvm_follow_pfn is initialized, e.g.
> __gfn_to_pfn_memslot()/gfn_to_pfn_prot(), .writable is not initialized.
> IIUC, they expect __kvm_follow_pfn() to update it and return .writable to
> upper caller.
>
> As the one of the output, it would be better to initalize it either in the
> caller or update it in __kvm_follow_pfn(). Or
> __gfn_to_pfn_memslot()/gfn_to_pfn_prot() will return random data in the
> stack to the caller via bool *writable. It doesn't sound nice.

Entries omitted from an initializer are initialized to zero, so
.writable does get initialized in all of the patches in this series
via designated initializers. Although you're right that explicitly
setting it to false is a good idea, in case someday someone adds a
caller that doesn't use an initializer when declaring its
kvm_follow_pfn.

-David
  
David Stevens July 5, 2023, 9:22 a.m. UTC | #4
On Wed, Jul 5, 2023 at 12:10 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
>
> > @@ -2514,35 +2512,26 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> >   * The slow path to get the pfn of the specified host virtual address,
> >   * 1 indicates success, -errno is returned if error is detected.
> >   */
> > -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> > -                        bool interruptible, bool *writable, kvm_pfn_t *pfn)
> > +static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> >  {
> > -     unsigned int flags = FOLL_HWPOISON;
> > +     unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags;
> >       struct page *page;
> >       int npages;
> >
> >       might_sleep();
> >
> > -     if (writable)
> > -             *writable = write_fault;
> > -
> > -     if (write_fault)
> > -             flags |= FOLL_WRITE;
> > -     if (async)
> > -             flags |= FOLL_NOWAIT;
> > -     if (interruptible)
> > -             flags |= FOLL_INTERRUPTIBLE;
> > -
> > -     npages = get_user_pages_unlocked(addr, 1, &page, flags);
> > +     npages = get_user_pages_unlocked(foll->hva, 1, &page, flags);
> >       if (npages != 1)
> >               return npages;
> >
> > +     foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> > +
> >       /* map read fault as writable if possible */
> > -     if (unlikely(!write_fault) && writable) {
> > +     if (unlikely(!foll->writable) && foll->allow_write_mapping) {
>
> I guess !foll->writable should be !(foll->flags & FOLL_WRITE) here.

The two statements are logically equivalent, although I guess using
!(foll->flags & FOLL_WRITE) may be a little clearer, if a little more
verbose.

> >               struct page *wpage;
> >
> > -             if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
> > -                     *writable = true;
> > +             if (get_user_page_fast_only(foll->hva, FOLL_WRITE, &wpage)) {
> > +                     foll->writable = true;
> >                       put_page(page);
> >                       page = wpage;
> >               }
> > @@ -2572,23 +2561,23 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn)
> >       return get_page_unless_zero(page);
> >  }
> >
> ...
>
> > +kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
> > +                            bool atomic, bool interruptible, bool *async,
> > +                            bool write_fault, bool *writable, hva_t *hva)
> > +{
> > +     kvm_pfn_t pfn;
> > +     struct kvm_follow_pfn foll = {
> > +             .slot = slot,
> > +             .gfn = gfn,
> > +             .flags = 0,
> > +             .atomic = atomic,
> > +             .allow_write_mapping = !!writable,
> > +     };
> > +
> > +     if (write_fault)
> > +             foll.flags |= FOLL_WRITE;
> > +     if (async)
> > +             foll.flags |= FOLL_NOWAIT;
> > +     if (interruptible)
> > +             foll.flags |= FOLL_INTERRUPTIBLE;
> > +
> > +     pfn = __kvm_follow_pfn(&foll);
> > +     if (pfn == KVM_PFN_ERR_NEEDS_IO) {
>
> Could we just use KVM_PFN_ERR_FAULT and foll.flags here? I.e.,
>         if (pfn == KVM_PFN_ERR_FAULT && (foll.flags & FOLL_NOWAIT))?
> Setting pfn to KVM_PFN_ERR_NEEDS_IO just to indicate an async fault
> seems unnecessary.

There are the cases where the fault does not fall within a vma or when
the target vma's flags don't support the fault's access permissions.
In those cases, continuing to try to resolve the fault won't cause
problems per-se, but it's wasteful and a bit confusing. Having
hva_to_pfn detect whether or not it may be possible to resolve the
fault asynchronously and return KVM_PFN_ERR_NEEDS_IO if so seems like
a good idea. It also matches what the existing code does.

-David
  
Yu Zhang July 5, 2023, 10:53 a.m. UTC | #5
On Wed, Jul 05, 2023 at 06:22:59PM +0900, David Stevens wrote:
> On Wed, Jul 5, 2023 at 12:10 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
> >
> > > @@ -2514,35 +2512,26 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> > >   * The slow path to get the pfn of the specified host virtual address,
> > >   * 1 indicates success, -errno is returned if error is detected.
> > >   */
> > > -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> > > -                        bool interruptible, bool *writable, kvm_pfn_t *pfn)
> > > +static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> > >  {
> > > -     unsigned int flags = FOLL_HWPOISON;
> > > +     unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags;
> > >       struct page *page;
> > >       int npages;
> > >
> > >       might_sleep();
> > >
> > > -     if (writable)
> > > -             *writable = write_fault;
> > > -
> > > -     if (write_fault)
> > > -             flags |= FOLL_WRITE;
> > > -     if (async)
> > > -             flags |= FOLL_NOWAIT;
> > > -     if (interruptible)
> > > -             flags |= FOLL_INTERRUPTIBLE;
> > > -
> > > -     npages = get_user_pages_unlocked(addr, 1, &page, flags);
> > > +     npages = get_user_pages_unlocked(foll->hva, 1, &page, flags);
> > >       if (npages != 1)
> > >               return npages;
> > >
> > > +     foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> > > +
> > >       /* map read fault as writable if possible */
> > > -     if (unlikely(!write_fault) && writable) {
> > > +     if (unlikely(!foll->writable) && foll->allow_write_mapping) {
> >
> > I guess !foll->writable should be !(foll->flags & FOLL_WRITE) here.
> 
> The two statements are logically equivalent, although I guess using
> !(foll->flags & FOLL_WRITE) may be a little clearer, if a little more
> verbose.

Well, as the comment says, we wanna try to map the read fault as writable
whenever possible. And __gfn_to_pfn_memslot() will only set the FOLL_WRITE
for write faults. So I guess using !foll->writable will not allow this.
Did I miss anything?

> > > +kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
> > > +                            bool atomic, bool interruptible, bool *async,
> > > +                            bool write_fault, bool *writable, hva_t *hva)
> > > +{
> > > +     kvm_pfn_t pfn;
> > > +     struct kvm_follow_pfn foll = {
> > > +             .slot = slot,
> > > +             .gfn = gfn,
> > > +             .flags = 0,
> > > +             .atomic = atomic,
> > > +             .allow_write_mapping = !!writable,
> > > +     };
> > > +
> > > +     if (write_fault)
> > > +             foll.flags |= FOLL_WRITE;
> > > +     if (async)
> > > +             foll.flags |= FOLL_NOWAIT;
> > > +     if (interruptible)
> > > +             foll.flags |= FOLL_INTERRUPTIBLE;
> > > +
> > > +     pfn = __kvm_follow_pfn(&foll);
> > > +     if (pfn == KVM_PFN_ERR_NEEDS_IO) {
> >
> > Could we just use KVM_PFN_ERR_FAULT and foll.flags here? I.e.,
> >         if (pfn == KVM_PFN_ERR_FAULT && (foll.flags & FOLL_NOWAIT))?
> > Setting pfn to KVM_PFN_ERR_NEEDS_IO just to indicate an async fault
> > seems unnecessary.
> 
> There are the cases where the fault does not fall within a vma or when
> the target vma's flags don't support the fault's access permissions.
> In those cases, continuing to try to resolve the fault won't cause
> problems per-se, but it's wasteful and a bit confusing. Having
> hva_to_pfn detect whether or not it may be possible to resolve the
> fault asynchronously and return KVM_PFN_ERR_NEEDS_IO if so seems like
> a good idea. It also matches what the existing code does.

Got it. Sounds reasonable. And thanks! :)

B.R.
Yu
  
Isaku Yamahata July 6, 2023, 1:34 a.m. UTC | #6
On Tue, Jul 04, 2023 at 04:50:47PM +0900,
David Stevens <stevensd@chromium.org> wrote:

> From: David Stevens <stevensd@chromium.org>
> 
> Introduce __kvm_follow_pfn, which will replace __gfn_to_pfn_memslot.
> __kvm_follow_pfn refactors the old API's arguments into a struct and,
> where possible, combines the boolean arguments into a single flags
> argument.
> 
> Signed-off-by: David Stevens <stevensd@chromium.org>
> ---
>  include/linux/kvm_host.h |  16 ++++
>  virt/kvm/kvm_main.c      | 171 ++++++++++++++++++++++-----------------
>  virt/kvm/kvm_mm.h        |   3 +-
>  virt/kvm/pfncache.c      |   8 +-
>  4 files changed, 122 insertions(+), 76 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 9d3ac7720da9..ef2763c2b12e 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -97,6 +97,7 @@
>  #define KVM_PFN_ERR_HWPOISON	(KVM_PFN_ERR_MASK + 1)
>  #define KVM_PFN_ERR_RO_FAULT	(KVM_PFN_ERR_MASK + 2)
>  #define KVM_PFN_ERR_SIGPENDING	(KVM_PFN_ERR_MASK + 3)
> +#define KVM_PFN_ERR_NEEDS_IO	(KVM_PFN_ERR_MASK + 4)
>  
>  /*
>   * error pfns indicate that the gfn is in slot but faild to
> @@ -1156,6 +1157,21 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
>  void kvm_release_page_clean(struct page *page);
>  void kvm_release_page_dirty(struct page *page);
>  
> +struct kvm_follow_pfn {
> +	const struct kvm_memory_slot *slot;
> +	gfn_t gfn;
> +	unsigned int flags;
> +	bool atomic;
> +	/* Allow a read fault to create a writeable mapping. */
> +	bool allow_write_mapping;

Maybe, make them const for input arguments?


> +
> +	/* Outputs of __kvm_follow_pfn */
> +	hva_t hva;
> +	bool writable;
> +};
> +
> +kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll);
> +
>  kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
>  kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
>  		      bool *writable);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 371bd783ff2b..b13f22861d2f 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2486,24 +2486,22 @@ static inline int check_user_page_hwpoison(unsigned long addr)
>   * true indicates success, otherwise false is returned.  It's also the
>   * only part that runs if we can in atomic context.
>   */
> -static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> -			    bool *writable, kvm_pfn_t *pfn)
> +static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
>  {
>  	struct page *page[1];
> +	bool write_fault = foll->flags & FOLL_WRITE;
>  
>  	/*
>  	 * Fast pin a writable pfn only if it is a write fault request
>  	 * or the caller allows to map a writable pfn for a read fault
>  	 * request.
>  	 */
> -	if (!(write_fault || writable))
> +	if (!(write_fault || foll->allow_write_mapping))
>  		return false;
>  
> -	if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
> +	if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) {
>  		*pfn = page_to_pfn(page[0]);
> -
> -		if (writable)
> -			*writable = true;
> +		foll->writable = foll->allow_write_mapping;
>  		return true;
>  	}
>  
> @@ -2514,35 +2512,26 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
>   * The slow path to get the pfn of the specified host virtual address,
>   * 1 indicates success, -errno is returned if error is detected.
>   */
> -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> -			   bool interruptible, bool *writable, kvm_pfn_t *pfn)
> +static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
>  {
> -	unsigned int flags = FOLL_HWPOISON;
> +	unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags;

Although adding FOLL_GET doesn't affect the behavior of
get_user_pages_unlocked(), I wondered how this affects the next change
It's better to mention it in the commit message.
get_user_pages_*() called by hva_to_pfn_{fast, slot} imply FOLL_GET,
but __kvm_follow_pfn() doesn't imply FOLL_GET.


>  	struct page *page;
>  	int npages;
>  
>  	might_sleep();
>  
> -	if (writable)
> -		*writable = write_fault;
> -
> -	if (write_fault)
> -		flags |= FOLL_WRITE;
> -	if (async)
> -		flags |= FOLL_NOWAIT;
> -	if (interruptible)
> -		flags |= FOLL_INTERRUPTIBLE;
> -
> -	npages = get_user_pages_unlocked(addr, 1, &page, flags);
> +	npages = get_user_pages_unlocked(foll->hva, 1, &page, flags);
>  	if (npages != 1)
>  		return npages;
>  
> +	foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> +
>  	/* map read fault as writable if possible */
> -	if (unlikely(!write_fault) && writable) {
> +	if (unlikely(!foll->writable) && foll->allow_write_mapping) {
>  		struct page *wpage;
>  
> -		if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
> -			*writable = true;
> +		if (get_user_page_fast_only(foll->hva, FOLL_WRITE, &wpage)) {
> +			foll->writable = true;
>  			put_page(page);
>  			page = wpage;
>  		}
> @@ -2572,23 +2561,23 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn)
>  	return get_page_unless_zero(page);
>  }
>  
> -static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> -			       unsigned long addr, bool write_fault,
> -			       bool *writable, kvm_pfn_t *p_pfn)
> +static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *foll,
> +			       kvm_pfn_t *p_pfn)
>  {
>  	kvm_pfn_t pfn;
>  	pte_t *ptep;
>  	spinlock_t *ptl;
> +	bool write_fault = foll->flags & FOLL_WRITE;
>  	int r;
>  
> -	r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
> +	r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
>  	if (r) {
>  		/*
>  		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
>  		 * not call the fault handler, so do it here.
>  		 */
>  		bool unlocked = false;
> -		r = fixup_user_fault(current->mm, addr,
> +		r = fixup_user_fault(current->mm, foll->hva,
>  				     (write_fault ? FAULT_FLAG_WRITE : 0),
>  				     &unlocked);
>  		if (unlocked)
> @@ -2596,7 +2585,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
>  		if (r)
>  			return r;
>  
> -		r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
> +		r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
>  		if (r)
>  			return r;
>  	}
> @@ -2606,8 +2595,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
>  		goto out;
>  	}
>  
> -	if (writable)
> -		*writable = pte_write(*ptep);
> +	foll->writable = pte_write(*ptep) && foll->allow_write_mapping;
>  	pfn = pte_pfn(*ptep);
>  
>  	/*
> @@ -2652,24 +2640,22 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
>   * 2): @write_fault = false && @writable, @writable will tell the caller
>   *     whether the mapping is writable.
>   */
> -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
> -		     bool *async, bool write_fault, bool *writable)
> +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll)
>  {
>  	struct vm_area_struct *vma;
>  	kvm_pfn_t pfn;
>  	int npages, r;
>  
>  	/* we can do it either atomically or asynchronously, not both */
> -	BUG_ON(atomic && async);
> +	BUG_ON(foll->atomic && (foll->flags & FOLL_NOWAIT));
>  
> -	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
> +	if (hva_to_pfn_fast(foll, &pfn))
>  		return pfn;
>  
> -	if (atomic)
> +	if (foll->atomic)
>  		return KVM_PFN_ERR_FAULT;
>  
> -	npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
> -				 writable, &pfn);
> +	npages = hva_to_pfn_slow(foll, &pfn);
>  	if (npages == 1)
>  		return pfn;
>  	if (npages == -EINTR)
> @@ -2677,83 +2663,122 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
>  
>  	mmap_read_lock(current->mm);
>  	if (npages == -EHWPOISON ||
> -	      (!async && check_user_page_hwpoison(addr))) {
> +	      (!(foll->flags & FOLL_NOWAIT) && check_user_page_hwpoison(foll->hva))) {
>  		pfn = KVM_PFN_ERR_HWPOISON;
>  		goto exit;
>  	}
>  
>  retry:
> -	vma = vma_lookup(current->mm, addr);
> +	vma = vma_lookup(current->mm, foll->hva);
>  
>  	if (vma == NULL)
>  		pfn = KVM_PFN_ERR_FAULT;
>  	else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
> -		r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
> +		r = hva_to_pfn_remapped(vma, foll, &pfn);
>  		if (r == -EAGAIN)
>  			goto retry;
>  		if (r < 0)
>  			pfn = KVM_PFN_ERR_FAULT;
>  	} else {
> -		if (async && vma_is_valid(vma, write_fault))
> -			*async = true;
> -		pfn = KVM_PFN_ERR_FAULT;
> +		if ((foll->flags & FOLL_NOWAIT) &&
> +		    vma_is_valid(vma, foll->flags & FOLL_WRITE))
> +			pfn = KVM_PFN_ERR_NEEDS_IO;
> +		else
> +			pfn = KVM_PFN_ERR_FAULT;
>  	}
>  exit:
>  	mmap_read_unlock(current->mm);
>  	return pfn;
>  }
>  
> -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
> -			       bool atomic, bool interruptible, bool *async,
> -			       bool write_fault, bool *writable, hva_t *hva)
> +kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll)
>  {
> -	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
> -
> -	if (hva)
> -		*hva = addr;
> +	foll->hva = __gfn_to_hva_many(foll->slot, foll->gfn, NULL,
> +				      foll->flags & FOLL_WRITE);
>  
> -	if (addr == KVM_HVA_ERR_RO_BAD) {
> -		if (writable)
> -			*writable = false;
> +	if (foll->hva == KVM_HVA_ERR_RO_BAD)
>  		return KVM_PFN_ERR_RO_FAULT;
> -	}
>  
> -	if (kvm_is_error_hva(addr)) {
> -		if (writable)
> -			*writable = false;
> +	if (kvm_is_error_hva(foll->hva))
>  		return KVM_PFN_NOSLOT;
> -	}
>  
> -	/* Do not map writable pfn in the readonly memslot. */
> -	if (writable && memslot_is_readonly(slot)) {
> -		*writable = false;
> -		writable = NULL;
> -	}
> +	if (memslot_is_readonly(foll->slot))
> +		foll->allow_write_mapping = false;
> +
> +	return hva_to_pfn(foll);
> +}
> +EXPORT_SYMBOL_GPL(__kvm_follow_pfn);
>  
> -	return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
> -			  writable);
> +kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
> +			       bool atomic, bool interruptible, bool *async,
> +			       bool write_fault, bool *writable, hva_t *hva)
> +{
> +	kvm_pfn_t pfn;
> +	struct kvm_follow_pfn foll = {
> +		.slot = slot,
> +		.gfn = gfn,
> +		.flags = 0,
> +		.atomic = atomic,
> +		.allow_write_mapping = !!writable,
> +	};
> +
> +	if (write_fault)
> +		foll.flags |= FOLL_WRITE;
> +	if (async)
> +		foll.flags |= FOLL_NOWAIT;
> +	if (interruptible)
> +		foll.flags |= FOLL_INTERRUPTIBLE;
> +
> +	pfn = __kvm_follow_pfn(&foll);
> +	if (pfn == KVM_PFN_ERR_NEEDS_IO) {
> +		*async = true;
> +		pfn = KVM_PFN_ERR_FAULT;
> +	}
> +	if (hva)
> +		*hva = foll.hva;
> +	if (writable)
> +		*writable = foll.writable;
> +	return pfn;
>  }
>  EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
>  
>  kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
>  		      bool *writable)
>  {
> -	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
> -				    NULL, write_fault, writable, NULL);
> +	kvm_pfn_t pfn;
> +	struct kvm_follow_pfn foll = {
> +		.slot = gfn_to_memslot(kvm, gfn),
> +		.gfn = gfn,
> +		.flags = write_fault ? FOLL_WRITE : 0,
> +		.allow_write_mapping = !!writable,
> +	};
> +	pfn = __kvm_follow_pfn(&foll);
> +	if (writable)
> +		*writable = foll.writable;
> +	return pfn;
>  }
>  EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
>  
>  kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
>  {
> -	return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
> -				    NULL, NULL);
> +	struct kvm_follow_pfn foll = {
> +		.slot = slot,
> +		.gfn = gfn,
> +		.flags = FOLL_WRITE,
> +	};
> +	return __kvm_follow_pfn(&foll);
>  }
>  EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
>  
>  kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
>  {
> -	return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
> -				    NULL, NULL);
> +	struct kvm_follow_pfn foll = {
> +		.slot = slot,
> +		.gfn = gfn,
> +		.flags = FOLL_WRITE,
> +		.atomic = true,
> +	};
> +	return __kvm_follow_pfn(&foll);
>  }
>  EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
>  
> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
> index 180f1a09e6ba..ed896aee5396 100644
> --- a/virt/kvm/kvm_mm.h
> +++ b/virt/kvm/kvm_mm.h
> @@ -20,8 +20,7 @@
>  #define KVM_MMU_UNLOCK(kvm)		spin_unlock(&(kvm)->mmu_lock)
>  #endif /* KVM_HAVE_MMU_RWLOCK */
>  
> -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
> -		     bool *async, bool write_fault, bool *writable);
> +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll);
>  
>  #ifdef CONFIG_HAVE_KVM_PFNCACHE
>  void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
> diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
> index 2d6aba677830..e3fefa753a51 100644
> --- a/virt/kvm/pfncache.c
> +++ b/virt/kvm/pfncache.c
> @@ -144,6 +144,12 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
>  	kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
>  	void *new_khva = NULL;
>  	unsigned long mmu_seq;
> +	struct kvm_follow_pfn foll = {
> +		.slot = gpc->memslot,
> +		.gfn = gpa_to_gfn(gpc->gpa),
> +		.flags = FOLL_WRITE,
> +		.hva = gpc->uhva,
> +	};
>  
>  	lockdep_assert_held(&gpc->refresh_lock);
>  
> @@ -183,7 +189,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
>  		}
>  
>  		/* We always request a writeable mapping */
> -		new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL);
> +		new_pfn = hva_to_pfn(&foll);
>  		if (is_error_noslot_pfn(new_pfn))
>  			goto out_error;
>  
> -- 
> 2.41.0.255.g8b1d071c50-goog
>
  
David Stevens July 6, 2023, 5:29 a.m. UTC | #7
On Wed, Jul 5, 2023 at 7:53 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
>
> On Wed, Jul 05, 2023 at 06:22:59PM +0900, David Stevens wrote:
> > On Wed, Jul 5, 2023 at 12:10 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
> > >
> > > > @@ -2514,35 +2512,26 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> > > >   * The slow path to get the pfn of the specified host virtual address,
> > > >   * 1 indicates success, -errno is returned if error is detected.
> > > >   */
> > > > -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> > > > -                        bool interruptible, bool *writable, kvm_pfn_t *pfn)
> > > > +static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> > > >  {
> > > > -     unsigned int flags = FOLL_HWPOISON;
> > > > +     unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags;
> > > >       struct page *page;
> > > >       int npages;
> > > >
> > > >       might_sleep();
> > > >
> > > > -     if (writable)
> > > > -             *writable = write_fault;
> > > > -
> > > > -     if (write_fault)
> > > > -             flags |= FOLL_WRITE;
> > > > -     if (async)
> > > > -             flags |= FOLL_NOWAIT;
> > > > -     if (interruptible)
> > > > -             flags |= FOLL_INTERRUPTIBLE;
> > > > -
> > > > -     npages = get_user_pages_unlocked(addr, 1, &page, flags);
> > > > +     npages = get_user_pages_unlocked(foll->hva, 1, &page, flags);
> > > >       if (npages != 1)
> > > >               return npages;
> > > >
> > > > +     foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> > > > +
> > > >       /* map read fault as writable if possible */
> > > > -     if (unlikely(!write_fault) && writable) {
> > > > +     if (unlikely(!foll->writable) && foll->allow_write_mapping) {
> > >
> > > I guess !foll->writable should be !(foll->flags & FOLL_WRITE) here.
> >
> > The two statements are logically equivalent, although I guess using
> > !(foll->flags & FOLL_WRITE) may be a little clearer, if a little more
> > verbose.
>
> Well, as the comment says, we wanna try to map the read fault as writable
> whenever possible. And __gfn_to_pfn_memslot() will only set the FOLL_WRITE
> for write faults. So I guess using !foll->writable will not allow this.
> Did I miss anything?

We just set the foll->writable out parameter to be equal to
((foll->flags & FOLL_WRITE) && foll->allow_write_mapping). Taking a =
foll->flags & FOLL_WRITE and b = foll->allow_write_mapping, we have
!(a && b) && b -> (!a || !b) && b -> (!a && b) || (!b && b) -> !a &&
b.

-David
  
David Stevens July 6, 2023, 5:52 a.m. UTC | #8
On Thu, Jul 6, 2023 at 10:34 AM Isaku Yamahata <isaku.yamahata@gmail.com> wrote:
>
> On Tue, Jul 04, 2023 at 04:50:47PM +0900,
> David Stevens <stevensd@chromium.org> wrote:
>
> > From: David Stevens <stevensd@chromium.org>
> >
> > Introduce __kvm_follow_pfn, which will replace __gfn_to_pfn_memslot.
> > __kvm_follow_pfn refactors the old API's arguments into a struct and,
> > where possible, combines the boolean arguments into a single flags
> > argument.
> >
> > Signed-off-by: David Stevens <stevensd@chromium.org>
> > ---
> >  include/linux/kvm_host.h |  16 ++++
> >  virt/kvm/kvm_main.c      | 171 ++++++++++++++++++++++-----------------
> >  virt/kvm/kvm_mm.h        |   3 +-
> >  virt/kvm/pfncache.c      |   8 +-
> >  4 files changed, 122 insertions(+), 76 deletions(-)
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 9d3ac7720da9..ef2763c2b12e 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -97,6 +97,7 @@
> >  #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1)
> >  #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2)
> >  #define KVM_PFN_ERR_SIGPENDING       (KVM_PFN_ERR_MASK + 3)
> > +#define KVM_PFN_ERR_NEEDS_IO (KVM_PFN_ERR_MASK + 4)
> >
> >  /*
> >   * error pfns indicate that the gfn is in slot but faild to
> > @@ -1156,6 +1157,21 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
> >  void kvm_release_page_clean(struct page *page);
> >  void kvm_release_page_dirty(struct page *page);
> >
> > +struct kvm_follow_pfn {
> > +     const struct kvm_memory_slot *slot;
> > +     gfn_t gfn;
> > +     unsigned int flags;
> > +     bool atomic;
> > +     /* Allow a read fault to create a writeable mapping. */
> > +     bool allow_write_mapping;
>
> Maybe, make them const for input arguments?

Unfortunately using const isn't straightforward as long as the kernel
continues to use -Wdeclaration-after-statement. If these fields were
const, then they would need to be specified in the initializer when
declaring the variable, but that's not necessarily always possible.

-David
  
Yu Zhang July 6, 2023, 2:52 p.m. UTC | #9
On Thu, Jul 06, 2023 at 02:29:24PM +0900, David Stevens wrote:
> On Wed, Jul 5, 2023 at 7:53 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
> >
> > On Wed, Jul 05, 2023 at 06:22:59PM +0900, David Stevens wrote:
> > > On Wed, Jul 5, 2023 at 12:10 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
> > > >
> > > > > @@ -2514,35 +2512,26 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> > > > >   * The slow path to get the pfn of the specified host virtual address,
> > > > >   * 1 indicates success, -errno is returned if error is detected.
> > > > >   */
> > > > > -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> > > > > -                        bool interruptible, bool *writable, kvm_pfn_t *pfn)
> > > > > +static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> > > > >  {
> > > > > -     unsigned int flags = FOLL_HWPOISON;
> > > > > +     unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags;
> > > > >       struct page *page;
> > > > >       int npages;
> > > > >
> > > > >       might_sleep();
> > > > >
> > > > > -     if (writable)
> > > > > -             *writable = write_fault;
> > > > > -
> > > > > -     if (write_fault)
> > > > > -             flags |= FOLL_WRITE;
> > > > > -     if (async)
> > > > > -             flags |= FOLL_NOWAIT;
> > > > > -     if (interruptible)
> > > > > -             flags |= FOLL_INTERRUPTIBLE;
> > > > > -
> > > > > -     npages = get_user_pages_unlocked(addr, 1, &page, flags);
> > > > > +     npages = get_user_pages_unlocked(foll->hva, 1, &page, flags);
> > > > >       if (npages != 1)
> > > > >               return npages;
> > > > >
> > > > > +     foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> > > > > +
> > > > >       /* map read fault as writable if possible */
> > > > > -     if (unlikely(!write_fault) && writable) {
> > > > > +     if (unlikely(!foll->writable) && foll->allow_write_mapping) {
> > > >
> > > > I guess !foll->writable should be !(foll->flags & FOLL_WRITE) here.
> > >
> > > The two statements are logically equivalent, although I guess using
> > > !(foll->flags & FOLL_WRITE) may be a little clearer, if a little more
> > > verbose.
> >
> > Well, as the comment says, we wanna try to map the read fault as writable
> > whenever possible. And __gfn_to_pfn_memslot() will only set the FOLL_WRITE
> > for write faults. So I guess using !foll->writable will not allow this.
> > Did I miss anything?
> 
> We just set the foll->writable out parameter to be equal to
> ((foll->flags & FOLL_WRITE) && foll->allow_write_mapping). Taking a =
> foll->flags & FOLL_WRITE and b = foll->allow_write_mapping, we have
> !(a && b) && b -> (!a || !b) && b -> (!a && b) || (!b && b) -> !a &&
> b.

Ouch, my bad again... I typed "!foll->writable", but missed the "!" in
my head while calculating... Thanks! :)

B.R.
Yu
  
Zhi Wang July 11, 2023, 5:37 p.m. UTC | #10
On Wed, 5 Jul 2023 18:08:17 +0900
David Stevens <stevensd@chromium.org> wrote:

> On Wed, Jul 5, 2023 at 5:47___PM Zhi Wang <zhi.wang.linux@gmail.com> wrote:
> >
> > On Tue,  4 Jul 2023 16:50:47 +0900
> > David Stevens <stevensd@chromium.org> wrote:
> >  
> > > From: David Stevens <stevensd@chromium.org>
> > >
> > > Introduce __kvm_follow_pfn, which will replace __gfn_to_pfn_memslot.
> > > __kvm_follow_pfn refactors the old API's arguments into a struct and,
> > > where possible, combines the boolean arguments into a single flags
> > > argument.
> > >
> > > Signed-off-by: David Stevens <stevensd@chromium.org>
> > > ---
> > >  include/linux/kvm_host.h |  16 ++++
> > >  virt/kvm/kvm_main.c      | 171 ++++++++++++++++++++++-----------------
> > >  virt/kvm/kvm_mm.h        |   3 +-
> > >  virt/kvm/pfncache.c      |   8 +-
> > >  4 files changed, 122 insertions(+), 76 deletions(-)
> > >
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 9d3ac7720da9..ef2763c2b12e 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -97,6 +97,7 @@
> > >  #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1)
> > >  #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2)
> > >  #define KVM_PFN_ERR_SIGPENDING       (KVM_PFN_ERR_MASK + 3)
> > > +#define KVM_PFN_ERR_NEEDS_IO (KVM_PFN_ERR_MASK + 4)
> > >
> > >  /*
> > >   * error pfns indicate that the gfn is in slot but faild to
> > > @@ -1156,6 +1157,21 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
> > >  void kvm_release_page_clean(struct page *page);
> > >  void kvm_release_page_dirty(struct page *page);
> > >
> > > +struct kvm_follow_pfn {
> > > +     const struct kvm_memory_slot *slot;
> > > +     gfn_t gfn;
> > > +     unsigned int flags;
> > > +     bool atomic;
> > > +     /* Allow a read fault to create a writeable mapping. */
> > > +     bool allow_write_mapping;
> > > +
> > > +     /* Outputs of __kvm_follow_pfn */
> > > +     hva_t hva;
> > > +     bool writable;
> > > +};
> > > +
> > > +kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll);
> > > +
> > >  kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
> > >  kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
> > >                     bool *writable);
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index 371bd783ff2b..b13f22861d2f 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -2486,24 +2486,22 @@ static inline int check_user_page_hwpoison(unsigned long addr)
> > >   * true indicates success, otherwise false is returned.  It's also the
> > >   * only part that runs if we can in atomic context.
> > >   */
> > > -static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> > > -                         bool *writable, kvm_pfn_t *pfn)
> > > +static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> > >  {
> > >       struct page *page[1];
> > > +     bool write_fault = foll->flags & FOLL_WRITE;
> > >
> > >       /*
> > >        * Fast pin a writable pfn only if it is a write fault request
> > >        * or the caller allows to map a writable pfn for a read fault
> > >        * request.
> > >        */
> > > -     if (!(write_fault || writable))
> > > +     if (!(write_fault || foll->allow_write_mapping))
> > >               return false;
> > >
> > > -     if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
> > > +     if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) {
> > >               *pfn = page_to_pfn(page[0]);
> > > -
> > > -             if (writable)
> > > -                     *writable = true;
> > > +             foll->writable = foll->allow_write_mapping;
> > >               return true;
> > >       }
> > >
> > > @@ -2514,35 +2512,26 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> > >   * The slow path to get the pfn of the specified host virtual address,
> > >   * 1 indicates success, -errno is returned if error is detected.
> > >   */
> > > -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> > > -                        bool interruptible, bool *writable, kvm_pfn_t *pfn)
> > > +static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> > >  {
> > > -     unsigned int flags = FOLL_HWPOISON;
> > > +     unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags;
> > >       struct page *page;
> > >       int npages;
> > >
> > >       might_sleep();
> > >
> > > -     if (writable)
> > > -             *writable = write_fault;
> > > -
> > > -     if (write_fault)
> > > -             flags |= FOLL_WRITE;
> > > -     if (async)
> > > -             flags |= FOLL_NOWAIT;
> > > -     if (interruptible)
> > > -             flags |= FOLL_INTERRUPTIBLE;
> > > -
> > > -     npages = get_user_pages_unlocked(addr, 1, &page, flags);
> > > +     npages = get_user_pages_unlocked(foll->hva, 1, &page, flags);
> > >       if (npages != 1)
> > >               return npages;
> > >
> > > +     foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> > > +
> > >       /* map read fault as writable if possible */
> > > -     if (unlikely(!write_fault) && writable) {
> > > +     if (unlikely(!foll->writable) && foll->allow_write_mapping) {
> > >               struct page *wpage;
> > >
> > > -             if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
> > > -                     *writable = true;
> > > +             if (get_user_page_fast_only(foll->hva, FOLL_WRITE, &wpage)) {
> > > +                     foll->writable = true;
> > >                       put_page(page);
> > >                       page = wpage;
> > >               }
> > > @@ -2572,23 +2561,23 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn)
> > >       return get_page_unless_zero(page);
> > >  }
> > >
> > > -static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> > > -                            unsigned long addr, bool write_fault,
> > > -                            bool *writable, kvm_pfn_t *p_pfn)
> > > +static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *foll,
> > > +                            kvm_pfn_t *p_pfn)
> > >  {
> > >       kvm_pfn_t pfn;
> > >       pte_t *ptep;
> > >       spinlock_t *ptl;
> > > +     bool write_fault = foll->flags & FOLL_WRITE;
> > >       int r;
> > >
> > > -     r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
> > > +     r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
> > >       if (r) {
> > >               /*
> > >                * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
> > >                * not call the fault handler, so do it here.
> > >                */
> > >               bool unlocked = false;
> > > -             r = fixup_user_fault(current->mm, addr,
> > > +             r = fixup_user_fault(current->mm, foll->hva,
> > >                                    (write_fault ? FAULT_FLAG_WRITE : 0),
> > >                                    &unlocked);
> > >               if (unlocked)
> > > @@ -2596,7 +2585,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> > >               if (r)
> > >                       return r;
> > >
> > > -             r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
> > > +             r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
> > >               if (r)
> > >                       return r;
> > >       }
> > > @@ -2606,8 +2595,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> > >               goto out;
> > >       }
> > >
> > > -     if (writable)
> > > -             *writable = pte_write(*ptep);
> > > +     foll->writable = pte_write(*ptep) && foll->allow_write_mapping;
> > >       pfn = pte_pfn(*ptep);
> > >
> > >       /*
> > > @@ -2652,24 +2640,22 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> > >   * 2): @write_fault = false && @writable, @writable will tell the caller
> > >   *     whether the mapping is writable.
> > >   */
> > > -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
> > > -                  bool *async, bool write_fault, bool *writable)
> > > +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll)
> > >  {
> > >       struct vm_area_struct *vma;
> > >       kvm_pfn_t pfn;
> > >       int npages, r;
> > >
> > >       /* we can do it either atomically or asynchronously, not both */
> > > -     BUG_ON(atomic && async);
> > > +     BUG_ON(foll->atomic && (foll->flags & FOLL_NOWAIT));
> > >
> > > -     if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
> > > +     if (hva_to_pfn_fast(foll, &pfn))
> > >               return pfn;
> > >
> > > -     if (atomic)
> > > +     if (foll->atomic)
> > >               return KVM_PFN_ERR_FAULT;
> > >
> > > -     npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
> > > -                              writable, &pfn);
> > > +     npages = hva_to_pfn_slow(foll, &pfn);
> > >       if (npages == 1)
> > >               return pfn;
> > >       if (npages == -EINTR)
> > > @@ -2677,83 +2663,122 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
> > >
> > >       mmap_read_lock(current->mm);
> > >       if (npages == -EHWPOISON ||
> > > -           (!async && check_user_page_hwpoison(addr))) {
> > > +           (!(foll->flags & FOLL_NOWAIT) && check_user_page_hwpoison(foll->hva))) {
> > >               pfn = KVM_PFN_ERR_HWPOISON;
> > >               goto exit;
> > >       }
> > >
> > >  retry:
> > > -     vma = vma_lookup(current->mm, addr);
> > > +     vma = vma_lookup(current->mm, foll->hva);
> > >
> > >       if (vma == NULL)
> > >               pfn = KVM_PFN_ERR_FAULT;
> > >       else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
> > > -             r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
> > > +             r = hva_to_pfn_remapped(vma, foll, &pfn);
> > >               if (r == -EAGAIN)
> > >                       goto retry;
> > >               if (r < 0)
> > >                       pfn = KVM_PFN_ERR_FAULT;
> > >       } else {
> > > -             if (async && vma_is_valid(vma, write_fault))
> > > -                     *async = true;
> > > -             pfn = KVM_PFN_ERR_FAULT;
> > > +             if ((foll->flags & FOLL_NOWAIT) &&
> > > +                 vma_is_valid(vma, foll->flags & FOLL_WRITE))
> > > +                     pfn = KVM_PFN_ERR_NEEDS_IO;
> > > +             else
> > > +                     pfn = KVM_PFN_ERR_FAULT;
> > >       }
> > >  exit:
> > >       mmap_read_unlock(current->mm);
> > >       return pfn;
> > >  }
> > >
> > > -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
> > > -                            bool atomic, bool interruptible, bool *async,
> > > -                            bool write_fault, bool *writable, hva_t *hva)
> > > +kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll)
> > >  {
> > > -     unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
> > > -
> > > -     if (hva)
> > > -             *hva = addr;
> > > +     foll->hva = __gfn_to_hva_many(foll->slot, foll->gfn, NULL,
> > > +                                   foll->flags & FOLL_WRITE);
> > >
> > > -     if (addr == KVM_HVA_ERR_RO_BAD) {
> > > -             if (writable)
> > > -                     *writable = false;
> > > +     if (foll->hva == KVM_HVA_ERR_RO_BAD)
> > >               return KVM_PFN_ERR_RO_FAULT;
> > > -     }
> > >  
> >
> > Can you explain why updating foll->writable = false (previously *writeable
> > = false) is omitted here?
> >
> > In the caller where the struct kvm_follow_pfn is initialized, e.g.
> > __gfn_to_pfn_memslot()/gfn_to_pfn_prot(), .writable is not initialized.
> > IIUC, they expect __kvm_follow_pfn() to update it and return .writable to
> > upper caller.
> >
> > As the one of the output, it would be better to initalize it either in the
> > caller or update it in __kvm_follow_pfn(). Or
> > __gfn_to_pfn_memslot()/gfn_to_pfn_prot() will return random data in the
> > stack to the caller via bool *writable. It doesn't sound nice.  
> 
> Entries omitted from an initializer are initialized to zero, so
> .writable does get initialized in all of the patches in this series
> via designated initializers. Although you're right that explicitly
> setting it to false is a good idea, in case someday someone adds a
> caller that doesn't use an initializer when declaring its
> kvm_follow_pfn.
>

Nice trick and nice to know that. :) Agreed on improving readability and
preventing a risk from the caller. 

> -David
  
Sean Christopherson Aug. 4, 2023, 10:03 p.m. UTC | #11
On Thu, Jul 06, 2023, Yu Zhang wrote:
> On Thu, Jul 06, 2023 at 02:29:24PM +0900, David Stevens wrote:
> > On Wed, Jul 5, 2023 at 7:53 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
> > >
> > > On Wed, Jul 05, 2023 at 06:22:59PM +0900, David Stevens wrote:
> > > > On Wed, Jul 5, 2023 at 12:10 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
> > > > >
> > > > > > @@ -2514,35 +2512,26 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> > > > > >   * The slow path to get the pfn of the specified host virtual address,
> > > > > >   * 1 indicates success, -errno is returned if error is detected.
> > > > > >   */
> > > > > > -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> > > > > > -                        bool interruptible, bool *writable, kvm_pfn_t *pfn)
> > > > > > +static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> > > > > >  {
> > > > > > -     unsigned int flags = FOLL_HWPOISON;
> > > > > > +     unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags;
> > > > > >       struct page *page;
> > > > > >       int npages;
> > > > > >
> > > > > >       might_sleep();
> > > > > >
> > > > > > -     if (writable)
> > > > > > -             *writable = write_fault;
> > > > > > -
> > > > > > -     if (write_fault)
> > > > > > -             flags |= FOLL_WRITE;
> > > > > > -     if (async)
> > > > > > -             flags |= FOLL_NOWAIT;
> > > > > > -     if (interruptible)
> > > > > > -             flags |= FOLL_INTERRUPTIBLE;
> > > > > > -
> > > > > > -     npages = get_user_pages_unlocked(addr, 1, &page, flags);
> > > > > > +     npages = get_user_pages_unlocked(foll->hva, 1, &page, flags);
> > > > > >       if (npages != 1)
> > > > > >               return npages;
> > > > > >
> > > > > > +     foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> > > > > > +
> > > > > >       /* map read fault as writable if possible */
> > > > > > -     if (unlikely(!write_fault) && writable) {
> > > > > > +     if (unlikely(!foll->writable) && foll->allow_write_mapping) {
> > > > >
> > > > > I guess !foll->writable should be !(foll->flags & FOLL_WRITE) here.
> > > >
> > > > The two statements are logically equivalent, although I guess using
> > > > !(foll->flags & FOLL_WRITE) may be a little clearer, if a little more
> > > > verbose.
> > >
> > > Well, as the comment says, we wanna try to map the read fault as writable
> > > whenever possible. And __gfn_to_pfn_memslot() will only set the FOLL_WRITE
> > > for write faults. So I guess using !foll->writable will not allow this.
> > > Did I miss anything?
> > 
> > We just set the foll->writable out parameter to be equal to
> > ((foll->flags & FOLL_WRITE) && foll->allow_write_mapping). Taking a =
> > foll->flags & FOLL_WRITE and b = foll->allow_write_mapping, we have
> > !(a && b) && b -> (!a || !b) && b -> (!a && b) || (!b && b) -> !a &&
> > b.
> 
> Ouch, my bad again... I typed "!foll->writable", but missed the "!" in
> my head while calculating... Thanks! :)

The code is funky and confusing though.  Specifically, FOLL_WRITE without
allow_write_mapping is nonsensical, and yields the even more nonsensical output
of a successful FOLL_WRITE with foll->writable==%false.

It "works" because callers only consume foll->writable when foll->allow_write_mapping
is true, but relying on that is ugly and completely unnecessary.   Similarly, the
"allow" terminology is misleading.  FOLL_WRITE *always* allows writable mappings.

This wasn't as much of problem in the previous code because the lower levels took
the pointer, i.e. avoided the "allow" terminology entirely.

So we should either keep that behavior, i.e. replace "bool allow_write_mapping"
with "bool *writable", or rename allow_write_mapping to something like
opportunistically_map_writable, and then unconditionally set foll->writable
whenever KVM obtains a writable mapping, i.e. regardless of whether the original
fault was a read or a write.

My vote is for the latter.  If opportunistically_map_writable is too verbose,
try_map_writable would be another option.  Hmm, I'll make "try_map_writable" my
official vote.

Ah, and I also vote to use an if-elif instead of unconditionally setting foll->writable.
That makes the relationship between FOLL_WRITE and try_map_writable a bit more
obvious IMO.  E.g.

static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
{
	struct page *page[1];

	/*
	 * Fast pin a writable pfn only if it is a write fault request
	 * or the caller allows to map a writable pfn for a read fault
	 * request.
	 */
	if (!((foll->flags & FOLL_WRITE) || foll->try_map_writable))
		return false;

	if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) {
		*pfn = page_to_pfn(page[0]);
		foll->writable = true;
		return true;
	}

	return false;
}

/*
 * The slow path to get the pfn of the specified host virtual address,
 * 1 indicates success, -errno is returned if error is detected.
 */
static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
{
	unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags;
	struct page *page;
	int npages;

	might_sleep();

	npages = get_user_pages_unlocked(foll->hva, 1, &page, flags);
	if (npages != 1)
		return npages;

	if (foll->flags & FOLL_WRITE) {
		foll->writable = true;
	} else if (foll->try_map_writable) {
		struct page *wpage;

		/* map read fault as writable if possible */
		if (get_user_page_fast_only(foll->hva, FOLL_WRITE, &wpage)) {
			foll->writable = true;
			put_page(page);
			page = wpage;
		}
	}
	*pfn = page_to_pfn(page);
	return npages;
}
  
Sean Christopherson Aug. 4, 2023, 10:13 p.m. UTC | #12
On Tue, Jul 04, 2023, David Stevens wrote:
> From: David Stevens <stevensd@chromium.org>
> 
> Introduce __kvm_follow_pfn, which will replace __gfn_to_pfn_memslot.
> __kvm_follow_pfn refactors the old API's arguments into a struct and,
> where possible, combines the boolean arguments into a single flags
> argument.
> 
> Signed-off-by: David Stevens <stevensd@chromium.org>
> ---
>  include/linux/kvm_host.h |  16 ++++
>  virt/kvm/kvm_main.c      | 171 ++++++++++++++++++++++-----------------
>  virt/kvm/kvm_mm.h        |   3 +-
>  virt/kvm/pfncache.c      |   8 +-
>  4 files changed, 122 insertions(+), 76 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 9d3ac7720da9..ef2763c2b12e 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -97,6 +97,7 @@
>  #define KVM_PFN_ERR_HWPOISON	(KVM_PFN_ERR_MASK + 1)
>  #define KVM_PFN_ERR_RO_FAULT	(KVM_PFN_ERR_MASK + 2)
>  #define KVM_PFN_ERR_SIGPENDING	(KVM_PFN_ERR_MASK + 3)
> +#define KVM_PFN_ERR_NEEDS_IO	(KVM_PFN_ERR_MASK + 4)

Hmm, ideally KVM_PFN_ERR_NEEDS_IO would be introduced in a separate prep patch,
e.g. by changing "bool *async" to "bool no_wait".  At a glance, I can't tell if
that's feasible though, so consider it more of a "wish" than a request.

> @@ -2572,23 +2561,23 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn)
>  	return get_page_unless_zero(page);
>  }
>  
> -static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> -			       unsigned long addr, bool write_fault,
> -			       bool *writable, kvm_pfn_t *p_pfn)
> +static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *foll,
> +			       kvm_pfn_t *p_pfn)

Please wrap.  KVM still honors the 80 char soft limit unless there's a reason not
to, and in this case it's already wrapping

static int hva_to_pfn_remapped(struct vm_area_struct *vma,
			       struct kvm_follow_pfn *foll, kvm_pfn_t *p_pfn)

> @@ -2606,8 +2595,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
>  		goto out;
>  	}
>  
> -	if (writable)
> -		*writable = pte_write(*ptep);
> +	foll->writable = pte_write(*ptep) && foll->allow_write_mapping;

Similar to feedback in my other response, don't condition this on try_map_writable,
i.e. just do:

	foll->writable = pte_write(...);

>  	pfn = pte_pfn(*ptep);
>  
>  	/*
> @@ -2652,24 +2640,22 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
>   * 2): @write_fault = false && @writable, @writable will tell the caller
>   *     whether the mapping is writable.
>   */
> -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
> -		     bool *async, bool write_fault, bool *writable)
> +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll)
>  {
>  	struct vm_area_struct *vma;
>  	kvm_pfn_t pfn;
>  	int npages, r;
>  
>  	/* we can do it either atomically or asynchronously, not both */
> -	BUG_ON(atomic && async);
> +	BUG_ON(foll->atomic && (foll->flags & FOLL_NOWAIT));
>  
> -	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
> +	if (hva_to_pfn_fast(foll, &pfn))
>  		return pfn;
>  
> -	if (atomic)
> +	if (foll->atomic)
>  		return KVM_PFN_ERR_FAULT;
>  
> -	npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
> -				 writable, &pfn);
> +	npages = hva_to_pfn_slow(foll, &pfn);
>  	if (npages == 1)
>  		return pfn;
>  	if (npages == -EINTR)
> @@ -2677,83 +2663,122 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
>  
>  	mmap_read_lock(current->mm);
>  	if (npages == -EHWPOISON ||
> -	      (!async && check_user_page_hwpoison(addr))) {
> +	      (!(foll->flags & FOLL_NOWAIT) && check_user_page_hwpoison(foll->hva))) {

Opportunistically align the indentation, as an added bonus that makes the line
length a few chars shorter, i.e.

	if (npages == -EHWPOISON ||
	    (!(foll->flags & FOLL_NOWAIT) && check_user_page_hwpoison(foll->hva))) {
		pfn = KVM_PFN_ERR_HWPOISON;
		goto exit;
	}
  

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9d3ac7720da9..ef2763c2b12e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -97,6 +97,7 @@ 
 #define KVM_PFN_ERR_HWPOISON	(KVM_PFN_ERR_MASK + 1)
 #define KVM_PFN_ERR_RO_FAULT	(KVM_PFN_ERR_MASK + 2)
 #define KVM_PFN_ERR_SIGPENDING	(KVM_PFN_ERR_MASK + 3)
+#define KVM_PFN_ERR_NEEDS_IO	(KVM_PFN_ERR_MASK + 4)
 
 /*
  * error pfns indicate that the gfn is in slot but faild to
@@ -1156,6 +1157,21 @@  unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 
+struct kvm_follow_pfn {
+	const struct kvm_memory_slot *slot;
+	gfn_t gfn;
+	unsigned int flags;
+	bool atomic;
+	/* Allow a read fault to create a writeable mapping. */
+	bool allow_write_mapping;
+
+	/* Outputs of __kvm_follow_pfn */
+	hva_t hva;
+	bool writable;
+};
+
+kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll);
+
 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 371bd783ff2b..b13f22861d2f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2486,24 +2486,22 @@  static inline int check_user_page_hwpoison(unsigned long addr)
  * true indicates success, otherwise false is returned.  It's also the
  * only part that runs if we can in atomic context.
  */
-static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
-			    bool *writable, kvm_pfn_t *pfn)
+static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
 {
 	struct page *page[1];
+	bool write_fault = foll->flags & FOLL_WRITE;
 
 	/*
 	 * Fast pin a writable pfn only if it is a write fault request
 	 * or the caller allows to map a writable pfn for a read fault
 	 * request.
 	 */
-	if (!(write_fault || writable))
+	if (!(write_fault || foll->allow_write_mapping))
 		return false;
 
-	if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
+	if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) {
 		*pfn = page_to_pfn(page[0]);
-
-		if (writable)
-			*writable = true;
+		foll->writable = foll->allow_write_mapping;
 		return true;
 	}
 
@@ -2514,35 +2512,26 @@  static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
  * The slow path to get the pfn of the specified host virtual address,
  * 1 indicates success, -errno is returned if error is detected.
  */
-static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
-			   bool interruptible, bool *writable, kvm_pfn_t *pfn)
+static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
 {
-	unsigned int flags = FOLL_HWPOISON;
+	unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags;
 	struct page *page;
 	int npages;
 
 	might_sleep();
 
-	if (writable)
-		*writable = write_fault;
-
-	if (write_fault)
-		flags |= FOLL_WRITE;
-	if (async)
-		flags |= FOLL_NOWAIT;
-	if (interruptible)
-		flags |= FOLL_INTERRUPTIBLE;
-
-	npages = get_user_pages_unlocked(addr, 1, &page, flags);
+	npages = get_user_pages_unlocked(foll->hva, 1, &page, flags);
 	if (npages != 1)
 		return npages;
 
+	foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
+
 	/* map read fault as writable if possible */
-	if (unlikely(!write_fault) && writable) {
+	if (unlikely(!foll->writable) && foll->allow_write_mapping) {
 		struct page *wpage;
 
-		if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
-			*writable = true;
+		if (get_user_page_fast_only(foll->hva, FOLL_WRITE, &wpage)) {
+			foll->writable = true;
 			put_page(page);
 			page = wpage;
 		}
@@ -2572,23 +2561,23 @@  static int kvm_try_get_pfn(kvm_pfn_t pfn)
 	return get_page_unless_zero(page);
 }
 
-static int hva_to_pfn_remapped(struct vm_area_struct *vma,
-			       unsigned long addr, bool write_fault,
-			       bool *writable, kvm_pfn_t *p_pfn)
+static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *foll,
+			       kvm_pfn_t *p_pfn)
 {
 	kvm_pfn_t pfn;
 	pte_t *ptep;
 	spinlock_t *ptl;
+	bool write_fault = foll->flags & FOLL_WRITE;
 	int r;
 
-	r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
+	r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
 	if (r) {
 		/*
 		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
 		 * not call the fault handler, so do it here.
 		 */
 		bool unlocked = false;
-		r = fixup_user_fault(current->mm, addr,
+		r = fixup_user_fault(current->mm, foll->hva,
 				     (write_fault ? FAULT_FLAG_WRITE : 0),
 				     &unlocked);
 		if (unlocked)
@@ -2596,7 +2585,7 @@  static int hva_to_pfn_remapped(struct vm_area_struct *vma,
 		if (r)
 			return r;
 
-		r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
+		r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
 		if (r)
 			return r;
 	}
@@ -2606,8 +2595,7 @@  static int hva_to_pfn_remapped(struct vm_area_struct *vma,
 		goto out;
 	}
 
-	if (writable)
-		*writable = pte_write(*ptep);
+	foll->writable = pte_write(*ptep) && foll->allow_write_mapping;
 	pfn = pte_pfn(*ptep);
 
 	/*
@@ -2652,24 +2640,22 @@  static int hva_to_pfn_remapped(struct vm_area_struct *vma,
  * 2): @write_fault = false && @writable, @writable will tell the caller
  *     whether the mapping is writable.
  */
-kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
-		     bool *async, bool write_fault, bool *writable)
+kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll)
 {
 	struct vm_area_struct *vma;
 	kvm_pfn_t pfn;
 	int npages, r;
 
 	/* we can do it either atomically or asynchronously, not both */
-	BUG_ON(atomic && async);
+	BUG_ON(foll->atomic && (foll->flags & FOLL_NOWAIT));
 
-	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
+	if (hva_to_pfn_fast(foll, &pfn))
 		return pfn;
 
-	if (atomic)
+	if (foll->atomic)
 		return KVM_PFN_ERR_FAULT;
 
-	npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
-				 writable, &pfn);
+	npages = hva_to_pfn_slow(foll, &pfn);
 	if (npages == 1)
 		return pfn;
 	if (npages == -EINTR)
@@ -2677,83 +2663,122 @@  kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
 
 	mmap_read_lock(current->mm);
 	if (npages == -EHWPOISON ||
-	      (!async && check_user_page_hwpoison(addr))) {
+	      (!(foll->flags & FOLL_NOWAIT) && check_user_page_hwpoison(foll->hva))) {
 		pfn = KVM_PFN_ERR_HWPOISON;
 		goto exit;
 	}
 
 retry:
-	vma = vma_lookup(current->mm, addr);
+	vma = vma_lookup(current->mm, foll->hva);
 
 	if (vma == NULL)
 		pfn = KVM_PFN_ERR_FAULT;
 	else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
-		r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
+		r = hva_to_pfn_remapped(vma, foll, &pfn);
 		if (r == -EAGAIN)
 			goto retry;
 		if (r < 0)
 			pfn = KVM_PFN_ERR_FAULT;
 	} else {
-		if (async && vma_is_valid(vma, write_fault))
-			*async = true;
-		pfn = KVM_PFN_ERR_FAULT;
+		if ((foll->flags & FOLL_NOWAIT) &&
+		    vma_is_valid(vma, foll->flags & FOLL_WRITE))
+			pfn = KVM_PFN_ERR_NEEDS_IO;
+		else
+			pfn = KVM_PFN_ERR_FAULT;
 	}
 exit:
 	mmap_read_unlock(current->mm);
 	return pfn;
 }
 
-kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
-			       bool atomic, bool interruptible, bool *async,
-			       bool write_fault, bool *writable, hva_t *hva)
+kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll)
 {
-	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
-
-	if (hva)
-		*hva = addr;
+	foll->hva = __gfn_to_hva_many(foll->slot, foll->gfn, NULL,
+				      foll->flags & FOLL_WRITE);
 
-	if (addr == KVM_HVA_ERR_RO_BAD) {
-		if (writable)
-			*writable = false;
+	if (foll->hva == KVM_HVA_ERR_RO_BAD)
 		return KVM_PFN_ERR_RO_FAULT;
-	}
 
-	if (kvm_is_error_hva(addr)) {
-		if (writable)
-			*writable = false;
+	if (kvm_is_error_hva(foll->hva))
 		return KVM_PFN_NOSLOT;
-	}
 
-	/* Do not map writable pfn in the readonly memslot. */
-	if (writable && memslot_is_readonly(slot)) {
-		*writable = false;
-		writable = NULL;
-	}
+	if (memslot_is_readonly(foll->slot))
+		foll->allow_write_mapping = false;
+
+	return hva_to_pfn(foll);
+}
+EXPORT_SYMBOL_GPL(__kvm_follow_pfn);
 
-	return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
-			  writable);
+kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
+			       bool atomic, bool interruptible, bool *async,
+			       bool write_fault, bool *writable, hva_t *hva)
+{
+	kvm_pfn_t pfn;
+	struct kvm_follow_pfn foll = {
+		.slot = slot,
+		.gfn = gfn,
+		.flags = 0,
+		.atomic = atomic,
+		.allow_write_mapping = !!writable,
+	};
+
+	if (write_fault)
+		foll.flags |= FOLL_WRITE;
+	if (async)
+		foll.flags |= FOLL_NOWAIT;
+	if (interruptible)
+		foll.flags |= FOLL_INTERRUPTIBLE;
+
+	pfn = __kvm_follow_pfn(&foll);
+	if (pfn == KVM_PFN_ERR_NEEDS_IO) {
+		*async = true;
+		pfn = KVM_PFN_ERR_FAULT;
+	}
+	if (hva)
+		*hva = foll.hva;
+	if (writable)
+		*writable = foll.writable;
+	return pfn;
 }
 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
 
 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable)
 {
-	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
-				    NULL, write_fault, writable, NULL);
+	kvm_pfn_t pfn;
+	struct kvm_follow_pfn foll = {
+		.slot = gfn_to_memslot(kvm, gfn),
+		.gfn = gfn,
+		.flags = write_fault ? FOLL_WRITE : 0,
+		.allow_write_mapping = !!writable,
+	};
+	pfn = __kvm_follow_pfn(&foll);
+	if (writable)
+		*writable = foll.writable;
+	return pfn;
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
 kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
-				    NULL, NULL);
+	struct kvm_follow_pfn foll = {
+		.slot = slot,
+		.gfn = gfn,
+		.flags = FOLL_WRITE,
+	};
+	return __kvm_follow_pfn(&foll);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
 
 kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
-				    NULL, NULL);
+	struct kvm_follow_pfn foll = {
+		.slot = slot,
+		.gfn = gfn,
+		.flags = FOLL_WRITE,
+		.atomic = true,
+	};
+	return __kvm_follow_pfn(&foll);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
 
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 180f1a09e6ba..ed896aee5396 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -20,8 +20,7 @@ 
 #define KVM_MMU_UNLOCK(kvm)		spin_unlock(&(kvm)->mmu_lock)
 #endif /* KVM_HAVE_MMU_RWLOCK */
 
-kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
-		     bool *async, bool write_fault, bool *writable);
+kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll);
 
 #ifdef CONFIG_HAVE_KVM_PFNCACHE
 void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 2d6aba677830..e3fefa753a51 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -144,6 +144,12 @@  static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 	kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
 	void *new_khva = NULL;
 	unsigned long mmu_seq;
+	struct kvm_follow_pfn foll = {
+		.slot = gpc->memslot,
+		.gfn = gpa_to_gfn(gpc->gpa),
+		.flags = FOLL_WRITE,
+		.hva = gpc->uhva,
+	};
 
 	lockdep_assert_held(&gpc->refresh_lock);
 
@@ -183,7 +189,7 @@  static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		}
 
 		/* We always request a writeable mapping */
-		new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL);
+		new_pfn = hva_to_pfn(&foll);
 		if (is_error_noslot_pfn(new_pfn))
 			goto out_error;