[RFC,09/12] mm/gup: Handle huge pud for follow_pud_mask()

Message ID 20231116012908.392077-10-peterx@redhat.com
State New
Headers
Series mm/gup: Unify hugetlb, part 2 |

Commit Message

Peter Xu Nov. 16, 2023, 1:29 a.m. UTC
  Teach follow_pud_mask() to be able to handle normal PUD pages like hugetlb.

Rename follow_devmap_pud() to follow_huge_pud(), move it out of config
CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD, instead let that macro only
covers the devmap special operations, like pgmap.

In the new follow_huge_pud(), taking care of possible CoR for hugetlb if
necessary.

Since at it, optimize the non-present check by adding a pud_present() early
check before taking the pgtable lock, failing the follow_page() early if
PUD is not present: that is required by both devmap or hugetlb.  Use
pud_huge() to also cover the pud_devmap() case.

We need to export "struct follow_page_context" along the way, so that
huge_memory.c can understand it.

One trivial more thing to mention is, introduce "pud_t pud" in the code
paths along the way, so the code doesn't dereference *pudp multiple time.
Not only because that looks less straightforward, but also because if the
dereference really happened, it's not clear whether there can be race to
see different *pudp values when it's being modified at the same time.

Setting ctx->page_mask properly for a PUD entry.  As a side effect, this
should also be able to optimize devmap GUP on PUD to be able to jump over
the whole PUD range, but not yet verified.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/linux/huge_mm.h | 17 +++----
 mm/gup.c                | 22 ++++-----
 mm/huge_memory.c        | 98 +++++++++++++++++++++++------------------
 3 files changed, 73 insertions(+), 64 deletions(-)
  

Comments

Christoph Hellwig Nov. 23, 2023, 7:28 a.m. UTC | #1
> We need to export "struct follow_page_context" along the way, so that
> huge_memory.c can understand it.

Again, thankfully not actually exported, just made global.

> @@ -751,24 +746,25 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
>  				    unsigned int flags,
>  				    struct follow_page_context *ctx)
>  {
> -	pud_t *pud;
> +	pud_t *pudp, pud;

This adding of pud while useful seems mostly unrelated and clutter
the patch up quite a bit.  If you have to respin this anyway it might
be worth to split it out into a little prep patch.
  
Peter Xu Nov. 23, 2023, 4:19 p.m. UTC | #2
On Wed, Nov 22, 2023 at 11:28:31PM -0800, Christoph Hellwig wrote:
> > We need to export "struct follow_page_context" along the way, so that
> > huge_memory.c can understand it.
> 
> Again, thankfully not actually exported, just made global.

In the new version that shouldn't be needed, because I just noticed
huge_memory.c is only compiled with THP=on.  Logically it may start to make
sense at some point to have thp.c for THP=on, and huge_memory.c for THP ||
HUGETLB.  But I'd rather leave that done separately even if so..

In short, for this one, I'll drop that in the commit message, as I'll leave
"struct follow_page_context" alone.

> 
> > @@ -751,24 +746,25 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
> >  				    unsigned int flags,
> >  				    struct follow_page_context *ctx)
> >  {
> > -	pud_t *pud;
> > +	pud_t *pudp, pud;
> 
> This adding of pud while useful seems mostly unrelated and clutter
> the patch up quite a bit.  If you have to respin this anyway it might
> be worth to split it out into a little prep patch.

I can do this.  I'll also try to do the same for the rest patches, if
applicable.

Thanks,
  

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ec463410aecc..84815012d3cf 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -7,6 +7,11 @@ 
 
 #include <linux/fs.h> /* only for vma_is_dax() */
 
+struct follow_page_context {
+	struct dev_pagemap *pgmap;
+	unsigned int page_mask;
+};
+
 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
@@ -222,8 +227,6 @@  static inline bool folio_test_pmd_mappable(struct folio *folio)
 
 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
-struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
-		pud_t *pud, int flags, struct dev_pagemap **pgmap);
 
 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
 
@@ -372,18 +375,16 @@  static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
 	return NULL;
 }
 
-static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
-	unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap)
-{
-	return NULL;
-}
-
 static inline bool thp_migration_supported(void)
 {
 	return false;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr,
+			     pud_t *pud, int flags,
+			     struct follow_page_context *ctx);
+
 static inline int split_folio_to_list(struct folio *folio,
 		struct list_head *list)
 {
diff --git a/mm/gup.c b/mm/gup.c
index 89c1584d68f0..55a2ae55f00f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -25,11 +25,6 @@ 
 
 #include "internal.h"
 
-struct follow_page_context {
-	struct dev_pagemap *pgmap;
-	unsigned int page_mask;
-};
-
 static inline void sanity_check_pinned_pages(struct page **pages,
 					     unsigned long npages)
 {
@@ -751,24 +746,25 @@  static struct page *follow_pud_mask(struct vm_area_struct *vma,
 				    unsigned int flags,
 				    struct follow_page_context *ctx)
 {
-	pud_t *pud;
+	pud_t *pudp, pud;
 	spinlock_t *ptl;
 	struct page *page;
 	struct mm_struct *mm = vma->vm_mm;
 
-	pud = pud_offset(p4dp, address);
-	if (pud_none(*pud))
+	pudp = pud_offset(p4dp, address);
+	pud = *pudp;
+	if (pud_none(pud) || !pud_present(pud))
 		return no_page_table(vma, flags, address);
-	if (pud_devmap(*pud)) {
-		ptl = pud_lock(mm, pud);
-		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
+	if (pud_huge(pud)) {
+		ptl = pud_lock(mm, pudp);
+		page = follow_huge_pud(vma, address, pudp, flags, ctx);
 		spin_unlock(ptl);
 		return page;
 	}
-	if (unlikely(pud_bad(*pud)))
+	if (unlikely(pud_bad(pud)))
 		return no_page_table(vma, flags, address);
 
-	return follow_pmd_mask(vma, address, pud, flags, ctx);
+	return follow_pmd_mask(vma, address, pudp, flags, ctx);
 }
 
 static struct page *follow_p4d_mask(struct vm_area_struct *vma,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6eb55f97a3d2..6748ef5f3fd9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1207,49 +1207,6 @@  static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
 		update_mmu_cache_pud(vma, addr, pud);
 }
 
-struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
-		pud_t *pud, int flags, struct dev_pagemap **pgmap)
-{
-	unsigned long pfn = pud_pfn(*pud);
-	struct mm_struct *mm = vma->vm_mm;
-	struct page *page;
-	int ret;
-
-	assert_spin_locked(pud_lockptr(mm, pud));
-
-	if (flags & FOLL_WRITE && !pud_write(*pud))
-		return NULL;
-
-	if (pud_present(*pud) && pud_devmap(*pud))
-		/* pass */;
-	else
-		return NULL;
-
-	if (flags & FOLL_TOUCH)
-		touch_pud(vma, addr, pud, flags & FOLL_WRITE);
-
-	/*
-	 * device mapped pages can only be returned if the
-	 * caller will manage the page reference count.
-	 *
-	 * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
-	 */
-	if (!(flags & (FOLL_GET | FOLL_PIN)))
-		return ERR_PTR(-EEXIST);
-
-	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
-	*pgmap = get_dev_pagemap(pfn, *pgmap);
-	if (!*pgmap)
-		return ERR_PTR(-EFAULT);
-	page = pfn_to_page(pfn);
-
-	ret = try_grab_page(page, flags);
-	if (ret)
-		page = ERR_PTR(ret);
-
-	return page;
-}
-
 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
 		  struct vm_area_struct *vma)
@@ -1305,6 +1262,61 @@  void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
 }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
+struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr,
+			     pud_t *pudp, int flags,
+			     struct follow_page_context *ctx)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct page *page;
+	pud_t pud = *pudp;
+	unsigned long pfn = pud_pfn(pud);
+	int ret;
+
+	assert_spin_locked(pud_lockptr(mm, pudp));
+
+	if ((flags & FOLL_WRITE) && !pud_write(pud))
+		return NULL;
+
+	if (!pud_present(pud))
+		return NULL;
+
+	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+	if (pud_devmap(pud)) {
+		/*
+		 * device mapped pages can only be returned if the caller
+		 * will manage the page reference count.
+		 *
+		 * At least one of FOLL_GET | FOLL_PIN must be set, so
+		 * assert that here:
+		 */
+		if (!(flags & (FOLL_GET | FOLL_PIN)))
+			return ERR_PTR(-EEXIST);
+
+		if (flags & FOLL_TOUCH)
+			touch_pud(vma, addr, pudp, flags & FOLL_WRITE);
+
+		ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap);
+		if (!ctx->pgmap)
+			return ERR_PTR(-EFAULT);
+	}
+#endif
+	page = pfn_to_page(pfn);
+
+	if (!pud_devmap(pud) && !pud_write(pud) &&
+	    gup_must_unshare(vma, flags, page))
+		return ERR_PTR(-EMLINK);
+
+	ret = try_grab_page(page, flags);
+	if (ret)
+		page = ERR_PTR(ret);
+	else
+		ctx->page_mask = HPAGE_PUD_NR - 1;
+
+	return page;
+}
+
 void huge_pmd_set_accessed(struct vm_fault *vmf)
 {
 	bool write = vmf->flags & FAULT_FLAG_WRITE;