[v2,10/13] mm/gup: Handle huge pud for follow_pud_mask()

Message ID 20240103091423.400294-11-peterx@redhat.com
State New
Headers
Series mm/gup: Unify hugetlb, part 2 |

Commit Message

Peter Xu Jan. 3, 2024, 9:14 a.m. UTC
  From: Peter Xu <peterx@redhat.com>

Teach follow_pud_mask() to be able to handle normal PUD pages like hugetlb.

Rename follow_devmap_pud() to follow_huge_pud() so that it can process
either huge devmap or hugetlb. Move it out of TRANSPARENT_HUGEPAGE_PUD and
and huge_memory.c (which relies on CONFIG_THP).

In the new follow_huge_pud(), taking care of possible CoR for hugetlb if
necessary.  touch_pud() needs to be moved out of huge_memory.c to be
accessable from gup.c even if !THP.

Since at it, optimize the non-present check by adding a pud_present() early
check before taking the pgtable lock, failing the follow_page() early if
PUD is not present: that is required by both devmap or hugetlb.  Use
pud_huge() to also cover the pud_devmap() case.

One more trivial thing to mention is, introduce "pud_t pud" in the code
paths along the way, so the code doesn't dereference *pudp multiple time.
Not only because that looks less straightforward, but also because if the
dereference really happened, it's not clear whether there can be race to
see different *pudp values when it's being modified at the same time.

Setting ctx->page_mask properly for a PUD entry.  As a side effect, this
patch should also be able to optimize devmap GUP on PUD to be able to jump
over the whole PUD range, but not yet verified.  Hugetlb already can do so
prior to this patch.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/linux/huge_mm.h |  8 -----
 mm/gup.c                | 70 +++++++++++++++++++++++++++++++++++++++--
 mm/huge_memory.c        | 47 ++-------------------------
 mm/internal.h           |  2 ++
 4 files changed, 71 insertions(+), 56 deletions(-)
  

Comments

Jason Gunthorpe Jan. 15, 2024, 6:49 p.m. UTC | #1
On Wed, Jan 03, 2024 at 05:14:20PM +0800, peterx@redhat.com wrote:
> diff --git a/mm/gup.c b/mm/gup.c
> index 63845b3ec44f..760406180222 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -525,6 +525,70 @@ static struct page *no_page_table(struct vm_area_struct *vma,
>  	return NULL;
>  }
>  
> +#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
> +static struct page *follow_huge_pud(struct vm_area_struct *vma,
> +				    unsigned long addr, pud_t *pudp,
> +				    int flags, struct follow_page_context *ctx)
> +{
> +	struct mm_struct *mm = vma->vm_mm;
> +	struct page *page;
> +	pud_t pud = *pudp;
> +	unsigned long pfn = pud_pfn(pud);
> +	int ret;
> +
> +	assert_spin_locked(pud_lockptr(mm, pudp));
> +
> +	if ((flags & FOLL_WRITE) && !pud_write(pud))
> +		return NULL;
> +
> +	if (!pud_present(pud))
> +		return NULL;
> +
> +	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
> +
> +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
> +	if (pud_devmap(pud)) {

Can this use IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) ?

> +		/*
> +		 * device mapped pages can only be returned if the caller
> +		 * will manage the page reference count.
> +		 *
> +		 * At least one of FOLL_GET | FOLL_PIN must be set, so
> +		 * assert that here:
> +		 */
> +		if (!(flags & (FOLL_GET | FOLL_PIN)))
> +			return ERR_PTR(-EEXIST);
> +
> +		if (flags & FOLL_TOUCH)
> +			touch_pud(vma, addr, pudp, flags & FOLL_WRITE);
> +
> +		ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap);
> +		if (!ctx->pgmap)
> +			return ERR_PTR(-EFAULT);
> +	}
> +#endif	/* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
> +	page = pfn_to_page(pfn);
> +
> +	if (!pud_devmap(pud) && !pud_write(pud) &&
> +	    gup_must_unshare(vma, flags, page))
> +		return ERR_PTR(-EMLINK);
> +
> +	ret = try_grab_page(page, flags);
> +	if (ret)
> +		page = ERR_PTR(ret);
> +	else
> +		ctx->page_mask = HPAGE_PUD_NR - 1;
> +
> +	return page;
> +}
> +#else  /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
> +static struct page *follow_huge_pud(struct vm_area_struct *vma,
> +				    unsigned long addr, pud_t *pudp,
> +				    int flags, struct follow_page_context *ctx)
> +{
> +	return NULL;
> +}
> +#endif	/* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
> +
>  static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
>  		pte_t *pte, unsigned int flags)
>  {
> @@ -760,11 +824,11 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
>  
>  	pudp = pud_offset(p4dp, address);
>  	pud = READ_ONCE(*pudp);
> -	if (pud_none(pud))
> +	if (pud_none(pud) || !pud_present(pud))
>  		return no_page_table(vma, flags, address);

Isn't 'pud_none() || !pud_present()' redundent? A none pud is
non-present, by definition?

> -	if (pud_devmap(pud)) {
> +	if (pud_huge(pud)) {
>  		ptl = pud_lock(mm, pudp);
> -		page = follow_devmap_pud(vma, address, pudp, flags, &ctx->pgmap);
> +		page = follow_huge_pud(vma, address, pudp, flags, ctx);
>  		spin_unlock(ptl);
>  		if (page)
>  			return page;

Otherwise it looks OK to me

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>

Jason
  
Peter Xu Feb. 21, 2024, 11:49 a.m. UTC | #2
On Mon, Jan 15, 2024 at 02:49:00PM -0400, Jason Gunthorpe wrote:
> On Wed, Jan 03, 2024 at 05:14:20PM +0800, peterx@redhat.com wrote:
> > diff --git a/mm/gup.c b/mm/gup.c
> > index 63845b3ec44f..760406180222 100644
> > --- a/mm/gup.c
> > +++ b/mm/gup.c
> > @@ -525,6 +525,70 @@ static struct page *no_page_table(struct vm_area_struct *vma,
> >  	return NULL;
> >  }
> >  
> > +#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
> > +static struct page *follow_huge_pud(struct vm_area_struct *vma,
> > +				    unsigned long addr, pud_t *pudp,
> > +				    int flags, struct follow_page_context *ctx)
> > +{
> > +	struct mm_struct *mm = vma->vm_mm;
> > +	struct page *page;
> > +	pud_t pud = *pudp;
> > +	unsigned long pfn = pud_pfn(pud);
> > +	int ret;
> > +
> > +	assert_spin_locked(pud_lockptr(mm, pudp));
> > +
> > +	if ((flags & FOLL_WRITE) && !pud_write(pud))
> > +		return NULL;
> > +
> > +	if (!pud_present(pud))
> > +		return NULL;
> > +
> > +	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
> > +
> > +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
> > +	if (pud_devmap(pud)) {
> 
> Can this use IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) ?

Sure.

> 
> > +		/*
> > +		 * device mapped pages can only be returned if the caller
> > +		 * will manage the page reference count.
> > +		 *
> > +		 * At least one of FOLL_GET | FOLL_PIN must be set, so
> > +		 * assert that here:
> > +		 */
> > +		if (!(flags & (FOLL_GET | FOLL_PIN)))
> > +			return ERR_PTR(-EEXIST);
> > +
> > +		if (flags & FOLL_TOUCH)
> > +			touch_pud(vma, addr, pudp, flags & FOLL_WRITE);
> > +
> > +		ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap);
> > +		if (!ctx->pgmap)
> > +			return ERR_PTR(-EFAULT);
> > +	}
> > +#endif	/* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
> > +	page = pfn_to_page(pfn);
> > +
> > +	if (!pud_devmap(pud) && !pud_write(pud) &&
> > +	    gup_must_unshare(vma, flags, page))
> > +		return ERR_PTR(-EMLINK);
> > +
> > +	ret = try_grab_page(page, flags);
> > +	if (ret)
> > +		page = ERR_PTR(ret);
> > +	else
> > +		ctx->page_mask = HPAGE_PUD_NR - 1;
> > +
> > +	return page;
> > +}
> > +#else  /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
> > +static struct page *follow_huge_pud(struct vm_area_struct *vma,
> > +				    unsigned long addr, pud_t *pudp,
> > +				    int flags, struct follow_page_context *ctx)
> > +{
> > +	return NULL;
> > +}
> > +#endif	/* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
> > +
> >  static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
> >  		pte_t *pte, unsigned int flags)
> >  {
> > @@ -760,11 +824,11 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
> >  
> >  	pudp = pud_offset(p4dp, address);
> >  	pud = READ_ONCE(*pudp);
> > -	if (pud_none(pud))
> > +	if (pud_none(pud) || !pud_present(pud))
> >  		return no_page_table(vma, flags, address);
> 
> Isn't 'pud_none() || !pud_present()' redundent? A none pud is
> non-present, by definition?

Hmm yes, seems redundant.  Let me drop it.

> 
> > -	if (pud_devmap(pud)) {
> > +	if (pud_huge(pud)) {
> >  		ptl = pud_lock(mm, pudp);
> > -		page = follow_devmap_pud(vma, address, pudp, flags, &ctx->pgmap);
> > +		page = follow_huge_pud(vma, address, pudp, flags, ctx);
> >  		spin_unlock(ptl);
> >  		if (page)
> >  			return page;
> 
> Otherwise it looks OK to me
> 
> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>

Thanks!
  

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 96bd4b5d027e..3b73d20d537e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -345,8 +345,6 @@  static inline bool folio_test_pmd_mappable(struct folio *folio)
 
 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
-struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
-		pud_t *pud, int flags, struct dev_pagemap **pgmap);
 
 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
 
@@ -502,12 +500,6 @@  static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
 	return NULL;
 }
 
-static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
-	unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap)
-{
-	return NULL;
-}
-
 static inline bool thp_migration_supported(void)
 {
 	return false;
diff --git a/mm/gup.c b/mm/gup.c
index 63845b3ec44f..760406180222 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -525,6 +525,70 @@  static struct page *no_page_table(struct vm_area_struct *vma,
 	return NULL;
 }
 
+#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
+static struct page *follow_huge_pud(struct vm_area_struct *vma,
+				    unsigned long addr, pud_t *pudp,
+				    int flags, struct follow_page_context *ctx)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct page *page;
+	pud_t pud = *pudp;
+	unsigned long pfn = pud_pfn(pud);
+	int ret;
+
+	assert_spin_locked(pud_lockptr(mm, pudp));
+
+	if ((flags & FOLL_WRITE) && !pud_write(pud))
+		return NULL;
+
+	if (!pud_present(pud))
+		return NULL;
+
+	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+	if (pud_devmap(pud)) {
+		/*
+		 * device mapped pages can only be returned if the caller
+		 * will manage the page reference count.
+		 *
+		 * At least one of FOLL_GET | FOLL_PIN must be set, so
+		 * assert that here:
+		 */
+		if (!(flags & (FOLL_GET | FOLL_PIN)))
+			return ERR_PTR(-EEXIST);
+
+		if (flags & FOLL_TOUCH)
+			touch_pud(vma, addr, pudp, flags & FOLL_WRITE);
+
+		ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap);
+		if (!ctx->pgmap)
+			return ERR_PTR(-EFAULT);
+	}
+#endif	/* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+	page = pfn_to_page(pfn);
+
+	if (!pud_devmap(pud) && !pud_write(pud) &&
+	    gup_must_unshare(vma, flags, page))
+		return ERR_PTR(-EMLINK);
+
+	ret = try_grab_page(page, flags);
+	if (ret)
+		page = ERR_PTR(ret);
+	else
+		ctx->page_mask = HPAGE_PUD_NR - 1;
+
+	return page;
+}
+#else  /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
+static struct page *follow_huge_pud(struct vm_area_struct *vma,
+				    unsigned long addr, pud_t *pudp,
+				    int flags, struct follow_page_context *ctx)
+{
+	return NULL;
+}
+#endif	/* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
+
 static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
 		pte_t *pte, unsigned int flags)
 {
@@ -760,11 +824,11 @@  static struct page *follow_pud_mask(struct vm_area_struct *vma,
 
 	pudp = pud_offset(p4dp, address);
 	pud = READ_ONCE(*pudp);
-	if (pud_none(pud))
+	if (pud_none(pud) || !pud_present(pud))
 		return no_page_table(vma, flags, address);
-	if (pud_devmap(pud)) {
+	if (pud_huge(pud)) {
 		ptl = pud_lock(mm, pudp);
-		page = follow_devmap_pud(vma, address, pudp, flags, &ctx->pgmap);
+		page = follow_huge_pud(vma, address, pudp, flags, ctx);
 		spin_unlock(ptl);
 		if (page)
 			return page;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 94ef5c02b459..9993d2b18809 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1373,8 +1373,8 @@  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 }
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
-		      pud_t *pud, bool write)
+void touch_pud(struct vm_area_struct *vma, unsigned long addr,
+	       pud_t *pud, bool write)
 {
 	pud_t _pud;
 
@@ -1386,49 +1386,6 @@  static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
 		update_mmu_cache_pud(vma, addr, pud);
 }
 
-struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
-		pud_t *pud, int flags, struct dev_pagemap **pgmap)
-{
-	unsigned long pfn = pud_pfn(*pud);
-	struct mm_struct *mm = vma->vm_mm;
-	struct page *page;
-	int ret;
-
-	assert_spin_locked(pud_lockptr(mm, pud));
-
-	if (flags & FOLL_WRITE && !pud_write(*pud))
-		return NULL;
-
-	if (pud_present(*pud) && pud_devmap(*pud))
-		/* pass */;
-	else
-		return NULL;
-
-	if (flags & FOLL_TOUCH)
-		touch_pud(vma, addr, pud, flags & FOLL_WRITE);
-
-	/*
-	 * device mapped pages can only be returned if the
-	 * caller will manage the page reference count.
-	 *
-	 * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
-	 */
-	if (!(flags & (FOLL_GET | FOLL_PIN)))
-		return ERR_PTR(-EEXIST);
-
-	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
-	*pgmap = get_dev_pagemap(pfn, *pgmap);
-	if (!*pgmap)
-		return ERR_PTR(-EFAULT);
-	page = pfn_to_page(pfn);
-
-	ret = try_grab_page(page, flags);
-	if (ret)
-		page = ERR_PTR(ret);
-
-	return page;
-}
-
 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
 		  struct vm_area_struct *vma)
diff --git a/mm/internal.h b/mm/internal.h
index f309a010d50f..5821b7a14b62 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1007,6 +1007,8 @@  int __must_check try_grab_page(struct page *page, unsigned int flags);
 /*
  * mm/huge_memory.c
  */
+void touch_pud(struct vm_area_struct *vma, unsigned long addr,
+	       pud_t *pud, bool write);
 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 				   unsigned long addr, pmd_t *pmd,
 				   unsigned int flags);