[19/46] hugetlb: add HGM support for follow_hugetlb_page

Message ID 20230105101844.1893104-20-jthoughton@google.com
State New
Headers
Series Based on latest mm-unstable (85b44c25cd1e). |

Commit Message

James Houghton Jan. 5, 2023, 10:18 a.m. UTC
  This enables high-granularity mapping support in GUP.

In case it is confusing, pfn_offset is the offset (in PAGE_SIZE units)
that vaddr points to within the subpage that hpte points to.

Signed-off-by: James Houghton <jthoughton@google.com>
---
 mm/hugetlb.c | 59 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 22 deletions(-)
  

Comments

Peter Xu Jan. 5, 2023, 10:26 p.m. UTC | #1
On Thu, Jan 05, 2023 at 10:18:17AM +0000, James Houghton wrote:
> @@ -6649,13 +6655,20 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  		 */
>  		if (absent && (flags & FOLL_DUMP) &&
>  		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
> -			if (pte)
> +			if (ptep)
>  				spin_unlock(ptl);
>  			hugetlb_vma_unlock_read(vma);
>  			remainder = 0;
>  			break;
>  		}
>  
> +		if (!absent && pte_present(pte) &&
> +				!hugetlb_pte_present_leaf(&hpte, pte)) {
> +			/* We raced with someone splitting the PTE, so retry. */
> +			spin_unlock(ptl);

vma unlock missing here.

> +			continue;
> +		}
  
Peter Xu Jan. 12, 2023, 6:02 p.m. UTC | #2
On Thu, Jan 05, 2023 at 10:18:17AM +0000, James Houghton wrote:
> @@ -6731,22 +6746,22 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  		 * and skip the same_page loop below.
>  		 */
>  		if (!pages && !vmas && !pfn_offset &&
> -		    (vaddr + huge_page_size(h) < vma->vm_end) &&
> -		    (remainder >= pages_per_huge_page(h))) {
> -			vaddr += huge_page_size(h);
> -			remainder -= pages_per_huge_page(h);
> -			i += pages_per_huge_page(h);
> +		    (vaddr + pages_per_hpte < vma->vm_end) &&
> +		    (remainder >= pages_per_hpte)) {
> +			vaddr += pages_per_hpte;

This silently breaks hugetlb GUP.. should be

                        vaddr += hugetlb_pte_size(&hpte);

It caused misterious MISSING events when I'm playing with this tree, and
I'm surprised it rooted here.  So far the most time consuming one. :)

> +			remainder -= pages_per_hpte;
> +			i += pages_per_hpte;
>  			spin_unlock(ptl);
>  			hugetlb_vma_unlock_read(vma);
>  			continue;
>  		}
  
James Houghton Jan. 12, 2023, 6:06 p.m. UTC | #3
On Thu, Jan 12, 2023 at 1:02 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Thu, Jan 05, 2023 at 10:18:17AM +0000, James Houghton wrote:
> > @@ -6731,22 +6746,22 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> >                * and skip the same_page loop below.
> >                */
> >               if (!pages && !vmas && !pfn_offset &&
> > -                 (vaddr + huge_page_size(h) < vma->vm_end) &&
> > -                 (remainder >= pages_per_huge_page(h))) {
> > -                     vaddr += huge_page_size(h);
> > -                     remainder -= pages_per_huge_page(h);
> > -                     i += pages_per_huge_page(h);
> > +                 (vaddr + pages_per_hpte < vma->vm_end) &&
> > +                 (remainder >= pages_per_hpte)) {
> > +                     vaddr += pages_per_hpte;
>
> This silently breaks hugetlb GUP.. should be
>
>                         vaddr += hugetlb_pte_size(&hpte);
>
> It caused misterious MISSING events when I'm playing with this tree, and
> I'm surprised it rooted here.  So far the most time consuming one. :)

Thanks Peter!! And the `vaddr + pages_per_hpte < vma->vm_end` should
be `vaddr + hugetlb_pte_size(&hpte) < vma->vm_end` too.
  

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 73672d806172..30fea414d9ee 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6532,11 +6532,9 @@  static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
 }
 
 static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
-					       unsigned int flags, pte_t *pte,
+					       unsigned int flags, pte_t pteval,
 					       bool *unshare)
 {
-	pte_t pteval = huge_ptep_get(pte);
-
 	*unshare = false;
 	if (is_swap_pte(pteval))
 		return true;
@@ -6611,11 +6609,13 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int err = -EFAULT, refs;
 
 	while (vaddr < vma->vm_end && remainder) {
-		pte_t *pte;
+		pte_t *ptep, pte;
 		spinlock_t *ptl = NULL;
 		bool unshare = false;
 		int absent;
-		struct page *page;
+		unsigned long pages_per_hpte;
+		struct page *page, *subpage;
+		struct hugetlb_pte hpte;
 
 		/*
 		 * If we have a pending SIGKILL, don't keep faulting pages and
@@ -6632,13 +6632,19 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * each hugepage.  We have to make sure we get the
 		 * first, for the page indexing below to work.
 		 *
-		 * Note that page table lock is not held when pte is null.
+		 * hugetlb_full_walk will mask the address appropriately.
+		 *
+		 * Note that page table lock is not held when ptep is null.
 		 */
-		pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
-				   huge_page_size(h));
-		if (pte)
-			ptl = huge_pte_lock(h, mm, pte);
-		absent = !pte || huge_pte_none(huge_ptep_get(pte));
+		if (hugetlb_full_walk(&hpte, vma, vaddr)) {
+			ptep = NULL;
+			absent = true;
+		} else {
+			ptl = hugetlb_pte_lock(&hpte);
+			ptep = hpte.ptep;
+			pte = huge_ptep_get(ptep);
+			absent = huge_pte_none(pte);
+		}
 
 		/*
 		 * When coredumping, it suits get_dump_page if we just return
@@ -6649,13 +6655,20 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		if (absent && (flags & FOLL_DUMP) &&
 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
-			if (pte)
+			if (ptep)
 				spin_unlock(ptl);
 			hugetlb_vma_unlock_read(vma);
 			remainder = 0;
 			break;
 		}
 
+		if (!absent && pte_present(pte) &&
+				!hugetlb_pte_present_leaf(&hpte, pte)) {
+			/* We raced with someone splitting the PTE, so retry. */
+			spin_unlock(ptl);
+			continue;
+		}
+
 		/*
 		 * We need call hugetlb_fault for both hugepages under migration
 		 * (in which case hugetlb_fault waits for the migration,) and
@@ -6671,7 +6684,7 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			vm_fault_t ret;
 			unsigned int fault_flags = 0;
 
-			if (pte)
+			if (ptep)
 				spin_unlock(ptl);
 			hugetlb_vma_unlock_read(vma);
 
@@ -6720,8 +6733,10 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			continue;
 		}
 
-		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
-		page = pte_page(huge_ptep_get(pte));
+		pfn_offset = (vaddr & ~hugetlb_pte_mask(&hpte)) >> PAGE_SHIFT;
+		subpage = pte_page(pte);
+		pages_per_hpte = hugetlb_pte_size(&hpte) / PAGE_SIZE;
+		page = compound_head(subpage);
 
 		VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
 			       !PageAnonExclusive(page), page);
@@ -6731,22 +6746,22 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * and skip the same_page loop below.
 		 */
 		if (!pages && !vmas && !pfn_offset &&
-		    (vaddr + huge_page_size(h) < vma->vm_end) &&
-		    (remainder >= pages_per_huge_page(h))) {
-			vaddr += huge_page_size(h);
-			remainder -= pages_per_huge_page(h);
-			i += pages_per_huge_page(h);
+		    (vaddr + pages_per_hpte < vma->vm_end) &&
+		    (remainder >= pages_per_hpte)) {
+			vaddr += pages_per_hpte;
+			remainder -= pages_per_hpte;
+			i += pages_per_hpte;
 			spin_unlock(ptl);
 			hugetlb_vma_unlock_read(vma);
 			continue;
 		}
 
 		/* vaddr may not be aligned to PAGE_SIZE */
-		refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
+		refs = min3(pages_per_hpte - pfn_offset, remainder,
 		    (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
 
 		if (pages || vmas)
-			record_subpages_vmas(nth_page(page, pfn_offset),
+			record_subpages_vmas(nth_page(subpage, pfn_offset),
 					     vma, refs,
 					     likely(pages) ? pages + i : NULL,
 					     vmas ? vmas + i : NULL);