[RFC,v2,18/47] hugetlb: enlighten follow_hugetlb_page to support HGM

Message ID 20221021163703.3218176-19-jthoughton@google.com
State New
Headers
Series hugetlb: introduce HugeTLB high-granularity mapping |

Commit Message

James Houghton Oct. 21, 2022, 4:36 p.m. UTC
  This enables high-granularity mapping support in GUP.

One important change here is that, before, we never needed to grab the
VMA lock, but now, to prevent someone from collapsing the page tables
out from under us, we grab it for reading when doing high-granularity PT
walks.

In case it is confusing, pfn_offset is the offset (in PAGE_SIZE units)
that vaddr points to within the subpage that hpte points to.

Signed-off-by: James Houghton <jthoughton@google.com>
---
 mm/hugetlb.c | 76 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 23 deletions(-)
  

Comments

Mike Kravetz Dec. 15, 2022, 7:29 p.m. UTC | #1
On 10/21/22 16:36, James Houghton wrote:
> This enables high-granularity mapping support in GUP.
> 
> One important change here is that, before, we never needed to grab the
> VMA lock, but now, to prevent someone from collapsing the page tables
> out from under us, we grab it for reading when doing high-granularity PT
> walks.

Once again, I think Peter's series will already take the vma lock here.

> In case it is confusing, pfn_offset is the offset (in PAGE_SIZE units)
> that vaddr points to within the subpage that hpte points to.
> 
> Signed-off-by: James Houghton <jthoughton@google.com>
> ---
>  mm/hugetlb.c | 76 ++++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 53 insertions(+), 23 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 2d096cef53cd..d76ab32fb6d3 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6382,11 +6382,9 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
>  	}
>  }
>  
> -static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
> +static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t pteval,
>  					       bool *unshare)
>  {
> -	pte_t pteval = huge_ptep_get(pte);
> -
>  	*unshare = false;
>  	if (is_swap_pte(pteval))
>  		return true;
> @@ -6478,12 +6476,20 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  	struct hstate *h = hstate_vma(vma);
>  	int err = -EFAULT, refs;
>  
> +	/*
> +	 * Grab the VMA lock for reading now so no one can collapse the page
> +	 * table from under us.
> +	 */
> +	hugetlb_vma_lock_read(vma);
> +
>  	while (vaddr < vma->vm_end && remainder) {
> -		pte_t *pte;
> +		pte_t *ptep, pte;

Thanks, that really would be better as ptep in the existing code.

>  		spinlock_t *ptl = NULL;
>  		bool unshare = false;
>  		int absent;
> -		struct page *page;
> +		unsigned long pages_per_hpte;
> +		struct page *page, *subpage;
> +		struct hugetlb_pte hpte;
>  
>  		/*
>  		 * If we have a pending SIGKILL, don't keep faulting pages and
> @@ -6499,13 +6505,22 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  		 * each hugepage.  We have to make sure we get the
>  		 * first, for the page indexing below to work.
>  		 *
> -		 * Note that page table lock is not held when pte is null.
> +		 * Note that page table lock is not held when ptep is null.
>  		 */
> -		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
> -				      huge_page_size(h));
> -		if (pte)
> -			ptl = huge_pte_lock(h, mm, pte);
> -		absent = !pte || huge_pte_none(huge_ptep_get(pte));
> +		ptep = huge_pte_offset(mm, vaddr & huge_page_mask(h),
> +				       huge_page_size(h));
> +		if (ptep) {
> +			hugetlb_pte_populate(&hpte, ptep, huge_page_shift(h),
> +					hpage_size_to_level(huge_page_size(h)));
> +			hugetlb_hgm_walk(mm, vma, &hpte, vaddr,
> +					PAGE_SIZE,
> +					/*stop_at_none=*/true);
> +			ptl = hugetlb_pte_lock(mm, &hpte);
> +			ptep = hpte.ptep;
> +			pte = huge_ptep_get(ptep);
> +		}
> +
> +		absent = !ptep || huge_pte_none(pte);

In Peter's series, huge_pte_offset calls are replaced with hugetlb_walk that
takes a vma pointer.  It might make sense now to consolidate the hugetlb
page table walkers.  I know that was discussed at some time.  Just thinking
we could possibly fold much of the above into hugetlb_walk.

>  
>  		/*
>  		 * When coredumping, it suits get_dump_page if we just return
> @@ -6516,12 +6531,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  		 */
>  		if (absent && (flags & FOLL_DUMP) &&
>  		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
> -			if (pte)
> +			if (ptep)
>  				spin_unlock(ptl);
>  			remainder = 0;
>  			break;
>  		}
>  
> +		if (!absent && pte_present(pte) &&
> +				!hugetlb_pte_present_leaf(&hpte, pte)) {
> +			/* We raced with someone splitting the PTE, so retry. */

I do not think I have got to the splitting code yet, but I am assuming we do
not hold vma lock for write when splitting.  We would of course hold page table
lock.

> +			spin_unlock(ptl);
> +			continue;
> +		}
> +
>  		/*
>  		 * We need call hugetlb_fault for both hugepages under migration
>  		 * (in which case hugetlb_fault waits for the migration,) and
> @@ -6537,7 +6559,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  			vm_fault_t ret;
>  			unsigned int fault_flags = 0;
>  
> -			if (pte)
> +			/* Drop the lock before entering hugetlb_fault. */
> +			hugetlb_vma_unlock_read(vma);
> +
> +			if (ptep)
>  				spin_unlock(ptl);
>  			if (flags & FOLL_WRITE)
>  				fault_flags |= FAULT_FLAG_WRITE;
> @@ -6560,7 +6585,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  			if (ret & VM_FAULT_ERROR) {
>  				err = vm_fault_to_errno(ret, flags);
>  				remainder = 0;
> -				break;
> +				goto out;
>  			}
>  			if (ret & VM_FAULT_RETRY) {
>  				if (locked &&
> @@ -6578,11 +6603,14 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  				 */
>  				return i;
>  			}
> +			hugetlb_vma_lock_read(vma);
>  			continue;
>  		}
>  
> -		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
> -		page = pte_page(huge_ptep_get(pte));
> +		pfn_offset = (vaddr & ~hugetlb_pte_mask(&hpte)) >> PAGE_SHIFT;
> +		subpage = pte_page(pte);
> +		pages_per_hpte = hugetlb_pte_size(&hpte) / PAGE_SIZE;
> +		page = compound_head(subpage);
>  
>  		VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
>  			       !PageAnonExclusive(page), page);
> @@ -6592,21 +6620,21 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  		 * and skip the same_page loop below.
>  		 */
>  		if (!pages && !vmas && !pfn_offset &&
> -		    (vaddr + huge_page_size(h) < vma->vm_end) &&
> -		    (remainder >= pages_per_huge_page(h))) {
> -			vaddr += huge_page_size(h);
> -			remainder -= pages_per_huge_page(h);
> -			i += pages_per_huge_page(h);
> +		    (vaddr + pages_per_hpte < vma->vm_end) &&
> +		    (remainder >= pages_per_hpte)) {
> +			vaddr += pages_per_hpte;
> +			remainder -= pages_per_hpte;
> +			i += pages_per_hpte;
>  			spin_unlock(ptl);
>  			continue;
>  		}
>  
>  		/* vaddr may not be aligned to PAGE_SIZE */
> -		refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
> +		refs = min3(pages_per_hpte - pfn_offset, remainder,
>  		    (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
>  
>  		if (pages || vmas)
> -			record_subpages_vmas(nth_page(page, pfn_offset),
> +			record_subpages_vmas(nth_page(subpage, pfn_offset),
>  					     vma, refs,
>  					     likely(pages) ? pages + i : NULL,
>  					     vmas ? vmas + i : NULL);

Not your fault, but all the above was difficult to follow before HGM. :(
Did not notice any issues.
  

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2d096cef53cd..d76ab32fb6d3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6382,11 +6382,9 @@  static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
 	}
 }
 
-static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
+static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t pteval,
 					       bool *unshare)
 {
-	pte_t pteval = huge_ptep_get(pte);
-
 	*unshare = false;
 	if (is_swap_pte(pteval))
 		return true;
@@ -6478,12 +6476,20 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct hstate *h = hstate_vma(vma);
 	int err = -EFAULT, refs;
 
+	/*
+	 * Grab the VMA lock for reading now so no one can collapse the page
+	 * table from under us.
+	 */
+	hugetlb_vma_lock_read(vma);
+
 	while (vaddr < vma->vm_end && remainder) {
-		pte_t *pte;
+		pte_t *ptep, pte;
 		spinlock_t *ptl = NULL;
 		bool unshare = false;
 		int absent;
-		struct page *page;
+		unsigned long pages_per_hpte;
+		struct page *page, *subpage;
+		struct hugetlb_pte hpte;
 
 		/*
 		 * If we have a pending SIGKILL, don't keep faulting pages and
@@ -6499,13 +6505,22 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * each hugepage.  We have to make sure we get the
 		 * first, for the page indexing below to work.
 		 *
-		 * Note that page table lock is not held when pte is null.
+		 * Note that page table lock is not held when ptep is null.
 		 */
-		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
-				      huge_page_size(h));
-		if (pte)
-			ptl = huge_pte_lock(h, mm, pte);
-		absent = !pte || huge_pte_none(huge_ptep_get(pte));
+		ptep = huge_pte_offset(mm, vaddr & huge_page_mask(h),
+				       huge_page_size(h));
+		if (ptep) {
+			hugetlb_pte_populate(&hpte, ptep, huge_page_shift(h),
+					hpage_size_to_level(huge_page_size(h)));
+			hugetlb_hgm_walk(mm, vma, &hpte, vaddr,
+					PAGE_SIZE,
+					/*stop_at_none=*/true);
+			ptl = hugetlb_pte_lock(mm, &hpte);
+			ptep = hpte.ptep;
+			pte = huge_ptep_get(ptep);
+		}
+
+		absent = !ptep || huge_pte_none(pte);
 
 		/*
 		 * When coredumping, it suits get_dump_page if we just return
@@ -6516,12 +6531,19 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		if (absent && (flags & FOLL_DUMP) &&
 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
-			if (pte)
+			if (ptep)
 				spin_unlock(ptl);
 			remainder = 0;
 			break;
 		}
 
+		if (!absent && pte_present(pte) &&
+				!hugetlb_pte_present_leaf(&hpte, pte)) {
+			/* We raced with someone splitting the PTE, so retry. */
+			spin_unlock(ptl);
+			continue;
+		}
+
 		/*
 		 * We need call hugetlb_fault for both hugepages under migration
 		 * (in which case hugetlb_fault waits for the migration,) and
@@ -6537,7 +6559,10 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			vm_fault_t ret;
 			unsigned int fault_flags = 0;
 
-			if (pte)
+			/* Drop the lock before entering hugetlb_fault. */
+			hugetlb_vma_unlock_read(vma);
+
+			if (ptep)
 				spin_unlock(ptl);
 			if (flags & FOLL_WRITE)
 				fault_flags |= FAULT_FLAG_WRITE;
@@ -6560,7 +6585,7 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			if (ret & VM_FAULT_ERROR) {
 				err = vm_fault_to_errno(ret, flags);
 				remainder = 0;
-				break;
+				goto out;
 			}
 			if (ret & VM_FAULT_RETRY) {
 				if (locked &&
@@ -6578,11 +6603,14 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 				 */
 				return i;
 			}
+			hugetlb_vma_lock_read(vma);
 			continue;
 		}
 
-		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
-		page = pte_page(huge_ptep_get(pte));
+		pfn_offset = (vaddr & ~hugetlb_pte_mask(&hpte)) >> PAGE_SHIFT;
+		subpage = pte_page(pte);
+		pages_per_hpte = hugetlb_pte_size(&hpte) / PAGE_SIZE;
+		page = compound_head(subpage);
 
 		VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
 			       !PageAnonExclusive(page), page);
@@ -6592,21 +6620,21 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * and skip the same_page loop below.
 		 */
 		if (!pages && !vmas && !pfn_offset &&
-		    (vaddr + huge_page_size(h) < vma->vm_end) &&
-		    (remainder >= pages_per_huge_page(h))) {
-			vaddr += huge_page_size(h);
-			remainder -= pages_per_huge_page(h);
-			i += pages_per_huge_page(h);
+		    (vaddr + pages_per_hpte < vma->vm_end) &&
+		    (remainder >= pages_per_hpte)) {
+			vaddr += pages_per_hpte;
+			remainder -= pages_per_hpte;
+			i += pages_per_hpte;
 			spin_unlock(ptl);
 			continue;
 		}
 
 		/* vaddr may not be aligned to PAGE_SIZE */
-		refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
+		refs = min3(pages_per_hpte - pfn_offset, remainder,
 		    (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
 
 		if (pages || vmas)
-			record_subpages_vmas(nth_page(page, pfn_offset),
+			record_subpages_vmas(nth_page(subpage, pfn_offset),
 					     vma, refs,
 					     likely(pages) ? pages + i : NULL,
 					     vmas ? vmas + i : NULL);
@@ -6637,6 +6665,8 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 		spin_unlock(ptl);
 	}
+	hugetlb_vma_unlock_read(vma);
+out:
 	*nr_pages = remainder;
 	/*
 	 * setting position is actually required only if remainder is