[RFC,v2,25/47] hugetlb: add HGM support for copy_hugetlb_page_range

Message ID 20221021163703.3218176-26-jthoughton@google.com
State New
Headers
Series hugetlb: introduce HugeTLB high-granularity mapping |

Commit Message

James Houghton Oct. 21, 2022, 4:36 p.m. UTC
  This allows fork() to work with high-granularity mappings. The page
table structure is copied such that partially mapped regions will remain
partially mapped in the same way for the new process.

A page's reference count is incremented for *each* portion of it that is
mapped in the page table. For example, if you have a PMD-mapped 1G page,
the reference count and mapcount will be incremented by 512.

Signed-off-by: James Houghton <jthoughton@google.com>
---
 mm/hugetlb.c | 81 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 65 insertions(+), 16 deletions(-)
  

Comments

Peter Xu Nov. 30, 2022, 9:32 p.m. UTC | #1
On Fri, Oct 21, 2022 at 04:36:41PM +0000, James Houghton wrote:
> This allows fork() to work with high-granularity mappings. The page
> table structure is copied such that partially mapped regions will remain
> partially mapped in the same way for the new process.
> 
> A page's reference count is incremented for *each* portion of it that is
> mapped in the page table. For example, if you have a PMD-mapped 1G page,
> the reference count and mapcount will be incremented by 512.
> 
> Signed-off-by: James Houghton <jthoughton@google.com>

I have a feeling that this path is not triggered.  See:

bcd51a3c679d ("hugetlb: lazy page table copies in fork()", 2022-07-17)

It might be helpful to have it when exploring private mapping support of
hgm on page poison in the future.  But the thing is if we want this to be
accepted we still need a way to test it. I just don't see how to test this
without the private support being there..
  
James Houghton Nov. 30, 2022, 11:18 p.m. UTC | #2
On Wed, Nov 30, 2022 at 4:32 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Fri, Oct 21, 2022 at 04:36:41PM +0000, James Houghton wrote:
> > This allows fork() to work with high-granularity mappings. The page
> > table structure is copied such that partially mapped regions will remain
> > partially mapped in the same way for the new process.
> >
> > A page's reference count is incremented for *each* portion of it that is
> > mapped in the page table. For example, if you have a PMD-mapped 1G page,
> > the reference count and mapcount will be incremented by 512.
> >
> > Signed-off-by: James Houghton <jthoughton@google.com>
>
> I have a feeling that this path is not triggered.  See:
>
> bcd51a3c679d ("hugetlb: lazy page table copies in fork()", 2022-07-17)
>
> It might be helpful to have it when exploring private mapping support of
> hgm on page poison in the future.  But the thing is if we want this to be
> accepted we still need a way to test it. I just don't see how to test this
> without the private support being there..

We can trigger this behavior by registering the VMA with
uffd-writeprotect. I didn't include any self-tests for this though;
I'll make sure to actually test this path in v1.

- James

>
> --
> Peter Xu
>
  

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5783a8307a77..7d692907cbf3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4946,7 +4946,8 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			    struct vm_area_struct *src_vma)
 {
 	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
+	struct hugetlb_pte src_hpte, dst_hpte;
+	struct page *ptepage, *hpage;
 	unsigned long addr;
 	bool cow = is_cow_mapping(src_vma->vm_flags);
 	struct hstate *h = hstate_vma(src_vma);
@@ -4956,6 +4957,16 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	unsigned long last_addr_mask;
 	int ret = 0;
 
+	if (hugetlb_hgm_enabled(src_vma)) {
+		/*
+		 * src_vma might have high-granularity PTEs, and dst_vma will
+		 * need to copy those.
+		 */
+		ret = enable_hugetlb_hgm(dst_vma);
+		if (ret)
+			return ret;
+	}
+
 	if (cow) {
 		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
 					src_vma->vm_start,
@@ -4967,18 +4978,22 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		/*
 		 * For shared mappings the vma lock must be held before
 		 * calling huge_pte_offset in the src vma. Otherwise, the
-		 * returned ptep could go away if part of a shared pmd and
-		 * another thread calls huge_pmd_unshare.
+		 * returned ptep could go away if
+		 *  - part of a shared pmd and another thread calls
+		 *    huge_pmd_unshare, or
+		 *  - another thread collapses a high-granularity mapping.
 		 */
 		hugetlb_vma_lock_read(src_vma);
 	}
 
 	last_addr_mask = hugetlb_mask_last_page(h);
-	for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
+	addr = src_vma->vm_start;
+	while (addr < src_vma->vm_end) {
 		spinlock_t *src_ptl, *dst_ptl;
+		unsigned long hpte_sz;
 		src_pte = huge_pte_offset(src, addr, sz);
 		if (!src_pte) {
-			addr |= last_addr_mask;
+			addr = (addr | last_addr_mask) + sz;
 			continue;
 		}
 		dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
@@ -4987,6 +5002,26 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			break;
 		}
 
+		hugetlb_pte_populate(&src_hpte, src_pte, huge_page_shift(h),
+				hpage_size_to_level(huge_page_size(h)));
+		hugetlb_pte_populate(&dst_hpte, dst_pte, huge_page_shift(h),
+				hpage_size_to_level(huge_page_size(h)));
+
+		if (hugetlb_hgm_enabled(src_vma)) {
+			hugetlb_hgm_walk(src, src_vma, &src_hpte, addr,
+				      PAGE_SIZE, /*stop_at_none=*/true);
+			ret = hugetlb_hgm_walk(dst, dst_vma, &dst_hpte, addr,
+					hugetlb_pte_size(&src_hpte),
+					/*stop_at_none=*/false);
+			if (ret)
+				break;
+
+			src_pte = src_hpte.ptep;
+			dst_pte = dst_hpte.ptep;
+		}
+
+		hpte_sz = hugetlb_pte_size(&src_hpte);
+
 		/*
 		 * If the pagetables are shared don't copy or take references.
 		 *
@@ -4996,12 +5031,12 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		 * to reliably determine whether pte is shared.
 		 */
 		if (page_count(virt_to_page(dst_pte)) > 1) {
-			addr |= last_addr_mask;
+			addr = (addr | last_addr_mask) + sz;
 			continue;
 		}
 
-		dst_ptl = huge_pte_lock(h, dst, dst_pte);
-		src_ptl = huge_pte_lockptr(huge_page_shift(h), src, src_pte);
+		dst_ptl = hugetlb_pte_lock(dst, &dst_hpte);
+		src_ptl = hugetlb_pte_lockptr(src, &src_hpte);
 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 		entry = huge_ptep_get(src_pte);
 again:
@@ -5042,10 +5077,15 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			 */
 			if (userfaultfd_wp(dst_vma))
 				set_huge_pte_at(dst, addr, dst_pte, entry);
+		} else if (!hugetlb_pte_present_leaf(&src_hpte, entry)) {
+			/* Retry the walk. */
+			spin_unlock(src_ptl);
+			spin_unlock(dst_ptl);
+			continue;
 		} else {
-			entry = huge_ptep_get(src_pte);
 			ptepage = pte_page(entry);
-			get_page(ptepage);
+			hpage = compound_head(ptepage);
+			get_page(hpage);
 
 			/*
 			 * Failing to duplicate the anon rmap is a rare case
@@ -5058,24 +5098,29 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			 * sleep during the process.
 			 */
 			if (!PageAnon(ptepage)) {
-				page_dup_file_rmap(ptepage, true);
-			} else if (page_try_dup_anon_rmap(ptepage, true,
+				page_dup_file_rmap(hpage, true);
+			} else if (page_try_dup_anon_rmap(hpage, true,
 							  src_vma)) {
 				pte_t src_pte_old = entry;
 				struct page *new;
 
+				if (hugetlb_hgm_enabled(src_vma)) {
+					ret = -EINVAL;
+					break;
+				}
+
 				spin_unlock(src_ptl);
 				spin_unlock(dst_ptl);
 				/* Do not use reserve as it's private owned */
 				new = alloc_huge_page(dst_vma, addr, 1);
 				if (IS_ERR(new)) {
-					put_page(ptepage);
+					put_page(hpage);
 					ret = PTR_ERR(new);
 					break;
 				}
-				copy_user_huge_page(new, ptepage, addr, dst_vma,
+				copy_user_huge_page(new, hpage, addr, dst_vma,
 						    npages);
-				put_page(ptepage);
+				put_page(hpage);
 
 				/* Install the new huge page if src pte stable */
 				dst_ptl = huge_pte_lock(h, dst, dst_pte);
@@ -5093,6 +5138,7 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 				hugetlb_install_page(dst_vma, dst_pte, addr, new);
 				spin_unlock(src_ptl);
 				spin_unlock(dst_ptl);
+				addr += hugetlb_pte_size(&src_hpte);
 				continue;
 			}
 
@@ -5109,10 +5155,13 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			}
 
 			set_huge_pte_at(dst, addr, dst_pte, entry);
-			hugetlb_count_add(npages, dst);
+			hugetlb_count_add(
+					hugetlb_pte_size(&dst_hpte) / PAGE_SIZE,
+					dst);
 		}
 		spin_unlock(src_ptl);
 		spin_unlock(dst_ptl);
+		addr += hugetlb_pte_size(&src_hpte);
 	}
 
 	if (cow) {