[v2,22/46] hugetlb: add HGM support to copy_hugetlb_page_range
Commit Message
This allows fork() to work with high-granularity mappings. The page
table structure is copied such that partially mapped regions will remain
partially mapped in the same way for the new process.
A page's reference count is incremented for *each* portion of it that
is mapped in the page table. For example, if you have a PMD-mapped 1G
page, the reference count will be incremented by 512.
mapcount is handled similar to THPs: if you're completely mapping a
hugepage, then the compound_mapcount is incremented. If you're mapping a
part of it, the subpages that are getting mapped will have their
mapcounts incremented.
Signed-off-by: James Houghton <jthoughton@google.com>
Comments
On Fri, Feb 17, 2023 at 4:29 PM James Houghton <jthoughton@google.com> wrote:
>
> This allows fork() to work with high-granularity mappings. The page
> table structure is copied such that partially mapped regions will remain
> partially mapped in the same way for the new process.
>
> A page's reference count is incremented for *each* portion of it that
> is mapped in the page table. For example, if you have a PMD-mapped 1G
> page, the reference count will be incremented by 512.
>
> mapcount is handled similar to THPs: if you're completely mapping a
> hugepage, then the compound_mapcount is incremented. If you're mapping a
> part of it, the subpages that are getting mapped will have their
> mapcounts incremented.
>
> Signed-off-by: James Houghton <jthoughton@google.com>
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 1a1a71868dfd..2fe1eb6897d4 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -162,6 +162,8 @@ void hugepage_put_subpool(struct hugepage_subpool *spool);
>
> void hugetlb_remove_rmap(struct page *subpage, unsigned long shift,
> struct hstate *h, struct vm_area_struct *vma);
> +void hugetlb_add_file_rmap(struct page *subpage, unsigned long shift,
> + struct hstate *h, struct vm_area_struct *vma);
>
> void hugetlb_dup_vma_private(struct vm_area_struct *vma);
> void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 693332b7e186..210c6f2b16a5 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -141,6 +141,37 @@ void hugetlb_remove_rmap(struct page *subpage, unsigned long shift,
> page_remove_rmap(subpage, vma, false);
> }
> }
> +/*
> + * hugetlb_add_file_rmap() - increment the mapcounts for file-backed hugetlb
> + * pages appropriately.
> + *
> + * For pages that are being mapped with their hstate-level PTE (e.g., a 1G page
> + * being mapped with a 1G PUD), then we increment the compound_mapcount for the
> + * head page.
> + *
> + * For pages that are being mapped with high-granularity, we increment the
> + * mapcounts for the individual subpages that are getting mapped.
> + */
> +void hugetlb_add_file_rmap(struct page *subpage, unsigned long shift,
> + struct hstate *h, struct vm_area_struct *vma)
> +{
> + struct page *hpage = compound_head(subpage);
> +
> + if (shift == huge_page_shift(h)) {
> + VM_BUG_ON_PAGE(subpage != hpage, subpage);
> + page_add_file_rmap(hpage, vma, true);
> + } else {
> + unsigned long nr_subpages = 1UL << (shift - PAGE_SHIFT);
> + struct page *final_page = &subpage[nr_subpages];
> +
> + VM_BUG_ON_PAGE(HPageVmemmapOptimized(hpage), hpage);
> + /*
> + * Increment the mapcount on each page that is getting mapped.
> + */
> + for (; subpage < final_page; ++subpage)
> + page_add_file_rmap(subpage, vma, false);
> + }
> +}
>
> static inline bool subpool_is_free(struct hugepage_subpool *spool)
> {
> @@ -5210,7 +5241,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> struct vm_area_struct *src_vma)
> {
> pte_t *src_pte, *dst_pte, entry;
> - struct page *ptepage;
> + struct hugetlb_pte src_hpte, dst_hpte;
> + struct page *ptepage, *hpage;
> unsigned long addr;
> bool cow = is_cow_mapping(src_vma->vm_flags);
> struct hstate *h = hstate_vma(src_vma);
> @@ -5238,18 +5270,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> }
>
> last_addr_mask = hugetlb_mask_last_page(h);
> - for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
> + addr = src_vma->vm_start;
> + while (addr < src_vma->vm_end) {
> spinlock_t *src_ptl, *dst_ptl;
> - src_pte = hugetlb_walk(src_vma, addr, sz);
> - if (!src_pte) {
> - addr |= last_addr_mask;
> + unsigned long hpte_sz;
> +
> + if (hugetlb_full_walk(&src_hpte, src_vma, addr)) {
> + addr = (addr | last_addr_mask) + sz;
> continue;
> }
> - dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
> - if (!dst_pte) {
> - ret = -ENOMEM;
> + ret = hugetlb_full_walk_alloc(&dst_hpte, dst_vma, addr,
> + hugetlb_pte_size(&src_hpte));
> + if (ret)
> break;
> - }
> +
> + src_pte = src_hpte.ptep;
> + dst_pte = dst_hpte.ptep;
> +
> + hpte_sz = hugetlb_pte_size(&src_hpte);
>
> /*
> * If the pagetables are shared don't copy or take references.
> @@ -5259,13 +5297,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> * another vma. So page_count of ptep page is checked instead
> * to reliably determine whether pte is shared.
> */
> - if (page_count(virt_to_page(dst_pte)) > 1) {
> - addr |= last_addr_mask;
> + if (hugetlb_pte_size(&dst_hpte) == sz &&
> + page_count(virt_to_page(dst_pte)) > 1) {
> + addr = (addr | last_addr_mask) + sz;
> continue;
> }
>
> - dst_ptl = huge_pte_lock(h, dst, dst_pte);
> - src_ptl = huge_pte_lockptr(huge_page_shift(h), src, src_pte);
> + dst_ptl = hugetlb_pte_lock(&dst_hpte);
> + src_ptl = hugetlb_pte_lockptr(&src_hpte);
> spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
> entry = huge_ptep_get(src_pte);
> again:
> @@ -5309,10 +5348,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> */
> if (userfaultfd_wp(dst_vma))
> set_huge_pte_at(dst, addr, dst_pte, entry);
> + } else if (!hugetlb_pte_present_leaf(&src_hpte, entry)) {
> + /* Retry the walk. */
> + spin_unlock(src_ptl);
> + spin_unlock(dst_ptl);
> + continue;
> } else {
> - entry = huge_ptep_get(src_pte);
> ptepage = pte_page(entry);
> - get_page(ptepage);
> + hpage = compound_head(ptepage);
> + get_page(hpage);
>
> /*
> * Failing to duplicate the anon rmap is a rare case
> @@ -5324,13 +5368,34 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> * need to be without the pgtable locks since we could
> * sleep during the process.
> */
> - if (!PageAnon(ptepage)) {
> - page_add_file_rmap(ptepage, src_vma, true);
> - } else if (page_try_dup_anon_rmap(ptepage, true,
> + if (!PageAnon(hpage)) {
> + hugetlb_add_file_rmap(ptepage,
> + src_hpte.shift, h, src_vma);
> + }
> + /*
> + * It is currently impossible to get anonymous HugeTLB
> + * high-granularity mappings, so we use 'hpage' here.
> + *
> + * This will need to be changed when HGM support for
> + * anon mappings is added.
> + */
> + else if (page_try_dup_anon_rmap(hpage, true,
> src_vma)) {
> pte_t src_pte_old = entry;
> struct folio *new_folio;
>
> + /*
> + * If we are mapped at high granularity, we
> + * may end up allocating lots and lots of
> + * hugepages when we only need one. Bail out
> + * now.
> + */
> + if (hugetlb_pte_size(&src_hpte) != sz) {
> + put_page(hpage);
> + ret = -EINVAL;
> + break;
> + }
> +
Although this block never executes, it should come after the following
spin_unlocks().
> spin_unlock(src_ptl);
> spin_unlock(dst_ptl);
> /* Do not use reserve as it's private owned */
> @@ -5342,7 +5407,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> }
> copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma,
> npages);
> - put_page(ptepage);
> + put_page(hpage);
>
> /* Install the new hugetlb folio if src pte stable */
> dst_ptl = huge_pte_lock(h, dst, dst_pte);
> @@ -5360,6 +5425,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio);
> spin_unlock(src_ptl);
> spin_unlock(dst_ptl);
> + addr += hugetlb_pte_size(&src_hpte);
> continue;
> }
>
> @@ -5376,10 +5442,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> }
>
> set_huge_pte_at(dst, addr, dst_pte, entry);
> - hugetlb_count_add(npages, dst);
> + hugetlb_count_add(
> + hugetlb_pte_size(&dst_hpte) / PAGE_SIZE,
> + dst);
> }
> spin_unlock(src_ptl);
> spin_unlock(dst_ptl);
> + addr += hugetlb_pte_size(&src_hpte);
> }
>
> if (cow) {
> --
> 2.39.2.637.g21b0678d19-goog
>
@@ -162,6 +162,8 @@ void hugepage_put_subpool(struct hugepage_subpool *spool);
void hugetlb_remove_rmap(struct page *subpage, unsigned long shift,
struct hstate *h, struct vm_area_struct *vma);
+void hugetlb_add_file_rmap(struct page *subpage, unsigned long shift,
+ struct hstate *h, struct vm_area_struct *vma);
void hugetlb_dup_vma_private(struct vm_area_struct *vma);
void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
@@ -141,6 +141,37 @@ void hugetlb_remove_rmap(struct page *subpage, unsigned long shift,
page_remove_rmap(subpage, vma, false);
}
}
+/*
+ * hugetlb_add_file_rmap() - increment the mapcounts for file-backed hugetlb
+ * pages appropriately.
+ *
+ * For pages that are being mapped with their hstate-level PTE (e.g., a 1G page
+ * being mapped with a 1G PUD), then we increment the compound_mapcount for the
+ * head page.
+ *
+ * For pages that are being mapped with high-granularity, we increment the
+ * mapcounts for the individual subpages that are getting mapped.
+ */
+void hugetlb_add_file_rmap(struct page *subpage, unsigned long shift,
+ struct hstate *h, struct vm_area_struct *vma)
+{
+ struct page *hpage = compound_head(subpage);
+
+ if (shift == huge_page_shift(h)) {
+ VM_BUG_ON_PAGE(subpage != hpage, subpage);
+ page_add_file_rmap(hpage, vma, true);
+ } else {
+ unsigned long nr_subpages = 1UL << (shift - PAGE_SHIFT);
+ struct page *final_page = &subpage[nr_subpages];
+
+ VM_BUG_ON_PAGE(HPageVmemmapOptimized(hpage), hpage);
+ /*
+ * Increment the mapcount on each page that is getting mapped.
+ */
+ for (; subpage < final_page; ++subpage)
+ page_add_file_rmap(subpage, vma, false);
+ }
+}
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -5210,7 +5241,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *src_vma)
{
pte_t *src_pte, *dst_pte, entry;
- struct page *ptepage;
+ struct hugetlb_pte src_hpte, dst_hpte;
+ struct page *ptepage, *hpage;
unsigned long addr;
bool cow = is_cow_mapping(src_vma->vm_flags);
struct hstate *h = hstate_vma(src_vma);
@@ -5238,18 +5270,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}
last_addr_mask = hugetlb_mask_last_page(h);
- for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
+ addr = src_vma->vm_start;
+ while (addr < src_vma->vm_end) {
spinlock_t *src_ptl, *dst_ptl;
- src_pte = hugetlb_walk(src_vma, addr, sz);
- if (!src_pte) {
- addr |= last_addr_mask;
+ unsigned long hpte_sz;
+
+ if (hugetlb_full_walk(&src_hpte, src_vma, addr)) {
+ addr = (addr | last_addr_mask) + sz;
continue;
}
- dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
- if (!dst_pte) {
- ret = -ENOMEM;
+ ret = hugetlb_full_walk_alloc(&dst_hpte, dst_vma, addr,
+ hugetlb_pte_size(&src_hpte));
+ if (ret)
break;
- }
+
+ src_pte = src_hpte.ptep;
+ dst_pte = dst_hpte.ptep;
+
+ hpte_sz = hugetlb_pte_size(&src_hpte);
/*
* If the pagetables are shared don't copy or take references.
@@ -5259,13 +5297,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
* another vma. So page_count of ptep page is checked instead
* to reliably determine whether pte is shared.
*/
- if (page_count(virt_to_page(dst_pte)) > 1) {
- addr |= last_addr_mask;
+ if (hugetlb_pte_size(&dst_hpte) == sz &&
+ page_count(virt_to_page(dst_pte)) > 1) {
+ addr = (addr | last_addr_mask) + sz;
continue;
}
- dst_ptl = huge_pte_lock(h, dst, dst_pte);
- src_ptl = huge_pte_lockptr(huge_page_shift(h), src, src_pte);
+ dst_ptl = hugetlb_pte_lock(&dst_hpte);
+ src_ptl = hugetlb_pte_lockptr(&src_hpte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
again:
@@ -5309,10 +5348,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
*/
if (userfaultfd_wp(dst_vma))
set_huge_pte_at(dst, addr, dst_pte, entry);
+ } else if (!hugetlb_pte_present_leaf(&src_hpte, entry)) {
+ /* Retry the walk. */
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ continue;
} else {
- entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
- get_page(ptepage);
+ hpage = compound_head(ptepage);
+ get_page(hpage);
/*
* Failing to duplicate the anon rmap is a rare case
@@ -5324,13 +5368,34 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
* need to be without the pgtable locks since we could
* sleep during the process.
*/
- if (!PageAnon(ptepage)) {
- page_add_file_rmap(ptepage, src_vma, true);
- } else if (page_try_dup_anon_rmap(ptepage, true,
+ if (!PageAnon(hpage)) {
+ hugetlb_add_file_rmap(ptepage,
+ src_hpte.shift, h, src_vma);
+ }
+ /*
+ * It is currently impossible to get anonymous HugeTLB
+ * high-granularity mappings, so we use 'hpage' here.
+ *
+ * This will need to be changed when HGM support for
+ * anon mappings is added.
+ */
+ else if (page_try_dup_anon_rmap(hpage, true,
src_vma)) {
pte_t src_pte_old = entry;
struct folio *new_folio;
+ /*
+ * If we are mapped at high granularity, we
+ * may end up allocating lots and lots of
+ * hugepages when we only need one. Bail out
+ * now.
+ */
+ if (hugetlb_pte_size(&src_hpte) != sz) {
+ put_page(hpage);
+ ret = -EINVAL;
+ break;
+ }
+
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
@@ -5342,7 +5407,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}
copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma,
npages);
- put_page(ptepage);
+ put_page(hpage);
/* Install the new hugetlb folio if src pte stable */
dst_ptl = huge_pte_lock(h, dst, dst_pte);
@@ -5360,6 +5425,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
+ addr += hugetlb_pte_size(&src_hpte);
continue;
}
@@ -5376,10 +5442,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}
set_huge_pte_at(dst, addr, dst_pte, entry);
- hugetlb_count_add(npages, dst);
+ hugetlb_count_add(
+ hugetlb_pte_size(&dst_hpte) / PAGE_SIZE,
+ dst);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
+ addr += hugetlb_pte_size(&src_hpte);
}
if (cow) {