[v2,11/46] hugetlb: add hugetlb_pte to track HugeTLB page table entries

Message ID 20230218002819.1486479-12-jthoughton@google.com
State New
Headers
Series hugetlb: introduce HugeTLB high-granularity mapping |

Commit Message

James Houghton Feb. 18, 2023, 12:27 a.m. UTC
  After high-granularity mapping, page table entries for HugeTLB pages can
be of any size/type. (For example, we can have a 1G page mapped with a
mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
PTE after we have done a page table walk.

Without this, we'd have to pass around the "size" of the PTE everywhere.
We effectively did this before; it could be fetched from the hstate,
which we pass around pretty much everywhere.

hugetlb_pte_present_leaf is included here as a helper function that will
be used frequently later on.

Signed-off-by: James Houghton <jthoughton@google.com>
  

Comments

Mina Almasry Feb. 18, 2023, 5:24 a.m. UTC | #1
On Fri, Feb 17, 2023 at 4:28 PM James Houghton <jthoughton@google.com> wrote:
>
> After high-granularity mapping, page table entries for HugeTLB pages can
> be of any size/type. (For example, we can have a 1G page mapped with a
> mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
> PTE after we have done a page table walk.
>
> Without this, we'd have to pass around the "size" of the PTE everywhere.
> We effectively did this before; it could be fetched from the hstate,
> which we pass around pretty much everywhere.
>
> hugetlb_pte_present_leaf is included here as a helper function that will
> be used frequently later on.
>
> Signed-off-by: James Houghton <jthoughton@google.com>
>

Only nits.

Reviewed-by: Mina Almasry <almasrymina@google.com>

> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index a1ceb9417f01..eeacadf3272b 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -26,6 +26,25 @@ typedef struct { unsigned long pd; } hugepd_t;
>  #define __hugepd(x) ((hugepd_t) { (x) })
>  #endif
>
> +enum hugetlb_level {
> +       HUGETLB_LEVEL_PTE = 1,
> +       /*
> +        * We always include PMD, PUD, and P4D in this enum definition so that,
> +        * when logged as an integer, we can easily tell which level it is.
> +        */
> +       HUGETLB_LEVEL_PMD,
> +       HUGETLB_LEVEL_PUD,
> +       HUGETLB_LEVEL_P4D,
> +       HUGETLB_LEVEL_PGD,
> +};
> +
> +struct hugetlb_pte {
> +       pte_t *ptep;
> +       unsigned int shift;
> +       enum hugetlb_level level;
> +       spinlock_t *ptl;
> +};
> +
>  #ifdef CONFIG_HUGETLB_PAGE
>
>  #include <linux/mempolicy.h>
> @@ -39,6 +58,20 @@ typedef struct { unsigned long pd; } hugepd_t;
>   */
>  #define __NR_USED_SUBPAGE 3
>
> +static inline
> +unsigned long hugetlb_pte_size(const struct hugetlb_pte *hpte)
> +{
> +       return 1UL << hpte->shift;
> +}
> +
> +static inline
> +unsigned long hugetlb_pte_mask(const struct hugetlb_pte *hpte)
> +{
> +       return ~(hugetlb_pte_size(hpte) - 1);
> +}
> +
> +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte, pte_t pte);
> +
>  struct hugepage_subpool {
>         spinlock_t lock;
>         long count;
> @@ -1234,6 +1267,45 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h,
>         return ptl;
>  }
>
> +static inline
> +spinlock_t *hugetlb_pte_lockptr(struct hugetlb_pte *hpte)
> +{
> +       return hpte->ptl;
> +}

I find this helper unnecessary. I would remove it.

> +
> +static inline
> +spinlock_t *hugetlb_pte_lock(struct hugetlb_pte *hpte)
> +{
> +       spinlock_t *ptl = hugetlb_pte_lockptr(hpte);
> +
> +       spin_lock(ptl);

Here 'spin_lock(hpte->ptl)' would be more immediately understandable
IMO, for example.

> +       return ptl;
> +}
> +
> +static inline
> +void __hugetlb_pte_init(struct hugetlb_pte *hpte, pte_t *ptep,
> +                       unsigned int shift, enum hugetlb_level level,
> +                       spinlock_t *ptl)
> +{
> +       /*
> +        * If 'shift' indicates that this PTE is contiguous, then @ptep must
> +        * be the first pte of the contiguous bunch.
> +        */

I would move the comment to above the function as a pseudo doc. It
seems to instruct the user of the function of how to use it.

> +       hpte->ptl = ptl;
> +       hpte->ptep = ptep;
> +       hpte->shift = shift;
> +       hpte->level = level;
> +}
> +
> +static inline
> +void hugetlb_pte_init(struct mm_struct *mm, struct hugetlb_pte *hpte,
> +                     pte_t *ptep, unsigned int shift,
> +                     enum hugetlb_level level)
> +{
> +       __hugetlb_pte_init(hpte, ptep, shift, level,
> +                          huge_pte_lockptr(shift, mm, ptep));
> +}
> +
>  #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
>  extern void __init hugetlb_cma_reserve(int order);
>  #else
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 5ca9eae0ac42..6c74adff43b6 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1269,6 +1269,35 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
>         return false;
>  }
>
> +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte, pte_t pte)
> +{
> +       pgd_t pgd;
> +       p4d_t p4d;
> +       pud_t pud;
> +       pmd_t pmd;
> +
> +       switch (hpte->level) {
> +       case HUGETLB_LEVEL_PGD:
> +               pgd = __pgd(pte_val(pte));
> +               return pgd_present(pgd) && pgd_leaf(pgd);
> +       case HUGETLB_LEVEL_P4D:
> +               p4d = __p4d(pte_val(pte));
> +               return p4d_present(p4d) && p4d_leaf(p4d);
> +       case HUGETLB_LEVEL_PUD:
> +               pud = __pud(pte_val(pte));
> +               return pud_present(pud) && pud_leaf(pud);
> +       case HUGETLB_LEVEL_PMD:
> +               pmd = __pmd(pte_val(pte));
> +               return pmd_present(pmd) && pmd_leaf(pmd);
> +       case HUGETLB_LEVEL_PTE:
> +               return pte_present(pte);
> +       default:
> +               WARN_ON_ONCE(1);
> +               return false;
> +       }
> +}
> +
> +
>  static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
>  {
>         int nid = folio_nid(folio);
> --
> 2.39.2.637.g21b0678d19-goog
>
  
James Houghton Feb. 21, 2023, 4:36 p.m. UTC | #2
On Fri, Feb 17, 2023 at 9:24 PM Mina Almasry <almasrymina@google.com> wrote:
>
> On Fri, Feb 17, 2023 at 4:28 PM James Houghton <jthoughton@google.com> wrote:
> >
> > After high-granularity mapping, page table entries for HugeTLB pages can
> > be of any size/type. (For example, we can have a 1G page mapped with a
> > mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
> > PTE after we have done a page table walk.
> >
> > Without this, we'd have to pass around the "size" of the PTE everywhere.
> > We effectively did this before; it could be fetched from the hstate,
> > which we pass around pretty much everywhere.
> >
> > hugetlb_pte_present_leaf is included here as a helper function that will
> > be used frequently later on.
> >
> > Signed-off-by: James Houghton <jthoughton@google.com>
> >
>
> Only nits.
>
> Reviewed-by: Mina Almasry <almasrymina@google.com>

Thanks Mina!

>
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index a1ceb9417f01..eeacadf3272b 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -26,6 +26,25 @@ typedef struct { unsigned long pd; } hugepd_t;
> >  #define __hugepd(x) ((hugepd_t) { (x) })
> >  #endif
> >
> > +enum hugetlb_level {
> > +       HUGETLB_LEVEL_PTE = 1,
> > +       /*
> > +        * We always include PMD, PUD, and P4D in this enum definition so that,
> > +        * when logged as an integer, we can easily tell which level it is.
> > +        */
> > +       HUGETLB_LEVEL_PMD,
> > +       HUGETLB_LEVEL_PUD,
> > +       HUGETLB_LEVEL_P4D,
> > +       HUGETLB_LEVEL_PGD,
> > +};
> > +
> > +struct hugetlb_pte {
> > +       pte_t *ptep;
> > +       unsigned int shift;
> > +       enum hugetlb_level level;
> > +       spinlock_t *ptl;
> > +};
> > +
> >  #ifdef CONFIG_HUGETLB_PAGE
> >
> >  #include <linux/mempolicy.h>
> > @@ -39,6 +58,20 @@ typedef struct { unsigned long pd; } hugepd_t;
> >   */
> >  #define __NR_USED_SUBPAGE 3
> >
> > +static inline
> > +unsigned long hugetlb_pte_size(const struct hugetlb_pte *hpte)
> > +{
> > +       return 1UL << hpte->shift;
> > +}
> > +
> > +static inline
> > +unsigned long hugetlb_pte_mask(const struct hugetlb_pte *hpte)
> > +{
> > +       return ~(hugetlb_pte_size(hpte) - 1);
> > +}
> > +
> > +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte, pte_t pte);
> > +
> >  struct hugepage_subpool {
> >         spinlock_t lock;
> >         long count;
> > @@ -1234,6 +1267,45 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h,
> >         return ptl;
> >  }
> >
> > +static inline
> > +spinlock_t *hugetlb_pte_lockptr(struct hugetlb_pte *hpte)
> > +{
> > +       return hpte->ptl;
> > +}
>
> I find this helper unnecessary. I would remove it.

Ok. Will do.

>
> > +
> > +static inline
> > +spinlock_t *hugetlb_pte_lock(struct hugetlb_pte *hpte)
> > +{
> > +       spinlock_t *ptl = hugetlb_pte_lockptr(hpte);
> > +
> > +       spin_lock(ptl);
>
> Here 'spin_lock(hpte->ptl)' would be more immediately understandable
> IMO, for example.
>
> > +       return ptl;
> > +}
> > +
> > +static inline
> > +void __hugetlb_pte_init(struct hugetlb_pte *hpte, pte_t *ptep,
> > +                       unsigned int shift, enum hugetlb_level level,
> > +                       spinlock_t *ptl)
> > +{
> > +       /*
> > +        * If 'shift' indicates that this PTE is contiguous, then @ptep must
> > +        * be the first pte of the contiguous bunch.
> > +        */
>
> I would move the comment to above the function as a pseudo doc. It
> seems to instruct the user of the function of how to use it.

Right. Will do.

>
> > +       hpte->ptl = ptl;
> > +       hpte->ptep = ptep;
> > +       hpte->shift = shift;
> > +       hpte->level = level;
> > +}
> > +
> > +static inline
> > +void hugetlb_pte_init(struct mm_struct *mm, struct hugetlb_pte *hpte,
> > +                     pte_t *ptep, unsigned int shift,
> > +                     enum hugetlb_level level)
> > +{
> > +       __hugetlb_pte_init(hpte, ptep, shift, level,
> > +                          huge_pte_lockptr(shift, mm, ptep));
> > +}
> > +
> >  #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
> >  extern void __init hugetlb_cma_reserve(int order);
> >  #else
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 5ca9eae0ac42..6c74adff43b6 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1269,6 +1269,35 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
> >         return false;
> >  }
> >
> > +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte, pte_t pte)
> > +{
> > +       pgd_t pgd;
> > +       p4d_t p4d;
> > +       pud_t pud;
> > +       pmd_t pmd;
> > +
> > +       switch (hpte->level) {
> > +       case HUGETLB_LEVEL_PGD:
> > +               pgd = __pgd(pte_val(pte));
> > +               return pgd_present(pgd) && pgd_leaf(pgd);
> > +       case HUGETLB_LEVEL_P4D:
> > +               p4d = __p4d(pte_val(pte));
> > +               return p4d_present(p4d) && p4d_leaf(p4d);
> > +       case HUGETLB_LEVEL_PUD:
> > +               pud = __pud(pte_val(pte));
> > +               return pud_present(pud) && pud_leaf(pud);
> > +       case HUGETLB_LEVEL_PMD:
> > +               pmd = __pmd(pte_val(pte));
> > +               return pmd_present(pmd) && pmd_leaf(pmd);
> > +       case HUGETLB_LEVEL_PTE:
> > +               return pte_present(pte);
> > +       default:
> > +               WARN_ON_ONCE(1);
> > +               return false;
> > +       }
> > +}
> > +
> > +
> >  static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
> >  {
> >         int nid = folio_nid(folio);
> > --
> > 2.39.2.637.g21b0678d19-goog
> >
  
Mike Kravetz Feb. 25, 2023, 12:09 a.m. UTC | #3
On 02/18/23 00:27, James Houghton wrote:
> After high-granularity mapping, page table entries for HugeTLB pages can
> be of any size/type. (For example, we can have a 1G page mapped with a
> mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
> PTE after we have done a page table walk.
> 
> Without this, we'd have to pass around the "size" of the PTE everywhere.
> We effectively did this before; it could be fetched from the hstate,
> which we pass around pretty much everywhere.

Agreed.  I can not think of a better way to handle the possibility of
having hugetlb page table entries at any level.  The somewhat unfortunate
part of this is that code outside hugetlbfs proper needs to know about this.
However, there is already 'special handling' with hugetlb assumptions
in those places today.

> hugetlb_pte_present_leaf is included here as a helper function that will
> be used frequently later on.
> 
> Signed-off-by: James Houghton <jthoughton@google.com>
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index a1ceb9417f01..eeacadf3272b 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h

Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
  

Patch

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a1ceb9417f01..eeacadf3272b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -26,6 +26,25 @@  typedef struct { unsigned long pd; } hugepd_t;
 #define __hugepd(x) ((hugepd_t) { (x) })
 #endif
 
+enum hugetlb_level {
+	HUGETLB_LEVEL_PTE = 1,
+	/*
+	 * We always include PMD, PUD, and P4D in this enum definition so that,
+	 * when logged as an integer, we can easily tell which level it is.
+	 */
+	HUGETLB_LEVEL_PMD,
+	HUGETLB_LEVEL_PUD,
+	HUGETLB_LEVEL_P4D,
+	HUGETLB_LEVEL_PGD,
+};
+
+struct hugetlb_pte {
+	pte_t *ptep;
+	unsigned int shift;
+	enum hugetlb_level level;
+	spinlock_t *ptl;
+};
+
 #ifdef CONFIG_HUGETLB_PAGE
 
 #include <linux/mempolicy.h>
@@ -39,6 +58,20 @@  typedef struct { unsigned long pd; } hugepd_t;
  */
 #define __NR_USED_SUBPAGE 3
 
+static inline
+unsigned long hugetlb_pte_size(const struct hugetlb_pte *hpte)
+{
+	return 1UL << hpte->shift;
+}
+
+static inline
+unsigned long hugetlb_pte_mask(const struct hugetlb_pte *hpte)
+{
+	return ~(hugetlb_pte_size(hpte) - 1);
+}
+
+bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte, pte_t pte);
+
 struct hugepage_subpool {
 	spinlock_t lock;
 	long count;
@@ -1234,6 +1267,45 @@  static inline spinlock_t *huge_pte_lock(struct hstate *h,
 	return ptl;
 }
 
+static inline
+spinlock_t *hugetlb_pte_lockptr(struct hugetlb_pte *hpte)
+{
+	return hpte->ptl;
+}
+
+static inline
+spinlock_t *hugetlb_pte_lock(struct hugetlb_pte *hpte)
+{
+	spinlock_t *ptl = hugetlb_pte_lockptr(hpte);
+
+	spin_lock(ptl);
+	return ptl;
+}
+
+static inline
+void __hugetlb_pte_init(struct hugetlb_pte *hpte, pte_t *ptep,
+			unsigned int shift, enum hugetlb_level level,
+			spinlock_t *ptl)
+{
+	/*
+	 * If 'shift' indicates that this PTE is contiguous, then @ptep must
+	 * be the first pte of the contiguous bunch.
+	 */
+	hpte->ptl = ptl;
+	hpte->ptep = ptep;
+	hpte->shift = shift;
+	hpte->level = level;
+}
+
+static inline
+void hugetlb_pte_init(struct mm_struct *mm, struct hugetlb_pte *hpte,
+		      pte_t *ptep, unsigned int shift,
+		      enum hugetlb_level level)
+{
+	__hugetlb_pte_init(hpte, ptep, shift, level,
+			   huge_pte_lockptr(shift, mm, ptep));
+}
+
 #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
 extern void __init hugetlb_cma_reserve(int order);
 #else
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5ca9eae0ac42..6c74adff43b6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1269,6 +1269,35 @@  static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
 	return false;
 }
 
+bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte, pte_t pte)
+{
+	pgd_t pgd;
+	p4d_t p4d;
+	pud_t pud;
+	pmd_t pmd;
+
+	switch (hpte->level) {
+	case HUGETLB_LEVEL_PGD:
+		pgd = __pgd(pte_val(pte));
+		return pgd_present(pgd) && pgd_leaf(pgd);
+	case HUGETLB_LEVEL_P4D:
+		p4d = __p4d(pte_val(pte));
+		return p4d_present(p4d) && p4d_leaf(p4d);
+	case HUGETLB_LEVEL_PUD:
+		pud = __pud(pte_val(pte));
+		return pud_present(pud) && pud_leaf(pud);
+	case HUGETLB_LEVEL_PMD:
+		pmd = __pmd(pte_val(pte));
+		return pmd_present(pmd) && pmd_leaf(pmd);
+	case HUGETLB_LEVEL_PTE:
+		return pte_present(pte);
+	default:
+		WARN_ON_ONCE(1);
+		return false;
+	}
+}
+
+
 static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
 {
 	int nid = folio_nid(folio);