[RFC,v2,05/47] hugetlb: make hugetlb_vma_lock_alloc return its failure reason

Message ID 20221021163703.3218176-6-jthoughton@google.com
State New
Headers
Series hugetlb: introduce HugeTLB high-granularity mapping |

Commit Message

James Houghton Oct. 21, 2022, 4:36 p.m. UTC
  Currently hugetlb_vma_lock_alloc doesn't return anything, as there is no
need: if it fails, PMD sharing won't be enabled. However, HGM requires
that the VMA lock exists, so we need to verify that
hugetlb_vma_lock_alloc actually succeeded. If hugetlb_vma_lock_alloc
fails, then we can pass that up to the caller that is attempting to
enable HGM.

Signed-off-by: James Houghton <jthoughton@google.com>
---
 mm/hugetlb.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)
  

Comments

Peter Xu Nov. 16, 2022, 5:08 p.m. UTC | #1
On Fri, Oct 21, 2022 at 04:36:21PM +0000, James Houghton wrote:
> Currently hugetlb_vma_lock_alloc doesn't return anything, as there is no
> need: if it fails, PMD sharing won't be enabled. However, HGM requires
> that the VMA lock exists, so we need to verify that
> hugetlb_vma_lock_alloc actually succeeded. If hugetlb_vma_lock_alloc
> fails, then we can pass that up to the caller that is attempting to
> enable HGM.
> 
> Signed-off-by: James Houghton <jthoughton@google.com>
> ---
>  mm/hugetlb.c | 16 +++++++++-------
>  1 file changed, 9 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 52cec5b0789e..dc82256b89dd 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -92,7 +92,7 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
>  /* Forward declaration */
>  static int hugetlb_acct_memory(struct hstate *h, long delta);
>  static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
> -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
> +static int hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
>  static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
>  
>  static inline bool subpool_is_free(struct hugepage_subpool *spool)
> @@ -7001,17 +7001,17 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
>  	}
>  }
>  
> -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
> +static int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
>  {
>  	struct hugetlb_vma_lock *vma_lock;
>  
>  	/* Only establish in (flags) sharable vmas */
>  	if (!vma || !(vma->vm_flags & VM_MAYSHARE))
> -		return;
> +		return -EINVAL;
>  
> -	/* Should never get here with non-NULL vm_private_data */
> +	/* We've already allocated the lock. */
>  	if (vma->vm_private_data)
> -		return;
> +		return 0;

No objection on the patch itself, but I am just wondering what guarantees
thread-safety for this function to not leak vm_private_data when two
threads try to allocate at the same time.

I think it should be the write mmap lock.  I saw that in your latest code
base there's:

	/*
	 * We must hold the mmap lock for writing so that callers can rely on
	 * hugetlb_hgm_enabled returning a consistent result while holding
	 * the mmap lock for reading.
	 */
	mmap_assert_write_locked(vma->vm_mm);

	/* HugeTLB HGM requires the VMA lock to synchronize collapsing. */
	ret = hugetlb_vma_data_alloc(vma);
	if (ret)
		return ret;

So that's covered there.  The rest places are hugetlb_vm_op_open() and
hugetlb_reserve_pages() and they all seem fine too: hugetlb_vm_op_open() is
during mmap(), the latter has vma==NULL so allocation will be skipped.

I'm wondering whether it would make sense to move this assert to be inside
of hugetlb_vma_data_alloc() after the !vma check, or just add the same
assert too but for different reason.

>  
>  	vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
>  	if (!vma_lock) {
> @@ -7026,13 +7026,14 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
>  		 * allocation failure.
>  		 */
>  		pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
> -		return;
> +		return -ENOMEM;
>  	}
>  
>  	kref_init(&vma_lock->refs);
>  	init_rwsem(&vma_lock->rw_sema);
>  	vma_lock->vma = vma;
>  	vma->vm_private_data = vma_lock;
> +	return 0;
>  }
>  
>  /*
> @@ -7160,8 +7161,9 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
>  {
>  }
>  
> -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
> +static int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
>  {
> +	return 0;
>  }
>  
>  pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
> -- 
> 2.38.0.135.g90850a2211-goog
> 
>
  
James Houghton Nov. 21, 2022, 6:11 p.m. UTC | #2
On Wed, Nov 16, 2022 at 9:08 AM Peter Xu <peterx@redhat.com> wrote:
>
> No objection on the patch itself, but I am just wondering what guarantees
> thread-safety for this function to not leak vm_private_data when two
> threads try to allocate at the same time.
>
> I think it should be the write mmap lock.  I saw that in your latest code
> base there's:
>
>         /*
>          * We must hold the mmap lock for writing so that callers can rely on
>          * hugetlb_hgm_enabled returning a consistent result while holding
>          * the mmap lock for reading.
>          */
>         mmap_assert_write_locked(vma->vm_mm);
>
>         /* HugeTLB HGM requires the VMA lock to synchronize collapsing. */
>         ret = hugetlb_vma_data_alloc(vma);
>         if (ret)
>                 return ret;
>
> So that's covered there.  The rest places are hugetlb_vm_op_open() and
> hugetlb_reserve_pages() and they all seem fine too: hugetlb_vm_op_open() is
> during mmap(), the latter has vma==NULL so allocation will be skipped.
>
> I'm wondering whether it would make sense to move this assert to be inside
> of hugetlb_vma_data_alloc() after the !vma check, or just add the same
> assert too but for different reason.

I think leaving the assert here and adding a new assert inside
hugetlb_vma_data_alloc() makes sense. Thanks Peter.

- James

>
> >
> >       vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
> >       if (!vma_lock) {
> > @@ -7026,13 +7026,14 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
> >                * allocation failure.
> >                */
> >               pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
> > -             return;
> > +             return -ENOMEM;
> >       }
> >
> >       kref_init(&vma_lock->refs);
> >       init_rwsem(&vma_lock->rw_sema);
> >       vma_lock->vma = vma;
> >       vma->vm_private_data = vma_lock;
> > +     return 0;
> >  }
> >
> >  /*
> > @@ -7160,8 +7161,9 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
> >  {
> >  }
> >
> > -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
> > +static int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
> >  {
> > +     return 0;
> >  }
> >
> >  pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
> > --
> > 2.38.0.135.g90850a2211-goog
> >
> >
>
> --
> Peter Xu
>
  
Mina Almasry Dec. 7, 2022, 11:33 p.m. UTC | #3
On Fri, Oct 21, 2022 at 9:37 AM James Houghton <jthoughton@google.com> wrote:
>
> Currently hugetlb_vma_lock_alloc doesn't return anything, as there is no
> need: if it fails, PMD sharing won't be enabled. However, HGM requires
> that the VMA lock exists, so we need to verify that
> hugetlb_vma_lock_alloc actually succeeded. If hugetlb_vma_lock_alloc
> fails, then we can pass that up to the caller that is attempting to
> enable HGM.
>
> Signed-off-by: James Houghton <jthoughton@google.com>
> ---
>  mm/hugetlb.c | 16 +++++++++-------
>  1 file changed, 9 insertions(+), 7 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 52cec5b0789e..dc82256b89dd 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -92,7 +92,7 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
>  /* Forward declaration */
>  static int hugetlb_acct_memory(struct hstate *h, long delta);
>  static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
> -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
> +static int hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
>  static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
>
>  static inline bool subpool_is_free(struct hugepage_subpool *spool)
> @@ -7001,17 +7001,17 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
>         }
>  }
>
> -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
> +static int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
>  {
>         struct hugetlb_vma_lock *vma_lock;
>
>         /* Only establish in (flags) sharable vmas */
>         if (!vma || !(vma->vm_flags & VM_MAYSHARE))
> -               return;
> +               return -EINVAL;
>
> -       /* Should never get here with non-NULL vm_private_data */
> +       /* We've already allocated the lock. */
>         if (vma->vm_private_data)
> -               return;
> +               return 0;

I would have expected -EEXIST here.

Also even if the patch looks generally fine it's hard to provide
Acked-by now. I need to look at the call site which is in another
patch in the series. If there is an opportunity to squash changes to
helpers and their call sites please do.

>
>         vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
>         if (!vma_lock) {
> @@ -7026,13 +7026,14 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
>                  * allocation failure.
>                  */
>                 pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
> -               return;
> +               return -ENOMEM;
>         }
>
>         kref_init(&vma_lock->refs);
>         init_rwsem(&vma_lock->rw_sema);
>         vma_lock->vma = vma;
>         vma->vm_private_data = vma_lock;
> +       return 0;
>  }
>
>  /*
> @@ -7160,8 +7161,9 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
>  {
>  }
>
> -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
> +static int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
>  {
> +       return 0;
>  }
>
>  pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
> --
> 2.38.0.135.g90850a2211-goog
>
  
Mike Kravetz Dec. 9, 2022, 10:36 p.m. UTC | #4
On 10/21/22 16:36, James Houghton wrote:
> Currently hugetlb_vma_lock_alloc doesn't return anything, as there is no
> need: if it fails, PMD sharing won't be enabled. However, HGM requires
> that the VMA lock exists, so we need to verify that
> hugetlb_vma_lock_alloc actually succeeded. If hugetlb_vma_lock_alloc
> fails, then we can pass that up to the caller that is attempting to
> enable HGM.

No serious objections to this change ...

However, there are currently only two places today where hugetlb_vma_lock_alloc
is called: hugetlb_reserve_pages and hugetlb_vm_op_open.  hugetlb_reserve_pages
is not an issue.  Since hugetlb_vm_op_open (as a defined vm_operation) returns
void, I am not sure how you plan to pass up an allocation failure.
Suspect this will become evident in subsequent patches.
  

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 52cec5b0789e..dc82256b89dd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -92,7 +92,7 @@  struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
 static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
+static int hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
 static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
 
 static inline bool subpool_is_free(struct hugepage_subpool *spool)
@@ -7001,17 +7001,17 @@  static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
 	}
 }
 
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+static int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
 {
 	struct hugetlb_vma_lock *vma_lock;
 
 	/* Only establish in (flags) sharable vmas */
 	if (!vma || !(vma->vm_flags & VM_MAYSHARE))
-		return;
+		return -EINVAL;
 
-	/* Should never get here with non-NULL vm_private_data */
+	/* We've already allocated the lock. */
 	if (vma->vm_private_data)
-		return;
+		return 0;
 
 	vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
 	if (!vma_lock) {
@@ -7026,13 +7026,14 @@  static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
 		 * allocation failure.
 		 */
 		pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
-		return;
+		return -ENOMEM;
 	}
 
 	kref_init(&vma_lock->refs);
 	init_rwsem(&vma_lock->rw_sema);
 	vma_lock->vma = vma;
 	vma->vm_private_data = vma_lock;
+	return 0;
 }
 
 /*
@@ -7160,8 +7161,9 @@  static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
 {
 }
 
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+static int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
 {
+	return 0;
 }
 
 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,