[v2,09/10] mm/hugetlb: Introduce hugetlb_walk()

Message ID 20221207203156.651077-1-peterx@redhat.com
State New
Headers
Series [v2,01/10] mm/hugetlb: Let vma_offset_start() to return start |

Commit Message

Peter Xu Dec. 7, 2022, 8:31 p.m. UTC
  huge_pte_offset() is the main walker function for hugetlb pgtables.  The
name is not really representing what it does, though.

Instead of renaming it, introduce a wrapper function called hugetlb_walk()
which will use huge_pte_offset() inside.  Assert on the locks when walking
the pgtable.

Note, the vma lock assertion will be a no-op for private mappings.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 fs/hugetlbfs/inode.c    |  4 +---
 fs/userfaultfd.c        |  6 ++----
 include/linux/hugetlb.h | 39 +++++++++++++++++++++++++++++++++++++++
 mm/hugetlb.c            | 32 +++++++++++++-------------------
 mm/page_vma_mapped.c    |  2 +-
 mm/pagewalk.c           |  4 +---
 6 files changed, 57 insertions(+), 30 deletions(-)
  

Comments

Mike Kravetz Dec. 7, 2022, 10:27 p.m. UTC | #1
On 12/07/22 15:31, Peter Xu wrote:
> huge_pte_offset() is the main walker function for hugetlb pgtables.  The
> name is not really representing what it does, though.
> 
> Instead of renaming it, introduce a wrapper function called hugetlb_walk()
> which will use huge_pte_offset() inside.  Assert on the locks when walking
> the pgtable.
> 
> Note, the vma lock assertion will be a no-op for private mappings.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  fs/hugetlbfs/inode.c    |  4 +---
>  fs/userfaultfd.c        |  6 ++----
>  include/linux/hugetlb.h | 39 +++++++++++++++++++++++++++++++++++++++
>  mm/hugetlb.c            | 32 +++++++++++++-------------------
>  mm/page_vma_mapped.c    |  2 +-
>  mm/pagewalk.c           |  4 +---
>  6 files changed, 57 insertions(+), 30 deletions(-)

Thanks!  I like the lockdep checks in hugetlb_walk.

Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
  
John Hubbard Dec. 8, 2022, 12:12 a.m. UTC | #2
On 12/7/22 12:31, Peter Xu wrote:
> huge_pte_offset() is the main walker function for hugetlb pgtables.  The
> name is not really representing what it does, though.
> 
> Instead of renaming it, introduce a wrapper function called hugetlb_walk()
> which will use huge_pte_offset() inside.  Assert on the locks when walking
> the pgtable.
> 
> Note, the vma lock assertion will be a no-op for private mappings.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>   fs/hugetlbfs/inode.c    |  4 +---
>   fs/userfaultfd.c        |  6 ++----
>   include/linux/hugetlb.h | 39 +++++++++++++++++++++++++++++++++++++++
>   mm/hugetlb.c            | 32 +++++++++++++-------------------
>   mm/page_vma_mapped.c    |  2 +-
>   mm/pagewalk.c           |  4 +---
>   6 files changed, 57 insertions(+), 30 deletions(-)
> 
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index fdb16246f46e..48f1a8ad2243 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -388,9 +388,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
>   {
>   	pte_t *ptep, pte;
>   
> -	ptep = huge_pte_offset(vma->vm_mm, addr,
> -			huge_page_size(hstate_vma(vma)));
> -
> +	ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma)));
>   	if (!ptep)
>   		return false;
>   
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index a602f008dde5..f31fe1a9f4c5 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -237,14 +237,12 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
>   					 unsigned long flags,
>   					 unsigned long reason)
>   {
> -	struct mm_struct *mm = ctx->mm;
>   	pte_t *ptep, pte;
>   	bool ret = true;
>   
> -	mmap_assert_locked(mm);
> -
> -	ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
> +	mmap_assert_locked(ctx->mm);
>   
> +	ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
>   	if (!ptep)
>   		goto out;
>   
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 81efd9b9baa2..1c20cbbf3d22 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -2,6 +2,7 @@
>   #ifndef _LINUX_HUGETLB_H
>   #define _LINUX_HUGETLB_H
>   
> +#include <linux/mm.h>
>   #include <linux/mm_types.h>
>   #include <linux/mmdebug.h>
>   #include <linux/fs.h>
> @@ -196,6 +197,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
>    * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE.
>    * Returns the pte_t* if found, or NULL if the address is not mapped.
>    *
> + * IMPORTANT: we should normally not directly call this function, instead
> + * this is only a common interface to implement arch-specific walker.
> + * Please consider using the hugetlb_walk() helper to make sure of the
> + * correct locking is satisfied.

Or:

"Please use hugetlb_walk() instead, because that will attempt to verify
the locking for you."

> + *
>    * Since this function will walk all the pgtable pages (including not only
>    * high-level pgtable page, but also PUD entry that can be unshared
>    * concurrently for VM_SHARED), the caller of this function should be
> @@ -1229,4 +1235,37 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
>   #define flush_hugetlb_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
>   #endif
>   
> +static inline bool
> +__vma_shareable_flags_pmd(struct vm_area_struct *vma)
> +{
> +	return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
> +		vma->vm_private_data;
> +}
> +
> +/*
> + * Safe version of huge_pte_offset() to check the locks.  See comments
> + * above huge_pte_offset().
> + */

It is odd to say that functionA() is a safe version of functionB(), if the
names are completely different.

At this point, it is very clear that huge_pte_offset() should be renamed.
I'd suggest something like one of these:

     __hugetlb_walk()
     hugetlb_walk_raw()

> +static inline pte_t *
> +hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
> +{
> +#if defined(CONFIG_HUGETLB_PAGE) && \
> +	defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP)
> +	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
> +
> +	/*
> +	 * If pmd sharing possible, locking needed to safely walk the
> +	 * hugetlb pgtables.  More information can be found at the comment
> +	 * above huge_pte_offset() in the same file.
> +	 *
> +	 * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP.
> +	 */
> +	if (__vma_shareable_flags_pmd(vma))
> +		WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) &&
> +			     !lockdep_is_held(
> +				 &vma->vm_file->f_mapping->i_mmap_rwsem));
> +#endif
> +	return huge_pte_offset(vma->vm_mm, addr, sz);
> +}

Let's please not slice up C functions with ifdefs. Instead, stick to the
standard approach of

#ifdef X
functionC()
{
	...implementation
}
#else
functionC()
{
	...simpler or shorter or stub implementation
}

> +
>   #endif /* _LINUX_HUGETLB_H */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index f42399522805..e3500c087893 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4814,7 +4814,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
>   	} else {
>   		/*
>   		 * For shared mappings the vma lock must be held before
> -		 * calling huge_pte_offset in the src vma. Otherwise, the
> +		 * calling hugetlb_walk() in the src vma. Otherwise, the
>   		 * returned ptep could go away if part of a shared pmd and
>   		 * another thread calls huge_pmd_unshare.
>   		 */
> @@ -4824,7 +4824,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
>   	last_addr_mask = hugetlb_mask_last_page(h);
>   	for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
>   		spinlock_t *src_ptl, *dst_ptl;
> -		src_pte = huge_pte_offset(src, addr, sz);
> +		src_pte = hugetlb_walk(src_vma, addr, sz);
>   		if (!src_pte) {
>   			addr |= last_addr_mask;
>   			continue;
> @@ -5028,7 +5028,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
>   	hugetlb_vma_lock_write(vma);
>   	i_mmap_lock_write(mapping);
>   	for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
> -		src_pte = huge_pte_offset(mm, old_addr, sz);
> +		src_pte = hugetlb_walk(vma, old_addr, sz);
>   		if (!src_pte) {
>   			old_addr |= last_addr_mask;
>   			new_addr |= last_addr_mask;
> @@ -5091,7 +5091,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
>   	last_addr_mask = hugetlb_mask_last_page(h);
>   	address = start;
>   	for (; address < end; address += sz) {
> -		ptep = huge_pte_offset(mm, address, sz);
> +		ptep = hugetlb_walk(vma, address, sz);
>   		if (!ptep) {
>   			address |= last_addr_mask;
>   			continue;
> @@ -5404,7 +5404,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
>   			mutex_lock(&hugetlb_fault_mutex_table[hash]);
>   			hugetlb_vma_lock_read(vma);
>   			spin_lock(ptl);
> -			ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
> +			ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
>   			if (likely(ptep &&
>   				   pte_same(huge_ptep_get(ptep), pte)))
>   				goto retry_avoidcopy;
> @@ -5442,7 +5442,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
>   	 * before the page tables are altered
>   	 */
>   	spin_lock(ptl);
> -	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
> +	ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
>   	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
>   		/* Break COW or unshare */
>   		huge_ptep_clear_flush(vma, haddr, ptep);
> @@ -6227,7 +6227,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
>   		return NULL;
>   
>   	hugetlb_vma_lock_read(vma);
> -	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
> +	pte = hugetlb_walk(vma, haddr, huge_page_size(h));
>   	if (!pte)
>   		goto out_unlock;
>   
> @@ -6292,8 +6292,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>   		 *
>   		 * Note that page table lock is not held when pte is null.
>   		 */
> -		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
> -				      huge_page_size(h));
> +		pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
> +				   huge_page_size(h));
>   		if (pte)
>   			ptl = huge_pte_lock(h, mm, pte);
>   		absent = !pte || huge_pte_none(huge_ptep_get(pte));
> @@ -6479,7 +6479,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
>   	last_addr_mask = hugetlb_mask_last_page(h);
>   	for (; address < end; address += psize) {
>   		spinlock_t *ptl;
> -		ptep = huge_pte_offset(mm, address, psize);
> +		ptep = hugetlb_walk(vma, address, psize);
>   		if (!ptep) {
>   			address |= last_addr_mask;
>   			continue;
> @@ -6857,12 +6857,6 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
>   		*end = ALIGN(*end, PUD_SIZE);
>   }
>   
> -static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
> -{
> -	return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
> -		vma->vm_private_data;
> -}
> -
>   void hugetlb_vma_lock_read(struct vm_area_struct *vma)
>   {
>   	if (__vma_shareable_flags_pmd(vma)) {
> @@ -7028,8 +7022,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
>   
>   		saddr = page_table_shareable(svma, vma, addr, idx);
>   		if (saddr) {
> -			spte = huge_pte_offset(svma->vm_mm, saddr,
> -					       vma_mmu_pagesize(svma));
> +			spte = hugetlb_walk(svma, saddr,
> +					    vma_mmu_pagesize(svma));
>   			if (spte) {
>   				get_page(virt_to_page(spte));
>   				break;
> @@ -7387,7 +7381,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
>   	hugetlb_vma_lock_write(vma);
>   	i_mmap_lock_write(vma->vm_file->f_mapping);
>   	for (address = start; address < end; address += PUD_SIZE) {
> -		ptep = huge_pte_offset(mm, address, sz);
> +		ptep = hugetlb_walk(vma, address, sz);
>   		if (!ptep)
>   			continue;
>   		ptl = huge_pte_lock(h, mm, ptep);
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index 93e13fc17d3c..e97b2e23bd28 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -170,7 +170,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>   			return not_found(pvmw);
>   
>   		/* when pud is not present, pte will be NULL */
> -		pvmw->pte = huge_pte_offset(mm, pvmw->address, size);
> +		pvmw->pte = hugetlb_walk(vma, pvmw->address, size);
>   		if (!pvmw->pte)
>   			return false;
>   
> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
> index d98564a7be57..cb23f8a15c13 100644
> --- a/mm/pagewalk.c
> +++ b/mm/pagewalk.c
> @@ -305,13 +305,11 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
>   	hugetlb_vma_lock_read(vma);
>   	do {
>   		next = hugetlb_entry_end(h, addr, end);
> -		pte = huge_pte_offset(walk->mm, addr & hmask, sz);
> -
> +		pte = hugetlb_walk(vma, addr & hmask, sz);
>   		if (pte)
>   			err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
>   		else if (ops->pte_hole)
>   			err = ops->pte_hole(addr, next, -1, walk);
> -
>   		if (err)
>   			break;
>   	} while (addr = next, addr != end);
  
Peter Xu Dec. 8, 2022, 9:01 p.m. UTC | #3
On Wed, Dec 07, 2022 at 04:12:31PM -0800, John Hubbard wrote:
> On 12/7/22 12:31, Peter Xu wrote:
> > huge_pte_offset() is the main walker function for hugetlb pgtables.  The
> > name is not really representing what it does, though.
> > 
> > Instead of renaming it, introduce a wrapper function called hugetlb_walk()
> > which will use huge_pte_offset() inside.  Assert on the locks when walking
> > the pgtable.
> > 
> > Note, the vma lock assertion will be a no-op for private mappings.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >   fs/hugetlbfs/inode.c    |  4 +---
> >   fs/userfaultfd.c        |  6 ++----
> >   include/linux/hugetlb.h | 39 +++++++++++++++++++++++++++++++++++++++
> >   mm/hugetlb.c            | 32 +++++++++++++-------------------
> >   mm/page_vma_mapped.c    |  2 +-
> >   mm/pagewalk.c           |  4 +---
> >   6 files changed, 57 insertions(+), 30 deletions(-)
> > 
> > diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> > index fdb16246f46e..48f1a8ad2243 100644
> > --- a/fs/hugetlbfs/inode.c
> > +++ b/fs/hugetlbfs/inode.c
> > @@ -388,9 +388,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
> >   {
> >   	pte_t *ptep, pte;
> > -	ptep = huge_pte_offset(vma->vm_mm, addr,
> > -			huge_page_size(hstate_vma(vma)));
> > -
> > +	ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma)));
> >   	if (!ptep)
> >   		return false;
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index a602f008dde5..f31fe1a9f4c5 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -237,14 +237,12 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
> >   					 unsigned long flags,
> >   					 unsigned long reason)
> >   {
> > -	struct mm_struct *mm = ctx->mm;
> >   	pte_t *ptep, pte;
> >   	bool ret = true;
> > -	mmap_assert_locked(mm);
> > -
> > -	ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
> > +	mmap_assert_locked(ctx->mm);
> > +	ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
> >   	if (!ptep)
> >   		goto out;
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index 81efd9b9baa2..1c20cbbf3d22 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -2,6 +2,7 @@
> >   #ifndef _LINUX_HUGETLB_H
> >   #define _LINUX_HUGETLB_H
> > +#include <linux/mm.h>
> >   #include <linux/mm_types.h>
> >   #include <linux/mmdebug.h>
> >   #include <linux/fs.h>
> > @@ -196,6 +197,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
> >    * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE.
> >    * Returns the pte_t* if found, or NULL if the address is not mapped.
> >    *
> > + * IMPORTANT: we should normally not directly call this function, instead
> > + * this is only a common interface to implement arch-specific walker.
> > + * Please consider using the hugetlb_walk() helper to make sure of the
> > + * correct locking is satisfied.
> 
> Or:
> 
> "Please use hugetlb_walk() instead, because that will attempt to verify
> the locking for you."

Ok.

> 
> > + *
> >    * Since this function will walk all the pgtable pages (including not only
> >    * high-level pgtable page, but also PUD entry that can be unshared
> >    * concurrently for VM_SHARED), the caller of this function should be
> > @@ -1229,4 +1235,37 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
> >   #define flush_hugetlb_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
> >   #endif
> > +static inline bool
> > +__vma_shareable_flags_pmd(struct vm_area_struct *vma)
> > +{
> > +	return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
> > +		vma->vm_private_data;
> > +}
> > +
> > +/*
> > + * Safe version of huge_pte_offset() to check the locks.  See comments
> > + * above huge_pte_offset().
> > + */
> 
> It is odd to say that functionA() is a safe version of functionB(), if the
> names are completely different.
> 
> At this point, it is very clear that huge_pte_offset() should be renamed.
> I'd suggest something like one of these:
> 
>     __hugetlb_walk()
>     hugetlb_walk_raw()

We can.

Not only because that's an arch api for years (didn't want to touch more
arch code unless necessary), but also since we have hugetlb_walk() that'll
be the future interface not huge_pte_offset().

Actually it's good when that's the only thing people can find from its name
when they want to have a huge pgtable walk. :)

So totally makes sense to do so, but I don't strongly feel like doing it in
this patchset if you're okay with it.

> 
> > +static inline pte_t *
> > +hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
> > +{
> > +#if defined(CONFIG_HUGETLB_PAGE) && \
> > +	defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP)
> > +	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
> > +
> > +	/*
> > +	 * If pmd sharing possible, locking needed to safely walk the
> > +	 * hugetlb pgtables.  More information can be found at the comment
> > +	 * above huge_pte_offset() in the same file.
> > +	 *
> > +	 * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP.
> > +	 */
> > +	if (__vma_shareable_flags_pmd(vma))
> > +		WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) &&
> > +			     !lockdep_is_held(
> > +				 &vma->vm_file->f_mapping->i_mmap_rwsem));
> > +#endif
> > +	return huge_pte_offset(vma->vm_mm, addr, sz);
> > +}
> 
> Let's please not slice up C functions with ifdefs. Instead, stick to the
> standard approach of
> 
> #ifdef X
> functionC()
> {
> 	...implementation
> }
> #else
> functionC()
> {
> 	...simpler or shorter or stub implementation
> }

Personally I like the slicing because it clearly tells what's the
difference with/without the macros defined.

Thanks,
  
John Hubbard Dec. 8, 2022, 9:50 p.m. UTC | #4
On 12/8/22 13:01, Peter Xu wrote:
>> At this point, it is very clear that huge_pte_offset() should be renamed.
>> I'd suggest something like one of these:
>>
>>      __hugetlb_walk()
>>      hugetlb_walk_raw()
> 
> We can.
> 
> Not only because that's an arch api for years (didn't want to touch more
> arch code unless necessary), but also since we have hugetlb_walk() that'll
> be the future interface not huge_pte_offset().
> 
> Actually it's good when that's the only thing people can find from its name
> when they want to have a huge pgtable walk. :)
> 
> So totally makes sense to do so, but I don't strongly feel like doing it in
> this patchset if you're okay with it.
> 

Sounds good.

>>
>>> +static inline pte_t *
>>> +hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
>>> +{
>>> +#if defined(CONFIG_HUGETLB_PAGE) && \
>>> +	defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP)
>>> +	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>>> +
>>> +	/*
>>> +	 * If pmd sharing possible, locking needed to safely walk the
>>> +	 * hugetlb pgtables.  More information can be found at the comment
>>> +	 * above huge_pte_offset() in the same file.
>>> +	 *
>>> +	 * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP.
>>> +	 */
>>> +	if (__vma_shareable_flags_pmd(vma))
>>> +		WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) &&
>>> +			     !lockdep_is_held(
>>> +				 &vma->vm_file->f_mapping->i_mmap_rwsem));
>>> +#endif
>>> +	return huge_pte_offset(vma->vm_mm, addr, sz);
>>> +}
>>
>> Let's please not slice up C functions with ifdefs. Instead, stick to the
>> standard approach of
>>
>> #ifdef X
>> functionC()
>> {
>> 	...implementation
>> }
>> #else
>> functionC()
>> {
>> 	...simpler or shorter or stub implementation
>> }
> 
> Personally I like the slicing because it clearly tells what's the
> difference with/without the macros defined.
> 

Ha, I think you have a higher tolerance for complexity on the screen.
The fact that you can see more of that complexity at once, is what
slows down human readers.

So when someone is working through the code, if they can look at one
config at a time, that's shorter and cleaner. This is why folks (I'm
very much not the only one) have this common pattern.

However, of course I won't insist here, as there are clearly preferences
in both directions. And the code is still small in either form in this
case so really a non-issue.

thanks,
  
Peter Xu Dec. 8, 2022, 11:21 p.m. UTC | #5
On Thu, Dec 08, 2022 at 01:50:17PM -0800, John Hubbard wrote:
> However, of course I won't insist here, as there are clearly preferences
> in both directions. And the code is still small in either form in this
> case so really a non-issue.

Thanks, John.
  

Patch

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index fdb16246f46e..48f1a8ad2243 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -388,9 +388,7 @@  static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
 {
 	pte_t *ptep, pte;
 
-	ptep = huge_pte_offset(vma->vm_mm, addr,
-			huge_page_size(hstate_vma(vma)));
-
+	ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma)));
 	if (!ptep)
 		return false;
 
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index a602f008dde5..f31fe1a9f4c5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -237,14 +237,12 @@  static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
 					 unsigned long flags,
 					 unsigned long reason)
 {
-	struct mm_struct *mm = ctx->mm;
 	pte_t *ptep, pte;
 	bool ret = true;
 
-	mmap_assert_locked(mm);
-
-	ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
+	mmap_assert_locked(ctx->mm);
 
+	ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
 	if (!ptep)
 		goto out;
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 81efd9b9baa2..1c20cbbf3d22 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -2,6 +2,7 @@ 
 #ifndef _LINUX_HUGETLB_H
 #define _LINUX_HUGETLB_H
 
+#include <linux/mm.h>
 #include <linux/mm_types.h>
 #include <linux/mmdebug.h>
 #include <linux/fs.h>
@@ -196,6 +197,11 @@  pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
  * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE.
  * Returns the pte_t* if found, or NULL if the address is not mapped.
  *
+ * IMPORTANT: we should normally not directly call this function, instead
+ * this is only a common interface to implement arch-specific walker.
+ * Please consider using the hugetlb_walk() helper to make sure of the
+ * correct locking is satisfied.
+ *
  * Since this function will walk all the pgtable pages (including not only
  * high-level pgtable page, but also PUD entry that can be unshared
  * concurrently for VM_SHARED), the caller of this function should be
@@ -1229,4 +1235,37 @@  bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
 #define flush_hugetlb_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
 #endif
 
+static inline bool
+__vma_shareable_flags_pmd(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
+		vma->vm_private_data;
+}
+
+/*
+ * Safe version of huge_pte_offset() to check the locks.  See comments
+ * above huge_pte_offset().
+ */
+static inline pte_t *
+hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
+{
+#if defined(CONFIG_HUGETLB_PAGE) && \
+	defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP)
+	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+	/*
+	 * If pmd sharing possible, locking needed to safely walk the
+	 * hugetlb pgtables.  More information can be found at the comment
+	 * above huge_pte_offset() in the same file.
+	 *
+	 * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP.
+	 */
+	if (__vma_shareable_flags_pmd(vma))
+		WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) &&
+			     !lockdep_is_held(
+				 &vma->vm_file->f_mapping->i_mmap_rwsem));
+#endif
+	return huge_pte_offset(vma->vm_mm, addr, sz);
+}
+
 #endif /* _LINUX_HUGETLB_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f42399522805..e3500c087893 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4814,7 +4814,7 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	} else {
 		/*
 		 * For shared mappings the vma lock must be held before
-		 * calling huge_pte_offset in the src vma. Otherwise, the
+		 * calling hugetlb_walk() in the src vma. Otherwise, the
 		 * returned ptep could go away if part of a shared pmd and
 		 * another thread calls huge_pmd_unshare.
 		 */
@@ -4824,7 +4824,7 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	last_addr_mask = hugetlb_mask_last_page(h);
 	for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
 		spinlock_t *src_ptl, *dst_ptl;
-		src_pte = huge_pte_offset(src, addr, sz);
+		src_pte = hugetlb_walk(src_vma, addr, sz);
 		if (!src_pte) {
 			addr |= last_addr_mask;
 			continue;
@@ -5028,7 +5028,7 @@  int move_hugetlb_page_tables(struct vm_area_struct *vma,
 	hugetlb_vma_lock_write(vma);
 	i_mmap_lock_write(mapping);
 	for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
-		src_pte = huge_pte_offset(mm, old_addr, sz);
+		src_pte = hugetlb_walk(vma, old_addr, sz);
 		if (!src_pte) {
 			old_addr |= last_addr_mask;
 			new_addr |= last_addr_mask;
@@ -5091,7 +5091,7 @@  static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 	last_addr_mask = hugetlb_mask_last_page(h);
 	address = start;
 	for (; address < end; address += sz) {
-		ptep = huge_pte_offset(mm, address, sz);
+		ptep = hugetlb_walk(vma, address, sz);
 		if (!ptep) {
 			address |= last_addr_mask;
 			continue;
@@ -5404,7 +5404,7 @@  static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
 			hugetlb_vma_lock_read(vma);
 			spin_lock(ptl);
-			ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+			ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
 			if (likely(ptep &&
 				   pte_same(huge_ptep_get(ptep), pte)))
 				goto retry_avoidcopy;
@@ -5442,7 +5442,7 @@  static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * before the page tables are altered
 	 */
 	spin_lock(ptl);
-	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+	ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
 	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
 		/* Break COW or unshare */
 		huge_ptep_clear_flush(vma, haddr, ptep);
@@ -6227,7 +6227,7 @@  struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
 		return NULL;
 
 	hugetlb_vma_lock_read(vma);
-	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
+	pte = hugetlb_walk(vma, haddr, huge_page_size(h));
 	if (!pte)
 		goto out_unlock;
 
@@ -6292,8 +6292,8 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 *
 		 * Note that page table lock is not held when pte is null.
 		 */
-		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
-				      huge_page_size(h));
+		pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
+				   huge_page_size(h));
 		if (pte)
 			ptl = huge_pte_lock(h, mm, pte);
 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
@@ -6479,7 +6479,7 @@  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	last_addr_mask = hugetlb_mask_last_page(h);
 	for (; address < end; address += psize) {
 		spinlock_t *ptl;
-		ptep = huge_pte_offset(mm, address, psize);
+		ptep = hugetlb_walk(vma, address, psize);
 		if (!ptep) {
 			address |= last_addr_mask;
 			continue;
@@ -6857,12 +6857,6 @@  void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 		*end = ALIGN(*end, PUD_SIZE);
 }
 
-static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
-{
-	return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
-		vma->vm_private_data;
-}
-
 void hugetlb_vma_lock_read(struct vm_area_struct *vma)
 {
 	if (__vma_shareable_flags_pmd(vma)) {
@@ -7028,8 +7022,8 @@  pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 
 		saddr = page_table_shareable(svma, vma, addr, idx);
 		if (saddr) {
-			spte = huge_pte_offset(svma->vm_mm, saddr,
-					       vma_mmu_pagesize(svma));
+			spte = hugetlb_walk(svma, saddr,
+					    vma_mmu_pagesize(svma));
 			if (spte) {
 				get_page(virt_to_page(spte));
 				break;
@@ -7387,7 +7381,7 @@  void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
 	hugetlb_vma_lock_write(vma);
 	i_mmap_lock_write(vma->vm_file->f_mapping);
 	for (address = start; address < end; address += PUD_SIZE) {
-		ptep = huge_pte_offset(mm, address, sz);
+		ptep = hugetlb_walk(vma, address, sz);
 		if (!ptep)
 			continue;
 		ptl = huge_pte_lock(h, mm, ptep);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 93e13fc17d3c..e97b2e23bd28 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -170,7 +170,7 @@  bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 			return not_found(pvmw);
 
 		/* when pud is not present, pte will be NULL */
-		pvmw->pte = huge_pte_offset(mm, pvmw->address, size);
+		pvmw->pte = hugetlb_walk(vma, pvmw->address, size);
 		if (!pvmw->pte)
 			return false;
 
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index d98564a7be57..cb23f8a15c13 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -305,13 +305,11 @@  static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 	hugetlb_vma_lock_read(vma);
 	do {
 		next = hugetlb_entry_end(h, addr, end);
-		pte = huge_pte_offset(walk->mm, addr & hmask, sz);
-
+		pte = hugetlb_walk(vma, addr & hmask, sz);
 		if (pte)
 			err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
 		else if (ops->pte_hole)
 			err = ops->pte_hole(addr, next, -1, walk);
-
 		if (err)
 			break;
 	} while (addr = next, addr != end);