[1/3] hugetlbfs: extend hugetlb_vma_lock to private VMAs
Commit Message
From: Rik van Riel <riel@surriel.com>
Extend the locking scheme used to protect shared hugetlb mappings
from truncate vs page fault races, in order to protect private
hugetlb mappings (with resv_map) against MADV_DONTNEED.
Add a read-write semaphore to the resv_map data structure, and
use that from the hugetlb_vma_(un)lock_* functions, in preparation
for closing the race between MADV_DONTNEED and page faults.
Signed-off-by: Rik van Riel <riel@surriel.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
---
include/linux/hugetlb.h | 6 ++++++
mm/hugetlb.c | 41 +++++++++++++++++++++++++++++++++++++----
2 files changed, 43 insertions(+), 4 deletions(-)
Comments
On 09/25/23 23:10, riel@surriel.com wrote:
> From: Rik van Riel <riel@surriel.com>
>
> Extend the locking scheme used to protect shared hugetlb mappings
> from truncate vs page fault races, in order to protect private
> hugetlb mappings (with resv_map) against MADV_DONTNEED.
>
> Add a read-write semaphore to the resv_map data structure, and
> use that from the hugetlb_vma_(un)lock_* functions, in preparation
> for closing the race between MADV_DONTNEED and page faults.
>
> Signed-off-by: Rik van Riel <riel@surriel.com>
> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
> include/linux/hugetlb.h | 6 ++++++
> mm/hugetlb.c | 41 +++++++++++++++++++++++++++++++++++++----
> 2 files changed, 43 insertions(+), 4 deletions(-)
My bad during the review of patch 2!
In reply to patch 1, I suggested the changes:
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index f906c5fa4d09..8f3d5895fffc 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -372,6 +372,11 @@ static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
> struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>
> __hugetlb_vma_unlock_write_put(vma_lock);
> + } else if (__vma_private_lock(vma)) {
> + struct resv_map *resv_map = vma_resv_map(vma);
> +
> + /* no free for anon vmas, but still need to unlock */
> + up_write(&resv_map->rw_sema);
> }
> }
However, the check for 'if (__vma_private_lock(vma))' was dropped.
> @@ -345,6 +372,11 @@ static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
> struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>
> __hugetlb_vma_unlock_write_put(vma_lock);
> + } else {
> + struct resv_map *resv_map = vma_resv_map(vma);
> +
> + /* no free for anon vmas, but still need to unlock */
> + up_write(&resv_map->rw_sema);
> }
> }
So, the map_high_truncate_2 (2M: 32) libhugetlbfs test still BUGs with:
BUG: kernel NULL pointer dereference
On Fri, 2023-09-29 at 19:28 -0700, Mike Kravetz wrote:
> On 09/25/23 23:10, riel@surriel.com wrote:
>
>
> In reply to patch 1, I suggested the changes:
>
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index f906c5fa4d09..8f3d5895fffc 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -372,6 +372,11 @@ static void
> > __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
> > struct hugetlb_vma_lock *vma_lock = vma-
> > >vm_private_data;
> >
> > __hugetlb_vma_unlock_write_put(vma_lock);
> > + } else if (__vma_private_lock(vma)) {
> > + struct resv_map *resv_map = vma_resv_map(vma);
> > +
> > + /* no free for anon vmas, but still need to unlock
> > */
> > + up_write(&resv_map->rw_sema);
> > }
> > }
>
> However, the check for 'if (__vma_private_lock(vma))' was dropped.
Oh ugh. I definitely added that in somewhere, but must
have committed that to the wrong git branch :(
Let me send out a new version with that change.
Sorry about that.
@@ -60,6 +60,7 @@ struct resv_map {
long adds_in_progress;
struct list_head region_cache;
long region_cache_count;
+ struct rw_semaphore rw_sema;
#ifdef CONFIG_CGROUP_HUGETLB
/*
* On private mappings, the counter to uncharge reservations is stored
@@ -1231,6 +1232,11 @@ static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
}
+static inline bool __vma_private_lock(struct vm_area_struct *vma)
+{
+ return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data;
+}
+
/*
* Safe version of huge_pte_offset() to check the locks. See comments
* above huge_pte_offset().
@@ -97,6 +97,7 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -267,6 +268,10 @@ void hugetlb_vma_lock_read(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
down_read(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ down_read(&resv_map->rw_sema);
}
}
@@ -276,6 +281,10 @@ void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
up_read(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ up_read(&resv_map->rw_sema);
}
}
@@ -285,6 +294,10 @@ void hugetlb_vma_lock_write(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
down_write(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ down_write(&resv_map->rw_sema);
}
}
@@ -294,17 +307,27 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
up_write(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ up_write(&resv_map->rw_sema);
}
}
int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
{
- struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
- if (!__vma_shareable_lock(vma))
- return 1;
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
- return down_write_trylock(&vma_lock->rw_sema);
+ return down_write_trylock(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ return down_write_trylock(&resv_map->rw_sema);
+ }
+
+ return 1;
}
void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
@@ -313,6 +336,10 @@ void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
lockdep_assert_held(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ lockdep_assert_held(&resv_map->rw_sema);
}
}
@@ -345,6 +372,11 @@ static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
__hugetlb_vma_unlock_write_put(vma_lock);
+ } else {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ /* no free for anon vmas, but still need to unlock */
+ up_write(&resv_map->rw_sema);
}
}
@@ -1068,6 +1100,7 @@ struct resv_map *resv_map_alloc(void)
kref_init(&resv_map->refs);
spin_lock_init(&resv_map->lock);
INIT_LIST_HEAD(&resv_map->regions);
+ init_rwsem(&resv_map->rw_sema);
resv_map->adds_in_progress = 0;
/*