[v4,18/49] mlock: Convert mlock to vma iterator

Message ID 20230120162650.984577-19-Liam.Howlett@oracle.com
State New
Headers
Series VMA tree type safety and remove __vma_adjust() |

Commit Message

Liam R. Howlett Jan. 20, 2023, 4:26 p.m. UTC
  From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>

Use the vma iterator so that the iterator can be invalidated or updated
to avoid each caller doing so.

Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
---
 mm/mlock.c | 57 +++++++++++++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 29 deletions(-)
  

Comments

Ryan Roberts July 11, 2023, 2:08 p.m. UTC | #1
On 20/01/2023 16:26, Liam R. Howlett wrote:
> From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
> 
> Use the vma iterator so that the iterator can be invalidated or updated
> to avoid each caller doing so.

Hi,

I've bisected 2 mm selftest regressions back to this patch, so hoping someone can help debug and fix? The failures are reproducible on x86_64 and arm64.


mlock-random-test:

$ ./run_kselftest.sh -t mm:mlock-random-test
TAP version 13
1..1
# selftests: mm: mlock-random-test
mlock() failure at |0xaaaaaaab52d0(131072)| mlock:|0xaaaaaaacc65d(26551)|
not ok 1 selftests: mm: mlock-random-test # exit=255

This mallocs a buffer then loops 100 times, trying to mlock random parts of it. After this patch, the test fails after a variable number of iterations; mlock() returns ENOMEM. If I explicitly munlock at the end of each loop, it works.


mlock2-tests:

$ ./run_kselftest.sh -t mm:mlock2-tests
TAP version 13
1..1
# selftests: mm: mlock2-tests
munlock(): Cannot allocate memory
munlock(): Cannot allocate memory
not ok 1 selftests: mm: mlock2-tests # exit=2

Here, a 3 page buffer is mlock2()ed, then the middle page is munlocked. Finally the whole 3 page range is munlocked, and after this patch it fails with ENOMEM. If I modify the test to split the final munlock into 2, one for the first page and one for the last, the test passes.


Immediately prior to this patch (2286a6914c77 "mm: change mprotect_fixup to vma iterator"), both tests pass.

From a quick scan of the man page, I don't think it explicitly says that its ok to call mlock/munlock on already locked/unlocked pages, but it's certainly a change of behavior and the tests notice, so I'm guessing this wasn't intentional?

I'm not familiar with this code so it's not obvious to me exactly what the problem is, but I'm hoping someone can help debug?

Thanks,
Ryan



> 
> Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
> ---
>  mm/mlock.c | 57 +++++++++++++++++++++++++++---------------------------
>  1 file changed, 28 insertions(+), 29 deletions(-)
> 
> diff --git a/mm/mlock.c b/mm/mlock.c
> index b680f11879c3..0d09b9070071 100644
> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -401,8 +401,9 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
>   *
>   * For vmas that pass the filters, merge/split as appropriate.
>   */
> -static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
> -	unsigned long start, unsigned long end, vm_flags_t newflags)
> +static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
> +	       struct vm_area_struct **prev, unsigned long start,
> +	       unsigned long end, vm_flags_t newflags)
>  {
>  	struct mm_struct *mm = vma->vm_mm;
>  	pgoff_t pgoff;
> @@ -417,22 +418,22 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
>  		goto out;
>  
>  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> -	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
> -			  vma->vm_file, pgoff, vma_policy(vma),
> -			  vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> +	*prev = vmi_vma_merge(vmi, mm, *prev, start, end, newflags,
> +			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
> +			vma->vm_userfaultfd_ctx, anon_vma_name(vma));
>  	if (*prev) {
>  		vma = *prev;
>  		goto success;
>  	}
>  
>  	if (start != vma->vm_start) {
> -		ret = split_vma(mm, vma, start, 1);
> +		ret = vmi_split_vma(vmi, mm, vma, start, 1);
>  		if (ret)
>  			goto out;
>  	}
>  
>  	if (end != vma->vm_end) {
> -		ret = split_vma(mm, vma, end, 0);
> +		ret = vmi_split_vma(vmi, mm, vma, end, 0);
>  		if (ret)
>  			goto out;
>  	}
> @@ -471,7 +472,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
>  	unsigned long nstart, end, tmp;
>  	struct vm_area_struct *vma, *prev;
>  	int error;
> -	MA_STATE(mas, &current->mm->mm_mt, start, start);
> +	VMA_ITERATOR(vmi, current->mm, start);
>  
>  	VM_BUG_ON(offset_in_page(start));
>  	VM_BUG_ON(len != PAGE_ALIGN(len));
> @@ -480,39 +481,37 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
>  		return -EINVAL;
>  	if (end == start)
>  		return 0;
> -	vma = mas_walk(&mas);
> +	vma = vma_iter_load(&vmi);
>  	if (!vma)
>  		return -ENOMEM;
>  
> +	prev = vma_prev(&vmi);
>  	if (start > vma->vm_start)
>  		prev = vma;
> -	else
> -		prev = mas_prev(&mas, 0);
>  
> -	for (nstart = start ; ; ) {
> -		vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
> +	nstart = start;
> +	tmp = vma->vm_start;
> +	for_each_vma_range(vmi, vma, end) {
> +		vm_flags_t newflags;
>  
> -		newflags |= flags;
> +		if (vma->vm_start != tmp)
> +			return -ENOMEM;
>  
> +		newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
> +		newflags |= flags;
>  		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
>  		tmp = vma->vm_end;
>  		if (tmp > end)
>  			tmp = end;
> -		error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
> +		error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
>  		if (error)
>  			break;
>  		nstart = tmp;
> -		if (nstart < prev->vm_end)
> -			nstart = prev->vm_end;
> -		if (nstart >= end)
> -			break;
> -
> -		vma = find_vma(prev->vm_mm, prev->vm_end);
> -		if (!vma || vma->vm_start != nstart) {
> -			error = -ENOMEM;
> -			break;
> -		}
>  	}
> +
> +	if (vma_iter_end(&vmi) < end)
> +		return -ENOMEM;
> +
>  	return error;
>  }
>  
> @@ -658,7 +657,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
>   */
>  static int apply_mlockall_flags(int flags)
>  {
> -	MA_STATE(mas, &current->mm->mm_mt, 0, 0);
> +	VMA_ITERATOR(vmi, current->mm, 0);
>  	struct vm_area_struct *vma, *prev = NULL;
>  	vm_flags_t to_add = 0;
>  
> @@ -679,15 +678,15 @@ static int apply_mlockall_flags(int flags)
>  			to_add |= VM_LOCKONFAULT;
>  	}
>  
> -	mas_for_each(&mas, vma, ULONG_MAX) {
> +	for_each_vma(vmi, vma) {
>  		vm_flags_t newflags;
>  
>  		newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
>  		newflags |= to_add;
>  
>  		/* Ignore errors */
> -		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
> -		mas_pause(&mas);
> +		mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
> +			    newflags);
>  		cond_resched();
>  	}
>  out:
  
Liam R. Howlett July 11, 2023, 3:27 p.m. UTC | #2
* Ryan Roberts <ryan.roberts@arm.com> [230711 10:09]:
> On 20/01/2023 16:26, Liam R. Howlett wrote:
> > From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
> > 
> > Use the vma iterator so that the iterator can be invalidated or updated
> > to avoid each caller doing so.
> 
> Hi,


Hello!

> 
> I've bisected 2 mm selftest regressions back to this patch, so hoping someone can help debug and fix? The failures are reproducible on x86_64 and arm64.

Thanks!  That is a big help.  Where did you start your bisection?  I
assume 6.4?

> 
> 
> mlock-random-test:
> 
> $ ./run_kselftest.sh -t mm:mlock-random-test
> TAP version 13
> 1..1
> # selftests: mm: mlock-random-test
> mlock() failure at |0xaaaaaaab52d0(131072)| mlock:|0xaaaaaaacc65d(26551)|
> not ok 1 selftests: mm: mlock-random-test # exit=255
> 
> This mallocs a buffer then loops 100 times, trying to mlock random parts of it. After this patch, the test fails after a variable number of iterations; mlock() returns ENOMEM. If I explicitly munlock at the end of each loop, it works.
> 
> 
> mlock2-tests:
> 
> $ ./run_kselftest.sh -t mm:mlock2-tests
> TAP version 13
> 1..1
> # selftests: mm: mlock2-tests
> munlock(): Cannot allocate memory
> munlock(): Cannot allocate memory
> not ok 1 selftests: mm: mlock2-tests # exit=2
> 
> Here, a 3 page buffer is mlock2()ed, then the middle page is munlocked. Finally the whole 3 page range is munlocked, and after this patch it fails with ENOMEM. If I modify the test to split the final munlock into 2, one for the first page and one for the last, the test passes.
> 
> 
> Immediately prior to this patch (2286a6914c77 "mm: change mprotect_fixup to vma iterator"), both tests pass.
> 
> From a quick scan of the man page, I don't think it explicitly says that its ok to call mlock/munlock on already locked/unlocked pages, but it's certainly a change of behavior and the tests notice, so I'm guessing this wasn't intentional?
> 
> I'm not familiar with this code so it's not obvious to me exactly what the problem is, but I'm hoping someone can help debug?

I think I see the issue and I'm working on a fix. I appreciate the
analysis and report, it really helps narrow things down.

Regards,
Liam
  
Ryan Roberts July 11, 2023, 3:30 p.m. UTC | #3
On 11/07/2023 16:27, Liam R. Howlett wrote:
> * Ryan Roberts <ryan.roberts@arm.com> [230711 10:09]:
>> On 20/01/2023 16:26, Liam R. Howlett wrote:
>>> From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
>>>
>>> Use the vma iterator so that the iterator can be invalidated or updated
>>> to avoid each caller doing so.
>>
>> Hi,
> 
> 
> Hello!
> 
>>
>> I've bisected 2 mm selftest regressions back to this patch, so hoping someone can help debug and fix? The failures are reproducible on x86_64 and arm64.
> 
> Thanks!  That is a big help.  Where did you start your bisection?  I
> assume 6.4?

Yes, I'm working to get all the mm selftests running (and ideally passing!) on
arm64. I working on v6.4 and it was broken there. I went arbitrarily back to
v5.10 and it was working there, so bisected between them.

> 
>>
>>
>> mlock-random-test:
>>
>> $ ./run_kselftest.sh -t mm:mlock-random-test
>> TAP version 13
>> 1..1
>> # selftests: mm: mlock-random-test
>> mlock() failure at |0xaaaaaaab52d0(131072)| mlock:|0xaaaaaaacc65d(26551)|
>> not ok 1 selftests: mm: mlock-random-test # exit=255
>>
>> This mallocs a buffer then loops 100 times, trying to mlock random parts of it. After this patch, the test fails after a variable number of iterations; mlock() returns ENOMEM. If I explicitly munlock at the end of each loop, it works.
>>
>>
>> mlock2-tests:
>>
>> $ ./run_kselftest.sh -t mm:mlock2-tests
>> TAP version 13
>> 1..1
>> # selftests: mm: mlock2-tests
>> munlock(): Cannot allocate memory
>> munlock(): Cannot allocate memory
>> not ok 1 selftests: mm: mlock2-tests # exit=2
>>
>> Here, a 3 page buffer is mlock2()ed, then the middle page is munlocked. Finally the whole 3 page range is munlocked, and after this patch it fails with ENOMEM. If I modify the test to split the final munlock into 2, one for the first page and one for the last, the test passes.
>>
>>
>> Immediately prior to this patch (2286a6914c77 "mm: change mprotect_fixup to vma iterator"), both tests pass.
>>
>> From a quick scan of the man page, I don't think it explicitly says that its ok to call mlock/munlock on already locked/unlocked pages, but it's certainly a change of behavior and the tests notice, so I'm guessing this wasn't intentional?
>>
>> I'm not familiar with this code so it's not obvious to me exactly what the problem is, but I'm hoping someone can help debug?
> 
> I think I see the issue and I'm working on a fix. I appreciate the
> analysis and report, it really helps narrow things down.

You're welcome!

> 
> Regards,
> Liam
  
Liam R. Howlett July 11, 2023, 5:57 p.m. UTC | #4
* Ryan Roberts <ryan.roberts@arm.com> [230711 11:30]:
> On 11/07/2023 16:27, Liam R. Howlett wrote:
> > * Ryan Roberts <ryan.roberts@arm.com> [230711 10:09]:
> >> On 20/01/2023 16:26, Liam R. Howlett wrote:
> >>> From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
> >>>
> >>> Use the vma iterator so that the iterator can be invalidated or updated
> >>> to avoid each caller doing so.
> >>
> >> Hi,
> > 
> > 
> > Hello!
> > 
> >>
> >> I've bisected 2 mm selftest regressions back to this patch, so hoping someone can help debug and fix? The failures are reproducible on x86_64 and arm64.
> > 
> > Thanks!  That is a big help.  Where did you start your bisection?  I
> > assume 6.4?
> 
> Yes, I'm working to get all the mm selftests running (and ideally passing!) on
> arm64. I working on v6.4 and it was broken there. I went arbitrarily back to
> v5.10 and it was working there, so bisected between them.
> 

Annoyingly, this is similar to another bug I had fixed in another
iterator across VMAs.  It's the same pattern and I did go back to see if
I had broken other places but, obviously, I missed this one.

It's annoying enough that I'm trying to figure out a better way to do
this in general.. A contiguous iterator of sorts.  I will add this to my
maple tree work list [1].

[1] http://lists.infradead.org/pipermail/maple-tree/2023-July/002683.html

Thanks,
Liam
  

Patch

diff --git a/mm/mlock.c b/mm/mlock.c
index b680f11879c3..0d09b9070071 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -401,8 +401,9 @@  static void mlock_vma_pages_range(struct vm_area_struct *vma,
  *
  * For vmas that pass the filters, merge/split as appropriate.
  */
-static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
-	unsigned long start, unsigned long end, vm_flags_t newflags)
+static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
+	       struct vm_area_struct **prev, unsigned long start,
+	       unsigned long end, vm_flags_t newflags)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pgoff_t pgoff;
@@ -417,22 +418,22 @@  static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		goto out;
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
-	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
-			  vma->vm_file, pgoff, vma_policy(vma),
-			  vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+	*prev = vmi_vma_merge(vmi, mm, *prev, start, end, newflags,
+			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+			vma->vm_userfaultfd_ctx, anon_vma_name(vma));
 	if (*prev) {
 		vma = *prev;
 		goto success;
 	}
 
 	if (start != vma->vm_start) {
-		ret = split_vma(mm, vma, start, 1);
+		ret = vmi_split_vma(vmi, mm, vma, start, 1);
 		if (ret)
 			goto out;
 	}
 
 	if (end != vma->vm_end) {
-		ret = split_vma(mm, vma, end, 0);
+		ret = vmi_split_vma(vmi, mm, vma, end, 0);
 		if (ret)
 			goto out;
 	}
@@ -471,7 +472,7 @@  static int apply_vma_lock_flags(unsigned long start, size_t len,
 	unsigned long nstart, end, tmp;
 	struct vm_area_struct *vma, *prev;
 	int error;
-	MA_STATE(mas, &current->mm->mm_mt, start, start);
+	VMA_ITERATOR(vmi, current->mm, start);
 
 	VM_BUG_ON(offset_in_page(start));
 	VM_BUG_ON(len != PAGE_ALIGN(len));
@@ -480,39 +481,37 @@  static int apply_vma_lock_flags(unsigned long start, size_t len,
 		return -EINVAL;
 	if (end == start)
 		return 0;
-	vma = mas_walk(&mas);
+	vma = vma_iter_load(&vmi);
 	if (!vma)
 		return -ENOMEM;
 
+	prev = vma_prev(&vmi);
 	if (start > vma->vm_start)
 		prev = vma;
-	else
-		prev = mas_prev(&mas, 0);
 
-	for (nstart = start ; ; ) {
-		vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+	nstart = start;
+	tmp = vma->vm_start;
+	for_each_vma_range(vmi, vma, end) {
+		vm_flags_t newflags;
 
-		newflags |= flags;
+		if (vma->vm_start != tmp)
+			return -ENOMEM;
 
+		newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+		newflags |= flags;
 		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 		tmp = vma->vm_end;
 		if (tmp > end)
 			tmp = end;
-		error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
+		error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
 		if (error)
 			break;
 		nstart = tmp;
-		if (nstart < prev->vm_end)
-			nstart = prev->vm_end;
-		if (nstart >= end)
-			break;
-
-		vma = find_vma(prev->vm_mm, prev->vm_end);
-		if (!vma || vma->vm_start != nstart) {
-			error = -ENOMEM;
-			break;
-		}
 	}
+
+	if (vma_iter_end(&vmi) < end)
+		return -ENOMEM;
+
 	return error;
 }
 
@@ -658,7 +657,7 @@  SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
  */
 static int apply_mlockall_flags(int flags)
 {
-	MA_STATE(mas, &current->mm->mm_mt, 0, 0);
+	VMA_ITERATOR(vmi, current->mm, 0);
 	struct vm_area_struct *vma, *prev = NULL;
 	vm_flags_t to_add = 0;
 
@@ -679,15 +678,15 @@  static int apply_mlockall_flags(int flags)
 			to_add |= VM_LOCKONFAULT;
 	}
 
-	mas_for_each(&mas, vma, ULONG_MAX) {
+	for_each_vma(vmi, vma) {
 		vm_flags_t newflags;
 
 		newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
 		newflags |= to_add;
 
 		/* Ignore errors */
-		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
-		mas_pause(&mas);
+		mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
+			    newflags);
 		cond_resched();
 	}
 out: