[v2] mm: userfaultfd: avoid passing an invalid range to vma_merge()

Message ID 20230515193232.67552-1-lstoakes@gmail.com
State New
Headers
Series [v2] mm: userfaultfd: avoid passing an invalid range to vma_merge() |

Commit Message

Lorenzo Stoakes May 15, 2023, 7:32 p.m. UTC
  The userfaultfd_[un]register() functions will knowingly pass an invalid
address range to vma_merge(), then rely on it failing to merge to indicate
that the VMA should be split into a valid one.

This is not something that should be relied upon, as vma_merge() implicitly
assumes in cases 5-8 that curr->vm_start == addr. This is now enforced
since commit b0729ae0ae67 ("mm/mmap/vma_merge: explicitly assign res, vma,
extend invariants") with an explicit VM_WARN_ON() check.

Since commit 29417d292bd0 ("mm/mmap/vma_merge: always check invariants")
this check is performed unconditionally, which caused this assert to arise
in tests performed by Mark [1].

This patch fixes the issue by performing the split operations before
attempting to merge VMAs in both instances. The problematic operation is
splitting the start of the VMA since we were clamping to the end of the VMA
in any case, however it is useful to group both of the split operations
together to avoid egregious goto's and to abstract the code between the
functions.

As well as fixing the repro described in [1] this also continues to pass
uffd unit tests.

[1]:https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com

Reported-by: Mark Rutland <mark.rutland@arm.com>
Closes: https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com/
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
---
 fs/userfaultfd.c | 108 ++++++++++++++++++++++++++---------------------
 1 file changed, 60 insertions(+), 48 deletions(-)
  

Comments

Liam R. Howlett May 15, 2023, 8:23 p.m. UTC | #1
* Lorenzo Stoakes <lstoakes@gmail.com> [230515 15:32]:
> The userfaultfd_[un]register() functions will knowingly pass an invalid
> address range to vma_merge(), then rely on it failing to merge to indicate
> that the VMA should be split into a valid one.
> 
> This is not something that should be relied upon, as vma_merge() implicitly
> assumes in cases 5-8 that curr->vm_start == addr. This is now enforced
> since commit b0729ae0ae67 ("mm/mmap/vma_merge: explicitly assign res, vma,
> extend invariants") with an explicit VM_WARN_ON() check.
> 
> Since commit 29417d292bd0 ("mm/mmap/vma_merge: always check invariants")
> this check is performed unconditionally, which caused this assert to arise
> in tests performed by Mark [1].
> 
> This patch fixes the issue by performing the split operations before
> attempting to merge VMAs in both instances. The problematic operation is
> splitting the start of the VMA since we were clamping to the end of the VMA
> in any case, however it is useful to group both of the split operations
> together to avoid egregious goto's and to abstract the code between the
> functions.
> 
> As well as fixing the repro described in [1] this also continues to pass
> uffd unit tests.
> 
> [1]:https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com
> 
> Reported-by: Mark Rutland <mark.rutland@arm.com>
> Closes: https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com/
> Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>

Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>

> ---
>  fs/userfaultfd.c | 108 ++++++++++++++++++++++++++---------------------
>  1 file changed, 60 insertions(+), 48 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 0fd96d6e39ce..ef5d667ea804 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1319,6 +1319,32 @@ static __always_inline int validate_range(struct mm_struct *mm,
>  	return 0;
>  }
>  
> +static int clamp_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
> +		       unsigned long start, unsigned long end, bool *can_merge)
> +{
> +	int ret;
> +	bool merge = true;
> +
> +	/* The range must always be clamped to the start of a VMA. */
> +	if (vma->vm_start < start) {
> +		ret = split_vma(vmi, vma, start, 1);
> +		if (ret)
> +			return ret;
> +
> +		merge = false;
> +	}
> +
> +	/* It must also be clamped to the end of a VMA. */
> +	if (vma->vm_end > end) {
> +		ret = split_vma(vmi, vma, end, 0);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	*can_merge = merge;
> +	return 0;
> +}
> +
>  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  				unsigned long arg)
>  {
> @@ -1330,7 +1356,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  	unsigned long vm_flags, new_flags;
>  	bool found;
>  	bool basic_ioctls;
> -	unsigned long start, end, vma_end;
> +	unsigned long start, end;
>  	struct vma_iterator vmi;
>  
>  	user_uffdio_register = (struct uffdio_register __user *) arg;
> @@ -1462,6 +1488,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  
>  	ret = 0;
>  	for_each_vma_range(vmi, vma, end) {
> +		bool can_merge;
> +
>  		cond_resched();
>  
>  		BUG_ON(!vma_can_userfault(vma, vm_flags));
> @@ -1477,32 +1505,22 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  		    (vma->vm_flags & vm_flags) == vm_flags)
>  			goto skip;
>  
> -		if (vma->vm_start > start)
> -			start = vma->vm_start;
> -		vma_end = min(end, vma->vm_end);
> +		ret = clamp_range(&vmi, vma, start, end, &can_merge);
> +		if (ret)
> +			break;
>  
>  		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
> -		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> -				 vma_policy(vma),
> -				 ((struct vm_userfaultfd_ctx){ ctx }),
> -				 anon_vma_name(vma));
> -		if (prev) {
> +		if (can_merge) {
> +			prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags,
> +					 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> +					 vma_policy(vma),
> +					 ((struct vm_userfaultfd_ctx){ ctx }),
> +					 anon_vma_name(vma));
> +
>  			/* vma_merge() invalidated the mas */
> -			vma = prev;
> -			goto next;
> -		}
> -		if (vma->vm_start < start) {
> -			ret = split_vma(&vmi, vma, start, 1);
> -			if (ret)
> -				break;
> -		}
> -		if (vma->vm_end > end) {
> -			ret = split_vma(&vmi, vma, end, 0);
> -			if (ret)
> -				break;
> +			if (prev)
> +				vma = prev;
>  		}
> -	next:
>  		/*
>  		 * In the vma_merge() successful mprotect-like case 8:
>  		 * the next vma was merged into the current one and
> @@ -1560,7 +1578,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  	struct uffdio_range uffdio_unregister;
>  	unsigned long new_flags;
>  	bool found;
> -	unsigned long start, end, vma_end;
> +	unsigned long start, end;
>  	const void __user *buf = (void __user *)arg;
>  	struct vma_iterator vmi;
>  
> @@ -1627,6 +1645,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  	prev = vma_prev(&vmi);
>  	ret = 0;
>  	for_each_vma_range(vmi, vma, end) {
> +		bool can_merge;
> +
>  		cond_resched();
>  
>  		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> @@ -1640,9 +1660,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  
>  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
>  
> -		if (vma->vm_start > start)
> -			start = vma->vm_start;
> -		vma_end = min(end, vma->vm_end);
> +		ret = clamp_range(&vmi, vma, start, end, &can_merge);
> +		if (ret)
> +			break;
>  
>  		if (userfaultfd_missing(vma)) {
>  			/*
> @@ -1652,35 +1672,27 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  			 * UFFDIO_WAKE explicitly.
>  			 */
>  			struct userfaultfd_wake_range range;
> -			range.start = start;
> -			range.len = vma_end - start;
> +			range.start = vma->vm_start;
> +			range.len = vma->vm_end - vma->vm_start;
>  			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
>  		}
>  
>  		/* Reset ptes for the whole vma range if wr-protected */
>  		if (userfaultfd_wp(vma))
> -			uffd_wp_range(vma, start, vma_end - start, false);
> +			uffd_wp_range(vma, vma->vm_start,
> +				      vma->vm_end - vma->vm_start, false);
>  
>  		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
> -		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> -				 vma_policy(vma),
> -				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
> -		if (prev) {
> -			vma = prev;
> -			goto next;
> -		}
> -		if (vma->vm_start < start) {
> -			ret = split_vma(&vmi, vma, start, 1);
> -			if (ret)
> -				break;
> -		}
> -		if (vma->vm_end > end) {
> -			ret = split_vma(&vmi, vma, end, 0);
> -			if (ret)
> -				break;
> +		if (can_merge) {
> +			prev = vma_merge(&vmi, mm, prev, vma->vm_start,
> +					 vma->vm_end, new_flags, vma->anon_vma,
> +					 vma->vm_file, vma->vm_pgoff,
> +					 vma_policy(vma),
> +					 NULL_VM_UFFD_CTX, anon_vma_name(vma));
> +			/* vma_merge() invalidated the mas */
> +			if (prev)
> +				vma = prev;
>  		}
> -	next:
>  		/*
>  		 * In the vma_merge() successful mprotect-like case 8:
>  		 * the next vma was merged into the current one and
> -- 
> 2.40.1
>
  
Peter Xu May 15, 2023, 9:27 p.m. UTC | #2
On Mon, May 15, 2023 at 08:32:32PM +0100, Lorenzo Stoakes wrote:
> The userfaultfd_[un]register() functions will knowingly pass an invalid
> address range to vma_merge(), then rely on it failing to merge to indicate
> that the VMA should be split into a valid one.
> 
> This is not something that should be relied upon,

Is there any real issue pop up to show that it's something that we cannot
rely upon before b0729ae0ae67?  Because normally if something broke with a
commit I'll say "commit xxx breaks yyy" unless there's more valid reason.. :)

> as vma_merge() implicitly assumes in cases 5-8 that curr->vm_start ==
> addr. This is now enforced since commit b0729ae0ae67 ("mm/mmap/vma_merge:
> explicitly assign res, vma, extend invariants") with an explicit
> VM_WARN_ON() check.

Doing vma_merge() before vma_split() makes some sense to me (and I noticed
that mostly all vma_merge() paths are doing so), because if a merge
happened, we 100% doesn't need a split (vma_merge took care of everything).
It's not the other way round, since if we split, we could still need a
merge.

So it fundamentally avoids unnecessary calls to split, it seems, if we
always call merge before splits.

> 
> Since commit 29417d292bd0 ("mm/mmap/vma_merge: always check invariants")
> this check is performed unconditionally, which caused this assert to arise
> in tests performed by Mark [1].
> 
> This patch fixes the issue by performing the split operations before
> attempting to merge VMAs in both instances. The problematic operation is
> splitting the start of the VMA since we were clamping to the end of the VMA
> in any case, however it is useful to group both of the split operations
> together to avoid egregious goto's and to abstract the code between the
> functions.
> 
> As well as fixing the repro described in [1] this also continues to pass
> uffd unit tests.
> 
> [1]:https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com
> 
> Reported-by: Mark Rutland <mark.rutland@arm.com>
> Closes: https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com/
> Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
> ---
>  fs/userfaultfd.c | 108 ++++++++++++++++++++++++++---------------------
>  1 file changed, 60 insertions(+), 48 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 0fd96d6e39ce..ef5d667ea804 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1319,6 +1319,32 @@ static __always_inline int validate_range(struct mm_struct *mm,
>  	return 0;
>  }
>  
> +static int clamp_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
> +		       unsigned long start, unsigned long end, bool *can_merge)
> +{
> +	int ret;
> +	bool merge = true;
> +
> +	/* The range must always be clamped to the start of a VMA. */
> +	if (vma->vm_start < start) {
> +		ret = split_vma(vmi, vma, start, 1);
> +		if (ret)
> +			return ret;
> +
> +		merge = false;

Could you explain a bit why we don't need to merge in this case?

I'm considering, for example, when we have:

  vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)

Then someone unregisters uffd on range (5-9), iiuc it should become:

  vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)

But if no merge here it's:

  vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)

Maybe I missed something?

> +	}
> +
> +	/* It must also be clamped to the end of a VMA. */
> +	if (vma->vm_end > end) {
> +		ret = split_vma(vmi, vma, end, 0);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	*can_merge = merge;
> +	return 0;
> +}
> +
>  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  				unsigned long arg)
>  {
> @@ -1330,7 +1356,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  	unsigned long vm_flags, new_flags;
>  	bool found;
>  	bool basic_ioctls;
> -	unsigned long start, end, vma_end;
> +	unsigned long start, end;
>  	struct vma_iterator vmi;
>  
>  	user_uffdio_register = (struct uffdio_register __user *) arg;
> @@ -1462,6 +1488,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  
>  	ret = 0;
>  	for_each_vma_range(vmi, vma, end) {
> +		bool can_merge;
> +
>  		cond_resched();
>  
>  		BUG_ON(!vma_can_userfault(vma, vm_flags));
> @@ -1477,32 +1505,22 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  		    (vma->vm_flags & vm_flags) == vm_flags)
>  			goto skip;
>  
> -		if (vma->vm_start > start)
> -			start = vma->vm_start;
> -		vma_end = min(end, vma->vm_end);
> +		ret = clamp_range(&vmi, vma, start, end, &can_merge);
> +		if (ret)
> +			break;
>  
>  		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
> -		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> -				 vma_policy(vma),
> -				 ((struct vm_userfaultfd_ctx){ ctx }),
> -				 anon_vma_name(vma));
> -		if (prev) {
> +		if (can_merge) {
> +			prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags,
> +					 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> +					 vma_policy(vma),
> +					 ((struct vm_userfaultfd_ctx){ ctx }),
> +					 anon_vma_name(vma));
> +
>  			/* vma_merge() invalidated the mas */
> -			vma = prev;
> -			goto next;
> -		}
> -		if (vma->vm_start < start) {
> -			ret = split_vma(&vmi, vma, start, 1);
> -			if (ret)
> -				break;
> -		}
> -		if (vma->vm_end > end) {
> -			ret = split_vma(&vmi, vma, end, 0);
> -			if (ret)
> -				break;
> +			if (prev)
> +				vma = prev;
>  		}
> -	next:
>  		/*
>  		 * In the vma_merge() successful mprotect-like case 8:
>  		 * the next vma was merged into the current one and
> @@ -1560,7 +1578,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  	struct uffdio_range uffdio_unregister;
>  	unsigned long new_flags;
>  	bool found;
> -	unsigned long start, end, vma_end;
> +	unsigned long start, end;
>  	const void __user *buf = (void __user *)arg;
>  	struct vma_iterator vmi;
>  
> @@ -1627,6 +1645,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  	prev = vma_prev(&vmi);
>  	ret = 0;
>  	for_each_vma_range(vmi, vma, end) {
> +		bool can_merge;
> +
>  		cond_resched();
>  
>  		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> @@ -1640,9 +1660,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  
>  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
>  
> -		if (vma->vm_start > start)
> -			start = vma->vm_start;
> -		vma_end = min(end, vma->vm_end);
> +		ret = clamp_range(&vmi, vma, start, end, &can_merge);
> +		if (ret)
> +			break;
>  
>  		if (userfaultfd_missing(vma)) {
>  			/*
> @@ -1652,35 +1672,27 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  			 * UFFDIO_WAKE explicitly.
>  			 */
>  			struct userfaultfd_wake_range range;
> -			range.start = start;
> -			range.len = vma_end - start;
> +			range.start = vma->vm_start;
> +			range.len = vma->vm_end - vma->vm_start;
>  			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
>  		}
>  
>  		/* Reset ptes for the whole vma range if wr-protected */
>  		if (userfaultfd_wp(vma))
> -			uffd_wp_range(vma, start, vma_end - start, false);
> +			uffd_wp_range(vma, vma->vm_start,
> +				      vma->vm_end - vma->vm_start, false);
>  
>  		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
> -		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> -				 vma_policy(vma),
> -				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
> -		if (prev) {
> -			vma = prev;
> -			goto next;
> -		}
> -		if (vma->vm_start < start) {
> -			ret = split_vma(&vmi, vma, start, 1);
> -			if (ret)
> -				break;
> -		}
> -		if (vma->vm_end > end) {
> -			ret = split_vma(&vmi, vma, end, 0);
> -			if (ret)
> -				break;
> +		if (can_merge) {
> +			prev = vma_merge(&vmi, mm, prev, vma->vm_start,
> +					 vma->vm_end, new_flags, vma->anon_vma,
> +					 vma->vm_file, vma->vm_pgoff,
> +					 vma_policy(vma),
> +					 NULL_VM_UFFD_CTX, anon_vma_name(vma));
> +			/* vma_merge() invalidated the mas */
> +			if (prev)
> +				vma = prev;
>  		}
> -	next:
>  		/*
>  		 * In the vma_merge() successful mprotect-like case 8:
>  		 * the next vma was merged into the current one and
> -- 
> 2.40.1
>
  
Peter Xu May 15, 2023, 9:39 p.m. UTC | #3
On Mon, May 15, 2023 at 08:32:32PM +0100, Lorenzo Stoakes wrote:
> As well as fixing the repro described in [1] this also continues to pass
> uffd unit tests.

Side note on testing, not directly relevant to the patch itself..

I'm wondering whether do we have tests somewhere to just test vma
operations on split and merge, then verify it using smap or whatever.

The uffd unit test in this case is probably not gonna trigger anything
because we always mostly register with a whole vma over the testing ranges,
so not immediately helpful.

The trick here is we have quite a few ops that will call vma merge/split in
different ways, but logically we can still category them into: (1) add/del
vmas, or (2) modify vma flags, so at least for case (2) we can have a
framework to cover all the cases (mbind, mprotect, uffd reg/unreg, mlock,
etc.), the difference will be the flags we'll be looking at for different
cases, however how vmas merge/split should be somehow in the same pattern.
  
Lorenzo Stoakes May 15, 2023, 10:04 p.m. UTC | #4
On Mon, May 15, 2023 at 05:27:25PM -0400, Peter Xu wrote:
> On Mon, May 15, 2023 at 08:32:32PM +0100, Lorenzo Stoakes wrote:
> > The userfaultfd_[un]register() functions will knowingly pass an invalid
> > address range to vma_merge(), then rely on it failing to merge to indicate
> > that the VMA should be split into a valid one.
> >
> > This is not something that should be relied upon,
>
> Is there any real issue pop up to show that it's something that we cannot
> rely upon before b0729ae0ae67?  Because normally if something broke with a
> commit I'll say "commit xxx breaks yyy" unless there's more valid reason.. :)

I mean firstly this is triggering warnings in the kernel as referenced in thread
[1], this is reason enough :)

However, vma_merge() assumes that you won't give it an invalid input range (in
this instance, because you provide start that occurs prior to the beginning of
the VMA).

Giving an invalid input range like this is disasterous, as it could result in a
situation such as e.g.:-

- addr == prev->vm_end
- addr != vma->vm_start (i.e. there is a gap)
- prev and vma are otherwise compatible

This would then trigger the merge_prev cases and you'll end up with something
very broken.

I expect this is simply not possible because the input will never be like this
(it's clamped to be _within_ a VMA anyway and if the prev was compatible it'd
already have been merged), but that is relying on an implementation detail.


>
> > as vma_merge() implicitly assumes in cases 5-8 that curr->vm_start ==
> > addr. This is now enforced since commit b0729ae0ae67 ("mm/mmap/vma_merge:
> > explicitly assign res, vma, extend invariants") with an explicit
> > VM_WARN_ON() check.
>
> Doing vma_merge() before vma_split() makes some sense to me (and I noticed
> that mostly all vma_merge() paths are doing so), because if a merge
> happened, we 100% doesn't need a split (vma_merge took care of everything).
> It's not the other way round, since if we split, we could still need a
> merge.
>

You only split if start/end are not clamped to the VMA, and in cases where you'd
need to split, you could not merge. So I don't think this is true.

> So it fundamentally avoids unnecessary calls to split, it seems, if we
> always call merge before splits.

No, because for the case of splitting the beginning of the VMA, you would have
given invalid input to vma_merge() and returned NULL, then had to split anyway.

For the case of splitting the end of the VMA, that part would have to be split
in any case, this just changes the ordering.

So I disagree that this adds work, it does the same thing only this time using
vma_merge() correctly.

(update after seeing below bit) - vma_merge() is not meant to do 'partial'
merges over a non-prev VMA, and relying on it to do so is broken.

>
> >
> > Since commit 29417d292bd0 ("mm/mmap/vma_merge: always check invariants")
> > this check is performed unconditionally, which caused this assert to arise
> > in tests performed by Mark [1].
> >
> > This patch fixes the issue by performing the split operations before
> > attempting to merge VMAs in both instances. The problematic operation is
> > splitting the start of the VMA since we were clamping to the end of the VMA
> > in any case, however it is useful to group both of the split operations
> > together to avoid egregious goto's and to abstract the code between the
> > functions.
> >
> > As well as fixing the repro described in [1] this also continues to pass
> > uffd unit tests.
> >
> > [1]:https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com
> >
> > Reported-by: Mark Rutland <mark.rutland@arm.com>
> > Closes: https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com/
> > Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
> > ---
> >  fs/userfaultfd.c | 108 ++++++++++++++++++++++++++---------------------
> >  1 file changed, 60 insertions(+), 48 deletions(-)
> >
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 0fd96d6e39ce..ef5d667ea804 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -1319,6 +1319,32 @@ static __always_inline int validate_range(struct mm_struct *mm,
> >  	return 0;
> >  }
> >
> > +static int clamp_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > +		       unsigned long start, unsigned long end, bool *can_merge)
> > +{
> > +	int ret;
> > +	bool merge = true;
> > +
> > +	/* The range must always be clamped to the start of a VMA. */
> > +	if (vma->vm_start < start) {
> > +		ret = split_vma(vmi, vma, start, 1);
> > +		if (ret)
> > +			return ret;
> > +
> > +		merge = false;
>
> Could you explain a bit why we don't need to merge in this case?
>
> I'm considering, for example, when we have:
>
>   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
>
> Then someone unregisters uffd on range (5-9), iiuc it should become:
>
>   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
>
> But if no merge here it's:
>
>   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
>
> Maybe I missed something?
>

There's something really, really wrong with this. It simply isn't valid to
invoke vma_merge() over an existing VMA that != prev where you're not
specifying addr = vma->vm_start, end == vma->vm_end.

This seems like you're relying on:-

  ***
CCCCCNNNNN -> CCNNNNNNNN

By specifying parameters that are compatible with N even though you're only
partially spanning C?

This is crazy, and isn't how this should be used. vma_merge() is not
supposed to do partial merges. If it works (presumably it does) this is not
by design unless I've lost my mind and I (and others) have somehow not
noticed this??

I think you're right that now we'll end up with more fragmentation, but
what you're suggesting is not how vma_merge() is supposed to work.

As I said above, giving vma_merge() invalid parameters is very dangerous as
you could end up merging over empty ranges in theory (and could otherwise
have corruption).

I guess we should probably be passing 0 to the last parameter in
split_vma() here then to ensure we do a merge pass too. Will experiment
with this.

I'm confused as to how the remove from case 8 is not proceeding. I'll look
into this some more...

Happy to be corrected if I'm misconstruing this!

> > +	}
> > +
> > +	/* It must also be clamped to the end of a VMA. */
> > +	if (vma->vm_end > end) {
> > +		ret = split_vma(vmi, vma, end, 0);
> > +		if (ret)
> > +			return ret;
> > +	}
> > +
> > +	*can_merge = merge;
> > +	return 0;
> > +}
> > +
> >  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  				unsigned long arg)
> >  {
> > @@ -1330,7 +1356,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  	unsigned long vm_flags, new_flags;
> >  	bool found;
> >  	bool basic_ioctls;
> > -	unsigned long start, end, vma_end;
> > +	unsigned long start, end;
> >  	struct vma_iterator vmi;
> >
> >  	user_uffdio_register = (struct uffdio_register __user *) arg;
> > @@ -1462,6 +1488,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >
> >  	ret = 0;
> >  	for_each_vma_range(vmi, vma, end) {
> > +		bool can_merge;
> > +
> >  		cond_resched();
> >
> >  		BUG_ON(!vma_can_userfault(vma, vm_flags));
> > @@ -1477,32 +1505,22 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  		    (vma->vm_flags & vm_flags) == vm_flags)
> >  			goto skip;
> >
> > -		if (vma->vm_start > start)
> > -			start = vma->vm_start;
> > -		vma_end = min(end, vma->vm_end);
> > +		ret = clamp_range(&vmi, vma, start, end, &can_merge);
> > +		if (ret)
> > +			break;
> >
> >  		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
> > -		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > -				 vma_policy(vma),
> > -				 ((struct vm_userfaultfd_ctx){ ctx }),
> > -				 anon_vma_name(vma));
> > -		if (prev) {
> > +		if (can_merge) {
> > +			prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags,
> > +					 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > +					 vma_policy(vma),
> > +					 ((struct vm_userfaultfd_ctx){ ctx }),
> > +					 anon_vma_name(vma));
> > +
> >  			/* vma_merge() invalidated the mas */
> > -			vma = prev;
> > -			goto next;
> > -		}
> > -		if (vma->vm_start < start) {
> > -			ret = split_vma(&vmi, vma, start, 1);
> > -			if (ret)
> > -				break;
> > -		}
> > -		if (vma->vm_end > end) {
> > -			ret = split_vma(&vmi, vma, end, 0);
> > -			if (ret)
> > -				break;
> > +			if (prev)
> > +				vma = prev;
> >  		}
> > -	next:
> >  		/*
> >  		 * In the vma_merge() successful mprotect-like case 8:
> >  		 * the next vma was merged into the current one and
> > @@ -1560,7 +1578,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  	struct uffdio_range uffdio_unregister;
> >  	unsigned long new_flags;
> >  	bool found;
> > -	unsigned long start, end, vma_end;
> > +	unsigned long start, end;
> >  	const void __user *buf = (void __user *)arg;
> >  	struct vma_iterator vmi;
> >
> > @@ -1627,6 +1645,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  	prev = vma_prev(&vmi);
> >  	ret = 0;
> >  	for_each_vma_range(vmi, vma, end) {
> > +		bool can_merge;
> > +
> >  		cond_resched();
> >
> >  		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> > @@ -1640,9 +1660,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >
> >  		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
> >
> > -		if (vma->vm_start > start)
> > -			start = vma->vm_start;
> > -		vma_end = min(end, vma->vm_end);
> > +		ret = clamp_range(&vmi, vma, start, end, &can_merge);
> > +		if (ret)
> > +			break;
> >
> >  		if (userfaultfd_missing(vma)) {
> >  			/*
> > @@ -1652,35 +1672,27 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  			 * UFFDIO_WAKE explicitly.
> >  			 */
> >  			struct userfaultfd_wake_range range;
> > -			range.start = start;
> > -			range.len = vma_end - start;
> > +			range.start = vma->vm_start;
> > +			range.len = vma->vm_end - vma->vm_start;
> >  			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
> >  		}
> >
> >  		/* Reset ptes for the whole vma range if wr-protected */
> >  		if (userfaultfd_wp(vma))
> > -			uffd_wp_range(vma, start, vma_end - start, false);
> > +			uffd_wp_range(vma, vma->vm_start,
> > +				      vma->vm_end - vma->vm_start, false);
> >
> >  		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
> > -		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > -				 vma_policy(vma),
> > -				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
> > -		if (prev) {
> > -			vma = prev;
> > -			goto next;
> > -		}
> > -		if (vma->vm_start < start) {
> > -			ret = split_vma(&vmi, vma, start, 1);
> > -			if (ret)
> > -				break;
> > -		}
> > -		if (vma->vm_end > end) {
> > -			ret = split_vma(&vmi, vma, end, 0);
> > -			if (ret)
> > -				break;
> > +		if (can_merge) {
> > +			prev = vma_merge(&vmi, mm, prev, vma->vm_start,
> > +					 vma->vm_end, new_flags, vma->anon_vma,
> > +					 vma->vm_file, vma->vm_pgoff,
> > +					 vma_policy(vma),
> > +					 NULL_VM_UFFD_CTX, anon_vma_name(vma));
> > +			/* vma_merge() invalidated the mas */
> > +			if (prev)
> > +				vma = prev;
> >  		}
> > -	next:
> >  		/*
> >  		 * In the vma_merge() successful mprotect-like case 8:
> >  		 * the next vma was merged into the current one and
> > --
> > 2.40.1
> >
>
> --
> Peter Xu
>
  
Lorenzo Stoakes May 15, 2023, 10:10 p.m. UTC | #5
On Mon, May 15, 2023 at 05:39:25PM -0400, Peter Xu wrote:
> On Mon, May 15, 2023 at 08:32:32PM +0100, Lorenzo Stoakes wrote:
> > As well as fixing the repro described in [1] this also continues to pass
> > uffd unit tests.
>
> Side note on testing, not directly relevant to the patch itself..
>
> I'm wondering whether do we have tests somewhere to just test vma
> operations on split and merge, then verify it using smap or whatever.
>
> The uffd unit test in this case is probably not gonna trigger anything
> because we always mostly register with a whole vma over the testing ranges,
> so not immediately helpful.
>
> The trick here is we have quite a few ops that will call vma merge/split in
> different ways, but logically we can still category them into: (1) add/del
> vmas, or (2) modify vma flags, so at least for case (2) we can have a
> framework to cover all the cases (mbind, mprotect, uffd reg/unreg, mlock,
> etc.), the difference will be the flags we'll be looking at for different
> cases, however how vmas merge/split should be somehow in the same pattern.
>

I totally agree we need more testing on this. We do have some basic
self-tests for various things but I don't think we test this specifically
or certainly not in the way I'd prefer (somehow pull vma_merge() + friends
into userland and instrument with heavy unit tests).

I do intend to try to do something with this soon.

Something with /proc/$pid/[s]maps could be a good straightforward thing, I
will try to write some small test (we already have a little repro for the
reported issue) for this anyway.

> --
> Peter Xu
>
  
Lorenzo Stoakes May 15, 2023, 11:07 p.m. UTC | #6
On Mon, May 15, 2023 at 11:04:27PM +0100, Lorenzo Stoakes wrote:
[snip]
> > Could you explain a bit why we don't need to merge in this case?
> >
> > I'm considering, for example, when we have:
> >
> >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> >
> > Then someone unregisters uffd on range (5-9), iiuc it should become:
> >
> >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> >
> > But if no merge here it's:
> >
> >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> >
> > Maybe I missed something?
> >
>
> There's something really, really wrong with this. It simply isn't valid to
> invoke vma_merge() over an existing VMA that != prev where you're not
> specifying addr = vma->vm_start, end == vma->vm_end.
>
> This seems like you're relying on:-
>
>   ***
> CCCCCNNNNN -> CCNNNNNNNN
>
> By specifying parameters that are compatible with N even though you're only
> partially spanning C?
>
> This is crazy, and isn't how this should be used. vma_merge() is not
> supposed to do partial merges. If it works (presumably it does) this is not
> by design unless I've lost my mind and I (and others) have somehow not
> noticed this??
>
> I think you're right that now we'll end up with more fragmentation, but
> what you're suggesting is not how vma_merge() is supposed to work.
>
> As I said above, giving vma_merge() invalid parameters is very dangerous as
> you could end up merging over empty ranges in theory (and could otherwise
> have corruption).
>
> I guess we should probably be passing 0 to the last parameter in
> split_vma() here then to ensure we do a merge pass too. Will experiment
> with this.
>
> I'm confused as to how the remove from case 8 is not proceeding. I'll look
> into this some more...
>
> Happy to be corrected if I'm misconstruing this!
>

OK, so I wrote a small program to do perform exactly this case [0] and it seems
that the outcome is the same before and after this patch - vma_merge() is
clearly rejecting the case 8 merge (phew!) and in both instances you end up with
3 VMAs.

So this patch doesn't change this behaviour and everything is as it was
before. Ideally we'd let it go for another pass, so maybe we should change the
split to add a new VMA _afterwards_. Will experiment with that, separately.

But looks like the patch is good as it is.

(if you notice something wrong with the repro, etc. do let me know!)

[0]: https://gist.github.com/lorenzo-stoakes/a11a10f5f479e7a977fc456331266e0e

[snip]
  
Peter Xu May 16, 2023, 3:06 p.m. UTC | #7
On Tue, May 16, 2023 at 12:07:11AM +0100, Lorenzo Stoakes wrote:
> On Mon, May 15, 2023 at 11:04:27PM +0100, Lorenzo Stoakes wrote:
> [snip]
> > > Could you explain a bit why we don't need to merge in this case?
> > >
> > > I'm considering, for example, when we have:
> > >
> > >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> > >
> > > Then someone unregisters uffd on range (5-9), iiuc it should become:
> > >
> > >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> > >
> > > But if no merge here it's:
> > >
> > >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> > >
> > > Maybe I missed something?
> > >
> >
> > There's something really, really wrong with this. It simply isn't valid to
> > invoke vma_merge() over an existing VMA that != prev where you're not
> > specifying addr = vma->vm_start, end == vma->vm_end.
> >
> > This seems like you're relying on:-
> >
> >   ***
> > CCCCCNNNNN -> CCNNNNNNNN

I had a closer look today, I still think this patch is not really the right
one.  The split/merge order is something we use everywhere and I am not
convinced it must change as drastic.  At least so far it still seems to me
if we do with what current patch proposed we can have vma fragmentations.

I think I see what you meant, but here I think it's a legal case where we
should have PPPP rather than CCCC (PPPPPNNNN --> PPNNNNNNNN).

To be explicit, for register I think it _should_ be the case 0 where we
cannot merge (note: the current code is indeed wrong though, see below):

   ****
  PPPPPPNNNNNN
  cannot merge

While for the unregister case here it's case 4:

    ****
  PPPPPPNNNNNN
  might become
  PPNNNNNNNNNN
  case 4 below

Here the problem is not that we need to do split then merge (I think it'll
have the problem of vma defragmentation here), the problem is we simply
passed in the wrong "prev" vma pointer, IMHO.  I've patches attached
showing what I meant.

I checked the original commit from Andrea and I found that it _was_ correct:

commit 86039bd3b4e6a1129318cbfed4e0a6e001656635
Author: Andrea Arcangeli <aarcange@redhat.com>
Date:   Fri Sep 4 15:46:31 2015 -0700

    userfaultfd: add new syscall to provide memory externalization

I had a feeling that it's broken during the recent rework on vma (or maybe
even not that close), but I'm not yet sure and need to further check.

> >
> > By specifying parameters that are compatible with N even though you're only
> > partially spanning C?
> >
> > This is crazy, and isn't how this should be used. vma_merge() is not
> > supposed to do partial merges. If it works (presumably it does) this is not
> > by design unless I've lost my mind and I (and others) have somehow not
> > noticed this??
> >
> > I think you're right that now we'll end up with more fragmentation, but
> > what you're suggesting is not how vma_merge() is supposed to work.
> >
> > As I said above, giving vma_merge() invalid parameters is very dangerous as
> > you could end up merging over empty ranges in theory (and could otherwise
> > have corruption).
> >
> > I guess we should probably be passing 0 to the last parameter in
> > split_vma() here then to ensure we do a merge pass too. Will experiment
> > with this.
> >
> > I'm confused as to how the remove from case 8 is not proceeding. I'll look
> > into this some more...
> >
> > Happy to be corrected if I'm misconstruing this!
> >
> 
> OK, so I wrote a small program to do perform exactly this case [0] and it seems
> that the outcome is the same before and after this patch - vma_merge() is
> clearly rejecting the case 8 merge (phew!) and in both instances you end up with
> 3 VMAs.
> 
> So this patch doesn't change this behaviour and everything is as it was
> before. Ideally we'd let it go for another pass, so maybe we should change the
> split to add a new VMA _afterwards_. Will experiment with that, separately.
> 
> But looks like the patch is good as it is.
> 
> (if you notice something wrong with the repro, etc. do let me know!)
> 
> [0]: https://gist.github.com/lorenzo-stoakes/a11a10f5f479e7a977fc456331266e0e

I think your test case is fine, but... no, this is still not expected. The
vma should just merge.

So I have another closer look on this specific issue, it seems we have a
long standing bug on pgoff calculation here when passing that to
vma_merge().  I've got another patch attached to show what I meant.

To summarize.. now I've got two patches attached:

Patch 1 is something I'd like to propose to replace this patch that fixes
incorrect use of vma_merge() so it should also eliminate the assertion
being triggered (I still think this is a regression but I need to check
which I will do later; I'm not super familiar with maple tree work, maybe
you or Liam can quickly spot).

Patch 2 fixes the long standing issue of vma not being able to merge in
above case, and with patch 2 applied it should start merging all right.

Please have a look, thanks.

---8<---
From 6bc39028bba246394bb0bafdaeaab7b8dfd1694f Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 16 May 2023 09:03:22 -0400
Subject: [PATCH 1/2] mm/uffd: Fix vma operation where start addr cuts part of
 vma

It seems vma merging with uffd paths is broken with either
register/unregister, where right now we can feed wrong parameters to
vma_merge() and it's found by recent patch which moved asserts upwards in
vma_merge():

https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com/

The problem is in the current code base we didn't fixup "prev" for the case
where "start" address can be within the "prev" vma section.  In that case
we should have "prev" points to the current vma rather than the previous
one when feeding to vma_merge().

This will eliminate the report and make sure vma_merge() calls will become
legal again.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 fs/userfaultfd.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0fd96d6e39ce..7eb88bc74d00 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1458,10 +1458,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	BUG_ON(!found);
 
 	vma_iter_set(&vmi, start);
-	prev = vma_prev(&vmi);
+	vma = vma_find(&vmi, end);
+	if (!vma)
+		goto out_unlock;
+
+	if (vma->vm_start < start)
+		prev = vma;
+	else
+		prev = vma_prev(&vmi);
 
 	ret = 0;
-	for_each_vma_range(vmi, vma, end) {
+	do {
 		cond_resched();
 
 		BUG_ON(!vma_can_userfault(vma, vm_flags));
@@ -1517,7 +1524,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	skip:
 		prev = vma;
 		start = vma->vm_end;
-	}
+	} for_each_vma_range(vmi, vma, end);
 
 out_unlock:
 	mmap_write_unlock(mm);
@@ -1624,9 +1631,17 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	BUG_ON(!found);
 
 	vma_iter_set(&vmi, start);
-	prev = vma_prev(&vmi);
+	vma = vma_find(&vmi, end);
+	if (!vma)
+		goto out_unlock;
+
+	if (vma->vm_start < start)
+		prev = vma;
+	else
+		prev = vma_prev(&vmi);
+
 	ret = 0;
-	for_each_vma_range(vmi, vma, end) {
+	do {
 		cond_resched();
 
 		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
@@ -1692,7 +1707,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	skip:
 		prev = vma;
 		start = vma->vm_end;
-	}
+	} for_each_vma_range(vmi, vma, end);
 
 out_unlock:
 	mmap_write_unlock(mm);
  
Liam R. Howlett May 16, 2023, 4:49 p.m. UTC | #8
* Peter Xu <peterx@redhat.com> [230516 11:06]:

...

> > > This seems like you're relying on:-
> > >
> > >   ***
> > > CCCCCNNNNN -> CCNNNNNNNN
> 
> I had a closer look today, I still think this patch is not really the right
> one.  The split/merge order is something we use everywhere and I am not
> convinced it must change as drastic.  At least so far it still seems to me
> if we do with what current patch proposed we can have vma fragmentations.
> 
> I think I see what you meant, but here I think it's a legal case where we
> should have PPPP rather than CCCC (PPPPPNNNN --> PPNNNNNNNN).
> 
> To be explicit, for register I think it _should_ be the case 0 where we
> cannot merge (note: the current code is indeed wrong though, see below):
> 
>    ****
>   PPPPPPNNNNNN
>   cannot merge
> 
> While for the unregister case here it's case 4:
> 
>     ****
>   PPPPPPNNNNNN
>   might become
>   PPNNNNNNNNNN
>   case 4 below
> 
> Here the problem is not that we need to do split then merge (I think it'll
> have the problem of vma defragmentation here), the problem is we simply
> passed in the wrong "prev" vma pointer, IMHO.  I've patches attached
> showing what I meant.
> 
> I checked the original commit from Andrea and I found that it _was_ correct:
> 
> commit 86039bd3b4e6a1129318cbfed4e0a6e001656635
> Author: Andrea Arcangeli <aarcange@redhat.com>
> Date:   Fri Sep 4 15:46:31 2015 -0700
> 
>     userfaultfd: add new syscall to provide memory externalization
> 
> I had a feeling that it's broken during the recent rework on vma (or maybe
> even not that close), but I'm not yet sure and need to further check.

I believe this was 69dbe6daf1041e32e003f966d71f70f20c63af53, which is my
work on this area.  Any patches will need to be backported to before
the vma iterator for 6.1

I think keeping the asserts and ensuring we are calling vma_merge() with
arguments that make sense is safer.

> 
> > >
> > > By specifying parameters that are compatible with N even though you're only
> > > partially spanning C?
> > >
> > > This is crazy, and isn't how this should be used. vma_merge() is not
> > > supposed to do partial merges. If it works (presumably it does) this is not
> > > by design unless I've lost my mind and I (and others) have somehow not
> > > noticed this??
> > >
> > > I think you're right that now we'll end up with more fragmentation, but
> > > what you're suggesting is not how vma_merge() is supposed to work.
> > >
> > > As I said above, giving vma_merge() invalid parameters is very dangerous as
> > > you could end up merging over empty ranges in theory (and could otherwise
> > > have corruption).
> > >
> > > I guess we should probably be passing 0 to the last parameter in
> > > split_vma() here then to ensure we do a merge pass too. Will experiment
> > > with this.
> > >
> > > I'm confused as to how the remove from case 8 is not proceeding. I'll look
> > > into this some more...
> > >
> > > Happy to be corrected if I'm misconstruing this!
> > >
> > 
> > OK, so I wrote a small program to do perform exactly this case [0] and it seems
> > that the outcome is the same before and after this patch - vma_merge() is
> > clearly rejecting the case 8 merge (phew!) and in both instances you end up with
> > 3 VMAs.
> > 
> > So this patch doesn't change this behaviour and everything is as it was
> > before. Ideally we'd let it go for another pass, so maybe we should change the
> > split to add a new VMA _afterwards_. Will experiment with that, separately.
> > 
> > But looks like the patch is good as it is.
> > 
> > (if you notice something wrong with the repro, etc. do let me know!)
> > 
> > [0]: https://gist.github.com/lorenzo-stoakes/a11a10f5f479e7a977fc456331266e0e
> 
> I think your test case is fine, but... no, this is still not expected. The
> vma should just merge.
> 
> So I have another closer look on this specific issue, it seems we have a
> long standing bug on pgoff calculation here when passing that to
> vma_merge().  I've got another patch attached to show what I meant.
> 
> To summarize.. now I've got two patches attached:
> 
> Patch 1 is something I'd like to propose to replace this patch that fixes
> incorrect use of vma_merge() so it should also eliminate the assertion
> being triggered (I still think this is a regression but I need to check
> which I will do later; I'm not super familiar with maple tree work, maybe
> you or Liam can quickly spot).
> 
> Patch 2 fixes the long standing issue of vma not being able to merge in
> above case, and with patch 2 applied it should start merging all right.
> 
> Please have a look, thanks.
> 
> ---8<---
> From 6bc39028bba246394bb0bafdaeaab7b8dfd1694f Mon Sep 17 00:00:00 2001
> From: Peter Xu <peterx@redhat.com>
> Date: Tue, 16 May 2023 09:03:22 -0400
> Subject: [PATCH 1/2] mm/uffd: Fix vma operation where start addr cuts part of
>  vma
> 
> It seems vma merging with uffd paths is broken with either
> register/unregister, where right now we can feed wrong parameters to
> vma_merge() and it's found by recent patch which moved asserts upwards in
> vma_merge():
> 
> https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com/
> 
> The problem is in the current code base we didn't fixup "prev" for the case
> where "start" address can be within the "prev" vma section.  In that case
> we should have "prev" points to the current vma rather than the previous
> one when feeding to vma_merge().
> 
> This will eliminate the report and make sure vma_merge() calls will become
> legal again.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  fs/userfaultfd.c | 27 +++++++++++++++++++++------
>  1 file changed, 21 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 0fd96d6e39ce..7eb88bc74d00 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1458,10 +1458,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  	BUG_ON(!found);
>  
>  	vma_iter_set(&vmi, start);
> -	prev = vma_prev(&vmi);
> +	vma = vma_find(&vmi, end);
> +	if (!vma)
> +		goto out_unlock;
> +
> +	if (vma->vm_start < start)
> +		prev = vma;
> +	else
> +		prev = vma_prev(&vmi);
>  
>  	ret = 0;
> -	for_each_vma_range(vmi, vma, end) {
> +	do {

The iterator may be off by one, depending on if vma_prev() is called or
not.

Perhaps:
	prev = vma_prev(&vmi); /* could be wrong, or null */
	vma = vma_find(&vmi, end);
	if (!vma)
		goto out_unlock;

	if (vma->vm_start < start)
		prev = vma;

now we know we are at vma with the iterator..
	ret = 0
	do{
	...


>  		cond_resched();
>  
>  		BUG_ON(!vma_can_userfault(vma, vm_flags));
> @@ -1517,7 +1524,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  	skip:
>  		prev = vma;
>  		start = vma->vm_end;
> -	}
> +	} for_each_vma_range(vmi, vma, end);
>  
>  out_unlock:
>  	mmap_write_unlock(mm);
> @@ -1624,9 +1631,17 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  	BUG_ON(!found);
>  
>  	vma_iter_set(&vmi, start);
> -	prev = vma_prev(&vmi);
> +	vma = vma_find(&vmi, end);
> +	if (!vma)
> +		goto out_unlock;
> +
> +	if (vma->vm_start < start)
> +		prev = vma;
> +	else
> +		prev = vma_prev(&vmi);
> +
>  	ret = 0;
> -	for_each_vma_range(vmi, vma, end) {
> +	do {
>  		cond_resched();
>  
>  		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> @@ -1692,7 +1707,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  	skip:
>  		prev = vma;
>  		start = vma->vm_end;
> -	}
> +	} for_each_vma_range(vmi, vma, end);
>  
>  out_unlock:
>  	mmap_write_unlock(mm);
> -- 
> 2.39.1
> 
> From bf61f3c937e9e2ab96ab2bed0005cb7dc74db231 Mon Sep 17 00:00:00 2001
> From: Peter Xu <peterx@redhat.com>
> Date: Tue, 16 May 2023 09:39:38 -0400
> Subject: [PATCH 2/2] mm/uffd: Allow vma to merge as much as possible
> 
> We used to not pass in the pgoff correctly when register/unregister uffd
> regions, it caused incorrect behavior on vma merging.
> 
> For example, when we have:
> 
>   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> 
> Then someone unregisters uffd on range (5-9), it should become:
> 
>   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> 
> But with current code it's:
> 
>   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> 
> This patch allows such merge to happen correctly.
> 
> This behavior seems to have existed since the 1st day of uffd, keep it just
> as a performance optmization and not copy stable.
> 
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Mike Rapoport (IBM) <rppt@kernel.org>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  fs/userfaultfd.c | 8 ++++++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 7eb88bc74d00..891048b4799f 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1332,6 +1332,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  	bool basic_ioctls;
>  	unsigned long start, end, vma_end;
>  	struct vma_iterator vmi;
> +	pgoff_t pgoff;
>  
>  	user_uffdio_register = (struct uffdio_register __user *) arg;
>  
> @@ -1489,8 +1490,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  		vma_end = min(end, vma->vm_end);
>  
>  		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
> +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);

I don't think this is safe.  You are telling vma_merge() something that
is not true and will result in can_vma_merge_before() passing.  I mean,
sure it will become true after you split (unless you can't?), but I
don't know if you can just merge a VMA that doesn't pass
can_vma_merge_before(), even for a short period?

>  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> +				 vma->anon_vma, vma->vm_file, pgoff,
>  				 vma_policy(vma),
>  				 ((struct vm_userfaultfd_ctx){ ctx }),
>  				 anon_vma_name(vma));
> @@ -1570,6 +1572,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  	unsigned long start, end, vma_end;
>  	const void __user *buf = (void __user *)arg;
>  	struct vma_iterator vmi;
> +	pgoff_t pgoff;
>  
>  	ret = -EFAULT;
>  	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
> @@ -1677,8 +1680,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  			uffd_wp_range(vma, start, vma_end - start, false);
>  
>  		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
> +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
>  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> +				 vma->anon_vma, vma->vm_file, pgoff,
>  				 vma_policy(vma),
>  				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
>  		if (prev) {
> -- 
> 2.39.1
> ---8<---
> 
> -- 
> Peter Xu
>
  
Lorenzo Stoakes May 16, 2023, 7:25 p.m. UTC | #9
On Tue, May 16, 2023 at 11:06:40AM -0400, Peter Xu wrote:
> On Tue, May 16, 2023 at 12:07:11AM +0100, Lorenzo Stoakes wrote:
> > On Mon, May 15, 2023 at 11:04:27PM +0100, Lorenzo Stoakes wrote:
> > [snip]
> > > > Could you explain a bit why we don't need to merge in this case?
> > > >
> > > > I'm considering, for example, when we have:
> > > >
> > > >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> > > >
> > > > Then someone unregisters uffd on range (5-9), iiuc it should become:
> > > >
> > > >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> > > >
> > > > But if no merge here it's:
> > > >
> > > >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> > > >
> > > > Maybe I missed something?
> > > >
> > >
> > > There's something really, really wrong with this. It simply isn't valid to
> > > invoke vma_merge() over an existing VMA that != prev where you're not
> > > specifying addr = vma->vm_start, end == vma->vm_end.
> > >
> > > This seems like you're relying on:-
> > >
> > >   ***
> > > CCCCCNNNNN -> CCNNNNNNNN
>
> I had a closer look today, I still think this patch is not really the right
> one.  The split/merge order is something we use everywhere and I am not
> convinced it must change as drastic.  At least so far it still seems to me
> if we do with what current patch proposed we can have vma fragmentations.

'something we use everywhere' is not an argument (speak to Willy about
folios), vma_merge() expects valid input, relying on it _happening_ to be
ok or to fail in ways that _happen_ not to cause big problems is not right.

This is just further evidence that the vma_merge() interface is
fundamentally broken. Implicitly assuming you will only get a partial prev
overlap merge is far from intuitive.

I am definitely going to try to do a series addressing vma_merge() horrors
because I feel like we need a generic means of doing this split/merge pattern.

>
> I think I see what you meant, but here I think it's a legal case where we
> should have PPPP rather than CCCC (PPPPPNNNN --> PPNNNNNNNN).
>
> To be explicit, for register I think it _should_ be the case 0 where we
> cannot merge (note: the current code is indeed wrong though, see below):
>
>    ****
>   PPPPPPNNNNNN
>   cannot merge
>
> While for the unregister case here it's case 4:
>
>     ****
>   PPPPPPNNNNNN
>   might become
>   PPNNNNNNNNNN
>   case 4 below
>
> Here the problem is not that we need to do split then merge (I think it'll
> have the problem of vma defragmentation here), the problem is we simply
> passed in the wrong "prev" vma pointer, IMHO.  I've patches attached
> showing what I meant.

Yeah if you do it with prev = vma this form should _probably_ work, that's
a good point. This _is_ a case (see https://ljs.io/vma_merge_cases.png for
nice diagram of cases btw :), like 5, where we actually do split and merge
at the same time.

Liam's raised some issues with the safety of your patches, let me look at
them when I get a chance, nasty headcold = brain less functional atm.

I would say for now this fix resolves the issue in a way that should
emphatically avoid invalid input to vma_merge(), the fragmentation existed
before so this is not a new issue, so for the time being I think it's ok to
stay as-is.

>
> I checked the original commit from Andrea and I found that it _was_ correct:
>
> commit 86039bd3b4e6a1129318cbfed4e0a6e001656635
> Author: Andrea Arcangeli <aarcange@redhat.com>
> Date:   Fri Sep 4 15:46:31 2015 -0700
>
>     userfaultfd: add new syscall to provide memory externalization
>
> I had a feeling that it's broken during the recent rework on vma (or maybe
> even not that close), but I'm not yet sure and need to further check.
>
> > >
> > > By specifying parameters that are compatible with N even though you're only
> > > partially spanning C?
> > >
> > > This is crazy, and isn't how this should be used. vma_merge() is not
> > > supposed to do partial merges. If it works (presumably it does) this is not
> > > by design unless I've lost my mind and I (and others) have somehow not
> > > noticed this??
> > >
> > > I think you're right that now we'll end up with more fragmentation, but
> > > what you're suggesting is not how vma_merge() is supposed to work.
> > >
> > > As I said above, giving vma_merge() invalid parameters is very dangerous as
> > > you could end up merging over empty ranges in theory (and could otherwise
> > > have corruption).
> > >
> > > I guess we should probably be passing 0 to the last parameter in
> > > split_vma() here then to ensure we do a merge pass too. Will experiment
> > > with this.
> > >
> > > I'm confused as to how the remove from case 8 is not proceeding. I'll look
> > > into this some more...
> > >
> > > Happy to be corrected if I'm misconstruing this!
> > >
> >
> > OK, so I wrote a small program to do perform exactly this case [0] and it seems
> > that the outcome is the same before and after this patch - vma_merge() is
> > clearly rejecting the case 8 merge (phew!) and in both instances you end up with
> > 3 VMAs.
> >
> > So this patch doesn't change this behaviour and everything is as it was
> > before. Ideally we'd let it go for another pass, so maybe we should change the
> > split to add a new VMA _afterwards_. Will experiment with that, separately.
> >
> > But looks like the patch is good as it is.
> >
> > (if you notice something wrong with the repro, etc. do let me know!)
> >
> > [0]: https://gist.github.com/lorenzo-stoakes/a11a10f5f479e7a977fc456331266e0e
>
> I think your test case is fine, but... no, this is still not expected. The
> vma should just merge.
>
> So I have another closer look on this specific issue, it seems we have a
> long standing bug on pgoff calculation here when passing that to
> vma_merge().  I've got another patch attached to show what I meant.
>
> To summarize.. now I've got two patches attached:
>
> Patch 1 is something I'd like to propose to replace this patch that fixes
> incorrect use of vma_merge() so it should also eliminate the assertion
> being triggered (I still think this is a regression but I need to check
> which I will do later; I'm not super familiar with maple tree work, maybe
> you or Liam can quickly spot).
>
> Patch 2 fixes the long standing issue of vma not being able to merge in
> above case, and with patch 2 applied it should start merging all right.
>
> Please have a look, thanks.
>
> ---8<---
> From 6bc39028bba246394bb0bafdaeaab7b8dfd1694f Mon Sep 17 00:00:00 2001
> From: Peter Xu <peterx@redhat.com>
> Date: Tue, 16 May 2023 09:03:22 -0400
> Subject: [PATCH 1/2] mm/uffd: Fix vma operation where start addr cuts part of
>  vma
>
> It seems vma merging with uffd paths is broken with either
> register/unregister, where right now we can feed wrong parameters to
> vma_merge() and it's found by recent patch which moved asserts upwards in
> vma_merge():
>
> https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com/
>
> The problem is in the current code base we didn't fixup "prev" for the case
> where "start" address can be within the "prev" vma section.  In that case
> we should have "prev" points to the current vma rather than the previous
> one when feeding to vma_merge().
>
> This will eliminate the report and make sure vma_merge() calls will become
> legal again.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  fs/userfaultfd.c | 27 +++++++++++++++++++++------
>  1 file changed, 21 insertions(+), 6 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 0fd96d6e39ce..7eb88bc74d00 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1458,10 +1458,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  	BUG_ON(!found);
>
>  	vma_iter_set(&vmi, start);
> -	prev = vma_prev(&vmi);
> +	vma = vma_find(&vmi, end);
> +	if (!vma)
> +		goto out_unlock;
> +
> +	if (vma->vm_start < start)
> +		prev = vma;
> +	else
> +		prev = vma_prev(&vmi);
>
>  	ret = 0;
> -	for_each_vma_range(vmi, vma, end) {
> +	do {
>  		cond_resched();
>
>  		BUG_ON(!vma_can_userfault(vma, vm_flags));
> @@ -1517,7 +1524,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  	skip:
>  		prev = vma;
>  		start = vma->vm_end;
> -	}
> +	} for_each_vma_range(vmi, vma, end);
>
>  out_unlock:
>  	mmap_write_unlock(mm);
> @@ -1624,9 +1631,17 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  	BUG_ON(!found);
>
>  	vma_iter_set(&vmi, start);
> -	prev = vma_prev(&vmi);
> +	vma = vma_find(&vmi, end);
> +	if (!vma)
> +		goto out_unlock;
> +
> +	if (vma->vm_start < start)
> +		prev = vma;
> +	else
> +		prev = vma_prev(&vmi);
> +
>  	ret = 0;
> -	for_each_vma_range(vmi, vma, end) {
> +	do {
>  		cond_resched();
>
>  		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> @@ -1692,7 +1707,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  	skip:
>  		prev = vma;
>  		start = vma->vm_end;
> -	}
> +	} for_each_vma_range(vmi, vma, end);
>
>  out_unlock:
>  	mmap_write_unlock(mm);
> --
> 2.39.1
>
> From bf61f3c937e9e2ab96ab2bed0005cb7dc74db231 Mon Sep 17 00:00:00 2001
> From: Peter Xu <peterx@redhat.com>
> Date: Tue, 16 May 2023 09:39:38 -0400
> Subject: [PATCH 2/2] mm/uffd: Allow vma to merge as much as possible
>
> We used to not pass in the pgoff correctly when register/unregister uffd
> regions, it caused incorrect behavior on vma merging.
>
> For example, when we have:
>
>   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
>
> Then someone unregisters uffd on range (5-9), it should become:
>
>   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
>
> But with current code it's:
>
>   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
>
> This patch allows such merge to happen correctly.
>
> This behavior seems to have existed since the 1st day of uffd, keep it just
> as a performance optmization and not copy stable.
>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Mike Rapoport (IBM) <rppt@kernel.org>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  fs/userfaultfd.c | 8 ++++++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 7eb88bc74d00..891048b4799f 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1332,6 +1332,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  	bool basic_ioctls;
>  	unsigned long start, end, vma_end;
>  	struct vma_iterator vmi;
> +	pgoff_t pgoff;
>
>  	user_uffdio_register = (struct uffdio_register __user *) arg;
>
> @@ -1489,8 +1490,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  		vma_end = min(end, vma->vm_end);
>
>  		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
> +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
>  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> +				 vma->anon_vma, vma->vm_file, pgoff,
>  				 vma_policy(vma),
>  				 ((struct vm_userfaultfd_ctx){ ctx }),
>  				 anon_vma_name(vma));
> @@ -1570,6 +1572,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  	unsigned long start, end, vma_end;
>  	const void __user *buf = (void __user *)arg;
>  	struct vma_iterator vmi;
> +	pgoff_t pgoff;
>
>  	ret = -EFAULT;
>  	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
> @@ -1677,8 +1680,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  			uffd_wp_range(vma, start, vma_end - start, false);
>
>  		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
> +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
>  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> +				 vma->anon_vma, vma->vm_file, pgoff,
>  				 vma_policy(vma),
>  				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
>  		if (prev) {
> --
> 2.39.1
> ---8<---
>
> --
> Peter Xu
>
  
Peter Xu May 16, 2023, 8:12 p.m. UTC | #10
Hi, Liam,

On Tue, May 16, 2023 at 12:49:07PM -0400, Liam R. Howlett wrote:
> * Peter Xu <peterx@redhat.com> [230516 11:06]:
> 
> ...
> 
> > > > This seems like you're relying on:-
> > > >
> > > >   ***
> > > > CCCCCNNNNN -> CCNNNNNNNN
> > 
> > I had a closer look today, I still think this patch is not really the right
> > one.  The split/merge order is something we use everywhere and I am not
> > convinced it must change as drastic.  At least so far it still seems to me
> > if we do with what current patch proposed we can have vma fragmentations.
> > 
> > I think I see what you meant, but here I think it's a legal case where we
> > should have PPPP rather than CCCC (PPPPPNNNN --> PPNNNNNNNN).
> > 
> > To be explicit, for register I think it _should_ be the case 0 where we
> > cannot merge (note: the current code is indeed wrong though, see below):
> > 
> >    ****
> >   PPPPPPNNNNNN
> >   cannot merge
> > 
> > While for the unregister case here it's case 4:
> > 
> >     ****
> >   PPPPPPNNNNNN
> >   might become
> >   PPNNNNNNNNNN
> >   case 4 below
> > 
> > Here the problem is not that we need to do split then merge (I think it'll
> > have the problem of vma defragmentation here), the problem is we simply
> > passed in the wrong "prev" vma pointer, IMHO.  I've patches attached
> > showing what I meant.
> > 
> > I checked the original commit from Andrea and I found that it _was_ correct:
> > 
> > commit 86039bd3b4e6a1129318cbfed4e0a6e001656635
> > Author: Andrea Arcangeli <aarcange@redhat.com>
> > Date:   Fri Sep 4 15:46:31 2015 -0700
> > 
> >     userfaultfd: add new syscall to provide memory externalization
> > 
> > I had a feeling that it's broken during the recent rework on vma (or maybe
> > even not that close), but I'm not yet sure and need to further check.
> 
> I believe this was 69dbe6daf1041e32e003f966d71f70f20c63af53, which is my
> work on this area.  Any patches will need to be backported to before
> the vma iterator for 6.1

Thanks for confirming this.

> 
> I think keeping the asserts and ensuring we are calling vma_merge() with
> arguments that make sense is safer.

Yes, definitely.

> 
> > 
> > > >
> > > > By specifying parameters that are compatible with N even though you're only
> > > > partially spanning C?
> > > >
> > > > This is crazy, and isn't how this should be used. vma_merge() is not
> > > > supposed to do partial merges. If it works (presumably it does) this is not
> > > > by design unless I've lost my mind and I (and others) have somehow not
> > > > noticed this??
> > > >
> > > > I think you're right that now we'll end up with more fragmentation, but
> > > > what you're suggesting is not how vma_merge() is supposed to work.
> > > >
> > > > As I said above, giving vma_merge() invalid parameters is very dangerous as
> > > > you could end up merging over empty ranges in theory (and could otherwise
> > > > have corruption).
> > > >
> > > > I guess we should probably be passing 0 to the last parameter in
> > > > split_vma() here then to ensure we do a merge pass too. Will experiment
> > > > with this.
> > > >
> > > > I'm confused as to how the remove from case 8 is not proceeding. I'll look
> > > > into this some more...
> > > >
> > > > Happy to be corrected if I'm misconstruing this!
> > > >
> > > 
> > > OK, so I wrote a small program to do perform exactly this case [0] and it seems
> > > that the outcome is the same before and after this patch - vma_merge() is
> > > clearly rejecting the case 8 merge (phew!) and in both instances you end up with
> > > 3 VMAs.
> > > 
> > > So this patch doesn't change this behaviour and everything is as it was
> > > before. Ideally we'd let it go for another pass, so maybe we should change the
> > > split to add a new VMA _afterwards_. Will experiment with that, separately.
> > > 
> > > But looks like the patch is good as it is.
> > > 
> > > (if you notice something wrong with the repro, etc. do let me know!)
> > > 
> > > [0]: https://gist.github.com/lorenzo-stoakes/a11a10f5f479e7a977fc456331266e0e
> > 
> > I think your test case is fine, but... no, this is still not expected. The
> > vma should just merge.
> > 
> > So I have another closer look on this specific issue, it seems we have a
> > long standing bug on pgoff calculation here when passing that to
> > vma_merge().  I've got another patch attached to show what I meant.
> > 
> > To summarize.. now I've got two patches attached:
> > 
> > Patch 1 is something I'd like to propose to replace this patch that fixes
> > incorrect use of vma_merge() so it should also eliminate the assertion
> > being triggered (I still think this is a regression but I need to check
> > which I will do later; I'm not super familiar with maple tree work, maybe
> > you or Liam can quickly spot).
> > 
> > Patch 2 fixes the long standing issue of vma not being able to merge in
> > above case, and with patch 2 applied it should start merging all right.
> > 
> > Please have a look, thanks.
> > 
> > ---8<---
> > From 6bc39028bba246394bb0bafdaeaab7b8dfd1694f Mon Sep 17 00:00:00 2001
> > From: Peter Xu <peterx@redhat.com>
> > Date: Tue, 16 May 2023 09:03:22 -0400
> > Subject: [PATCH 1/2] mm/uffd: Fix vma operation where start addr cuts part of
> >  vma
> > 
> > It seems vma merging with uffd paths is broken with either
> > register/unregister, where right now we can feed wrong parameters to
> > vma_merge() and it's found by recent patch which moved asserts upwards in
> > vma_merge():
> > 
> > https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com/
> > 
> > The problem is in the current code base we didn't fixup "prev" for the case
> > where "start" address can be within the "prev" vma section.  In that case
> > we should have "prev" points to the current vma rather than the previous
> > one when feeding to vma_merge().
> > 
> > This will eliminate the report and make sure vma_merge() calls will become
> > legal again.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  fs/userfaultfd.c | 27 +++++++++++++++++++++------
> >  1 file changed, 21 insertions(+), 6 deletions(-)
> > 
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 0fd96d6e39ce..7eb88bc74d00 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -1458,10 +1458,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  	BUG_ON(!found);
> >  
> >  	vma_iter_set(&vmi, start);
> > -	prev = vma_prev(&vmi);
> > +	vma = vma_find(&vmi, end);
> > +	if (!vma)
> > +		goto out_unlock;
> > +
> > +	if (vma->vm_start < start)
> > +		prev = vma;
> > +	else
> > +		prev = vma_prev(&vmi);
> >  
> >  	ret = 0;
> > -	for_each_vma_range(vmi, vma, end) {
> > +	do {
> 
> The iterator may be off by one, depending on if vma_prev() is called or
> not.
> 
> Perhaps:
> 	prev = vma_prev(&vmi); /* could be wrong, or null */
> 	vma = vma_find(&vmi, end);
> 	if (!vma)
> 		goto out_unlock;
> 
> 	if (vma->vm_start < start)
> 		prev = vma;
> 
> now we know we are at vma with the iterator..
> 	ret = 0
> 	do{
> 	...

Will do, thanks.

I think I got trapped similarly when I was looking at xarray months ago
where xarray also had similar side effects to have off-by-one the iterator
behavior.

Do you think it'll make sense to have something describing such side
effects for maple tree (or the current vma api), or.. maybe there's already
some but I just didn't really know?

> 
> 
> >  		cond_resched();
> >  
> >  		BUG_ON(!vma_can_userfault(vma, vm_flags));
> > @@ -1517,7 +1524,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  	skip:
> >  		prev = vma;
> >  		start = vma->vm_end;
> > -	}
> > +	} for_each_vma_range(vmi, vma, end);
> >  
> >  out_unlock:
> >  	mmap_write_unlock(mm);
> > @@ -1624,9 +1631,17 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  	BUG_ON(!found);
> >  
> >  	vma_iter_set(&vmi, start);
> > -	prev = vma_prev(&vmi);
> > +	vma = vma_find(&vmi, end);
> > +	if (!vma)
> > +		goto out_unlock;
> > +
> > +	if (vma->vm_start < start)
> > +		prev = vma;
> > +	else
> > +		prev = vma_prev(&vmi);
> > +
> >  	ret = 0;
> > -	for_each_vma_range(vmi, vma, end) {
> > +	do {
> >  		cond_resched();
> >  
> >  		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> > @@ -1692,7 +1707,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  	skip:
> >  		prev = vma;
> >  		start = vma->vm_end;
> > -	}
> > +	} for_each_vma_range(vmi, vma, end);
> >  
> >  out_unlock:
> >  	mmap_write_unlock(mm);
> > -- 
> > 2.39.1
> > 
> > From bf61f3c937e9e2ab96ab2bed0005cb7dc74db231 Mon Sep 17 00:00:00 2001
> > From: Peter Xu <peterx@redhat.com>
> > Date: Tue, 16 May 2023 09:39:38 -0400
> > Subject: [PATCH 2/2] mm/uffd: Allow vma to merge as much as possible
> > 
> > We used to not pass in the pgoff correctly when register/unregister uffd
> > regions, it caused incorrect behavior on vma merging.
> > 
> > For example, when we have:
> > 
> >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> > 
> > Then someone unregisters uffd on range (5-9), it should become:
> > 
> >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> > 
> > But with current code it's:
> > 
> >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> > 
> > This patch allows such merge to happen correctly.
> > 
> > This behavior seems to have existed since the 1st day of uffd, keep it just
> > as a performance optmization and not copy stable.
> > 
> > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > Cc: Mike Rapoport (IBM) <rppt@kernel.org>
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  fs/userfaultfd.c | 8 ++++++--
> >  1 file changed, 6 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 7eb88bc74d00..891048b4799f 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -1332,6 +1332,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  	bool basic_ioctls;
> >  	unsigned long start, end, vma_end;
> >  	struct vma_iterator vmi;
> > +	pgoff_t pgoff;
> >  
> >  	user_uffdio_register = (struct uffdio_register __user *) arg;
> >  
> > @@ -1489,8 +1490,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  		vma_end = min(end, vma->vm_end);
> >  
> >  		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
> > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> 
> I don't think this is safe.  You are telling vma_merge() something that
> is not true and will result in can_vma_merge_before() passing.  I mean,
> sure it will become true after you split (unless you can't?), but I
> don't know if you can just merge a VMA that doesn't pass
> can_vma_merge_before(), even for a short period?

I must admit I'm not really that handy yet to vma codes, so I could miss
something obvious.

My reasoning comes from two parts that this pgoff looks all fine:

1) It's documented in vma_merge() in that:

 * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
 * figure out ...

  So fundamentally this pgoff is part of the mapping request paired with
  all the rest of the information.  AFAICT it means it must match with what
  "addr" is describing in VA address space.  That's why I think offseting
  it makes sense here.

  It also matches my understanding in vma_merge() code on how the pgoff is
  used.

2) Uffd is nothing special in this regard, namely:

   mbind_range():

	pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT);
	merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags,
			 vma->anon_vma, vma->vm_file, pgoff, new_pol,
			 vma->vm_userfaultfd_ctx, anon_vma_name(vma));

   mlock_fixup():
   
	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
	*prev = vma_merge(vmi, mm, *prev, start, end, newflags,
			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
			vma->vm_userfaultfd_ctx, anon_vma_name(vma));

   mprotect_fixup():

	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
	*pprev = vma_merge(vmi, mm, *pprev, start, end, newflags,
			   vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
			   vma->vm_userfaultfd_ctx, anon_vma_name(vma));

I had a feeling that it's just something overlooked in the initial proposal
of uffd, but maybe I missed something important?

> 
> >  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > +				 vma->anon_vma, vma->vm_file, pgoff,
> >  				 vma_policy(vma),
> >  				 ((struct vm_userfaultfd_ctx){ ctx }),
> >  				 anon_vma_name(vma));
> > @@ -1570,6 +1572,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  	unsigned long start, end, vma_end;
> >  	const void __user *buf = (void __user *)arg;
> >  	struct vma_iterator vmi;
> > +	pgoff_t pgoff;
> >  
> >  	ret = -EFAULT;
> >  	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
> > @@ -1677,8 +1680,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  			uffd_wp_range(vma, start, vma_end - start, false);
> >  
> >  		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
> > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> >  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > +				 vma->anon_vma, vma->vm_file, pgoff,
> >  				 vma_policy(vma),
> >  				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
> >  		if (prev) {
> > -- 
> > 2.39.1
> > ---8<---
> > 
> > -- 
> > Peter Xu
> > 
>
  
Peter Xu May 16, 2023, 8:30 p.m. UTC | #11
On Tue, May 16, 2023 at 08:25:13PM +0100, Lorenzo Stoakes wrote:
> On Tue, May 16, 2023 at 11:06:40AM -0400, Peter Xu wrote:
> > On Tue, May 16, 2023 at 12:07:11AM +0100, Lorenzo Stoakes wrote:
> > > On Mon, May 15, 2023 at 11:04:27PM +0100, Lorenzo Stoakes wrote:
> > > [snip]
> > > > > Could you explain a bit why we don't need to merge in this case?
> > > > >
> > > > > I'm considering, for example, when we have:
> > > > >
> > > > >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> > > > >
> > > > > Then someone unregisters uffd on range (5-9), iiuc it should become:
> > > > >
> > > > >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> > > > >
> > > > > But if no merge here it's:
> > > > >
> > > > >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> > > > >
> > > > > Maybe I missed something?
> > > > >
> > > >
> > > > There's something really, really wrong with this. It simply isn't valid to
> > > > invoke vma_merge() over an existing VMA that != prev where you're not
> > > > specifying addr = vma->vm_start, end == vma->vm_end.
> > > >
> > > > This seems like you're relying on:-
> > > >
> > > >   ***
> > > > CCCCCNNNNN -> CCNNNNNNNN
> >
> > I had a closer look today, I still think this patch is not really the right
> > one.  The split/merge order is something we use everywhere and I am not
> > convinced it must change as drastic.  At least so far it still seems to me
> > if we do with what current patch proposed we can have vma fragmentations.
> 
> 'something we use everywhere' is not an argument (speak to Willy about
> folios), vma_merge() expects valid input, relying on it _happening_ to be

I still think it's an argument.

I believe Matthew tried hard to justify he's correct in that aspect when
changing "something we used everywhere". Blindly referencing it doesn't
help much on a technical discussion.

If we have similar code that handles similar things, IMHO you need a reason
to modify one of them to not like the other.

If you think what you proposed is better, please consider (1) justify why
it's better, and then (2) also apply it to all the rest places where
applicable.  Please refer to my reply to Liam on the other use cases of
vma_merge().

Said that, I apprecaite a lot on your assert patch that found this problem.

> ok or to fail in ways that _happen_ not to cause big problems is not right.
> 
> This is just further evidence that the vma_merge() interface is
> fundamentally broken. Implicitly assuming you will only get a partial prev
> overlap merge is far from intuitive.
> 
> I am definitely going to try to do a series addressing vma_merge() horrors
> because I feel like we need a generic means of doing this split/merge pattern.

It'll be great if you can make it better.

> 
> >
> > I think I see what you meant, but here I think it's a legal case where we
> > should have PPPP rather than CCCC (PPPPPNNNN --> PPNNNNNNNN).
> >
> > To be explicit, for register I think it _should_ be the case 0 where we
> > cannot merge (note: the current code is indeed wrong though, see below):
> >
> >    ****
> >   PPPPPPNNNNNN
> >   cannot merge
> >
> > While for the unregister case here it's case 4:
> >
> >     ****
> >   PPPPPPNNNNNN
> >   might become
> >   PPNNNNNNNNNN
> >   case 4 below
> >
> > Here the problem is not that we need to do split then merge (I think it'll
> > have the problem of vma defragmentation here), the problem is we simply
> > passed in the wrong "prev" vma pointer, IMHO.  I've patches attached
> > showing what I meant.
> 
> Yeah if you do it with prev = vma this form should _probably_ work, that's
> a good point. This _is_ a case (see https://ljs.io/vma_merge_cases.png for
> nice diagram of cases btw :), like 5, where we actually do split and merge
> at the same time.

It's not something I came up with myself, it's just that I started looking
back to see what people did and trying to understand why they did it.
Normally people did things with good reasons.

So far it seems clearer at least to me to keep this pattern of "merge then
split".  But I'm happy to be proven wrong anytime.

> 
> Liam's raised some issues with the safety of your patches, let me look at
> them when I get a chance, nasty headcold = brain less functional atm.

Sure, no need to rush.

> 
> I would say for now this fix resolves the issue in a way that should
> emphatically avoid invalid input to vma_merge(), the fragmentation existed
> before so this is not a new issue, so for the time being I think it's ok to
> stay as-is.

So far I would still suggest having uffd code the same as the rest if
possible.

I think I'll wait for the other discussion to settle on patch 2 to see
whether I should post a formal patchset.

> 
> >
> > I checked the original commit from Andrea and I found that it _was_ correct:
> >
> > commit 86039bd3b4e6a1129318cbfed4e0a6e001656635
> > Author: Andrea Arcangeli <aarcange@redhat.com>
> > Date:   Fri Sep 4 15:46:31 2015 -0700
> >
> >     userfaultfd: add new syscall to provide memory externalization
> >
> > I had a feeling that it's broken during the recent rework on vma (or maybe
> > even not that close), but I'm not yet sure and need to further check.
> >
> > > >
> > > > By specifying parameters that are compatible with N even though you're only
> > > > partially spanning C?
> > > >
> > > > This is crazy, and isn't how this should be used. vma_merge() is not
> > > > supposed to do partial merges. If it works (presumably it does) this is not
> > > > by design unless I've lost my mind and I (and others) have somehow not
> > > > noticed this??
> > > >
> > > > I think you're right that now we'll end up with more fragmentation, but
> > > > what you're suggesting is not how vma_merge() is supposed to work.
> > > >
> > > > As I said above, giving vma_merge() invalid parameters is very dangerous as
> > > > you could end up merging over empty ranges in theory (and could otherwise
> > > > have corruption).
> > > >
> > > > I guess we should probably be passing 0 to the last parameter in
> > > > split_vma() here then to ensure we do a merge pass too. Will experiment
> > > > with this.
> > > >
> > > > I'm confused as to how the remove from case 8 is not proceeding. I'll look
> > > > into this some more...
> > > >
> > > > Happy to be corrected if I'm misconstruing this!
> > > >
> > >
> > > OK, so I wrote a small program to do perform exactly this case [0] and it seems
> > > that the outcome is the same before and after this patch - vma_merge() is
> > > clearly rejecting the case 8 merge (phew!) and in both instances you end up with
> > > 3 VMAs.
> > >
> > > So this patch doesn't change this behaviour and everything is as it was
> > > before. Ideally we'd let it go for another pass, so maybe we should change the
> > > split to add a new VMA _afterwards_. Will experiment with that, separately.
> > >
> > > But looks like the patch is good as it is.
> > >
> > > (if you notice something wrong with the repro, etc. do let me know!)
> > >
> > > [0]: https://gist.github.com/lorenzo-stoakes/a11a10f5f479e7a977fc456331266e0e
> >
> > I think your test case is fine, but... no, this is still not expected. The
> > vma should just merge.
> >
> > So I have another closer look on this specific issue, it seems we have a
> > long standing bug on pgoff calculation here when passing that to
> > vma_merge().  I've got another patch attached to show what I meant.
> >
> > To summarize.. now I've got two patches attached:
> >
> > Patch 1 is something I'd like to propose to replace this patch that fixes
> > incorrect use of vma_merge() so it should also eliminate the assertion
> > being triggered (I still think this is a regression but I need to check
> > which I will do later; I'm not super familiar with maple tree work, maybe
> > you or Liam can quickly spot).
> >
> > Patch 2 fixes the long standing issue of vma not being able to merge in
> > above case, and with patch 2 applied it should start merging all right.
> >
> > Please have a look, thanks.
> >
> > ---8<---
> > From 6bc39028bba246394bb0bafdaeaab7b8dfd1694f Mon Sep 17 00:00:00 2001
> > From: Peter Xu <peterx@redhat.com>
> > Date: Tue, 16 May 2023 09:03:22 -0400
> > Subject: [PATCH 1/2] mm/uffd: Fix vma operation where start addr cuts part of
> >  vma
> >
> > It seems vma merging with uffd paths is broken with either
> > register/unregister, where right now we can feed wrong parameters to
> > vma_merge() and it's found by recent patch which moved asserts upwards in
> > vma_merge():
> >
> > https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com/
> >
> > The problem is in the current code base we didn't fixup "prev" for the case
> > where "start" address can be within the "prev" vma section.  In that case
> > we should have "prev" points to the current vma rather than the previous
> > one when feeding to vma_merge().
> >
> > This will eliminate the report and make sure vma_merge() calls will become
> > legal again.
> >
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  fs/userfaultfd.c | 27 +++++++++++++++++++++------
> >  1 file changed, 21 insertions(+), 6 deletions(-)
> >
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 0fd96d6e39ce..7eb88bc74d00 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -1458,10 +1458,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  	BUG_ON(!found);
> >
> >  	vma_iter_set(&vmi, start);
> > -	prev = vma_prev(&vmi);
> > +	vma = vma_find(&vmi, end);
> > +	if (!vma)
> > +		goto out_unlock;
> > +
> > +	if (vma->vm_start < start)
> > +		prev = vma;
> > +	else
> > +		prev = vma_prev(&vmi);
> >
> >  	ret = 0;
> > -	for_each_vma_range(vmi, vma, end) {
> > +	do {
> >  		cond_resched();
> >
> >  		BUG_ON(!vma_can_userfault(vma, vm_flags));
> > @@ -1517,7 +1524,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  	skip:
> >  		prev = vma;
> >  		start = vma->vm_end;
> > -	}
> > +	} for_each_vma_range(vmi, vma, end);
> >
> >  out_unlock:
> >  	mmap_write_unlock(mm);
> > @@ -1624,9 +1631,17 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  	BUG_ON(!found);
> >
> >  	vma_iter_set(&vmi, start);
> > -	prev = vma_prev(&vmi);
> > +	vma = vma_find(&vmi, end);
> > +	if (!vma)
> > +		goto out_unlock;
> > +
> > +	if (vma->vm_start < start)
> > +		prev = vma;
> > +	else
> > +		prev = vma_prev(&vmi);
> > +
> >  	ret = 0;
> > -	for_each_vma_range(vmi, vma, end) {
> > +	do {
> >  		cond_resched();
> >
> >  		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> > @@ -1692,7 +1707,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  	skip:
> >  		prev = vma;
> >  		start = vma->vm_end;
> > -	}
> > +	} for_each_vma_range(vmi, vma, end);
> >
> >  out_unlock:
> >  	mmap_write_unlock(mm);
> > --
> > 2.39.1
> >
> > From bf61f3c937e9e2ab96ab2bed0005cb7dc74db231 Mon Sep 17 00:00:00 2001
> > From: Peter Xu <peterx@redhat.com>
> > Date: Tue, 16 May 2023 09:39:38 -0400
> > Subject: [PATCH 2/2] mm/uffd: Allow vma to merge as much as possible
> >
> > We used to not pass in the pgoff correctly when register/unregister uffd
> > regions, it caused incorrect behavior on vma merging.
> >
> > For example, when we have:
> >
> >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> >
> > Then someone unregisters uffd on range (5-9), it should become:
> >
> >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> >
> > But with current code it's:
> >
> >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> >
> > This patch allows such merge to happen correctly.
> >
> > This behavior seems to have existed since the 1st day of uffd, keep it just
> > as a performance optmization and not copy stable.
> >
> > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > Cc: Mike Rapoport (IBM) <rppt@kernel.org>
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  fs/userfaultfd.c | 8 ++++++--
> >  1 file changed, 6 insertions(+), 2 deletions(-)
> >
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 7eb88bc74d00..891048b4799f 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -1332,6 +1332,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  	bool basic_ioctls;
> >  	unsigned long start, end, vma_end;
> >  	struct vma_iterator vmi;
> > +	pgoff_t pgoff;
> >
> >  	user_uffdio_register = (struct uffdio_register __user *) arg;
> >
> > @@ -1489,8 +1490,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  		vma_end = min(end, vma->vm_end);
> >
> >  		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
> > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> >  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > +				 vma->anon_vma, vma->vm_file, pgoff,
> >  				 vma_policy(vma),
> >  				 ((struct vm_userfaultfd_ctx){ ctx }),
> >  				 anon_vma_name(vma));
> > @@ -1570,6 +1572,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  	unsigned long start, end, vma_end;
> >  	const void __user *buf = (void __user *)arg;
> >  	struct vma_iterator vmi;
> > +	pgoff_t pgoff;
> >
> >  	ret = -EFAULT;
> >  	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
> > @@ -1677,8 +1680,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >  			uffd_wp_range(vma, start, vma_end - start, false);
> >
> >  		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
> > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> >  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > +				 vma->anon_vma, vma->vm_file, pgoff,
> >  				 vma_policy(vma),
> >  				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
> >  		if (prev) {
> > --
> > 2.39.1
> > ---8<---
> >
> > --
> > Peter Xu
> >
>
  
Lorenzo Stoakes May 16, 2023, 9:01 p.m. UTC | #12
On Tue, May 16, 2023 at 04:30:11PM -0400, Peter Xu wrote:
> On Tue, May 16, 2023 at 08:25:13PM +0100, Lorenzo Stoakes wrote:
> > On Tue, May 16, 2023 at 11:06:40AM -0400, Peter Xu wrote:
> > > On Tue, May 16, 2023 at 12:07:11AM +0100, Lorenzo Stoakes wrote:
> > > > On Mon, May 15, 2023 at 11:04:27PM +0100, Lorenzo Stoakes wrote:
> > > > [snip]
> > > > > > Could you explain a bit why we don't need to merge in this case?
> > > > > >
> > > > > > I'm considering, for example, when we have:
> > > > > >
> > > > > >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> > > > > >
> > > > > > Then someone unregisters uffd on range (5-9), iiuc it should become:
> > > > > >
> > > > > >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> > > > > >
> > > > > > But if no merge here it's:
> > > > > >
> > > > > >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> > > > > >
> > > > > > Maybe I missed something?
> > > > > >
> > > > >
> > > > > There's something really, really wrong with this. It simply isn't valid to
> > > > > invoke vma_merge() over an existing VMA that != prev where you're not
> > > > > specifying addr = vma->vm_start, end == vma->vm_end.
> > > > >
> > > > > This seems like you're relying on:-
> > > > >
> > > > >   ***
> > > > > CCCCCNNNNN -> CCNNNNNNNN
> > >
> > > I had a closer look today, I still think this patch is not really the right
> > > one.  The split/merge order is something we use everywhere and I am not
> > > convinced it must change as drastic.  At least so far it still seems to me
> > > if we do with what current patch proposed we can have vma fragmentations.
> >
> > 'something we use everywhere' is not an argument (speak to Willy about
> > folios), vma_merge() expects valid input, relying on it _happening_ to be
>
> I still think it's an argument.
>
> I believe Matthew tried hard to justify he's correct in that aspect when
> changing "something we used everywhere". Blindly referencing it doesn't
> help much on a technical discussion.
>
> If we have similar code that handles similar things, IMHO you need a reason
> to modify one of them to not like the other.
>
> If you think what you proposed is better, please consider (1) justify why
> it's better, and then (2) also apply it to all the rest places where
> applicable.  Please refer to my reply to Liam on the other use cases of
> vma_merge().
>
> Said that, I apprecaite a lot on your assert patch that found this problem.
>
> > ok or to fail in ways that _happen_ not to cause big problems is not right.
> >
> > This is just further evidence that the vma_merge() interface is
> > fundamentally broken. Implicitly assuming you will only get a partial prev
> > overlap merge is far from intuitive.
> >
> > I am definitely going to try to do a series addressing vma_merge() horrors
> > because I feel like we need a generic means of doing this split/merge pattern.
>
> It'll be great if you can make it better.
>
> >
> > >
> > > I think I see what you meant, but here I think it's a legal case where we
> > > should have PPPP rather than CCCC (PPPPPNNNN --> PPNNNNNNNN).
> > >
> > > To be explicit, for register I think it _should_ be the case 0 where we
> > > cannot merge (note: the current code is indeed wrong though, see below):
> > >
> > >    ****
> > >   PPPPPPNNNNNN
> > >   cannot merge
> > >
> > > While for the unregister case here it's case 4:
> > >
> > >     ****
> > >   PPPPPPNNNNNN
> > >   might become
> > >   PPNNNNNNNNNN
> > >   case 4 below
> > >
> > > Here the problem is not that we need to do split then merge (I think it'll
> > > have the problem of vma defragmentation here), the problem is we simply
> > > passed in the wrong "prev" vma pointer, IMHO.  I've patches attached
> > > showing what I meant.
> >
> > Yeah if you do it with prev = vma this form should _probably_ work, that's
> > a good point. This _is_ a case (see https://ljs.io/vma_merge_cases.png for
> > nice diagram of cases btw :), like 5, where we actually do split and merge
> > at the same time.
>
> It's not something I came up with myself, it's just that I started looking
> back to see what people did and trying to understand why they did it.
> Normally people did things with good reasons.
>

Yup, it's an iffy pattern in each case. I can see why you chose to do it,
it's not a criticism of that, it's just that the incorrect (but
accidentally ok to be incorrect seemingly) input is made more obvious by
recent changes.

> So far it seems clearer at least to me to keep this pattern of "merge then
> split".  But I'm happy to be proven wrong anytime.

But you're not, you're doing a vma_merge() and (without it being clear)
hoping it does a merge AND SPLIT in case in an instance where that might be
required followed by you doing any further splits required.

But it's _not your fault_ that this is the standard approach, it's the
interface that's wrong absolutely.

To me doing this is quite a bit less clear than simply 'splitting so this
is mergable first then try to merge' but obviously this current patch does
not avoid the previously introduced fragmentation. However it does maintain
existing behaviour.

>
> >
> > Liam's raised some issues with the safety of your patches, let me look at
> > them when I get a chance, nasty headcold = brain less functional atm.
>
> Sure, no need to rush.
>
> >
> > I would say for now this fix resolves the issue in a way that should
> > emphatically avoid invalid input to vma_merge(), the fragmentation existed
> > before so this is not a new issue, so for the time being I think it's ok to
> > stay as-is.
>
> So far I would still suggest having uffd code the same as the rest if
> possible.
>

Absolutely, but in the immediate term, we are seeing VM_WARN_ON()'s here
and not from other callers (which is actually surprising).

Again, we absolutely do need an abstracted solution for the whole. And
that's something I'll try to work on!

> I think I'll wait for the other discussion to settle on patch 2 to see
> whether I should post a formal patchset.

My suggestion is that it's ok to proceed as-is, not because this is the
final change that should be applied, but rather because it resolves the
issue while maintaining existing behaviour.

I will be more than happy to see patches land after this one that replace
it if necessary but I think it's safe for this to land as a mainline fixup,
even if temporary, is all I am saying :)

>
> >
> > >
> > > I checked the original commit from Andrea and I found that it _was_ correct:
> > >
> > > commit 86039bd3b4e6a1129318cbfed4e0a6e001656635
> > > Author: Andrea Arcangeli <aarcange@redhat.com>
> > > Date:   Fri Sep 4 15:46:31 2015 -0700
> > >
> > >     userfaultfd: add new syscall to provide memory externalization
> > >
> > > I had a feeling that it's broken during the recent rework on vma (or maybe
> > > even not that close), but I'm not yet sure and need to further check.
> > >
> > > > >
> > > > > By specifying parameters that are compatible with N even though you're only
> > > > > partially spanning C?
> > > > >
> > > > > This is crazy, and isn't how this should be used. vma_merge() is not
> > > > > supposed to do partial merges. If it works (presumably it does) this is not
> > > > > by design unless I've lost my mind and I (and others) have somehow not
> > > > > noticed this??
> > > > >
> > > > > I think you're right that now we'll end up with more fragmentation, but
> > > > > what you're suggesting is not how vma_merge() is supposed to work.
> > > > >
> > > > > As I said above, giving vma_merge() invalid parameters is very dangerous as
> > > > > you could end up merging over empty ranges in theory (and could otherwise
> > > > > have corruption).
> > > > >
> > > > > I guess we should probably be passing 0 to the last parameter in
> > > > > split_vma() here then to ensure we do a merge pass too. Will experiment
> > > > > with this.
> > > > >
> > > > > I'm confused as to how the remove from case 8 is not proceeding. I'll look
> > > > > into this some more...
> > > > >
> > > > > Happy to be corrected if I'm misconstruing this!
> > > > >
> > > >
> > > > OK, so I wrote a small program to do perform exactly this case [0] and it seems
> > > > that the outcome is the same before and after this patch - vma_merge() is
> > > > clearly rejecting the case 8 merge (phew!) and in both instances you end up with
> > > > 3 VMAs.
> > > >
> > > > So this patch doesn't change this behaviour and everything is as it was
> > > > before. Ideally we'd let it go for another pass, so maybe we should change the
> > > > split to add a new VMA _afterwards_. Will experiment with that, separately.
> > > >
> > > > But looks like the patch is good as it is.
> > > >
> > > > (if you notice something wrong with the repro, etc. do let me know!)
> > > >
> > > > [0]: https://gist.github.com/lorenzo-stoakes/a11a10f5f479e7a977fc456331266e0e
> > >
> > > I think your test case is fine, but... no, this is still not expected. The
> > > vma should just merge.
> > >
> > > So I have another closer look on this specific issue, it seems we have a
> > > long standing bug on pgoff calculation here when passing that to
> > > vma_merge().  I've got another patch attached to show what I meant.
> > >
> > > To summarize.. now I've got two patches attached:
> > >
> > > Patch 1 is something I'd like to propose to replace this patch that fixes
> > > incorrect use of vma_merge() so it should also eliminate the assertion
> > > being triggered (I still think this is a regression but I need to check
> > > which I will do later; I'm not super familiar with maple tree work, maybe
> > > you or Liam can quickly spot).
> > >
> > > Patch 2 fixes the long standing issue of vma not being able to merge in
> > > above case, and with patch 2 applied it should start merging all right.
> > >
> > > Please have a look, thanks.
> > >
> > > ---8<---
> > > From 6bc39028bba246394bb0bafdaeaab7b8dfd1694f Mon Sep 17 00:00:00 2001
> > > From: Peter Xu <peterx@redhat.com>
> > > Date: Tue, 16 May 2023 09:03:22 -0400
> > > Subject: [PATCH 1/2] mm/uffd: Fix vma operation where start addr cuts part of
> > >  vma
> > >
> > > It seems vma merging with uffd paths is broken with either
> > > register/unregister, where right now we can feed wrong parameters to
> > > vma_merge() and it's found by recent patch which moved asserts upwards in
> > > vma_merge():
> > >
> > > https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com/
> > >
> > > The problem is in the current code base we didn't fixup "prev" for the case
> > > where "start" address can be within the "prev" vma section.  In that case
> > > we should have "prev" points to the current vma rather than the previous
> > > one when feeding to vma_merge().
> > >
> > > This will eliminate the report and make sure vma_merge() calls will become
> > > legal again.
> > >
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > ---
> > >  fs/userfaultfd.c | 27 +++++++++++++++++++++------
> > >  1 file changed, 21 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > index 0fd96d6e39ce..7eb88bc74d00 100644
> > > --- a/fs/userfaultfd.c
> > > +++ b/fs/userfaultfd.c
> > > @@ -1458,10 +1458,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > >  	BUG_ON(!found);
> > >
> > >  	vma_iter_set(&vmi, start);
> > > -	prev = vma_prev(&vmi);
> > > +	vma = vma_find(&vmi, end);
> > > +	if (!vma)
> > > +		goto out_unlock;
> > > +
> > > +	if (vma->vm_start < start)
> > > +		prev = vma;
> > > +	else
> > > +		prev = vma_prev(&vmi);
> > >
> > >  	ret = 0;
> > > -	for_each_vma_range(vmi, vma, end) {
> > > +	do {
> > >  		cond_resched();
> > >
> > >  		BUG_ON(!vma_can_userfault(vma, vm_flags));
> > > @@ -1517,7 +1524,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > >  	skip:
> > >  		prev = vma;
> > >  		start = vma->vm_end;
> > > -	}
> > > +	} for_each_vma_range(vmi, vma, end);
> > >
> > >  out_unlock:
> > >  	mmap_write_unlock(mm);
> > > @@ -1624,9 +1631,17 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > >  	BUG_ON(!found);
> > >
> > >  	vma_iter_set(&vmi, start);
> > > -	prev = vma_prev(&vmi);
> > > +	vma = vma_find(&vmi, end);
> > > +	if (!vma)
> > > +		goto out_unlock;
> > > +
> > > +	if (vma->vm_start < start)
> > > +		prev = vma;
> > > +	else
> > > +		prev = vma_prev(&vmi);
> > > +
> > >  	ret = 0;
> > > -	for_each_vma_range(vmi, vma, end) {
> > > +	do {
> > >  		cond_resched();
> > >
> > >  		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> > > @@ -1692,7 +1707,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > >  	skip:
> > >  		prev = vma;
> > >  		start = vma->vm_end;
> > > -	}
> > > +	} for_each_vma_range(vmi, vma, end);
> > >
> > >  out_unlock:
> > >  	mmap_write_unlock(mm);
> > > --
> > > 2.39.1
> > >
> > > From bf61f3c937e9e2ab96ab2bed0005cb7dc74db231 Mon Sep 17 00:00:00 2001
> > > From: Peter Xu <peterx@redhat.com>
> > > Date: Tue, 16 May 2023 09:39:38 -0400
> > > Subject: [PATCH 2/2] mm/uffd: Allow vma to merge as much as possible
> > >
> > > We used to not pass in the pgoff correctly when register/unregister uffd
> > > regions, it caused incorrect behavior on vma merging.
> > >
> > > For example, when we have:
> > >
> > >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> > >
> > > Then someone unregisters uffd on range (5-9), it should become:
> > >
> > >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> > >
> > > But with current code it's:
> > >
> > >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> > >
> > > This patch allows such merge to happen correctly.
> > >
> > > This behavior seems to have existed since the 1st day of uffd, keep it just
> > > as a performance optmization and not copy stable.
> > >
> > > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > > Cc: Mike Rapoport (IBM) <rppt@kernel.org>
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > ---
> > >  fs/userfaultfd.c | 8 ++++++--
> > >  1 file changed, 6 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > index 7eb88bc74d00..891048b4799f 100644
> > > --- a/fs/userfaultfd.c
> > > +++ b/fs/userfaultfd.c
> > > @@ -1332,6 +1332,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > >  	bool basic_ioctls;
> > >  	unsigned long start, end, vma_end;
> > >  	struct vma_iterator vmi;
> > > +	pgoff_t pgoff;
> > >
> > >  	user_uffdio_register = (struct uffdio_register __user *) arg;
> > >
> > > @@ -1489,8 +1490,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > >  		vma_end = min(end, vma->vm_end);
> > >
> > >  		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
> > > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > >  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > > +				 vma->anon_vma, vma->vm_file, pgoff,
> > >  				 vma_policy(vma),
> > >  				 ((struct vm_userfaultfd_ctx){ ctx }),
> > >  				 anon_vma_name(vma));
> > > @@ -1570,6 +1572,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > >  	unsigned long start, end, vma_end;
> > >  	const void __user *buf = (void __user *)arg;
> > >  	struct vma_iterator vmi;
> > > +	pgoff_t pgoff;
> > >
> > >  	ret = -EFAULT;
> > >  	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
> > > @@ -1677,8 +1680,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > >  			uffd_wp_range(vma, start, vma_end - start, false);
> > >
> > >  		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
> > > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > >  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > > +				 vma->anon_vma, vma->vm_file, pgoff,
> > >  				 vma_policy(vma),
> > >  				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
> > >  		if (prev) {
> > > --
> > > 2.39.1
> > > ---8<---
> > >
> > > --
> > > Peter Xu
> > >
> >
>
> --
> Peter Xu
>
>
  
Peter Xu May 16, 2023, 9:39 p.m. UTC | #13
On Tue, May 16, 2023 at 10:01:06PM +0100, Lorenzo Stoakes wrote:
> On Tue, May 16, 2023 at 04:30:11PM -0400, Peter Xu wrote:
> > On Tue, May 16, 2023 at 08:25:13PM +0100, Lorenzo Stoakes wrote:
> > > On Tue, May 16, 2023 at 11:06:40AM -0400, Peter Xu wrote:
> > > > On Tue, May 16, 2023 at 12:07:11AM +0100, Lorenzo Stoakes wrote:
> > > > > On Mon, May 15, 2023 at 11:04:27PM +0100, Lorenzo Stoakes wrote:
> > > > > [snip]
> > > > > > > Could you explain a bit why we don't need to merge in this case?
> > > > > > >
> > > > > > > I'm considering, for example, when we have:
> > > > > > >
> > > > > > >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> > > > > > >
> > > > > > > Then someone unregisters uffd on range (5-9), iiuc it should become:
> > > > > > >
> > > > > > >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> > > > > > >
> > > > > > > But if no merge here it's:
> > > > > > >
> > > > > > >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> > > > > > >
> > > > > > > Maybe I missed something?
> > > > > > >
> > > > > >
> > > > > > There's something really, really wrong with this. It simply isn't valid to
> > > > > > invoke vma_merge() over an existing VMA that != prev where you're not
> > > > > > specifying addr = vma->vm_start, end == vma->vm_end.
> > > > > >
> > > > > > This seems like you're relying on:-
> > > > > >
> > > > > >   ***
> > > > > > CCCCCNNNNN -> CCNNNNNNNN
> > > >
> > > > I had a closer look today, I still think this patch is not really the right
> > > > one.  The split/merge order is something we use everywhere and I am not
> > > > convinced it must change as drastic.  At least so far it still seems to me
> > > > if we do with what current patch proposed we can have vma fragmentations.
> > >
> > > 'something we use everywhere' is not an argument (speak to Willy about
> > > folios), vma_merge() expects valid input, relying on it _happening_ to be
> >
> > I still think it's an argument.
> >
> > I believe Matthew tried hard to justify he's correct in that aspect when
> > changing "something we used everywhere". Blindly referencing it doesn't
> > help much on a technical discussion.
> >
> > If we have similar code that handles similar things, IMHO you need a reason
> > to modify one of them to not like the other.
> >
> > If you think what you proposed is better, please consider (1) justify why
> > it's better, and then (2) also apply it to all the rest places where
> > applicable.  Please refer to my reply to Liam on the other use cases of
> > vma_merge().
> >
> > Said that, I apprecaite a lot on your assert patch that found this problem.
> >
> > > ok or to fail in ways that _happen_ not to cause big problems is not right.
> > >
> > > This is just further evidence that the vma_merge() interface is
> > > fundamentally broken. Implicitly assuming you will only get a partial prev
> > > overlap merge is far from intuitive.
> > >
> > > I am definitely going to try to do a series addressing vma_merge() horrors
> > > because I feel like we need a generic means of doing this split/merge pattern.
> >
> > It'll be great if you can make it better.
> >
> > >
> > > >
> > > > I think I see what you meant, but here I think it's a legal case where we
> > > > should have PPPP rather than CCCC (PPPPPNNNN --> PPNNNNNNNN).
> > > >
> > > > To be explicit, for register I think it _should_ be the case 0 where we
> > > > cannot merge (note: the current code is indeed wrong though, see below):
> > > >
> > > >    ****
> > > >   PPPPPPNNNNNN
> > > >   cannot merge
> > > >
> > > > While for the unregister case here it's case 4:
> > > >
> > > >     ****
> > > >   PPPPPPNNNNNN
> > > >   might become
> > > >   PPNNNNNNNNNN
> > > >   case 4 below
> > > >
> > > > Here the problem is not that we need to do split then merge (I think it'll
> > > > have the problem of vma defragmentation here), the problem is we simply
> > > > passed in the wrong "prev" vma pointer, IMHO.  I've patches attached
> > > > showing what I meant.
> > >
> > > Yeah if you do it with prev = vma this form should _probably_ work, that's
> > > a good point. This _is_ a case (see https://ljs.io/vma_merge_cases.png for
> > > nice diagram of cases btw :), like 5, where we actually do split and merge
> > > at the same time.
> >
> > It's not something I came up with myself, it's just that I started looking
> > back to see what people did and trying to understand why they did it.
> > Normally people did things with good reasons.
> >
> 
> Yup, it's an iffy pattern in each case. I can see why you chose to do it,
> it's not a criticism of that, it's just that the incorrect (but
> accidentally ok to be incorrect seemingly) input is made more obvious by
> recent changes.
> 
> > So far it seems clearer at least to me to keep this pattern of "merge then
> > split".  But I'm happy to be proven wrong anytime.
> 
> But you're not, you're doing a vma_merge() and (without it being clear)
> hoping it does a merge AND SPLIT in case in an instance where that might be
> required followed by you doing any further splits required.

I don't quite get the reason that you seem to keep saying this is
"incorrect input" to vma_merge().  vma_merge() definitely includes splits
too where it can, to be explicit, for case 4 & 5.

It seems to me what you're trying to explain is we shouldn't handle any
split in vma_merge() so we should move cases 4 & 5 out of vma_merge().  If
we split first then merge, cases 4 & 5 will become case 2 & 3 after split.
My question would be: if it worked perfect in the past few years and it
looks all good enough, why bother..

> 
> But it's _not your fault_ that this is the standard approach, it's the
> interface that's wrong absolutely.
> 
> To me doing this is quite a bit less clear than simply 'splitting so this
> is mergable first then try to merge' but obviously this current patch does
> not avoid the previously introduced fragmentation. However it does maintain
> existing behaviour.
> 
> >
> > >
> > > Liam's raised some issues with the safety of your patches, let me look at
> > > them when I get a chance, nasty headcold = brain less functional atm.
> >
> > Sure, no need to rush.
> >
> > >
> > > I would say for now this fix resolves the issue in a way that should
> > > emphatically avoid invalid input to vma_merge(), the fragmentation existed
> > > before so this is not a new issue, so for the time being I think it's ok to
> > > stay as-is.
> >
> > So far I would still suggest having uffd code the same as the rest if
> > possible.
> >
> 
> Absolutely, but in the immediate term, we are seeing VM_WARN_ON()'s here
> and not from other callers (which is actually surprising).

Not surprising anymore to me, simply because the uffd path was the only one
got overlooked when converting to maple in commit 69dbe6daf104, as Liam
rightfully pointed out.

For example, mprotect() has similar handling when the start addr of the
range can be in the middle of a vma, then in do_mprotect_pkey() there is:

	prev = vma_prev(&vmi);
	if (start > vma->vm_start)
		prev = vma;

IMHO it's the same thing.

> 
> Again, we absolutely do need an abstracted solution for the whole. And
> that's something I'll try to work on!
> 
> > I think I'll wait for the other discussion to settle on patch 2 to see
> > whether I should post a formal patchset.
> 
> My suggestion is that it's ok to proceed as-is, not because this is the
> final change that should be applied, but rather because it resolves the
> issue while maintaining existing behaviour.
> 
> I will be more than happy to see patches land after this one that replace
> it if necessary but I think it's safe for this to land as a mainline fixup,
> even if temporary, is all I am saying :)

I'd hope we can replace this patch with my patch 1 if possible because I
_think_ this patch is still in hotfixes-unstable (while patch 2 doesn't
need to copy stable in all cases).  Andrew may know better on how to
proceed.

If this lands first, I'll probably propose a full revert otherwise as the
1st patch of the patchset to post, then make uffd the same as others.

Before that I'd like to know whether you agree that the new patch 1 (I'll
fixup the vma_prev() side effect) could be a better solution than the
current one, no matter whether we need a full revert or not.

Thanks,

> 
> >
> > >
> > > >
> > > > I checked the original commit from Andrea and I found that it _was_ correct:
> > > >
> > > > commit 86039bd3b4e6a1129318cbfed4e0a6e001656635
> > > > Author: Andrea Arcangeli <aarcange@redhat.com>
> > > > Date:   Fri Sep 4 15:46:31 2015 -0700
> > > >
> > > >     userfaultfd: add new syscall to provide memory externalization
> > > >
> > > > I had a feeling that it's broken during the recent rework on vma (or maybe
> > > > even not that close), but I'm not yet sure and need to further check.
> > > >
> > > > > >
> > > > > > By specifying parameters that are compatible with N even though you're only
> > > > > > partially spanning C?
> > > > > >
> > > > > > This is crazy, and isn't how this should be used. vma_merge() is not
> > > > > > supposed to do partial merges. If it works (presumably it does) this is not
> > > > > > by design unless I've lost my mind and I (and others) have somehow not
> > > > > > noticed this??
> > > > > >
> > > > > > I think you're right that now we'll end up with more fragmentation, but
> > > > > > what you're suggesting is not how vma_merge() is supposed to work.
> > > > > >
> > > > > > As I said above, giving vma_merge() invalid parameters is very dangerous as
> > > > > > you could end up merging over empty ranges in theory (and could otherwise
> > > > > > have corruption).
> > > > > >
> > > > > > I guess we should probably be passing 0 to the last parameter in
> > > > > > split_vma() here then to ensure we do a merge pass too. Will experiment
> > > > > > with this.
> > > > > >
> > > > > > I'm confused as to how the remove from case 8 is not proceeding. I'll look
> > > > > > into this some more...
> > > > > >
> > > > > > Happy to be corrected if I'm misconstruing this!
> > > > > >
> > > > >
> > > > > OK, so I wrote a small program to do perform exactly this case [0] and it seems
> > > > > that the outcome is the same before and after this patch - vma_merge() is
> > > > > clearly rejecting the case 8 merge (phew!) and in both instances you end up with
> > > > > 3 VMAs.
> > > > >
> > > > > So this patch doesn't change this behaviour and everything is as it was
> > > > > before. Ideally we'd let it go for another pass, so maybe we should change the
> > > > > split to add a new VMA _afterwards_. Will experiment with that, separately.
> > > > >
> > > > > But looks like the patch is good as it is.
> > > > >
> > > > > (if you notice something wrong with the repro, etc. do let me know!)
> > > > >
> > > > > [0]: https://gist.github.com/lorenzo-stoakes/a11a10f5f479e7a977fc456331266e0e
> > > >
> > > > I think your test case is fine, but... no, this is still not expected. The
> > > > vma should just merge.
> > > >
> > > > So I have another closer look on this specific issue, it seems we have a
> > > > long standing bug on pgoff calculation here when passing that to
> > > > vma_merge().  I've got another patch attached to show what I meant.
> > > >
> > > > To summarize.. now I've got two patches attached:
> > > >
> > > > Patch 1 is something I'd like to propose to replace this patch that fixes
> > > > incorrect use of vma_merge() so it should also eliminate the assertion
> > > > being triggered (I still think this is a regression but I need to check
> > > > which I will do later; I'm not super familiar with maple tree work, maybe
> > > > you or Liam can quickly spot).
> > > >
> > > > Patch 2 fixes the long standing issue of vma not being able to merge in
> > > > above case, and with patch 2 applied it should start merging all right.
> > > >
> > > > Please have a look, thanks.
> > > >
> > > > ---8<---
> > > > From 6bc39028bba246394bb0bafdaeaab7b8dfd1694f Mon Sep 17 00:00:00 2001
> > > > From: Peter Xu <peterx@redhat.com>
> > > > Date: Tue, 16 May 2023 09:03:22 -0400
> > > > Subject: [PATCH 1/2] mm/uffd: Fix vma operation where start addr cuts part of
> > > >  vma
> > > >
> > > > It seems vma merging with uffd paths is broken with either
> > > > register/unregister, where right now we can feed wrong parameters to
> > > > vma_merge() and it's found by recent patch which moved asserts upwards in
> > > > vma_merge():
> > > >
> > > > https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com/
> > > >
> > > > The problem is in the current code base we didn't fixup "prev" for the case
> > > > where "start" address can be within the "prev" vma section.  In that case
> > > > we should have "prev" points to the current vma rather than the previous
> > > > one when feeding to vma_merge().
> > > >
> > > > This will eliminate the report and make sure vma_merge() calls will become
> > > > legal again.
> > > >
> > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > ---
> > > >  fs/userfaultfd.c | 27 +++++++++++++++++++++------
> > > >  1 file changed, 21 insertions(+), 6 deletions(-)
> > > >
> > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > index 0fd96d6e39ce..7eb88bc74d00 100644
> > > > --- a/fs/userfaultfd.c
> > > > +++ b/fs/userfaultfd.c
> > > > @@ -1458,10 +1458,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > >  	BUG_ON(!found);
> > > >
> > > >  	vma_iter_set(&vmi, start);
> > > > -	prev = vma_prev(&vmi);
> > > > +	vma = vma_find(&vmi, end);
> > > > +	if (!vma)
> > > > +		goto out_unlock;
> > > > +
> > > > +	if (vma->vm_start < start)
> > > > +		prev = vma;
> > > > +	else
> > > > +		prev = vma_prev(&vmi);
> > > >
> > > >  	ret = 0;
> > > > -	for_each_vma_range(vmi, vma, end) {
> > > > +	do {
> > > >  		cond_resched();
> > > >
> > > >  		BUG_ON(!vma_can_userfault(vma, vm_flags));
> > > > @@ -1517,7 +1524,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > >  	skip:
> > > >  		prev = vma;
> > > >  		start = vma->vm_end;
> > > > -	}
> > > > +	} for_each_vma_range(vmi, vma, end);
> > > >
> > > >  out_unlock:
> > > >  	mmap_write_unlock(mm);
> > > > @@ -1624,9 +1631,17 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > >  	BUG_ON(!found);
> > > >
> > > >  	vma_iter_set(&vmi, start);
> > > > -	prev = vma_prev(&vmi);
> > > > +	vma = vma_find(&vmi, end);
> > > > +	if (!vma)
> > > > +		goto out_unlock;
> > > > +
> > > > +	if (vma->vm_start < start)
> > > > +		prev = vma;
> > > > +	else
> > > > +		prev = vma_prev(&vmi);
> > > > +
> > > >  	ret = 0;
> > > > -	for_each_vma_range(vmi, vma, end) {
> > > > +	do {
> > > >  		cond_resched();
> > > >
> > > >  		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> > > > @@ -1692,7 +1707,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > >  	skip:
> > > >  		prev = vma;
> > > >  		start = vma->vm_end;
> > > > -	}
> > > > +	} for_each_vma_range(vmi, vma, end);
> > > >
> > > >  out_unlock:
> > > >  	mmap_write_unlock(mm);
> > > > --
> > > > 2.39.1
> > > >
> > > > From bf61f3c937e9e2ab96ab2bed0005cb7dc74db231 Mon Sep 17 00:00:00 2001
> > > > From: Peter Xu <peterx@redhat.com>
> > > > Date: Tue, 16 May 2023 09:39:38 -0400
> > > > Subject: [PATCH 2/2] mm/uffd: Allow vma to merge as much as possible
> > > >
> > > > We used to not pass in the pgoff correctly when register/unregister uffd
> > > > regions, it caused incorrect behavior on vma merging.
> > > >
> > > > For example, when we have:
> > > >
> > > >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> > > >
> > > > Then someone unregisters uffd on range (5-9), it should become:
> > > >
> > > >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> > > >
> > > > But with current code it's:
> > > >
> > > >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> > > >
> > > > This patch allows such merge to happen correctly.
> > > >
> > > > This behavior seems to have existed since the 1st day of uffd, keep it just
> > > > as a performance optmization and not copy stable.
> > > >
> > > > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > > > Cc: Mike Rapoport (IBM) <rppt@kernel.org>
> > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > ---
> > > >  fs/userfaultfd.c | 8 ++++++--
> > > >  1 file changed, 6 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > index 7eb88bc74d00..891048b4799f 100644
> > > > --- a/fs/userfaultfd.c
> > > > +++ b/fs/userfaultfd.c
> > > > @@ -1332,6 +1332,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > >  	bool basic_ioctls;
> > > >  	unsigned long start, end, vma_end;
> > > >  	struct vma_iterator vmi;
> > > > +	pgoff_t pgoff;
> > > >
> > > >  	user_uffdio_register = (struct uffdio_register __user *) arg;
> > > >
> > > > @@ -1489,8 +1490,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > >  		vma_end = min(end, vma->vm_end);
> > > >
> > > >  		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
> > > > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > > >  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > > > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > > > +				 vma->anon_vma, vma->vm_file, pgoff,
> > > >  				 vma_policy(vma),
> > > >  				 ((struct vm_userfaultfd_ctx){ ctx }),
> > > >  				 anon_vma_name(vma));
> > > > @@ -1570,6 +1572,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > >  	unsigned long start, end, vma_end;
> > > >  	const void __user *buf = (void __user *)arg;
> > > >  	struct vma_iterator vmi;
> > > > +	pgoff_t pgoff;
> > > >
> > > >  	ret = -EFAULT;
> > > >  	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
> > > > @@ -1677,8 +1680,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > >  			uffd_wp_range(vma, start, vma_end - start, false);
> > > >
> > > >  		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
> > > > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > > >  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > > > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > > > +				 vma->anon_vma, vma->vm_file, pgoff,
> > > >  				 vma_policy(vma),
> > > >  				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
> > > >  		if (prev) {
> > > > --
> > > > 2.39.1
> > > > ---8<---
> > > >
> > > > --
> > > > Peter Xu
> > > >
> > >
> >
> > --
> > Peter Xu
> >
> >
>
  
Lorenzo Stoakes May 16, 2023, 10:15 p.m. UTC | #14
On Tue, May 16, 2023 at 05:39:53PM -0400, Peter Xu wrote:
> On Tue, May 16, 2023 at 10:01:06PM +0100, Lorenzo Stoakes wrote:
> > On Tue, May 16, 2023 at 04:30:11PM -0400, Peter Xu wrote:
> > > On Tue, May 16, 2023 at 08:25:13PM +0100, Lorenzo Stoakes wrote:
> > > > On Tue, May 16, 2023 at 11:06:40AM -0400, Peter Xu wrote:
> > > > > On Tue, May 16, 2023 at 12:07:11AM +0100, Lorenzo Stoakes wrote:
> > > > > > On Mon, May 15, 2023 at 11:04:27PM +0100, Lorenzo Stoakes wrote:
> > > > > > [snip]
> > > > > > > > Could you explain a bit why we don't need to merge in this case?
> > > > > > > >
> > > > > > > > I'm considering, for example, when we have:
> > > > > > > >
> > > > > > > >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> > > > > > > >
> > > > > > > > Then someone unregisters uffd on range (5-9), iiuc it should become:
> > > > > > > >
> > > > > > > >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> > > > > > > >
> > > > > > > > But if no merge here it's:
> > > > > > > >
> > > > > > > >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> > > > > > > >
> > > > > > > > Maybe I missed something?
> > > > > > > >
> > > > > > >
> > > > > > > There's something really, really wrong with this. It simply isn't valid to
> > > > > > > invoke vma_merge() over an existing VMA that != prev where you're not
> > > > > > > specifying addr = vma->vm_start, end == vma->vm_end.
> > > > > > >
> > > > > > > This seems like you're relying on:-
> > > > > > >
> > > > > > >   ***
> > > > > > > CCCCCNNNNN -> CCNNNNNNNN
> > > > >
> > > > > I had a closer look today, I still think this patch is not really the right
> > > > > one.  The split/merge order is something we use everywhere and I am not
> > > > > convinced it must change as drastic.  At least so far it still seems to me
> > > > > if we do with what current patch proposed we can have vma fragmentations.
> > > >
> > > > 'something we use everywhere' is not an argument (speak to Willy about
> > > > folios), vma_merge() expects valid input, relying on it _happening_ to be
> > >
> > > I still think it's an argument.
> > >
> > > I believe Matthew tried hard to justify he's correct in that aspect when
> > > changing "something we used everywhere". Blindly referencing it doesn't
> > > help much on a technical discussion.
> > >
> > > If we have similar code that handles similar things, IMHO you need a reason
> > > to modify one of them to not like the other.
> > >
> > > If you think what you proposed is better, please consider (1) justify why
> > > it's better, and then (2) also apply it to all the rest places where
> > > applicable.  Please refer to my reply to Liam on the other use cases of
> > > vma_merge().
> > >
> > > Said that, I apprecaite a lot on your assert patch that found this problem.
> > >
> > > > ok or to fail in ways that _happen_ not to cause big problems is not right.
> > > >
> > > > This is just further evidence that the vma_merge() interface is
> > > > fundamentally broken. Implicitly assuming you will only get a partial prev
> > > > overlap merge is far from intuitive.
> > > >
> > > > I am definitely going to try to do a series addressing vma_merge() horrors
> > > > because I feel like we need a generic means of doing this split/merge pattern.
> > >
> > > It'll be great if you can make it better.
> > >
> > > >
> > > > >
> > > > > I think I see what you meant, but here I think it's a legal case where we
> > > > > should have PPPP rather than CCCC (PPPPPNNNN --> PPNNNNNNNN).
> > > > >
> > > > > To be explicit, for register I think it _should_ be the case 0 where we
> > > > > cannot merge (note: the current code is indeed wrong though, see below):
> > > > >
> > > > >    ****
> > > > >   PPPPPPNNNNNN
> > > > >   cannot merge
> > > > >
> > > > > While for the unregister case here it's case 4:
> > > > >
> > > > >     ****
> > > > >   PPPPPPNNNNNN
> > > > >   might become
> > > > >   PPNNNNNNNNNN
> > > > >   case 4 below
> > > > >
> > > > > Here the problem is not that we need to do split then merge (I think it'll
> > > > > have the problem of vma defragmentation here), the problem is we simply
> > > > > passed in the wrong "prev" vma pointer, IMHO.  I've patches attached
> > > > > showing what I meant.
> > > >
> > > > Yeah if you do it with prev = vma this form should _probably_ work, that's
> > > > a good point. This _is_ a case (see https://ljs.io/vma_merge_cases.png for
> > > > nice diagram of cases btw :), like 5, where we actually do split and merge
> > > > at the same time.
> > >
> > > It's not something I came up with myself, it's just that I started looking
> > > back to see what people did and trying to understand why they did it.
> > > Normally people did things with good reasons.
> > >
> >
> > Yup, it's an iffy pattern in each case. I can see why you chose to do it,
> > it's not a criticism of that, it's just that the incorrect (but
> > accidentally ok to be incorrect seemingly) input is made more obvious by
> > recent changes.
> >
> > > So far it seems clearer at least to me to keep this pattern of "merge then
> > > split".  But I'm happy to be proven wrong anytime.
> >
> > But you're not, you're doing a vma_merge() and (without it being clear)
> > hoping it does a merge AND SPLIT in case in an instance where that might be
> > required followed by you doing any further splits required.
>
> I don't quite get the reason that you seem to keep saying this is
> "incorrect input" to vma_merge().  vma_merge() definitely includes splits
> too where it can, to be explicit, for case 4 & 5.

OK so I think there's some kind of confusion here. I'm saying the input
uffd code is sending is invalid. So does a VMA_WARN_ON(). I'm saying having
code that doesn't _very clearly_ send valid inputs is a bad idea.

You may disagree with both of these, that's fine. I'm not saying cases 4/5
are 'wrong' in any sense.

I'll try to address this in a later series, I don't think there's much use
in going round in circles on this. If you dislike that series, you're
welcome to provide negative feedback there, I don't think there's much use
in discussing further here.

>
> It seems to me what you're trying to explain is we shouldn't handle any
> split in vma_merge() so we should move cases 4 & 5 out of vma_merge().  If
> we split first then merge, cases 4 & 5 will become case 2 & 3 after split.
> My question would be: if it worked perfect in the past few years and it
> looks all good enough, why bother..
>

Nope. Not saying that at all.

You (appear to) say 'do a merge then split' is clearer then 'split to valid
input then try to merge'. I was just pointing out that 'merge' is not
always a merge. Sometimes it's a split too, and that it's not quite as
clear as you say.

My whole criticism of this interface is not _this code_, but rather that
the interface is unclear.

We've seen a regression on invalid input to vma_merge() (explicitly I mean
triggering a VM_WARN_ON()) and VMA fragmentation you were not aware of
here, that does not strike me as a great + clear interface.

However again, I don't think this is the place for _that_ disagreement, you
are welcome to respond negatively to the patch series which tries to tackle
this when I propose it.

> >
> > But it's _not your fault_ that this is the standard approach, it's the
> > interface that's wrong absolutely.
> >
> > To me doing this is quite a bit less clear than simply 'splitting so this
> > is mergable first then try to merge' but obviously this current patch does
> > not avoid the previously introduced fragmentation. However it does maintain
> > existing behaviour.
> >
> > >
> > > >
> > > > Liam's raised some issues with the safety of your patches, let me look at
> > > > them when I get a chance, nasty headcold = brain less functional atm.
> > >
> > > Sure, no need to rush.
> > >
> > > >
> > > > I would say for now this fix resolves the issue in a way that should
> > > > emphatically avoid invalid input to vma_merge(), the fragmentation existed
> > > > before so this is not a new issue, so for the time being I think it's ok to
> > > > stay as-is.
> > >
> > > So far I would still suggest having uffd code the same as the rest if
> > > possible.
> > >
> >
> > Absolutely, but in the immediate term, we are seeing VM_WARN_ON()'s here
> > and not from other callers (which is actually surprising).
>
> Not surprising anymore to me, simply because the uffd path was the only one
> got overlooked when converting to maple in commit 69dbe6daf104, as Liam
> rightfully pointed out.
>
> For example, mprotect() has similar handling when the start addr of the
> range can be in the middle of a vma, then in do_mprotect_pkey() there is:
>
> 	prev = vma_prev(&vmi);
> 	if (start > vma->vm_start)
> 		prev = vma;
>
> IMHO it's the same thing.

Yeah it probably is I agree.

>
> >
> > Again, we absolutely do need an abstracted solution for the whole. And
> > that's something I'll try to work on!
> >
> > > I think I'll wait for the other discussion to settle on patch 2 to see
> > > whether I should post a formal patchset.
> >
> > My suggestion is that it's ok to proceed as-is, not because this is the
> > final change that should be applied, but rather because it resolves the
> > issue while maintaining existing behaviour.
> >
> > I will be more than happy to see patches land after this one that replace
> > it if necessary but I think it's safe for this to land as a mainline fixup,
> > even if temporary, is all I am saying :)
>
> I'd hope we can replace this patch with my patch 1 if possible because I
> _think_ this patch is still in hotfixes-unstable (while patch 2 doesn't
> need to copy stable in all cases).  Andrew may know better on how to
> proceed.
>
> If this lands first, I'll probably propose a full revert otherwise as the
> 1st patch of the patchset to post, then make uffd the same as others.

Ah the thanks you get for contributing a regression fix _and_ a repro - a
nack :) will you at least give me some kind of a tag... or buy me a beer?
;)

Honestly it's fine to replace it with something more consistent with the
rest, as I think the _better_ solution is to fix them all at once with a
better abstraction. That can come later as mentioned above.

>
> Before that I'd like to know whether you agree that the new patch 1 (I'll
> fixup the vma_prev() side effect) could be a better solution than the
> current one, no matter whether we need a full revert or not.

In principle it looks fine actually (pending Liam's assessment), case 4/5
should handle it, but I feel like we need a comment (perhaps only in commit
msg) to make clear that we are ensuring that the inputs to vma_merge() are
either clamped to VMAs or case 4/5.

Let's see what Liam thinks, then let me check it locally to give a final
OK, if I may.

>
> Thanks,
>
> >
> > >
> > > >
> > > > >
> > > > > I checked the original commit from Andrea and I found that it _was_ correct:
> > > > >
> > > > > commit 86039bd3b4e6a1129318cbfed4e0a6e001656635
> > > > > Author: Andrea Arcangeli <aarcange@redhat.com>
> > > > > Date:   Fri Sep 4 15:46:31 2015 -0700
> > > > >
> > > > >     userfaultfd: add new syscall to provide memory externalization
> > > > >
> > > > > I had a feeling that it's broken during the recent rework on vma (or maybe
> > > > > even not that close), but I'm not yet sure and need to further check.
> > > > >
> > > > > > >
> > > > > > > By specifying parameters that are compatible with N even though you're only
> > > > > > > partially spanning C?
> > > > > > >
> > > > > > > This is crazy, and isn't how this should be used. vma_merge() is not
> > > > > > > supposed to do partial merges. If it works (presumably it does) this is not
> > > > > > > by design unless I've lost my mind and I (and others) have somehow not
> > > > > > > noticed this??
> > > > > > >
> > > > > > > I think you're right that now we'll end up with more fragmentation, but
> > > > > > > what you're suggesting is not how vma_merge() is supposed to work.
> > > > > > >
> > > > > > > As I said above, giving vma_merge() invalid parameters is very dangerous as
> > > > > > > you could end up merging over empty ranges in theory (and could otherwise
> > > > > > > have corruption).
> > > > > > >
> > > > > > > I guess we should probably be passing 0 to the last parameter in
> > > > > > > split_vma() here then to ensure we do a merge pass too. Will experiment
> > > > > > > with this.
> > > > > > >
> > > > > > > I'm confused as to how the remove from case 8 is not proceeding. I'll look
> > > > > > > into this some more...
> > > > > > >
> > > > > > > Happy to be corrected if I'm misconstruing this!
> > > > > > >
> > > > > >
> > > > > > OK, so I wrote a small program to do perform exactly this case [0] and it seems
> > > > > > that the outcome is the same before and after this patch - vma_merge() is
> > > > > > clearly rejecting the case 8 merge (phew!) and in both instances you end up with
> > > > > > 3 VMAs.
> > > > > >
> > > > > > So this patch doesn't change this behaviour and everything is as it was
> > > > > > before. Ideally we'd let it go for another pass, so maybe we should change the
> > > > > > split to add a new VMA _afterwards_. Will experiment with that, separately.
> > > > > >
> > > > > > But looks like the patch is good as it is.
> > > > > >
> > > > > > (if you notice something wrong with the repro, etc. do let me know!)
> > > > > >
> > > > > > [0]: https://gist.github.com/lorenzo-stoakes/a11a10f5f479e7a977fc456331266e0e
> > > > >
> > > > > I think your test case is fine, but... no, this is still not expected. The
> > > > > vma should just merge.
> > > > >
> > > > > So I have another closer look on this specific issue, it seems we have a
> > > > > long standing bug on pgoff calculation here when passing that to
> > > > > vma_merge().  I've got another patch attached to show what I meant.
> > > > >
> > > > > To summarize.. now I've got two patches attached:
> > > > >
> > > > > Patch 1 is something I'd like to propose to replace this patch that fixes
> > > > > incorrect use of vma_merge() so it should also eliminate the assertion
> > > > > being triggered (I still think this is a regression but I need to check
> > > > > which I will do later; I'm not super familiar with maple tree work, maybe
> > > > > you or Liam can quickly spot).
> > > > >
> > > > > Patch 2 fixes the long standing issue of vma not being able to merge in
> > > > > above case, and with patch 2 applied it should start merging all right.
> > > > >
> > > > > Please have a look, thanks.
> > > > >
> > > > > ---8<---
> > > > > From 6bc39028bba246394bb0bafdaeaab7b8dfd1694f Mon Sep 17 00:00:00 2001
> > > > > From: Peter Xu <peterx@redhat.com>
> > > > > Date: Tue, 16 May 2023 09:03:22 -0400
> > > > > Subject: [PATCH 1/2] mm/uffd: Fix vma operation where start addr cuts part of
> > > > >  vma
> > > > >
> > > > > It seems vma merging with uffd paths is broken with either
> > > > > register/unregister, where right now we can feed wrong parameters to
> > > > > vma_merge() and it's found by recent patch which moved asserts upwards in
> > > > > vma_merge():
> > > > >
> > > > > https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com/
> > > > >
> > > > > The problem is in the current code base we didn't fixup "prev" for the case
> > > > > where "start" address can be within the "prev" vma section.  In that case
> > > > > we should have "prev" points to the current vma rather than the previous
> > > > > one when feeding to vma_merge().
> > > > >
> > > > > This will eliminate the report and make sure vma_merge() calls will become
> > > > > legal again.
> > > > >
> > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > ---
> > > > >  fs/userfaultfd.c | 27 +++++++++++++++++++++------
> > > > >  1 file changed, 21 insertions(+), 6 deletions(-)
> > > > >
> > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > > index 0fd96d6e39ce..7eb88bc74d00 100644
> > > > > --- a/fs/userfaultfd.c
> > > > > +++ b/fs/userfaultfd.c
> > > > > @@ -1458,10 +1458,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > >  	BUG_ON(!found);
> > > > >
> > > > >  	vma_iter_set(&vmi, start);
> > > > > -	prev = vma_prev(&vmi);
> > > > > +	vma = vma_find(&vmi, end);
> > > > > +	if (!vma)
> > > > > +		goto out_unlock;
> > > > > +
> > > > > +	if (vma->vm_start < start)
> > > > > +		prev = vma;
> > > > > +	else
> > > > > +		prev = vma_prev(&vmi);
> > > > >
> > > > >  	ret = 0;
> > > > > -	for_each_vma_range(vmi, vma, end) {
> > > > > +	do {
> > > > >  		cond_resched();
> > > > >
> > > > >  		BUG_ON(!vma_can_userfault(vma, vm_flags));
> > > > > @@ -1517,7 +1524,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > >  	skip:
> > > > >  		prev = vma;
> > > > >  		start = vma->vm_end;
> > > > > -	}
> > > > > +	} for_each_vma_range(vmi, vma, end);
> > > > >
> > > > >  out_unlock:
> > > > >  	mmap_write_unlock(mm);
> > > > > @@ -1624,9 +1631,17 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > > >  	BUG_ON(!found);
> > > > >
> > > > >  	vma_iter_set(&vmi, start);
> > > > > -	prev = vma_prev(&vmi);
> > > > > +	vma = vma_find(&vmi, end);
> > > > > +	if (!vma)
> > > > > +		goto out_unlock;
> > > > > +
> > > > > +	if (vma->vm_start < start)
> > > > > +		prev = vma;
> > > > > +	else
> > > > > +		prev = vma_prev(&vmi);
> > > > > +
> > > > >  	ret = 0;
> > > > > -	for_each_vma_range(vmi, vma, end) {
> > > > > +	do {
> > > > >  		cond_resched();
> > > > >
> > > > >  		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> > > > > @@ -1692,7 +1707,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > > >  	skip:
> > > > >  		prev = vma;
> > > > >  		start = vma->vm_end;
> > > > > -	}
> > > > > +	} for_each_vma_range(vmi, vma, end);
> > > > >
> > > > >  out_unlock:
> > > > >  	mmap_write_unlock(mm);
> > > > > --
> > > > > 2.39.1
> > > > >
> > > > > From bf61f3c937e9e2ab96ab2bed0005cb7dc74db231 Mon Sep 17 00:00:00 2001
> > > > > From: Peter Xu <peterx@redhat.com>
> > > > > Date: Tue, 16 May 2023 09:39:38 -0400
> > > > > Subject: [PATCH 2/2] mm/uffd: Allow vma to merge as much as possible
> > > > >
> > > > > We used to not pass in the pgoff correctly when register/unregister uffd
> > > > > regions, it caused incorrect behavior on vma merging.
> > > > >
> > > > > For example, when we have:
> > > > >
> > > > >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> > > > >
> > > > > Then someone unregisters uffd on range (5-9), it should become:
> > > > >
> > > > >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> > > > >
> > > > > But with current code it's:
> > > > >
> > > > >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> > > > >
> > > > > This patch allows such merge to happen correctly.
> > > > >
> > > > > This behavior seems to have existed since the 1st day of uffd, keep it just
> > > > > as a performance optmization and not copy stable.
> > > > >
> > > > > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > > > > Cc: Mike Rapoport (IBM) <rppt@kernel.org>
> > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > ---
> > > > >  fs/userfaultfd.c | 8 ++++++--
> > > > >  1 file changed, 6 insertions(+), 2 deletions(-)
> > > > >
> > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > > index 7eb88bc74d00..891048b4799f 100644
> > > > > --- a/fs/userfaultfd.c
> > > > > +++ b/fs/userfaultfd.c
> > > > > @@ -1332,6 +1332,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > >  	bool basic_ioctls;
> > > > >  	unsigned long start, end, vma_end;
> > > > >  	struct vma_iterator vmi;
> > > > > +	pgoff_t pgoff;
> > > > >
> > > > >  	user_uffdio_register = (struct uffdio_register __user *) arg;
> > > > >
> > > > > @@ -1489,8 +1490,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > >  		vma_end = min(end, vma->vm_end);
> > > > >
> > > > >  		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
> > > > > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > > > >  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > > > > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > > > > +				 vma->anon_vma, vma->vm_file, pgoff,
> > > > >  				 vma_policy(vma),
> > > > >  				 ((struct vm_userfaultfd_ctx){ ctx }),
> > > > >  				 anon_vma_name(vma));
> > > > > @@ -1570,6 +1572,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > > >  	unsigned long start, end, vma_end;
> > > > >  	const void __user *buf = (void __user *)arg;
> > > > >  	struct vma_iterator vmi;
> > > > > +	pgoff_t pgoff;
> > > > >
> > > > >  	ret = -EFAULT;
> > > > >  	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
> > > > > @@ -1677,8 +1680,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > > >  			uffd_wp_range(vma, start, vma_end - start, false);
> > > > >
> > > > >  		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
> > > > > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > > > >  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > > > > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > > > > +				 vma->anon_vma, vma->vm_file, pgoff,
> > > > >  				 vma_policy(vma),
> > > > >  				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
> > > > >  		if (prev) {
> > > > > --
> > > > > 2.39.1
> > > > > ---8<---
> > > > >
> > > > > --
> > > > > Peter Xu
> > > > >
> > > >
> > >
> > > --
> > > Peter Xu
> > >
> > >
> >
>
> --
> Peter Xu
>
>
  
Peter Xu May 16, 2023, 10:32 p.m. UTC | #15
On Tue, May 16, 2023 at 11:15:54PM +0100, Lorenzo Stoakes wrote:
> I'll try to address this in a later series, I don't think there's much use
> in going round in circles on this. If you dislike that series, you're
> welcome to provide negative feedback there, I don't think there's much use
> in discussing further here.

I'm happy to read it, sorry if any of my wording was intruding, I didn't
mean so.

I think there's chance at least on generalizing vma flag change cases, even
though I'm not sure whether vma_merge() needs change.  Maybe it can be
another layer on top of it while keeping vma_merge() as is, but I can't tell.

> We've seen a regression on invalid input to vma_merge() (explicitly I mean
> triggering a VM_WARN_ON()) and VMA fragmentation you were not aware of
> here, that does not strike me as a great + clear interface.

Yes, the code needs time to read through, even the interface.  I don't
think I fully digested that myself.

[...]

> Ah the thanks you get for contributing a regression fix _and_ a repro - a
> nack :) will you at least give me some kind of a tag... or buy me a beer?
> ;)

I can. :)

We actually met on the conference, if I'll be able to meet you somewhere
that's what I can do.

I was probably hashing in the words, sorry about that if so, and thanks for
looking at this issue!  I appreciate both your assertion patch and the png
documentation file.

It's just that I feel irresponsible when we were talking about having vma
not merged correctly but then the discussion tried to end at there saying
it kept so so it's fine.  IMHO we should look into that problem or
something could be missing here.  Then when I was looking into that
not-merged issue I found that it's not uffd that's special.

> > Before that I'd like to know whether you agree that the new patch 1 (I'll
> > fixup the vma_prev() side effect) could be a better solution than the
> > current one, no matter whether we need a full revert or not.
> 
> In principle it looks fine actually (pending Liam's assessment), case 4/5
> should handle it, but I feel like we need a comment (perhaps only in commit
> msg) to make clear that we are ensuring that the inputs to vma_merge() are
> either clamped to VMAs or case 4/5.
> 
> Let's see what Liam thinks, then let me check it locally to give a final
> OK, if I may.

Sounds perfect here.  Thanks a lot.
  
Liam R. Howlett May 16, 2023, 10:38 p.m. UTC | #16
* Peter Xu <peterx@redhat.com> [230516 17:40]:
> On Tue, May 16, 2023 at 10:01:06PM +0100, Lorenzo Stoakes wrote:
> > On Tue, May 16, 2023 at 04:30:11PM -0400, Peter Xu wrote:
> > > On Tue, May 16, 2023 at 08:25:13PM +0100, Lorenzo Stoakes wrote:
> > > > On Tue, May 16, 2023 at 11:06:40AM -0400, Peter Xu wrote:
> > > > > On Tue, May 16, 2023 at 12:07:11AM +0100, Lorenzo Stoakes wrote:
> > > > > > On Mon, May 15, 2023 at 11:04:27PM +0100, Lorenzo Stoakes wrote:
> > > > > > [snip]
> > > > > > > > Could you explain a bit why we don't need to merge in this case?
> > > > > > > >
> > > > > > > > I'm considering, for example, when we have:
> > > > > > > >
> > > > > > > >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> > > > > > > >
> > > > > > > > Then someone unregisters uffd on range (5-9), iiuc it should become:
> > > > > > > >
> > > > > > > >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> > > > > > > >
> > > > > > > > But if no merge here it's:
> > > > > > > >
> > > > > > > >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> > > > > > > >
> > > > > > > > Maybe I missed something?
> > > > > > > >
> > > > > > >
> > > > > > > There's something really, really wrong with this. It simply isn't valid to
> > > > > > > invoke vma_merge() over an existing VMA that != prev where you're not
> > > > > > > specifying addr = vma->vm_start, end == vma->vm_end.
> > > > > > >
> > > > > > > This seems like you're relying on:-
> > > > > > >
> > > > > > >   ***
> > > > > > > CCCCCNNNNN -> CCNNNNNNNN
> > > > >
> > > > > I had a closer look today, I still think this patch is not really the right
> > > > > one.  The split/merge order is something we use everywhere and I am not
> > > > > convinced it must change as drastic.  At least so far it still seems to me
> > > > > if we do with what current patch proposed we can have vma fragmentations.
> > > >
> > > > 'something we use everywhere' is not an argument (speak to Willy about
> > > > folios), vma_merge() expects valid input, relying on it _happening_ to be
> > >
> > > I still think it's an argument.
> > >
> > > I believe Matthew tried hard to justify he's correct in that aspect when
> > > changing "something we used everywhere". Blindly referencing it doesn't
> > > help much on a technical discussion.
> > >
> > > If we have similar code that handles similar things, IMHO you need a reason
> > > to modify one of them to not like the other.
> > >
> > > If you think what you proposed is better, please consider (1) justify why
> > > it's better, and then (2) also apply it to all the rest places where
> > > applicable.  Please refer to my reply to Liam on the other use cases of
> > > vma_merge().
> > >
> > > Said that, I apprecaite a lot on your assert patch that found this problem.
> > >
> > > > ok or to fail in ways that _happen_ not to cause big problems is not right.
> > > >
> > > > This is just further evidence that the vma_merge() interface is
> > > > fundamentally broken. Implicitly assuming you will only get a partial prev
> > > > overlap merge is far from intuitive.
> > > >
> > > > I am definitely going to try to do a series addressing vma_merge() horrors
> > > > because I feel like we need a generic means of doing this split/merge pattern.
> > >
> > > It'll be great if you can make it better.
> > >
> > > >
> > > > >
> > > > > I think I see what you meant, but here I think it's a legal case where we
> > > > > should have PPPP rather than CCCC (PPPPPNNNN --> PPNNNNNNNN).
> > > > >
> > > > > To be explicit, for register I think it _should_ be the case 0 where we
> > > > > cannot merge (note: the current code is indeed wrong though, see below):
> > > > >
> > > > >    ****
> > > > >   PPPPPPNNNNNN
> > > > >   cannot merge
> > > > >
> > > > > While for the unregister case here it's case 4:
> > > > >
> > > > >     ****
> > > > >   PPPPPPNNNNNN
> > > > >   might become
> > > > >   PPNNNNNNNNNN
> > > > >   case 4 below
> > > > >
> > > > > Here the problem is not that we need to do split then merge (I think it'll
> > > > > have the problem of vma defragmentation here), the problem is we simply
> > > > > passed in the wrong "prev" vma pointer, IMHO.  I've patches attached
> > > > > showing what I meant.
> > > >
> > > > Yeah if you do it with prev = vma this form should _probably_ work, that's
> > > > a good point. This _is_ a case (see https://ljs.io/vma_merge_cases.png for
> > > > nice diagram of cases btw :), like 5, where we actually do split and merge
> > > > at the same time.
> > >
> > > It's not something I came up with myself, it's just that I started looking
> > > back to see what people did and trying to understand why they did it.
> > > Normally people did things with good reasons.
> > >
> > 
> > Yup, it's an iffy pattern in each case. I can see why you chose to do it,
> > it's not a criticism of that, it's just that the incorrect (but
> > accidentally ok to be incorrect seemingly) input is made more obvious by
> > recent changes.
> > 
> > > So far it seems clearer at least to me to keep this pattern of "merge then
> > > split".  But I'm happy to be proven wrong anytime.
> > 
> > But you're not, you're doing a vma_merge() and (without it being clear)
> > hoping it does a merge AND SPLIT in case in an instance where that might be
> > required followed by you doing any further splits required.
> 
> I don't quite get the reason that you seem to keep saying this is
> "incorrect input" to vma_merge().  vma_merge() definitely includes splits
> too where it can, to be explicit, for case 4 & 5.
> 
> It seems to me what you're trying to explain is we shouldn't handle any
> split in vma_merge() so we should move cases 4 & 5 out of vma_merge().  If
> we split first then merge, cases 4 & 5 will become case 2 & 3 after split.

We don't split in case 4 or 5 - we adjust the existing VMA limits.  We
don't actually handle any splits in vma_merge().  I think splitting
first would change 4 & 5 to 7 & 8?  2 & 3 would require a split and
munmap, right?

> My question would be: if it worked perfect in the past few years and it
> looks all good enough, why bother..

I suspect, but it's not clear (like all of this), that the other
arguments to vma_merge() is ruling out this potential hazard I thought
existed.

> 
> > 
> > But it's _not your fault_ that this is the standard approach, it's the
> > interface that's wrong absolutely.
> > 
> > To me doing this is quite a bit less clear than simply 'splitting so this
> > is mergable first then try to merge' but obviously this current patch does
> > not avoid the previously introduced fragmentation. However it does maintain
> > existing behaviour.
> > 
> > >
> > > >
> > > > Liam's raised some issues with the safety of your patches, let me look at
> > > > them when I get a chance, nasty headcold = brain less functional atm.
> > >
> > > Sure, no need to rush.
> > >
> > > >
> > > > I would say for now this fix resolves the issue in a way that should
> > > > emphatically avoid invalid input to vma_merge(), the fragmentation existed
> > > > before so this is not a new issue, so for the time being I think it's ok to
> > > > stay as-is.
> > >
> > > So far I would still suggest having uffd code the same as the rest if
> > > possible.
> > >
> > 
> > Absolutely, but in the immediate term, we are seeing VM_WARN_ON()'s here
> > and not from other callers (which is actually surprising).
> 
> Not surprising anymore to me, simply because the uffd path was the only one
> got overlooked when converting to maple in commit 69dbe6daf104, as Liam
> rightfully pointed out.
> 
> For example, mprotect() has similar handling when the start addr of the
> range can be in the middle of a vma, then in do_mprotect_pkey() there is:
> 
> 	prev = vma_prev(&vmi);
> 	if (start > vma->vm_start)
> 		prev = vma;
> 
> IMHO it's the same thing.
> 
> > 
> > Again, we absolutely do need an abstracted solution for the whole. And
> > that's something I'll try to work on!
> > 
> > > I think I'll wait for the other discussion to settle on patch 2 to see
> > > whether I should post a formal patchset.
> > 
> > My suggestion is that it's ok to proceed as-is, not because this is the
> > final change that should be applied, but rather because it resolves the
> > issue while maintaining existing behaviour.
> > 
> > I will be more than happy to see patches land after this one that replace
> > it if necessary but I think it's safe for this to land as a mainline fixup,
> > even if temporary, is all I am saying :)
> 
> I'd hope we can replace this patch with my patch 1 if possible because I
> _think_ this patch is still in hotfixes-unstable (while patch 2 doesn't
> need to copy stable in all cases).  Andrew may know better on how to
> proceed.
> 
> If this lands first, I'll probably propose a full revert otherwise as the
> 1st patch of the patchset to post, then make uffd the same as others.
> 
> Before that I'd like to know whether you agree that the new patch 1 (I'll
> fixup the vma_prev() side effect) could be a better solution than the
> current one, no matter whether we need a full revert or not.
> 
> Thanks,
> 
> > 
> > >
> > > >
> > > > >
> > > > > I checked the original commit from Andrea and I found that it _was_ correct:
> > > > >
> > > > > commit 86039bd3b4e6a1129318cbfed4e0a6e001656635
> > > > > Author: Andrea Arcangeli <aarcange@redhat.com>
> > > > > Date:   Fri Sep 4 15:46:31 2015 -0700
> > > > >
> > > > >     userfaultfd: add new syscall to provide memory externalization
> > > > >
> > > > > I had a feeling that it's broken during the recent rework on vma (or maybe
> > > > > even not that close), but I'm not yet sure and need to further check.
> > > > >
> > > > > > >
> > > > > > > By specifying parameters that are compatible with N even though you're only
> > > > > > > partially spanning C?
> > > > > > >
> > > > > > > This is crazy, and isn't how this should be used. vma_merge() is not
> > > > > > > supposed to do partial merges. If it works (presumably it does) this is not
> > > > > > > by design unless I've lost my mind and I (and others) have somehow not
> > > > > > > noticed this??
> > > > > > >
> > > > > > > I think you're right that now we'll end up with more fragmentation, but
> > > > > > > what you're suggesting is not how vma_merge() is supposed to work.
> > > > > > >
> > > > > > > As I said above, giving vma_merge() invalid parameters is very dangerous as
> > > > > > > you could end up merging over empty ranges in theory (and could otherwise
> > > > > > > have corruption).
> > > > > > >
> > > > > > > I guess we should probably be passing 0 to the last parameter in
> > > > > > > split_vma() here then to ensure we do a merge pass too. Will experiment
> > > > > > > with this.
> > > > > > >
> > > > > > > I'm confused as to how the remove from case 8 is not proceeding. I'll look
> > > > > > > into this some more...
> > > > > > >
> > > > > > > Happy to be corrected if I'm misconstruing this!
> > > > > > >
> > > > > >
> > > > > > OK, so I wrote a small program to do perform exactly this case [0] and it seems
> > > > > > that the outcome is the same before and after this patch - vma_merge() is
> > > > > > clearly rejecting the case 8 merge (phew!) and in both instances you end up with
> > > > > > 3 VMAs.
> > > > > >
> > > > > > So this patch doesn't change this behaviour and everything is as it was
> > > > > > before. Ideally we'd let it go for another pass, so maybe we should change the
> > > > > > split to add a new VMA _afterwards_. Will experiment with that, separately.
> > > > > >
> > > > > > But looks like the patch is good as it is.
> > > > > >
> > > > > > (if you notice something wrong with the repro, etc. do let me know!)
> > > > > >
> > > > > > [0]: https://gist.github.com/lorenzo-stoakes/a11a10f5f479e7a977fc456331266e0e
> > > > >
> > > > > I think your test case is fine, but... no, this is still not expected. The
> > > > > vma should just merge.
> > > > >
> > > > > So I have another closer look on this specific issue, it seems we have a
> > > > > long standing bug on pgoff calculation here when passing that to
> > > > > vma_merge().  I've got another patch attached to show what I meant.
> > > > >
> > > > > To summarize.. now I've got two patches attached:
> > > > >
> > > > > Patch 1 is something I'd like to propose to replace this patch that fixes
> > > > > incorrect use of vma_merge() so it should also eliminate the assertion
> > > > > being triggered (I still think this is a regression but I need to check
> > > > > which I will do later; I'm not super familiar with maple tree work, maybe
> > > > > you or Liam can quickly spot).
> > > > >
> > > > > Patch 2 fixes the long standing issue of vma not being able to merge in
> > > > > above case, and with patch 2 applied it should start merging all right.
> > > > >
> > > > > Please have a look, thanks.
> > > > >
> > > > > ---8<---
> > > > > From 6bc39028bba246394bb0bafdaeaab7b8dfd1694f Mon Sep 17 00:00:00 2001
> > > > > From: Peter Xu <peterx@redhat.com>
> > > > > Date: Tue, 16 May 2023 09:03:22 -0400
> > > > > Subject: [PATCH 1/2] mm/uffd: Fix vma operation where start addr cuts part of
> > > > >  vma
> > > > >
> > > > > It seems vma merging with uffd paths is broken with either
> > > > > register/unregister, where right now we can feed wrong parameters to
> > > > > vma_merge() and it's found by recent patch which moved asserts upwards in
> > > > > vma_merge():
> > > > >
> > > > > https://lore.kernel.org/all/ZFunF7DmMdK05MoF@FVFF77S0Q05N.cambridge.arm.com/
> > > > >
> > > > > The problem is in the current code base we didn't fixup "prev" for the case
> > > > > where "start" address can be within the "prev" vma section.  In that case
> > > > > we should have "prev" points to the current vma rather than the previous
> > > > > one when feeding to vma_merge().
> > > > >
> > > > > This will eliminate the report and make sure vma_merge() calls will become
> > > > > legal again.
> > > > >
> > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > ---
> > > > >  fs/userfaultfd.c | 27 +++++++++++++++++++++------
> > > > >  1 file changed, 21 insertions(+), 6 deletions(-)
> > > > >
> > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > > index 0fd96d6e39ce..7eb88bc74d00 100644
> > > > > --- a/fs/userfaultfd.c
> > > > > +++ b/fs/userfaultfd.c
> > > > > @@ -1458,10 +1458,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > >  	BUG_ON(!found);
> > > > >
> > > > >  	vma_iter_set(&vmi, start);
> > > > > -	prev = vma_prev(&vmi);
> > > > > +	vma = vma_find(&vmi, end);
> > > > > +	if (!vma)
> > > > > +		goto out_unlock;
> > > > > +
> > > > > +	if (vma->vm_start < start)
> > > > > +		prev = vma;
> > > > > +	else
> > > > > +		prev = vma_prev(&vmi);
> > > > >
> > > > >  	ret = 0;
> > > > > -	for_each_vma_range(vmi, vma, end) {
> > > > > +	do {
> > > > >  		cond_resched();
> > > > >
> > > > >  		BUG_ON(!vma_can_userfault(vma, vm_flags));
> > > > > @@ -1517,7 +1524,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > >  	skip:
> > > > >  		prev = vma;
> > > > >  		start = vma->vm_end;
> > > > > -	}
> > > > > +	} for_each_vma_range(vmi, vma, end);
> > > > >
> > > > >  out_unlock:
> > > > >  	mmap_write_unlock(mm);
> > > > > @@ -1624,9 +1631,17 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > > >  	BUG_ON(!found);
> > > > >
> > > > >  	vma_iter_set(&vmi, start);
> > > > > -	prev = vma_prev(&vmi);
> > > > > +	vma = vma_find(&vmi, end);
> > > > > +	if (!vma)
> > > > > +		goto out_unlock;
> > > > > +
> > > > > +	if (vma->vm_start < start)
> > > > > +		prev = vma;
> > > > > +	else
> > > > > +		prev = vma_prev(&vmi);
> > > > > +
> > > > >  	ret = 0;
> > > > > -	for_each_vma_range(vmi, vma, end) {
> > > > > +	do {
> > > > >  		cond_resched();
> > > > >
> > > > >  		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
> > > > > @@ -1692,7 +1707,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > > >  	skip:
> > > > >  		prev = vma;
> > > > >  		start = vma->vm_end;
> > > > > -	}
> > > > > +	} for_each_vma_range(vmi, vma, end);
> > > > >
> > > > >  out_unlock:
> > > > >  	mmap_write_unlock(mm);
> > > > > --
> > > > > 2.39.1
> > > > >
> > > > > From bf61f3c937e9e2ab96ab2bed0005cb7dc74db231 Mon Sep 17 00:00:00 2001
> > > > > From: Peter Xu <peterx@redhat.com>
> > > > > Date: Tue, 16 May 2023 09:39:38 -0400
> > > > > Subject: [PATCH 2/2] mm/uffd: Allow vma to merge as much as possible
> > > > >
> > > > > We used to not pass in the pgoff correctly when register/unregister uffd
> > > > > regions, it caused incorrect behavior on vma merging.
> > > > >
> > > > > For example, when we have:
> > > > >`
> > > > >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> > > > >
> > > > > Then someone unregisters uffd on range (5-9), it should become:
> > > > >
> > > > >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> > > > >
> > > > > But with current code it's:
> > > > >
> > > > >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> > > > >
> > > > > This patch allows such merge to happen correctly.
> > > > >
> > > > > This behavior seems to have existed since the 1st day of uffd, keep it just
> > > > > as a performance optmization and not copy stable.
> > > > >
> > > > > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > > > > Cc: Mike Rapoport (IBM) <rppt@kernel.org>
> > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > ---
> > > > >  fs/userfaultfd.c | 8 ++++++--
> > > > >  1 file changed, 6 insertions(+), 2 deletions(-)
> > > > >
> > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > > index 7eb88bc74d00..891048b4799f 100644
> > > > > --- a/fs/userfaultfd.c
> > > > > +++ b/fs/userfaultfd.c
> > > > > @@ -1332,6 +1332,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > >  	bool basic_ioctls;
> > > > >  	unsigned long start, end, vma_end;
> > > > >  	struct vma_iterator vmi;
> > > > > +	pgoff_t pgoff;
> > > > >
> > > > >  	user_uffdio_register = (struct uffdio_register __user *) arg;
> > > > >
> > > > > @@ -1489,8 +1490,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > > >  		vma_end = min(end, vma->vm_end);
> > > > >
> > > > >  		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
> > > > > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > > > >  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > > > > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > > > > +				 vma->anon_vma, vma->vm_file, pgoff,
> > > > >  				 vma_policy(vma),
> > > > >  				 ((struct vm_userfaultfd_ctx){ ctx }),
> > > > >  				 anon_vma_name(vma));
> > > > > @@ -1570,6 +1572,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > > >  	unsigned long start, end, vma_end;
> > > > >  	const void __user *buf = (void __user *)arg;
> > > > >  	struct vma_iterator vmi;
> > > > > +	pgoff_t pgoff;
> > > > >
> > > > >  	ret = -EFAULT;
> > > > >  	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
> > > > > @@ -1677,8 +1680,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > > >  			uffd_wp_range(vma, start, vma_end - start, false);
> > > > >
> > > > >  		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
> > > > > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > > > >  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > > > > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > > > > +				 vma->anon_vma, vma->vm_file, pgoff,
> > > > >  				 vma_policy(vma),
> > > > >  				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
> > > > >  		if (prev) {
> > > > > --
> > > > > 2.39.1
> > > > > ---8<---
> > > > >
> > > > > --
> > > > > Peter Xu
> > > > >
> > > >
> > >
> > > --
> > > Peter Xu
> > >
> > >
> > 
> 
> -- 
> Peter Xu
>
  
Peter Xu May 16, 2023, 10:51 p.m. UTC | #17
On Tue, May 16, 2023 at 06:38:30PM -0400, Liam R. Howlett wrote:
> > It seems to me what you're trying to explain is we shouldn't handle any
> > split in vma_merge() so we should move cases 4 & 5 out of vma_merge().  If
> > we split first then merge, cases 4 & 5 will become case 2 & 3 after split.
> 
> We don't split in case 4 or 5 - we adjust the existing VMA limits.  We
> don't actually handle any splits in vma_merge().  I think splitting
> first would change 4 & 5 to 7 & 8?  2 & 3 would require a split and
> munmap, right?

Right, I referenced to the wrong numbers..  2 & 3 are when CUR (CCCC) is
empty and newly mapped in, if split happened it means CUR (CCCC) exists
which is 7 & 8 correspondingly.

> 
> > My question would be: if it worked perfect in the past few years and it
> > looks all good enough, why bother..
> 
> I suspect, but it's not clear (like all of this), that the other
> arguments to vma_merge() is ruling out this potential hazard I thought
> existed.

Some more elaborations on this one would be appreciated.

Thanks,
  
Liam R. Howlett May 16, 2023, 10:52 p.m. UTC | #18
* Peter Xu <peterx@redhat.com> [230516 16:12]:
...

> > > 
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > ---
> > >  fs/userfaultfd.c | 27 +++++++++++++++++++++------
> > >  1 file changed, 21 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > index 0fd96d6e39ce..7eb88bc74d00 100644
> > > --- a/fs/userfaultfd.c
> > > +++ b/fs/userfaultfd.c
> > > @@ -1458,10 +1458,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > >  	BUG_ON(!found);
> > >  
> > >  	vma_iter_set(&vmi, start);
> > > -	prev = vma_prev(&vmi);
> > > +	vma = vma_find(&vmi, end);
> > > +	if (!vma)
> > > +		goto out_unlock;
> > > +
> > > +	if (vma->vm_start < start)
> > > +		prev = vma;
> > > +	else
> > > +		prev = vma_prev(&vmi);
> > >  
> > >  	ret = 0;
> > > -	for_each_vma_range(vmi, vma, end) {
> > > +	do {
> > 
> > The iterator may be off by one, depending on if vma_prev() is called or
> > not.
> > 
> > Perhaps:
> > 	prev = vma_prev(&vmi); /* could be wrong, or null */
> > 	vma = vma_find(&vmi, end);
> > 	if (!vma)
> > 		goto out_unlock;
> > 
> > 	if (vma->vm_start < start)
> > 		prev = vma;
> > 
> > now we know we are at vma with the iterator..
> > 	ret = 0
> > 	do{
> > 	...
> 
> Will do, thanks.
> 
> I think I got trapped similarly when I was looking at xarray months ago
> where xarray also had similar side effects to have off-by-one the iterator
> behavior.
> 
> Do you think it'll make sense to have something describing such side
> effects for maple tree (or the current vma api), or.. maybe there's already
> some but I just didn't really know?

Well, it's an iterator so I though a position was implied.  But I think
the documentation is lacking on the vma_iterator interface and I should
fix that.

...

> > > From: Peter Xu <peterx@redhat.com>
> > > Date: Tue, 16 May 2023 09:39:38 -0400
> > > Subject: [PATCH 2/2] mm/uffd: Allow vma to merge as much as possible
> > > 
> > > We used to not pass in the pgoff correctly when register/unregister uffd
> > > regions, it caused incorrect behavior on vma merging.
> > > 
> > > For example, when we have:
> > > 
> > >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> > > 
> > > Then someone unregisters uffd on range (5-9), it should become:
> > > 
> > >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> > > 
> > > But with current code it's:
> > > 
> > >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> > > 
> > > This patch allows such merge to happen correctly.
> > > 
> > > This behavior seems to have existed since the 1st day of uffd, keep it just
> > > as a performance optmization and not copy stable.
> > > 
> > > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > > Cc: Mike Rapoport (IBM) <rppt@kernel.org>
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > ---
> > >  fs/userfaultfd.c | 8 ++++++--
> > >  1 file changed, 6 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > index 7eb88bc74d00..891048b4799f 100644
> > > --- a/fs/userfaultfd.c
> > > +++ b/fs/userfaultfd.c
> > > @@ -1332,6 +1332,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > >  	bool basic_ioctls;
> > >  	unsigned long start, end, vma_end;
> > >  	struct vma_iterator vmi;
> > > +	pgoff_t pgoff;
> > >  
> > >  	user_uffdio_register = (struct uffdio_register __user *) arg;
> > >  
> > > @@ -1489,8 +1490,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > >  		vma_end = min(end, vma->vm_end);
> > >  
> > >  		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
> > > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > 
> > I don't think this is safe.  You are telling vma_merge() something that
> > is not true and will result in can_vma_merge_before() passing.  I mean,
> > sure it will become true after you split (unless you can't?), but I
> > don't know if you can just merge a VMA that doesn't pass
> > can_vma_merge_before(), even for a short period?
> 
> I must admit I'm not really that handy yet to vma codes, so I could miss
> something obvious.
> 
> My reasoning comes from two parts that this pgoff looks all fine:
> 
> 1) It's documented in vma_merge() in that:
> 
>  * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
>  * figure out ...
> 
>   So fundamentally this pgoff is part of the mapping request paired with
>   all the rest of the information.  AFAICT it means it must match with what
>   "addr" is describing in VA address space.  That's why I think offseting
>   it makes sense here.
> 
>   It also matches my understanding in vma_merge() code on how the pgoff is
>   used.
> 
> 2) Uffd is nothing special in this regard, namely:
> 
>    mbind_range():
> 
> 	pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT);
> 	merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags,
> 			 vma->anon_vma, vma->vm_file, pgoff, new_pol,
> 			 vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> 
>    mlock_fixup():
>    
> 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> 	*prev = vma_merge(vmi, mm, *prev, start, end, newflags,
> 			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
> 			vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> 
>    mprotect_fixup():
> 
> 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> 	*pprev = vma_merge(vmi, mm, *pprev, start, end, newflags,
> 			   vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
> 			   vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> 
> I had a feeling that it's just something overlooked in the initial proposal
> of uffd, but maybe I missed something important?

I think you are correct.  It's worth noting that all of these skip
splitting if merging succeeds.

We know it won't match case 1-4 (we have a current vma).  We then pass
in vma_end = min(end, vma->vm_end);

vma_lookup() will only be called if end == vma->vm_end, so next will not
be set (and found) unless it is adjacent to the current vma and the vma
in question does not need to be split anyways.

I also see that we use pgoff+pglen in the check, which avoids my concern
above.

> 
> > 
> > >  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > > +				 vma->anon_vma, vma->vm_file, pgoff,
> > >  				 vma_policy(vma),
> > >  				 ((struct vm_userfaultfd_ctx){ ctx }),
> > >  				 anon_vma_name(vma));
> > > @@ -1570,6 +1572,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > >  	unsigned long start, end, vma_end;
> > >  	const void __user *buf = (void __user *)arg;
> > >  	struct vma_iterator vmi;
> > > +	pgoff_t pgoff;
> > >  
> > >  	ret = -EFAULT;
> > >  	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
> > > @@ -1677,8 +1680,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > >  			uffd_wp_range(vma, start, vma_end - start, false);
> > >  
> > >  		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
> > > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > >  		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
> > > -				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> > > +				 vma->anon_vma, vma->vm_file, pgoff,
> > >  				 vma_policy(vma),
> > >  				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
> > >  		if (prev) {
> > > -- 
> > > 2.39.1
> > > ---8<---
> > > 
> > > -- 
> > > Peter Xu
> > > 
> > 
> 
> -- 
> Peter Xu
>
  
Liam R. Howlett May 16, 2023, 10:53 p.m. UTC | #19
* Peter Xu <peterx@redhat.com> [230516 18:52]:
> On Tue, May 16, 2023 at 06:38:30PM -0400, Liam R. Howlett wrote:
> > > It seems to me what you're trying to explain is we shouldn't handle any
> > > split in vma_merge() so we should move cases 4 & 5 out of vma_merge().  If
> > > we split first then merge, cases 4 & 5 will become case 2 & 3 after split.
> > 
> > We don't split in case 4 or 5 - we adjust the existing VMA limits.  We
> > don't actually handle any splits in vma_merge().  I think splitting
> > first would change 4 & 5 to 7 & 8?  2 & 3 would require a split and
> > munmap, right?
> 
> Right, I referenced to the wrong numbers..  2 & 3 are when CUR (CCCC) is
> empty and newly mapped in, if split happened it means CUR (CCCC) exists
> which is 7 & 8 correspondingly.
> 
> > 
> > > My question would be: if it worked perfect in the past few years and it
> > > looks all good enough, why bother..
> > 
> > I suspect, but it's not clear (like all of this), that the other
> > arguments to vma_merge() is ruling out this potential hazard I thought
> > existed.
> 
> Some more elaborations on this one would be appreciated.

I just responded in the other thread, as the context is more complete
there.
  
Lorenzo Stoakes May 17, 2023, 6:21 a.m. UTC | #20
On Tue, May 16, 2023 at 06:32:38PM -0400, Peter Xu wrote:
> On Tue, May 16, 2023 at 11:15:54PM +0100, Lorenzo Stoakes wrote:
> > I'll try to address this in a later series, I don't think there's much use
> > in going round in circles on this. If you dislike that series, you're
> > welcome to provide negative feedback there, I don't think there's much use
> > in discussing further here.
>
> I'm happy to read it, sorry if any of my wording was intruding, I didn't
> mean so.

No it's fine, I've perhaps not been clear, but this has all inspired an
upcoming change so it's all good :)

>
> I think there's chance at least on generalizing vma flag change cases, even
> though I'm not sure whether vma_merge() needs change.  Maybe it can be
> another layer on top of it while keeping vma_merge() as is, but I can't tell.
>

Indeed, will dig in!

> > We've seen a regression on invalid input to vma_merge() (explicitly I mean
> > triggering a VM_WARN_ON()) and VMA fragmentation you were not aware of
> > here, that does not strike me as a great + clear interface.
>
> Yes, the code needs time to read through, even the interface.  I don't
> think I fully digested that myself.
>
> [...]
>
> > Ah the thanks you get for contributing a regression fix _and_ a repro - a
> > nack :) will you at least give me some kind of a tag... or buy me a beer?
> > ;)
>
> I can. :)
>
> We actually met on the conference, if I'll be able to meet you somewhere
> that's what I can do.

Yeah was nice to meet! And only being tongue in cheek, I actually at this
point do think your suggestion should replace my patch.

>
> I was probably hashing in the words, sorry about that if so, and thanks for
> looking at this issue!  I appreciate both your assertion patch and the png
> documentation file.
>
> It's just that I feel irresponsible when we were talking about having vma
> not merged correctly but then the discussion tried to end at there saying
> it kept so so it's fine.  IMHO we should look into that problem or
> something could be missing here.  Then when I was looking into that
> not-merged issue I found that it's not uffd that's special.

Sure, we should try to keep all invocations as close as possible if we
can. It does seem as you say that uffd got left behind on this.

>
> > > Before that I'd like to know whether you agree that the new patch 1 (I'll
> > > fixup the vma_prev() side effect) could be a better solution than the
> > > current one, no matter whether we need a full revert or not.
> >
> > In principle it looks fine actually (pending Liam's assessment), case 4/5
> > should handle it, but I feel like we need a comment (perhaps only in commit
> > msg) to make clear that we are ensuring that the inputs to vma_merge() are
> > either clamped to VMAs or case 4/5.
> >
> > Let's see what Liam thinks, then let me check it locally to give a final
> > OK, if I may.
>
> Sounds perfect here.  Thanks a lot.

Great!

>
> --
> Peter Xu
>
  
Peter Xu May 17, 2023, 1:50 p.m. UTC | #21
On Tue, May 16, 2023 at 06:52:51PM -0400, Liam R. Howlett wrote:
> * Peter Xu <peterx@redhat.com> [230516 16:12]:
> ...
> 
> > > > 
> > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > ---
> > > >  fs/userfaultfd.c | 27 +++++++++++++++++++++------
> > > >  1 file changed, 21 insertions(+), 6 deletions(-)
> > > > 
> > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > index 0fd96d6e39ce..7eb88bc74d00 100644
> > > > --- a/fs/userfaultfd.c
> > > > +++ b/fs/userfaultfd.c
> > > > @@ -1458,10 +1458,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > >  	BUG_ON(!found);
> > > >  
> > > >  	vma_iter_set(&vmi, start);
> > > > -	prev = vma_prev(&vmi);
> > > > +	vma = vma_find(&vmi, end);
> > > > +	if (!vma)
> > > > +		goto out_unlock;
> > > > +
> > > > +	if (vma->vm_start < start)
> > > > +		prev = vma;
> > > > +	else
> > > > +		prev = vma_prev(&vmi);
> > > >  
> > > >  	ret = 0;
> > > > -	for_each_vma_range(vmi, vma, end) {
> > > > +	do {
> > > 
> > > The iterator may be off by one, depending on if vma_prev() is called or
> > > not.
> > > 
> > > Perhaps:
> > > 	prev = vma_prev(&vmi); /* could be wrong, or null */
> > > 	vma = vma_find(&vmi, end);
> > > 	if (!vma)
> > > 		goto out_unlock;
> > > 
> > > 	if (vma->vm_start < start)
> > > 		prev = vma;
> > > 
> > > now we know we are at vma with the iterator..
> > > 	ret = 0
> > > 	do{
> > > 	...
> > 
> > Will do, thanks.
> > 
> > I think I got trapped similarly when I was looking at xarray months ago
> > where xarray also had similar side effects to have off-by-one the iterator
> > behavior.
> > 
> > Do you think it'll make sense to have something describing such side
> > effects for maple tree (or the current vma api), or.. maybe there's already
> > some but I just didn't really know?
> 
> Well, it's an iterator so I though a position was implied.  But I think
> the documentation is lacking on the vma_iterator interface and I should
> fix that.

Thanks.

> 
> ...
> 
> > > > From: Peter Xu <peterx@redhat.com>
> > > > Date: Tue, 16 May 2023 09:39:38 -0400
> > > > Subject: [PATCH 2/2] mm/uffd: Allow vma to merge as much as possible
> > > > 
> > > > We used to not pass in the pgoff correctly when register/unregister uffd
> > > > regions, it caused incorrect behavior on vma merging.
> > > > 
> > > > For example, when we have:
> > > > 
> > > >   vma1(range 0-9, with uffd), vma2(range 10-19, no uffd)
> > > > 
> > > > Then someone unregisters uffd on range (5-9), it should become:
> > > > 
> > > >   vma1(range 0-4, with uffd), vma2(range 5-19, no uffd)
> > > > 
> > > > But with current code it's:
> > > > 
> > > >   vma1(range 0-4, with uffd), vma3(range 5-9, no uffd), vma2(range 10-19, no uffd)
> > > > 
> > > > This patch allows such merge to happen correctly.
> > > > 
> > > > This behavior seems to have existed since the 1st day of uffd, keep it just
> > > > as a performance optmization and not copy stable.
> > > > 
> > > > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > > > Cc: Mike Rapoport (IBM) <rppt@kernel.org>
> > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > ---
> > > >  fs/userfaultfd.c | 8 ++++++--
> > > >  1 file changed, 6 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > index 7eb88bc74d00..891048b4799f 100644
> > > > --- a/fs/userfaultfd.c
> > > > +++ b/fs/userfaultfd.c
> > > > @@ -1332,6 +1332,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > >  	bool basic_ioctls;
> > > >  	unsigned long start, end, vma_end;
> > > >  	struct vma_iterator vmi;
> > > > +	pgoff_t pgoff;
> > > >  
> > > >  	user_uffdio_register = (struct uffdio_register __user *) arg;
> > > >  
> > > > @@ -1489,8 +1490,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > >  		vma_end = min(end, vma->vm_end);
> > > >  
> > > >  		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
> > > > +		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > > 
> > > I don't think this is safe.  You are telling vma_merge() something that
> > > is not true and will result in can_vma_merge_before() passing.  I mean,
> > > sure it will become true after you split (unless you can't?), but I
> > > don't know if you can just merge a VMA that doesn't pass
> > > can_vma_merge_before(), even for a short period?
> > 
> > I must admit I'm not really that handy yet to vma codes, so I could miss
> > something obvious.
> > 
> > My reasoning comes from two parts that this pgoff looks all fine:
> > 
> > 1) It's documented in vma_merge() in that:
> > 
> >  * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
> >  * figure out ...
> > 
> >   So fundamentally this pgoff is part of the mapping request paired with
> >   all the rest of the information.  AFAICT it means it must match with what
> >   "addr" is describing in VA address space.  That's why I think offseting
> >   it makes sense here.
> > 
> >   It also matches my understanding in vma_merge() code on how the pgoff is
> >   used.
> > 
> > 2) Uffd is nothing special in this regard, namely:
> > 
> >    mbind_range():
> > 
> > 	pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT);
> > 	merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags,
> > 			 vma->anon_vma, vma->vm_file, pgoff, new_pol,
> > 			 vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > 
> >    mlock_fixup():
> >    
> > 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > 	*prev = vma_merge(vmi, mm, *prev, start, end, newflags,
> > 			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
> > 			vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > 
> >    mprotect_fixup():
> > 
> > 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > 	*pprev = vma_merge(vmi, mm, *pprev, start, end, newflags,
> > 			   vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
> > 			   vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > 
> > I had a feeling that it's just something overlooked in the initial proposal
> > of uffd, but maybe I missed something important?
> 
> I think you are correct.  It's worth noting that all of these skip
> splitting if merging succeeds.

Yes, IIUC that's what we want because vma_merge() just handles everything
there (including split, or say, vma range adjustments) if any !NULL
returned.

> 
> We know it won't match case 1-4 (we have a current vma).  We then pass
> in vma_end = min(end, vma->vm_end);

Case 4 seems still possible and should be the case that mentioned in the
patch 2, iiuc.  But yes I think vma_end calculation is needed, afaik it is
to cover the last iteration, where that's the only place possible that we
may operate on "end" (where < vma->vm_end) rather than "vma->vm_end".  It
actually pairs with the initial "start" adjustment to me.

> 
> vma_lookup() will only be called if end == vma->vm_end, so next will not
> be set (and found) unless it is adjacent to the current vma and the vma
> in question does not need to be split anyways.
> 
> I also see that we use pgoff+pglen in the check, which avoids my concern
> above.

Right.

It seems so far all concerns are more or less ruled out.  I'll prepare a
formal patchset, we can continue the discussion there.

Thanks,
  
Liam R. Howlett May 17, 2023, 10:51 p.m. UTC | #22
* Peter Xu <peterx@redhat.com> [230517 09:50]:

...
> > > > 
> > > > I don't think this is safe.  You are telling vma_merge() something that
> > > > is not true and will result in can_vma_merge_before() passing.  I mean,
> > > > sure it will become true after you split (unless you can't?), but I
> > > > don't know if you can just merge a VMA that doesn't pass
> > > > can_vma_merge_before(), even for a short period?
> > > 
> > > I must admit I'm not really that handy yet to vma codes, so I could miss
> > > something obvious.
> > > 
> > > My reasoning comes from two parts that this pgoff looks all fine:
> > > 
> > > 1) It's documented in vma_merge() in that:
> > > 
> > >  * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
> > >  * figure out ...
> > > 
> > >   So fundamentally this pgoff is part of the mapping request paired with
> > >   all the rest of the information.  AFAICT it means it must match with what
> > >   "addr" is describing in VA address space.  That's why I think offseting
> > >   it makes sense here.
> > > 
> > >   It also matches my understanding in vma_merge() code on how the pgoff is
> > >   used.
> > > 
> > > 2) Uffd is nothing special in this regard, namely:
> > > 
> > >    mbind_range():
> > > 
> > > 	pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT);
> > > 	merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags,
> > > 			 vma->anon_vma, vma->vm_file, pgoff, new_pol,
> > > 			 vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > > 
> > >    mlock_fixup():
> > >    
> > > 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > > 	*prev = vma_merge(vmi, mm, *prev, start, end, newflags,
> > > 			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
> > > 			vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > > 
> > >    mprotect_fixup():
> > > 
> > > 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > > 	*pprev = vma_merge(vmi, mm, *pprev, start, end, newflags,
> > > 			   vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
> > > 			   vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > > 
> > > I had a feeling that it's just something overlooked in the initial proposal
> > > of uffd, but maybe I missed something important?
> > 
> > I think you are correct.  It's worth noting that all of these skip
> > splitting if merging succeeds.
> 
> Yes, IIUC that's what we want because vma_merge() just handles everything
> there (including split, or say, vma range adjustments) if any !NULL
> returned.

I don't get your use of split here. __vma_adjust() used to be used by
split, but it never split a VMA.  vma_merge() is not used by split at
all.

> 
> > 
> > We know it won't match case 1-4 (we have a current vma).  We then pass
> > in vma_end = min(end, vma->vm_end);
> 
> Case 4 seems still possible and should be the case that mentioned in the
> patch 2, iiuc.  But yes I think vma_end calculation is needed, afaik it is
> to cover the last iteration, where that's the only place possible that we
> may operate on "end" (where < vma->vm_end) rather than "vma->vm_end".  It
> actually pairs with the initial "start" adjustment to me.
> 
> > 
> > vma_lookup() will only be called if end == vma->vm_end, so next will not
> > be set (and found) unless it is adjacent to the current vma and the vma
> > in question does not need to be split anyways.
> > 
> > I also see that we use pgoff+pglen in the check, which avoids my concern
> > above.
> 
> Right.
> 
> It seems so far all concerns are more or less ruled out.  I'll prepare a
> formal patchset, we can continue the discussion there.
> 
> Thanks,
> 
> -- 
> Peter Xu
>
  
Peter Xu May 18, 2023, 12:38 a.m. UTC | #23
On Wed, May 17, 2023 at 06:51:42PM -0400, Liam R. Howlett wrote:
> > Yes, IIUC that's what we want because vma_merge() just handles everything
> > there (including split, or say, vma range adjustments) if any !NULL
> > returned.
> 
> I don't get your use of split here. __vma_adjust() used to be used by
> split, but it never split a VMA.  vma_merge() is not used by split at
> all.

I guess maybe I just made it confusing somehow when using "split" here..

I meant the vma range adjustments when vma_merge(), so indeed it's not
really a split, but it's still a logical split to me.

To be explicit, taking example of the uffd unregister case that was
essentially case 4, where part of PPPP will be "split" out and merged with
NNNN:

        ****
        PPPPPPNNNNNN
        might become
        PPNNNNNNNNNN
        case 4 below

        if (prev && addr < prev->vm_end) {	/* case 4 */
                vma_end = addr;
                adjust = next;
                adj_start = -(prev->vm_end - addr);
                err = dup_anon_vma(next, prev);
                ...
        } 

PPPP is adjusted here (where I was trying to refer as a "split", because
vma_end is less than before so part of PPPP is cut out):

	vma->vm_start = vma_start;
	vma->vm_end = vma_end;
	vma->vm_pgoff = vma_pgoff;

NNNN adjusted here (where the "split" part merged into NNNN):

	if (adj_start) {
		adjust->vm_start += adj_start;
		adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
                ...
	}

I'll remember to stop using the word "split" in these cases in the future
to avoid confusion.

Thanks,
  

Patch

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0fd96d6e39ce..ef5d667ea804 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1319,6 +1319,32 @@  static __always_inline int validate_range(struct mm_struct *mm,
 	return 0;
 }
 
+static int clamp_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
+		       unsigned long start, unsigned long end, bool *can_merge)
+{
+	int ret;
+	bool merge = true;
+
+	/* The range must always be clamped to the start of a VMA. */
+	if (vma->vm_start < start) {
+		ret = split_vma(vmi, vma, start, 1);
+		if (ret)
+			return ret;
+
+		merge = false;
+	}
+
+	/* It must also be clamped to the end of a VMA. */
+	if (vma->vm_end > end) {
+		ret = split_vma(vmi, vma, end, 0);
+		if (ret)
+			return ret;
+	}
+
+	*can_merge = merge;
+	return 0;
+}
+
 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 				unsigned long arg)
 {
@@ -1330,7 +1356,7 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	unsigned long vm_flags, new_flags;
 	bool found;
 	bool basic_ioctls;
-	unsigned long start, end, vma_end;
+	unsigned long start, end;
 	struct vma_iterator vmi;
 
 	user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1462,6 +1488,8 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
 	ret = 0;
 	for_each_vma_range(vmi, vma, end) {
+		bool can_merge;
+
 		cond_resched();
 
 		BUG_ON(!vma_can_userfault(vma, vm_flags));
@@ -1477,32 +1505,22 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		    (vma->vm_flags & vm_flags) == vm_flags)
 			goto skip;
 
-		if (vma->vm_start > start)
-			start = vma->vm_start;
-		vma_end = min(end, vma->vm_end);
+		ret = clamp_range(&vmi, vma, start, end, &can_merge);
+		if (ret)
+			break;
 
 		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
-		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
-				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
-				 vma_policy(vma),
-				 ((struct vm_userfaultfd_ctx){ ctx }),
-				 anon_vma_name(vma));
-		if (prev) {
+		if (can_merge) {
+			prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags,
+					 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+					 vma_policy(vma),
+					 ((struct vm_userfaultfd_ctx){ ctx }),
+					 anon_vma_name(vma));
+
 			/* vma_merge() invalidated the mas */
-			vma = prev;
-			goto next;
-		}
-		if (vma->vm_start < start) {
-			ret = split_vma(&vmi, vma, start, 1);
-			if (ret)
-				break;
-		}
-		if (vma->vm_end > end) {
-			ret = split_vma(&vmi, vma, end, 0);
-			if (ret)
-				break;
+			if (prev)
+				vma = prev;
 		}
-	next:
 		/*
 		 * In the vma_merge() successful mprotect-like case 8:
 		 * the next vma was merged into the current one and
@@ -1560,7 +1578,7 @@  static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	struct uffdio_range uffdio_unregister;
 	unsigned long new_flags;
 	bool found;
-	unsigned long start, end, vma_end;
+	unsigned long start, end;
 	const void __user *buf = (void __user *)arg;
 	struct vma_iterator vmi;
 
@@ -1627,6 +1645,8 @@  static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	prev = vma_prev(&vmi);
 	ret = 0;
 	for_each_vma_range(vmi, vma, end) {
+		bool can_merge;
+
 		cond_resched();
 
 		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
@@ -1640,9 +1660,9 @@  static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 
 		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
 
-		if (vma->vm_start > start)
-			start = vma->vm_start;
-		vma_end = min(end, vma->vm_end);
+		ret = clamp_range(&vmi, vma, start, end, &can_merge);
+		if (ret)
+			break;
 
 		if (userfaultfd_missing(vma)) {
 			/*
@@ -1652,35 +1672,27 @@  static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 			 * UFFDIO_WAKE explicitly.
 			 */
 			struct userfaultfd_wake_range range;
-			range.start = start;
-			range.len = vma_end - start;
+			range.start = vma->vm_start;
+			range.len = vma->vm_end - vma->vm_start;
 			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
 		}
 
 		/* Reset ptes for the whole vma range if wr-protected */
 		if (userfaultfd_wp(vma))
-			uffd_wp_range(vma, start, vma_end - start, false);
+			uffd_wp_range(vma, vma->vm_start,
+				      vma->vm_end - vma->vm_start, false);
 
 		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
-		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
-				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
-				 vma_policy(vma),
-				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
-		if (prev) {
-			vma = prev;
-			goto next;
-		}
-		if (vma->vm_start < start) {
-			ret = split_vma(&vmi, vma, start, 1);
-			if (ret)
-				break;
-		}
-		if (vma->vm_end > end) {
-			ret = split_vma(&vmi, vma, end, 0);
-			if (ret)
-				break;
+		if (can_merge) {
+			prev = vma_merge(&vmi, mm, prev, vma->vm_start,
+					 vma->vm_end, new_flags, vma->anon_vma,
+					 vma->vm_file, vma->vm_pgoff,
+					 vma_policy(vma),
+					 NULL_VM_UFFD_CTX, anon_vma_name(vma));
+			/* vma_merge() invalidated the mas */
+			if (prev)
+				vma = prev;
 		}
-	next:
 		/*
 		 * In the vma_merge() successful mprotect-like case 8:
 		 * the next vma was merged into the current one and