[v1,1/2] mm: Implement memory-deny-write-execute as a prctl

Message ID 20221026150457.36957-2-joey.gouly@arm.com
State New
Headers
Series mm: In-kernel support for memory-deny-write-execute (MDWE) |

Commit Message

Joey Gouly Oct. 26, 2022, 3:04 p.m. UTC
  The aim of such policy is to prevent a user task from creating an
executable mapping that is also writeable.

An example of mmap() returning -EACCESS if the policy is enabled:

	mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, 0, 0);

Similarly, mprotect() would return -EACCESS below:

	addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0);
	mprotect(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC);

The BPF filter that systemd MDWE uses is stateless, and disallows
mprotect() with PROT_EXEC completely. This new prctl allows PROT_EXEC to
be enabled if it was already PROT_EXEC, which allows the following case:

	addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0);
	mprotect(addr, size, PROT_READ | PROT_EXEC | PROT_BTI);

where PROT_BTI enables branch tracking identification on arm64.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Co-developed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mman.h           | 15 +++++++++++++++
 include/linux/sched/coredump.h |  6 +++++-
 include/uapi/linux/prctl.h     |  6 ++++++
 kernel/sys.c                   | 18 ++++++++++++++++++
 mm/mmap.c                      |  3 +++
 mm/mprotect.c                  |  5 +++++
 6 files changed, 52 insertions(+), 1 deletion(-)
  

Comments

Kees Cook Oct. 28, 2022, 6:51 p.m. UTC | #1
On Wed, Oct 26, 2022 at 04:04:56PM +0100, Joey Gouly wrote:
> The aim of such policy is to prevent a user task from creating an
> executable mapping that is also writeable.
> 
> An example of mmap() returning -EACCESS if the policy is enabled:
> 
> 	mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, 0, 0);
> 
> Similarly, mprotect() would return -EACCESS below:
> 
> 	addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0);
> 	mprotect(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC);
> 
> The BPF filter that systemd MDWE uses is stateless, and disallows
> mprotect() with PROT_EXEC completely. This new prctl allows PROT_EXEC to
> be enabled if it was already PROT_EXEC, which allows the following case:
> 
> 	addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0);
> 	mprotect(addr, size, PROT_READ | PROT_EXEC | PROT_BTI);
> 
> where PROT_BTI enables branch tracking identification on arm64.
> 
> Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> Co-developed-by: Catalin Marinas <catalin.marinas@arm.com>
> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> ---
>  include/linux/mman.h           | 15 +++++++++++++++
>  include/linux/sched/coredump.h |  6 +++++-
>  include/uapi/linux/prctl.h     |  6 ++++++
>  kernel/sys.c                   | 18 ++++++++++++++++++
>  mm/mmap.c                      |  3 +++
>  mm/mprotect.c                  |  5 +++++
>  6 files changed, 52 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/mman.h b/include/linux/mman.h
> index 58b3abd457a3..d84fdeab6b5e 100644
> --- a/include/linux/mman.h
> +++ b/include/linux/mman.h
> @@ -156,4 +156,19 @@ calc_vm_flag_bits(unsigned long flags)
>  }
>  
>  unsigned long vm_commit_limit(void);
> +
> +static inline bool map_deny_write_exec(struct vm_area_struct *vma,  unsigned long vm_flags)

Traditionally, it is easier to write these in the positive instead of
needing to parse a double-negative.

static inline bool allow_write_exec(...)

> +{
> +	if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
> +		return false;
> +
> +	if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
> +		return true;
> +
> +	if (vma && !(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
> +		return true;
> +
> +	return false;
> +}

Since this is implementation "2" from the earlier discussion[1], I think
some comments in here are good to have. (e.g. to explain to people
reading this code why there is a vma test, etc.) Perhaps even explicit
repeat the implementation expectations.

Restating from that thread:

  2. "is not already PROT_EXEC":

     a)	mmap(PROT_READ|PROT_WRITE|PROT_EXEC);	// fails

     b)	mmap(PROT_READ|PROT_EXEC);
	mprotect(PROT_READ|PROT_EXEC|PROT_BTI);	// passes

     c)	mmap(PROT_READ);
	mprotect(PROT_READ|PROT_EXEC);		// fails

     d)	mmap(PROT_READ|PROT_WRITE);
	mprotect(PROT_READ);
	mprotect(PROT_READ|PROT_EXEC);		// fails

[1] https://lore.kernel.org/linux-arm-kernel/YmGjYYlcSVz38rOe@arm.com/

>  #endif /* _LINUX_MMAN_H */
> diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
> index 8270ad7ae14c..0e17ae7fbfd3 100644
> --- a/include/linux/sched/coredump.h
> +++ b/include/linux/sched/coredump.h
> @@ -81,9 +81,13 @@ static inline int get_dumpable(struct mm_struct *mm)
>   * lifecycle of this mm, just for simplicity.
>   */
>  #define MMF_HAS_PINNED		27	/* FOLL_PIN has run, never cleared */
> +
> +#define MMF_HAS_MDWE		28
> +#define MMF_HAS_MDWE_MASK	(1 << MMF_HAS_MDWE)
> +
>  #define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
>  
>  #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
> -				 MMF_DISABLE_THP_MASK)
> +				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)

Good, yes, new "live forever" bit here. Perhaps bikeshedding over the
name, see below.

>  
>  #endif /* _LINUX_SCHED_COREDUMP_H */
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index a5e06dcbba13..ab9db1e86230 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -281,6 +281,12 @@ struct prctl_mm_map {
>  # define PR_SME_VL_LEN_MASK		0xffff
>  # define PR_SME_VL_INHERIT		(1 << 17) /* inherit across exec */
>  
> +/* Memory deny write / execute */
> +#define PR_SET_MDWE			65
> +# define PR_MDWE_FLAG_MMAP		1
> +
> +#define PR_GET_MDWE			66
> +
>  #define PR_SET_VMA		0x53564d41
>  # define PR_SET_VMA_ANON_NAME		0
>  
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 5fd54bf0e886..08e1dd6d2533 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2348,6 +2348,18 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
>  }
>  #endif /* CONFIG_ANON_VMA_NAME */
>  
> +static inline int prctl_set_mdwe(void)
> +{
> +	set_bit(MMF_HAS_MDWE, &current->mm->flags);
> +
> +	return 0;
> +}
> +
> +static inline int prctl_get_mdwe(void)
> +{
> +	return test_bit(MMF_HAS_MDWE, &current->mm->flags);
> +}

These will need to change -- the aren't constructed for future expansion
at all. At the very least, all the arguments need to passed to be
checked that they are zero. e.g.:

int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
		   unsigned long arg4, unsigned long arg5)
{
	if (arg3 || arg4 || arg5)
		return -EINVAL;

	...

	return 0;
}

Otherwise, there's no way to add arguments in the future because old
userspace may have been sending arbitrary junk on the stack, etc.

And regardless, I think we'll need some explicit flag bits here, since
we can see there has been a long history of various other desired
features that may end up living in here. For now, a single bit is fine.
The intended behavior is the inability to _add_ PROT_EXEC to an existing
vma, and to deny the creating of a W+X vma to begin with, so perhaps
this bit can be named MDWE_FLAG_REFUSE_EXEC_GAIN?

Then the above "..." becomes:

	if (bits & ~(MDWE_FLAG_REFUSE_EXEC_GAIN))
		return -EINVAL;

	if (bits & MDWE_FLAG_REFUSE_EXEC_GAIN)
		set_bit(MMF_HAS_MDWE, &current->mm->flags);
	else if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
		return -EPERM; /* Cannot unset the flag */

And prctl_get_mdwe() becomes:

int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
		   unsigned long arg4, unsigned long arg5)
{
	if (arg2 || arg3 || arg4 || arg5)
		return -EINVAL;
	return test_bit(MMF_HAS_MDWE, &current->mm->flags) ?
		MDWE_FLAG_REFUSE_EXEC_GAIN : 0;
}

> +
>  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  		unsigned long, arg4, unsigned long, arg5)
>  {
> @@ -2623,6 +2635,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  		error = sched_core_share_pid(arg2, arg3, arg4, arg5);
>  		break;
>  #endif
> +	case PR_SET_MDWE:
> +		error = prctl_set_mdwe();
> +		break;
> +	case PR_GET_MDWE:
> +		error = prctl_get_mdwe();
> +		break;
>  	case PR_SET_VMA:
>  		error = prctl_set_vma(arg2, arg3, arg4, arg5);
>  		break;
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 099468aee4d8..42eaf6683216 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1409,6 +1409,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>  			vm_flags |= VM_NORESERVE;
>  	}
>  
> +	if (map_deny_write_exec(NULL, vm_flags))
> +		return -EACCES;
> +

This seems like the wrong place to do the check -- that the vma argument
is a hard-coded "NULL" is evidence that something is wrong. Shouldn't
it live in mmap_region()? What happens with MAP_FIXED, when there is
an underlying vma? i.e. an MAP_FIXED will, I think, bypass the intended
check. For example, we had "c" above:

     c)	mmap(PROT_READ);
	mprotect(PROT_READ|PROT_EXEC);		// fails

But this would allow another case:

     e)	addr = mmap(..., PROT_READ, ...);
	mmap(addr, ..., PROT_READ | PROT_EXEC, MAP_FIXED, ...);	// passes


>  	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
>  	if (!IS_ERR_VALUE(addr) &&
>  	    ((vm_flags & VM_LOCKED) ||
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 8d770855b591..af71ef0788fd 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -766,6 +766,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
>  			break;
>  		}
>  
> +		if (map_deny_write_exec(vma, newflags)) {
> +			error = -EACCES;
> +			goto out;
> +		}
> +

This looks like the right place. Any rationale for why it's before
arch_validate_flags()?

>  		/* Allow architectures to sanity-check the new flags */
>  		if (!arch_validate_flags(newflags)) {
>  			error = -EINVAL;

-Kees
  
Joey Gouly Nov. 10, 2022, 11:27 a.m. UTC | #2
Hi,

On Fri, Oct 28, 2022 at 11:51:00AM -0700, Kees Cook wrote:
> On Wed, Oct 26, 2022 at 04:04:56PM +0100, Joey Gouly wrote:
> > The aim of such policy is to prevent a user task from creating an
> > executable mapping that is also writeable.
> > 
> > An example of mmap() returning -EACCESS if the policy is enabled:
> > 
> > 	mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, 0, 0);
> > 
> > Similarly, mprotect() would return -EACCESS below:
> > 
> > 	addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0);
> > 	mprotect(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC);
> > 
> > The BPF filter that systemd MDWE uses is stateless, and disallows
> > mprotect() with PROT_EXEC completely. This new prctl allows PROT_EXEC to
> > be enabled if it was already PROT_EXEC, which allows the following case:
> > 
> > 	addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0);
> > 	mprotect(addr, size, PROT_READ | PROT_EXEC | PROT_BTI);
> > 
> > where PROT_BTI enables branch tracking identification on arm64.
> > 
> > Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> > Co-developed-by: Catalin Marinas <catalin.marinas@arm.com>
> > Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > ---
> >  include/linux/mman.h           | 15 +++++++++++++++
> >  include/linux/sched/coredump.h |  6 +++++-
> >  include/uapi/linux/prctl.h     |  6 ++++++
> >  kernel/sys.c                   | 18 ++++++++++++++++++
> >  mm/mmap.c                      |  3 +++
> >  mm/mprotect.c                  |  5 +++++
> >  6 files changed, 52 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/linux/mman.h b/include/linux/mman.h
> > index 58b3abd457a3..d84fdeab6b5e 100644
> > --- a/include/linux/mman.h
> > +++ b/include/linux/mman.h
> > @@ -156,4 +156,19 @@ calc_vm_flag_bits(unsigned long flags)
> >  }
> >  
> >  unsigned long vm_commit_limit(void);
> > +
> > +static inline bool map_deny_write_exec(struct vm_area_struct *vma,  unsigned long vm_flags)
> 
> Traditionally, it is easier to write these in the positive instead of
> needing to parse a double-negative.
> 
> static inline bool allow_write_exec(...)

This doesn't feel like a double negative to me, and I think it would be better
to keep the name of the function similar to the name of the 'feature'.
However I'm not too fussed either way.

> 
> > +{
> > +	if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
> > +		return false;
> > +
> > +	if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
> > +		return true;
> > +
> > +	if (vma && !(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
> > +		return true;
> > +
> > +	return false;
> > +}
> 
> Since this is implementation "2" from the earlier discussion[1], I think
> some comments in here are good to have. (e.g. to explain to people
> reading this code why there is a vma test, etc.) Perhaps even explicit
> repeat the implementation expectations.
> 
> Restating from that thread:
> 
>   2. "is not already PROT_EXEC":
> 
>      a)	mmap(PROT_READ|PROT_WRITE|PROT_EXEC);	// fails
> 
>      b)	mmap(PROT_READ|PROT_EXEC);
> 	mprotect(PROT_READ|PROT_EXEC|PROT_BTI);	// passes
> 
>      c)	mmap(PROT_READ);
> 	mprotect(PROT_READ|PROT_EXEC);		// fails
> 
>      d)	mmap(PROT_READ|PROT_WRITE);
> 	mprotect(PROT_READ);
> 	mprotect(PROT_READ|PROT_EXEC);		// fails

Good idea, I will add a comment.

> 
> [1] https://lore.kernel.org/linux-arm-kernel/YmGjYYlcSVz38rOe@arm.com/
> 
> >  #endif /* _LINUX_MMAN_H */
> > diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
> > index 8270ad7ae14c..0e17ae7fbfd3 100644
> > --- a/include/linux/sched/coredump.h
> > +++ b/include/linux/sched/coredump.h
> > @@ -81,9 +81,13 @@ static inline int get_dumpable(struct mm_struct *mm)
> >   * lifecycle of this mm, just for simplicity.
> >   */
> >  #define MMF_HAS_PINNED		27	/* FOLL_PIN has run, never cleared */
> > +
> > +#define MMF_HAS_MDWE		28
> > +#define MMF_HAS_MDWE_MASK	(1 << MMF_HAS_MDWE)
> > +
> >  #define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
> >  
> >  #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
> > -				 MMF_DISABLE_THP_MASK)
> > +				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
> 
> Good, yes, new "live forever" bit here. Perhaps bikeshedding over the
> name, see below.
> 
> >  
> >  #endif /* _LINUX_SCHED_COREDUMP_H */
> > diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> > index a5e06dcbba13..ab9db1e86230 100644
> > --- a/include/uapi/linux/prctl.h
> > +++ b/include/uapi/linux/prctl.h
> > @@ -281,6 +281,12 @@ struct prctl_mm_map {
> >  # define PR_SME_VL_LEN_MASK		0xffff
> >  # define PR_SME_VL_INHERIT		(1 << 17) /* inherit across exec */
> >  
> > +/* Memory deny write / execute */
> > +#define PR_SET_MDWE			65
> > +# define PR_MDWE_FLAG_MMAP		1
> > +
> > +#define PR_GET_MDWE			66
> > +
> >  #define PR_SET_VMA		0x53564d41
> >  # define PR_SET_VMA_ANON_NAME		0
> >  
> > diff --git a/kernel/sys.c b/kernel/sys.c
> > index 5fd54bf0e886..08e1dd6d2533 100644
> > --- a/kernel/sys.c
> > +++ b/kernel/sys.c
> > @@ -2348,6 +2348,18 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
> >  }
> >  #endif /* CONFIG_ANON_VMA_NAME */
> >  
> > +static inline int prctl_set_mdwe(void)
> > +{
> > +	set_bit(MMF_HAS_MDWE, &current->mm->flags);
> > +
> > +	return 0;
> > +}
> > +
> > +static inline int prctl_get_mdwe(void)
> > +{
> > +	return test_bit(MMF_HAS_MDWE, &current->mm->flags);
> > +}
> 
> These will need to change -- the aren't constructed for future expansion
> at all. At the very least, all the arguments need to passed to be
> checked that they are zero. e.g.:
> 
> int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
> 		   unsigned long arg4, unsigned long arg5)
> {
> 	if (arg3 || arg4 || arg5)
> 		return -EINVAL;
> 
> 	...
> 
> 	return 0;
> }
> 
> Otherwise, there's no way to add arguments in the future because old
> userspace may have been sending arbitrary junk on the stack, etc.
> 
> And regardless, I think we'll need some explicit flag bits here, since
> we can see there has been a long history of various other desired
> features that may end up living in here. For now, a single bit is fine.
> The intended behavior is the inability to _add_ PROT_EXEC to an existing
> vma, and to deny the creating of a W+X vma to begin with, so perhaps
> this bit can be named MDWE_FLAG_REFUSE_EXEC_GAIN?
> 
> Then the above "..." becomes:
> 
> 	if (bits & ~(MDWE_FLAG_REFUSE_EXEC_GAIN))
> 		return -EINVAL;
> 
> 	if (bits & MDWE_FLAG_REFUSE_EXEC_GAIN)
> 		set_bit(MMF_HAS_MDWE, &current->mm->flags);
> 	else if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
> 		return -EPERM; /* Cannot unset the flag */
> 
> And prctl_get_mdwe() becomes:
> 
> int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
> 		   unsigned long arg4, unsigned long arg5)
> {
> 	if (arg2 || arg3 || arg4 || arg5)
> 		return -EINVAL;
> 	return test_bit(MMF_HAS_MDWE, &current->mm->flags) ?
> 		MDWE_FLAG_REFUSE_EXEC_GAIN : 0;
> }

Thanks, makes sense, I have incorporated those changes.

> 
> > +
> >  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> >  		unsigned long, arg4, unsigned long, arg5)
> >  {
> > @@ -2623,6 +2635,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> >  		error = sched_core_share_pid(arg2, arg3, arg4, arg5);
> >  		break;
> >  #endif
> > +	case PR_SET_MDWE:
> > +		error = prctl_set_mdwe();
> > +		break;
> > +	case PR_GET_MDWE:
> > +		error = prctl_get_mdwe();
> > +		break;
> >  	case PR_SET_VMA:
> >  		error = prctl_set_vma(arg2, arg3, arg4, arg5);
> >  		break;
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 099468aee4d8..42eaf6683216 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1409,6 +1409,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
> >  			vm_flags |= VM_NORESERVE;
> >  	}
> >  
> > +	if (map_deny_write_exec(NULL, vm_flags))
> > +		return -EACCES;
> > +
> 
> This seems like the wrong place to do the check -- that the vma argument
> is a hard-coded "NULL" is evidence that something is wrong. Shouldn't
> it live in mmap_region()? What happens with MAP_FIXED, when there is
> an underlying vma? i.e. an MAP_FIXED will, I think, bypass the intended
> check. For example, we had "c" above:
> 
>      c)	mmap(PROT_READ);
> 	mprotect(PROT_READ|PROT_EXEC);		// fails
> 
> But this would allow another case:
> 
>      e)	addr = mmap(..., PROT_READ, ...);
> 	mmap(addr, ..., PROT_READ | PROT_EXEC, MAP_FIXED, ...);	// passes

I can move the check into mmap_region() but it won't fix the MAP_FIXED
example that you showed here.

mmap_region() calls do_mas_munmap(..) which will unmap overlapping regions.
However the `vma` for the 'old' region is not kept around, and a new vma will
be allocated later on "vma = vm_area_alloc(mm);", and the vm_flags are just set
to what is passed into mmap_region(), so map_deny_write_exec(vma, vm_flags)
will just be as good as passing NULL.

It's possible to save the vm_flags from the region that is unmapped, but Catalin
suggested it might be better if that is part of a later extension, what do you
think? 

> 
> 
> >  	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
> >  	if (!IS_ERR_VALUE(addr) &&
> >  	    ((vm_flags & VM_LOCKED) ||
> > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > index 8d770855b591..af71ef0788fd 100644
> > --- a/mm/mprotect.c
> > +++ b/mm/mprotect.c
> > @@ -766,6 +766,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
> >  			break;
> >  		}
> >  
> > +		if (map_deny_write_exec(vma, newflags)) {
> > +			error = -EACCES;
> > +			goto out;
> > +		}
> > +
> 
> This looks like the right place. Any rationale for why it's before
> arch_validate_flags()?o

No big justification, it's just after the VM_ACCESS_FLAGS check and is more generic
than the architecture specific checks.

> 
> >  		/* Allow architectures to sanity-check the new flags */
> >  		if (!arch_validate_flags(newflags)) {
> >  			error = -EINVAL;
> 
> -Kees

Thanks for the review and for the rewritten test, I have replaced my commit with
the one that you sent.

Joey
  
Catalin Marinas Nov. 10, 2022, 12:03 p.m. UTC | #3
On Thu, Nov 10, 2022 at 11:27:14AM +0000, Joey Gouly wrote:
> On Fri, Oct 28, 2022 at 11:51:00AM -0700, Kees Cook wrote:
> > On Wed, Oct 26, 2022 at 04:04:56PM +0100, Joey Gouly wrote:
> > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > index 099468aee4d8..42eaf6683216 100644
> > > --- a/mm/mmap.c
> > > +++ b/mm/mmap.c
> > > @@ -1409,6 +1409,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
> > >  			vm_flags |= VM_NORESERVE;
> > >  	}
> > >  
> > > +	if (map_deny_write_exec(NULL, vm_flags))
> > > +		return -EACCES;
> > > +
> > 
> > This seems like the wrong place to do the check -- that the vma argument
> > is a hard-coded "NULL" is evidence that something is wrong. Shouldn't
> > it live in mmap_region()? What happens with MAP_FIXED, when there is
> > an underlying vma? i.e. an MAP_FIXED will, I think, bypass the intended
> > check. For example, we had "c" above:
> > 
> >      c)	mmap(PROT_READ);
> > 	mprotect(PROT_READ|PROT_EXEC);		// fails
> > 
> > But this would allow another case:
> > 
> >      e)	addr = mmap(..., PROT_READ, ...);
> > 	mmap(addr, ..., PROT_READ | PROT_EXEC, MAP_FIXED, ...);	// passes
> 
> I can move the check into mmap_region() but it won't fix the MAP_FIXED
> example that you showed here.
> 
> mmap_region() calls do_mas_munmap(..) which will unmap overlapping regions.
> However the `vma` for the 'old' region is not kept around, and a new vma will
> be allocated later on "vma = vm_area_alloc(mm);", and the vm_flags are just set
> to what is passed into mmap_region(), so map_deny_write_exec(vma, vm_flags)
> will just be as good as passing NULL.
> 
> It's possible to save the vm_flags from the region that is unmapped, but Catalin
> suggested it might be better if that is part of a later extension, what do you
> think? 

I thought initially we should keep the behaviour close to what systemd
achieves via SECCOMP while only relaxing an mprotect(PROT_EXEC) if the
vma is already executable (i.e. check actual permission change not just
the PROT_* flags).

We could pass the old vm_flags for that region (and maybe drop the vma
pointer entirely, just check old and new vm_flags). But this feels like
tightening slightly systemd's MDWE approach. If user-space doesn't get
confused by this, I'm fine to go with it. Otherwise we can add a new
flag later for this behaviour

I guess that's more of a question for Topi on whether point tightening
point (e) is feasible/desirable.
  
Topi Miettinen Nov. 12, 2022, 6:11 a.m. UTC | #4
On 10.11.2022 14.03, Catalin Marinas wrote:
> On Thu, Nov 10, 2022 at 11:27:14AM +0000, Joey Gouly wrote:
>> On Fri, Oct 28, 2022 at 11:51:00AM -0700, Kees Cook wrote:
>>> On Wed, Oct 26, 2022 at 04:04:56PM +0100, Joey Gouly wrote:
>>>> diff --git a/mm/mmap.c b/mm/mmap.c
>>>> index 099468aee4d8..42eaf6683216 100644
>>>> --- a/mm/mmap.c
>>>> +++ b/mm/mmap.c
>>>> @@ -1409,6 +1409,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>>>>   			vm_flags |= VM_NORESERVE;
>>>>   	}
>>>>   
>>>> +	if (map_deny_write_exec(NULL, vm_flags))
>>>> +		return -EACCES;
>>>> +
>>>
>>> This seems like the wrong place to do the check -- that the vma argument
>>> is a hard-coded "NULL" is evidence that something is wrong. Shouldn't
>>> it live in mmap_region()? What happens with MAP_FIXED, when there is
>>> an underlying vma? i.e. an MAP_FIXED will, I think, bypass the intended
>>> check. For example, we had "c" above:
>>>
>>>       c)	mmap(PROT_READ);
>>> 	mprotect(PROT_READ|PROT_EXEC);		// fails
>>>
>>> But this would allow another case:
>>>
>>>       e)	addr = mmap(..., PROT_READ, ...);
>>> 	mmap(addr, ..., PROT_READ | PROT_EXEC, MAP_FIXED, ...);	// passes
>>
>> I can move the check into mmap_region() but it won't fix the MAP_FIXED
>> example that you showed here.
>>
>> mmap_region() calls do_mas_munmap(..) which will unmap overlapping regions.
>> However the `vma` for the 'old' region is not kept around, and a new vma will
>> be allocated later on "vma = vm_area_alloc(mm);", and the vm_flags are just set
>> to what is passed into mmap_region(), so map_deny_write_exec(vma, vm_flags)
>> will just be as good as passing NULL.
>>
>> It's possible to save the vm_flags from the region that is unmapped, but Catalin
>> suggested it might be better if that is part of a later extension, what do you
>> think?
> 
> I thought initially we should keep the behaviour close to what systemd
> achieves via SECCOMP while only relaxing an mprotect(PROT_EXEC) if the
> vma is already executable (i.e. check actual permission change not just
> the PROT_* flags).
> 
> We could pass the old vm_flags for that region (and maybe drop the vma
> pointer entirely, just check old and new vm_flags). But this feels like
> tightening slightly systemd's MDWE approach. If user-space doesn't get
> confused by this, I'm fine to go with it. Otherwise we can add a new
> flag later for this behaviour
> 
> I guess that's more of a question for Topi on whether point tightening
> point (e) is feasible/desirable.

I think we want 1:1 compatibility with seccomp() for the basic version, 
so MAP_FIXED shouldn't change the verdict. Later we can introduce more 
versions (perhaps even less strict, too) when it's requested by 
configuration, like MemoryDenyWriteExecute=[relaxed | strict].

-Topi
  
Catalin Marinas Nov. 15, 2022, 3:35 p.m. UTC | #5
On Sat, Nov 12, 2022 at 08:11:24AM +0200, Topi Miettinen wrote:
> On 10.11.2022 14.03, Catalin Marinas wrote:
> > On Thu, Nov 10, 2022 at 11:27:14AM +0000, Joey Gouly wrote:
> > > On Fri, Oct 28, 2022 at 11:51:00AM -0700, Kees Cook wrote:
> > > > On Wed, Oct 26, 2022 at 04:04:56PM +0100, Joey Gouly wrote:
> > > > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > > > index 099468aee4d8..42eaf6683216 100644
> > > > > --- a/mm/mmap.c
> > > > > +++ b/mm/mmap.c
> > > > > @@ -1409,6 +1409,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
> > > > >   			vm_flags |= VM_NORESERVE;
> > > > >   	}
> > > > > +	if (map_deny_write_exec(NULL, vm_flags))
> > > > > +		return -EACCES;
> > > > > +
> > > > 
> > > > This seems like the wrong place to do the check -- that the vma argument
> > > > is a hard-coded "NULL" is evidence that something is wrong. Shouldn't
> > > > it live in mmap_region()? What happens with MAP_FIXED, when there is
> > > > an underlying vma? i.e. an MAP_FIXED will, I think, bypass the intended
> > > > check. For example, we had "c" above:
> > > > 
> > > >       c)	mmap(PROT_READ);
> > > > 	mprotect(PROT_READ|PROT_EXEC);		// fails
> > > > 
> > > > But this would allow another case:
> > > > 
> > > >       e)	addr = mmap(..., PROT_READ, ...);
> > > > 	mmap(addr, ..., PROT_READ | PROT_EXEC, MAP_FIXED, ...);	// passes
> > > 
> > > I can move the check into mmap_region() but it won't fix the MAP_FIXED
> > > example that you showed here.
> > > 
> > > mmap_region() calls do_mas_munmap(..) which will unmap overlapping regions.
> > > However the `vma` for the 'old' region is not kept around, and a new vma will
> > > be allocated later on "vma = vm_area_alloc(mm);", and the vm_flags are just set
> > > to what is passed into mmap_region(), so map_deny_write_exec(vma, vm_flags)
> > > will just be as good as passing NULL.
> > > 
> > > It's possible to save the vm_flags from the region that is unmapped, but Catalin
> > > suggested it might be better if that is part of a later extension, what do you
> > > think?
> > 
> > I thought initially we should keep the behaviour close to what systemd
> > achieves via SECCOMP while only relaxing an mprotect(PROT_EXEC) if the
> > vma is already executable (i.e. check actual permission change not just
> > the PROT_* flags).
> > 
> > We could pass the old vm_flags for that region (and maybe drop the vma
> > pointer entirely, just check old and new vm_flags). But this feels like
> > tightening slightly systemd's MDWE approach. If user-space doesn't get
> > confused by this, I'm fine to go with it. Otherwise we can add a new
> > flag later for this behaviour
> > 
> > I guess that's more of a question for Topi on whether point tightening
> > point (e) is feasible/desirable.
> 
> I think we want 1:1 compatibility with seccomp() for the basic version, so
> MAP_FIXED shouldn't change the verdict. Later we can introduce more versions
> (perhaps even less strict, too) when it's requested by configuration, like
> MemoryDenyWriteExecute=[relaxed | strict].

Are you ok with allowing mprotect(PROT_EXEC|PROT_BTI) if the mapping is
already PROT_EXEC? Or you'd rather reject that as well?
  
Topi Miettinen Nov. 15, 2022, 7:31 p.m. UTC | #6
On 15.11.2022 17.35, Catalin Marinas wrote:
> On Sat, Nov 12, 2022 at 08:11:24AM +0200, Topi Miettinen wrote:
>> On 10.11.2022 14.03, Catalin Marinas wrote:
>>> On Thu, Nov 10, 2022 at 11:27:14AM +0000, Joey Gouly wrote:
>>>> On Fri, Oct 28, 2022 at 11:51:00AM -0700, Kees Cook wrote:
>>>>> On Wed, Oct 26, 2022 at 04:04:56PM +0100, Joey Gouly wrote:
>>>>>> diff --git a/mm/mmap.c b/mm/mmap.c
>>>>>> index 099468aee4d8..42eaf6683216 100644
>>>>>> --- a/mm/mmap.c
>>>>>> +++ b/mm/mmap.c
>>>>>> @@ -1409,6 +1409,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>>>>>>    			vm_flags |= VM_NORESERVE;
>>>>>>    	}
>>>>>> +	if (map_deny_write_exec(NULL, vm_flags))
>>>>>> +		return -EACCES;
>>>>>> +
>>>>>
>>>>> This seems like the wrong place to do the check -- that the vma argument
>>>>> is a hard-coded "NULL" is evidence that something is wrong. Shouldn't
>>>>> it live in mmap_region()? What happens with MAP_FIXED, when there is
>>>>> an underlying vma? i.e. an MAP_FIXED will, I think, bypass the intended
>>>>> check. For example, we had "c" above:
>>>>>
>>>>>        c)	mmap(PROT_READ);
>>>>> 	mprotect(PROT_READ|PROT_EXEC);		// fails
>>>>>
>>>>> But this would allow another case:
>>>>>
>>>>>        e)	addr = mmap(..., PROT_READ, ...);
>>>>> 	mmap(addr, ..., PROT_READ | PROT_EXEC, MAP_FIXED, ...);	// passes
>>>>
>>>> I can move the check into mmap_region() but it won't fix the MAP_FIXED
>>>> example that you showed here.
>>>>
>>>> mmap_region() calls do_mas_munmap(..) which will unmap overlapping regions.
>>>> However the `vma` for the 'old' region is not kept around, and a new vma will
>>>> be allocated later on "vma = vm_area_alloc(mm);", and the vm_flags are just set
>>>> to what is passed into mmap_region(), so map_deny_write_exec(vma, vm_flags)
>>>> will just be as good as passing NULL.
>>>>
>>>> It's possible to save the vm_flags from the region that is unmapped, but Catalin
>>>> suggested it might be better if that is part of a later extension, what do you
>>>> think?
>>>
>>> I thought initially we should keep the behaviour close to what systemd
>>> achieves via SECCOMP while only relaxing an mprotect(PROT_EXEC) if the
>>> vma is already executable (i.e. check actual permission change not just
>>> the PROT_* flags).
>>>
>>> We could pass the old vm_flags for that region (and maybe drop the vma
>>> pointer entirely, just check old and new vm_flags). But this feels like
>>> tightening slightly systemd's MDWE approach. If user-space doesn't get
>>> confused by this, I'm fine to go with it. Otherwise we can add a new
>>> flag later for this behaviour
>>>
>>> I guess that's more of a question for Topi on whether point tightening
>>> point (e) is feasible/desirable.
>>
>> I think we want 1:1 compatibility with seccomp() for the basic version, so
>> MAP_FIXED shouldn't change the verdict. Later we can introduce more versions
>> (perhaps even less strict, too) when it's requested by configuration, like
>> MemoryDenyWriteExecute=[relaxed | strict].
> 
> Are you ok with allowing mprotect(PROT_EXEC|PROT_BTI) if the mapping is
> already PROT_EXEC? Or you'd rather reject that as well?
> 

I think that it's OK to allow that. It's an incompatible change, but it 
shouldn't break anything.

-Topi
  

Patch

diff --git a/include/linux/mman.h b/include/linux/mman.h
index 58b3abd457a3..d84fdeab6b5e 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -156,4 +156,19 @@  calc_vm_flag_bits(unsigned long flags)
 }
 
 unsigned long vm_commit_limit(void);
+
+static inline bool map_deny_write_exec(struct vm_area_struct *vma,  unsigned long vm_flags)
+{
+	if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
+		return false;
+
+	if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
+		return true;
+
+	if (vma && !(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
+		return true;
+
+	return false;
+}
+
 #endif /* _LINUX_MMAN_H */
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 8270ad7ae14c..0e17ae7fbfd3 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -81,9 +81,13 @@  static inline int get_dumpable(struct mm_struct *mm)
  * lifecycle of this mm, just for simplicity.
  */
 #define MMF_HAS_PINNED		27	/* FOLL_PIN has run, never cleared */
+
+#define MMF_HAS_MDWE		28
+#define MMF_HAS_MDWE_MASK	(1 << MMF_HAS_MDWE)
+
 #define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
-				 MMF_DISABLE_THP_MASK)
+				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
 
 #endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index a5e06dcbba13..ab9db1e86230 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -281,6 +281,12 @@  struct prctl_mm_map {
 # define PR_SME_VL_LEN_MASK		0xffff
 # define PR_SME_VL_INHERIT		(1 << 17) /* inherit across exec */
 
+/* Memory deny write / execute */
+#define PR_SET_MDWE			65
+# define PR_MDWE_FLAG_MMAP		1
+
+#define PR_GET_MDWE			66
+
 #define PR_SET_VMA		0x53564d41
 # define PR_SET_VMA_ANON_NAME		0
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 5fd54bf0e886..08e1dd6d2533 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2348,6 +2348,18 @@  static int prctl_set_vma(unsigned long opt, unsigned long start,
 }
 #endif /* CONFIG_ANON_VMA_NAME */
 
+static inline int prctl_set_mdwe(void)
+{
+	set_bit(MMF_HAS_MDWE, &current->mm->flags);
+
+	return 0;
+}
+
+static inline int prctl_get_mdwe(void)
+{
+	return test_bit(MMF_HAS_MDWE, &current->mm->flags);
+}
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -2623,6 +2635,12 @@  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		error = sched_core_share_pid(arg2, arg3, arg4, arg5);
 		break;
 #endif
+	case PR_SET_MDWE:
+		error = prctl_set_mdwe();
+		break;
+	case PR_GET_MDWE:
+		error = prctl_get_mdwe();
+		break;
 	case PR_SET_VMA:
 		error = prctl_set_vma(arg2, arg3, arg4, arg5);
 		break;
diff --git a/mm/mmap.c b/mm/mmap.c
index 099468aee4d8..42eaf6683216 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1409,6 +1409,9 @@  unsigned long do_mmap(struct file *file, unsigned long addr,
 			vm_flags |= VM_NORESERVE;
 	}
 
+	if (map_deny_write_exec(NULL, vm_flags))
+		return -EACCES;
+
 	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
 	if (!IS_ERR_VALUE(addr) &&
 	    ((vm_flags & VM_LOCKED) ||
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8d770855b591..af71ef0788fd 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -766,6 +766,11 @@  static int do_mprotect_pkey(unsigned long start, size_t len,
 			break;
 		}
 
+		if (map_deny_write_exec(vma, newflags)) {
+			error = -EACCES;
+			goto out;
+		}
+
 		/* Allow architectures to sanity-check the new flags */
 		if (!arch_validate_flags(newflags)) {
 			error = -EINVAL;