[5/6] KEY: Apply PKEY_ENFORCE_API to munmap

Message ID 20230515130553.2311248-6-jeffxu@chromium.org
State New
Headers
Series Memory Mapping (VMA) protection using PKU - set 1 |

Commit Message

Jeff Xu May 15, 2023, 1:05 p.m. UTC
  From: Jeff Xu <jeffxu@google.com>

This patch enables PKEY_ENFORCE_API for the munmap
syscall.

Signed-off-by: Jeff Xu<jeffxu@google.com>
---
 include/linux/mm.h |  2 +-
 mm/mmap.c          | 34 ++++++++++++++++++++++++++--------
 mm/mremap.c        |  6 ++++--
 3 files changed, 31 insertions(+), 11 deletions(-)
  

Comments

Kees Cook May 16, 2023, 8:06 p.m. UTC | #1
On Mon, May 15, 2023 at 01:05:51PM +0000, jeffxu@chromium.org wrote:
> From: Jeff Xu <jeffxu@google.com>
> 
> This patch enables PKEY_ENFORCE_API for the munmap
> syscall.
> 
> Signed-off-by: Jeff Xu<jeffxu@google.com>
> ---
>  include/linux/mm.h |  2 +-
>  mm/mmap.c          | 34 ++++++++++++++++++++++++++--------
>  mm/mremap.c        |  6 ++++--
>  3 files changed, 31 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 27ce77080c79..48076e845d53 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3136,7 +3136,7 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr,
>  	unsigned long pgoff, unsigned long *populate, struct list_head *uf);
>  extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
>  			 unsigned long start, size_t len, struct list_head *uf,
> -			 bool downgrade);
> +			 bool downgrade, bool syscall);

For type checking and readability, I suggest using an enum instead of
"bool". Perhaps something like:

enum caller_origin {
	ON_BEHALF_OF_KERNEL = 0,
	ON_BEHALF_OF_USERSPACE,
};

 extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
 			 unsigned long start, size_t len, struct list_head *uf,
-			 bool downgrade);
+			 bool downgrade, enum caller_origin called);

>  extern int do_munmap(struct mm_struct *, unsigned long, size_t,
>  		     struct list_head *uf);
>  extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 13678edaa22c..29329aa794a6 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -2498,6 +2498,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>   * @uf: The userfaultfd list_head
>   * @downgrade: set to true if the user wants to attempt to write_downgrade the
>   * mmap_lock
> + * @syscall: set to true if this is called from syscall entry
>   *
>   * This function takes a @mas that is either pointing to the previous VMA or set
>   * to MA_START and sets it up to remove the mapping(s).  The @len will be
> @@ -2507,7 +2508,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>   */
>  int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
>  		  unsigned long start, size_t len, struct list_head *uf,
> -		  bool downgrade)
> +		  bool downgrade, bool syscall)
>  {
>  	unsigned long end;
>  	struct vm_area_struct *vma;
> @@ -2519,6 +2520,19 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
>  	if (end == start)
>  		return -EINVAL;
>  
> +	/*
> +	 * When called by syscall from userspace, check if the calling
> +	 * thread has the PKEY permission to modify the memory mapping.
> +	 */
> +	if (syscall && arch_check_pkey_enforce_api(mm, start, end) < 0) {

	if (called == ON_BEHALF_OF_USERSPACE &&
	    arch_check_pkey_enforce_api(mm, start, end) < 0) {

> +		char comm[TASK_COMM_LEN];
> +
> +		pr_warn_ratelimited(
> +			"munmap was denied on PKEY_ENFORCE_API memory, pid=%d '%s'\n",
> +			task_pid_nr(current), get_task_comm(comm, current));
> +		return -EACCES;
> +	}
> +
>  	 /* arch_unmap() might do unmaps itself.  */
>  	arch_unmap(mm, start, end);
>  
> @@ -2541,7 +2555,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
>  {
>  	VMA_ITERATOR(vmi, mm, start);
>  
> -	return do_vmi_munmap(&vmi, mm, start, len, uf, false);
> +	return do_vmi_munmap(&vmi, mm, start, len, uf, false, false);

+	return do_vmi_munmap(&vmi, mm, start, len, uf, false, ON_BEHALF_OF_KERNEL);

> [...]
>  SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
>  {
>  	addr = untagged_addr(addr);
> -	return __vm_munmap(addr, len, true);
> +	return __vm_munmap(addr, len, true, true);

+	return __vm_munmap(addr, len, true, ON_BEHALF_OF_USERSPACE);

etc.
  
Jeff Xu May 16, 2023, 10:24 p.m. UTC | #2
On Tue, May 16, 2023 at 1:13 PM Kees Cook <keescook@chromium.org> wrote:
>
> On Mon, May 15, 2023 at 01:05:51PM +0000, jeffxu@chromium.org wrote:
> > From: Jeff Xu <jeffxu@google.com>
> >
> > This patch enables PKEY_ENFORCE_API for the munmap
> > syscall.
> >
> > Signed-off-by: Jeff Xu<jeffxu@google.com>
> > ---
> >  include/linux/mm.h |  2 +-
> >  mm/mmap.c          | 34 ++++++++++++++++++++++++++--------
> >  mm/mremap.c        |  6 ++++--
> >  3 files changed, 31 insertions(+), 11 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 27ce77080c79..48076e845d53 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -3136,7 +3136,7 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr,
> >       unsigned long pgoff, unsigned long *populate, struct list_head *uf);
> >  extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> >                        unsigned long start, size_t len, struct list_head *uf,
> > -                      bool downgrade);
> > +                      bool downgrade, bool syscall);
>
> For type checking and readability, I suggest using an enum instead of
> "bool". Perhaps something like:
>
> enum caller_origin {
>         ON_BEHALF_OF_KERNEL = 0,
>         ON_BEHALF_OF_USERSPACE,
> };
>
Sure, it makes sense.


>  extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
>                          unsigned long start, size_t len, struct list_head *uf,
> -                        bool downgrade);
> +                        bool downgrade, enum caller_origin called);
>
> >  extern int do_munmap(struct mm_struct *, unsigned long, size_t,
> >                    struct list_head *uf);
> >  extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 13678edaa22c..29329aa794a6 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -2498,6 +2498,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >   * @uf: The userfaultfd list_head
> >   * @downgrade: set to true if the user wants to attempt to write_downgrade the
> >   * mmap_lock
> > + * @syscall: set to true if this is called from syscall entry
> >   *
> >   * This function takes a @mas that is either pointing to the previous VMA or set
> >   * to MA_START and sets it up to remove the mapping(s).  The @len will be
> > @@ -2507,7 +2508,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >   */
> >  int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> >                 unsigned long start, size_t len, struct list_head *uf,
> > -               bool downgrade)
> > +               bool downgrade, bool syscall)
> >  {
> >       unsigned long end;
> >       struct vm_area_struct *vma;
> > @@ -2519,6 +2520,19 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> >       if (end == start)
> >               return -EINVAL;
> >
> > +     /*
> > +      * When called by syscall from userspace, check if the calling
> > +      * thread has the PKEY permission to modify the memory mapping.
> > +      */
> > +     if (syscall && arch_check_pkey_enforce_api(mm, start, end) < 0) {
>
>         if (called == ON_BEHALF_OF_USERSPACE &&
>             arch_check_pkey_enforce_api(mm, start, end) < 0) {
>
> > +             char comm[TASK_COMM_LEN];
> > +
> > +             pr_warn_ratelimited(
> > +                     "munmap was denied on PKEY_ENFORCE_API memory, pid=%d '%s'\n",
> > +                     task_pid_nr(current), get_task_comm(comm, current));
> > +             return -EACCES;
> > +     }
> > +
> >        /* arch_unmap() might do unmaps itself.  */
> >       arch_unmap(mm, start, end);
> >
> > @@ -2541,7 +2555,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
> >  {
> >       VMA_ITERATOR(vmi, mm, start);
> >
> > -     return do_vmi_munmap(&vmi, mm, start, len, uf, false);
> > +     return do_vmi_munmap(&vmi, mm, start, len, uf, false, false);
>
> +       return do_vmi_munmap(&vmi, mm, start, len, uf, false, ON_BEHALF_OF_KERNEL);
>
> > [...]
> >  SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
> >  {
> >       addr = untagged_addr(addr);
> > -     return __vm_munmap(addr, len, true);
> > +     return __vm_munmap(addr, len, true, true);
>
> +       return __vm_munmap(addr, len, true, ON_BEHALF_OF_USERSPACE);
>
> etc.
>
> --
> Kees Cook

Thanks!
-Jeff Xu
  
Dave Hansen May 16, 2023, 11:23 p.m. UTC | #3
On 5/15/23 06:05, jeffxu@chromium.org wrote:
> From: Jeff Xu <jeffxu@google.com>
> 
> This patch enables PKEY_ENFORCE_API for the munmap
> syscall.

The basic problem here is how we know when the set of syscalls that are
patched here is good enough and how we catch future functionality that
might need to be captured as well.

This mechanism really needs to be able to defend against *any* changes
to the address space.  I assume that folks are using syscall filtering
to prevent new syscalls from causing havoc, but is there anything that
can be done for, say, things like madvise()?  I bet it was harmless for
a long time until MADV_DONTNEED showed up and made it able to
effectively zero memory.
  
Jeff Xu May 17, 2023, 12:08 a.m. UTC | #4
On Tue, May 16, 2023 at 4:24 PM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 5/15/23 06:05, jeffxu@chromium.org wrote:
> > From: Jeff Xu <jeffxu@google.com>
> >
> > This patch enables PKEY_ENFORCE_API for the munmap
> > syscall.
>
> The basic problem here is how we know when the set of syscalls that are
> patched here is good enough and how we catch future functionality that
> might need to be captured as well.
>
> This mechanism really needs to be able to defend against *any* changes
> to the address space.  I assume that folks are using syscall filtering
> to prevent new syscalls from causing havoc, but is there anything that
> can be done for, say, things like madvise()?  I bet it was harmless for
> a long time until MADV_DONTNEED showed up and made it able to
> effectively zero memory.

Not any change, just a limited set of syscall from user space.
I think it is reasonable to hope that any kind of syscall ABI change that
affects VMA will get reviewed thoroughly from now on.

Also, if we continue to add mseal() to the kernel, we will have to pay more
attention to syscalls related to VMA.

Thanks
-Jeff Xu
  

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 27ce77080c79..48076e845d53 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3136,7 +3136,7 @@  extern unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long pgoff, unsigned long *populate, struct list_head *uf);
 extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
 			 unsigned long start, size_t len, struct list_head *uf,
-			 bool downgrade);
+			 bool downgrade, bool syscall);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t,
 		     struct list_head *uf);
 extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
diff --git a/mm/mmap.c b/mm/mmap.c
index 13678edaa22c..29329aa794a6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2498,6 +2498,7 @@  do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
  * @uf: The userfaultfd list_head
  * @downgrade: set to true if the user wants to attempt to write_downgrade the
  * mmap_lock
+ * @syscall: set to true if this is called from syscall entry
  *
  * This function takes a @mas that is either pointing to the previous VMA or set
  * to MA_START and sets it up to remove the mapping(s).  The @len will be
@@ -2507,7 +2508,7 @@  do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
  */
 int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
 		  unsigned long start, size_t len, struct list_head *uf,
-		  bool downgrade)
+		  bool downgrade, bool syscall)
 {
 	unsigned long end;
 	struct vm_area_struct *vma;
@@ -2519,6 +2520,19 @@  int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
 	if (end == start)
 		return -EINVAL;
 
+	/*
+	 * When called by syscall from userspace, check if the calling
+	 * thread has the PKEY permission to modify the memory mapping.
+	 */
+	if (syscall && arch_check_pkey_enforce_api(mm, start, end) < 0) {
+		char comm[TASK_COMM_LEN];
+
+		pr_warn_ratelimited(
+			"munmap was denied on PKEY_ENFORCE_API memory, pid=%d '%s'\n",
+			task_pid_nr(current), get_task_comm(comm, current));
+		return -EACCES;
+	}
+
 	 /* arch_unmap() might do unmaps itself.  */
 	arch_unmap(mm, start, end);
 
@@ -2541,7 +2555,7 @@  int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
 {
 	VMA_ITERATOR(vmi, mm, start);
 
-	return do_vmi_munmap(&vmi, mm, start, len, uf, false);
+	return do_vmi_munmap(&vmi, mm, start, len, uf, false, false);
 }
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
@@ -2575,7 +2589,7 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 	}
 
 	/* Unmap any existing mapping in the area */
-	if (do_vmi_munmap(&vmi, mm, addr, len, uf, false))
+	if (do_vmi_munmap(&vmi, mm, addr, len, uf, false, false))
 		return -ENOMEM;
 
 	/*
@@ -2792,7 +2806,11 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 	return error;
 }
 
-static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
+/*
+ * @syscall: set to true if this is called from syscall entry
+ */
+static int __vm_munmap(unsigned long start, size_t len, bool downgrade,
+		       bool syscall)
 {
 	int ret;
 	struct mm_struct *mm = current->mm;
@@ -2802,7 +2820,7 @@  static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
 	if (mmap_write_lock_killable(mm))
 		return -EINTR;
 
-	ret = do_vmi_munmap(&vmi, mm, start, len, &uf, downgrade);
+	ret = do_vmi_munmap(&vmi, mm, start, len, &uf, downgrade, syscall);
 	/*
 	 * Returning 1 indicates mmap_lock is downgraded.
 	 * But 1 is not legal return value of vm_munmap() and munmap(), reset
@@ -2820,14 +2838,14 @@  static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
 
 int vm_munmap(unsigned long start, size_t len)
 {
-	return __vm_munmap(start, len, false);
+	return __vm_munmap(start, len, false, false);
 }
 EXPORT_SYMBOL(vm_munmap);
 
 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 {
 	addr = untagged_addr(addr);
-	return __vm_munmap(addr, len, true);
+	return __vm_munmap(addr, len, true, true);
 }
 
 
@@ -3055,7 +3073,7 @@  int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
 	if (ret)
 		goto limits_failed;
 
-	ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0);
+	ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0, false);
 	if (ret)
 		goto munmap_failed;
 
diff --git a/mm/mremap.c b/mm/mremap.c
index b11ce6c92099..768e5bd4e8b5 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -703,7 +703,8 @@  static unsigned long move_vma(struct vm_area_struct *vma,
 	}
 
 	vma_iter_init(&vmi, mm, old_addr);
-	if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
+	if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false, false) <
+	    0) {
 		/* OOM: unable to split vma, just get accounts right */
 		if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
 			vm_acct_memory(old_len >> PAGE_SHIFT);
@@ -993,7 +994,8 @@  SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		VMA_ITERATOR(vmi, mm, addr + new_len);
 
 		retval = do_vmi_munmap(&vmi, mm, addr + new_len,
-				       old_len - new_len, &uf_unmap, true);
+				       old_len - new_len, &uf_unmap, true,
+				       false);
 		/* Returning 1 indicates mmap_lock is downgraded to read. */
 		if (retval == 1) {
 			downgraded = true;