[PATCHv13,06/16] x86/mm: Provide arch_prctl() interface for LAM

Message ID 20221227030829.12508-7-kirill.shutemov@linux.intel.com
State New
Headers
Series Linear Address Masking enabling |

Commit Message

Kirill A. Shutemov Dec. 27, 2022, 3:08 a.m. UTC
  Add a couple of arch_prctl() handles:

 - ARCH_ENABLE_TAGGED_ADDR enabled LAM. The argument is required number
   of tag bits. It is rounded up to the nearest LAM mode that can
   provide it. For now only LAM_U57 is supported, with 6 tag bits.

 - ARCH_GET_UNTAG_MASK returns untag mask. It can indicates where tag
   bits located in the address.

 - ARCH_GET_MAX_TAG_BITS returns the maximum tag bits user can request.
   Zero if LAM is not supported.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 arch/x86/include/asm/mmu.h        |  2 ++
 arch/x86/include/uapi/asm/prctl.h |  4 +++
 arch/x86/kernel/process.c         |  3 +++
 arch/x86/kernel/process_64.c      | 44 ++++++++++++++++++++++++++++++-
 4 files changed, 52 insertions(+), 1 deletion(-)
  

Comments

Edgecombe, Rick P Jan. 4, 2023, 7:55 p.m. UTC | #1
On Tue, 2022-12-27 at 06:08 +0300, Kirill A. Shutemov wrote:
> Add a couple of arch_prctl() handles:
> 
>  - ARCH_ENABLE_TAGGED_ADDR enabled LAM. The argument is required
> number
>    of tag bits. It is rounded up to the nearest LAM mode that can
>    provide it. For now only LAM_U57 is supported, with 6 tag bits.
> 
>  - ARCH_GET_UNTAG_MASK returns untag mask. It can indicates where tag
>    bits located in the address.
> 
>  - ARCH_GET_MAX_TAG_BITS returns the maximum tag bits user can
> request.
>    Zero if LAM is not supported.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  arch/x86/include/asm/mmu.h        |  2 ++
>  arch/x86/include/uapi/asm/prctl.h |  4 +++
>  arch/x86/kernel/process.c         |  3 +++
>  arch/x86/kernel/process_64.c      | 44
> ++++++++++++++++++++++++++++++-
>  4 files changed, 52 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
> index ed72fcd2292d..54e4a3e9b5c5 100644
> --- a/arch/x86/include/asm/mmu.h
> +++ b/arch/x86/include/asm/mmu.h
> @@ -12,6 +12,8 @@
>  #define MM_CONTEXT_UPROBE_IA32		0
>  /* vsyscall page is accessible on this MM */
>  #define MM_CONTEXT_HAS_VSYSCALL		1
> +/* Do not allow changing LAM mode */
> +#define MM_CONTEXT_LOCK_LAM		2
>  
>  /*
>   * x86 has arch-specific MMU state beyond what lives in mm_struct.
> diff --git a/arch/x86/include/uapi/asm/prctl.h
> b/arch/x86/include/uapi/asm/prctl.h
> index 500b96e71f18..a31e27b95b19 100644
> --- a/arch/x86/include/uapi/asm/prctl.h
> +++ b/arch/x86/include/uapi/asm/prctl.h
> @@ -20,4 +20,8 @@
>  #define ARCH_MAP_VDSO_32		0x2002
>  #define ARCH_MAP_VDSO_64		0x2003
>  
> +#define ARCH_GET_UNTAG_MASK		0x4001
> +#define ARCH_ENABLE_TAGGED_ADDR		0x4002
> +#define ARCH_GET_MAX_TAG_BITS		0x4003
> +
>  #endif /* _ASM_X86_PRCTL_H */
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index ef6bde1d40d8..cc0677f58f42 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -162,6 +162,9 @@ int copy_thread(struct task_struct *p, const
> struct kernel_clone_args *args)
>  
>  	savesegment(es, p->thread.es);
>  	savesegment(ds, p->thread.ds);
> +
> +	if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) ==
> CLONE_VM)
> +		set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags);
>  #else
>  	p->thread.sp0 = (unsigned long) (childregs + 1);
>  	savesegment(gs, p->thread.gs);
> diff --git a/arch/x86/kernel/process_64.c
> b/arch/x86/kernel/process_64.c
> index 8b06034e8c70..fef127ed79b6 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -743,6 +743,39 @@ static long prctl_map_vdso(const struct
> vdso_image *image, unsigned long addr)
>  }
>  #endif
>  
> +#define LAM_U57_BITS 6
> +
> +static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned
> long nr_bits)
> +{
> +	if (!cpu_feature_enabled(X86_FEATURE_LAM))
> +		return -ENODEV;
> +
> +	if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags))
> +		return -EBUSY;

Since this bit doesn't get set on vfork, you might be able to work
around this by enabling LAM in a vforked task, then enabling it again
after returning to the parent. But why do we need this check anyway?

> +
> +	if (mmap_write_lock_killable(mm))
> +		return -EINTR;
> +
> +	if (!nr_bits) {
> +		mmap_write_unlock(mm);
> +		return -EINVAL;
> +	} else if (nr_bits <= LAM_U57_BITS) {
> +		mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
> +		mm->context.untag_mask =  ~GENMASK(62, 57);
> +	} else {
> +		mmap_write_unlock(mm);
> +		return -EINVAL;
> +	}
> +
> +	write_cr3(__read_cr3() | mm->context.lam_cr3_mask);

mm might not be from the current task if it came from
PTRACE_ARCH_PRCTL, so then this would write to the wrong CR3. Maybe for
simplicity just return an error if task != current.

> +	set_tlbstate_cr3_lam_mask(mm->context.lam_cr3_mask);
> +	set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
> +
> +	mmap_write_unlock(mm);
> +
> +	return 0;
> +}
> +
>  long do_arch_prctl_64(struct task_struct *task, int option, unsigned
> long arg2)
>  {
>  	int ret = 0;
> @@ -830,7 +863,16 @@ long do_arch_prctl_64(struct task_struct *task,
> int option, unsigned long arg2)
>  	case ARCH_MAP_VDSO_64:
>  		return prctl_map_vdso(&vdso_image_64, arg2);
>  #endif
> -
> +	case ARCH_GET_UNTAG_MASK:
> +		return put_user(task->mm->context.untag_mask,
> +				(unsigned long __user *)arg2);
> +	case ARCH_ENABLE_TAGGED_ADDR:
> +		return prctl_enable_tagged_addr(task->mm, arg2);
> +	case ARCH_GET_MAX_TAG_BITS:
> +		if (!cpu_feature_enabled(X86_FEATURE_LAM))
> +			return put_user(0, (unsigned long __user
> *)arg2);
> +		else
> +			return put_user(LAM_U57_BITS, (unsigned long
> __user *)arg2);
>  	default:
>  		ret = -EINVAL;
>  		break;
  
Kirill A. Shutemov Jan. 10, 2023, 6:17 a.m. UTC | #2
On Wed, Jan 04, 2023 at 07:55:45PM +0000, Edgecombe, Rick P wrote:
> On Tue, 2022-12-27 at 06:08 +0300, Kirill A. Shutemov wrote:
> > Add a couple of arch_prctl() handles:
> > 
> >  - ARCH_ENABLE_TAGGED_ADDR enabled LAM. The argument is required
> > number
> >    of tag bits. It is rounded up to the nearest LAM mode that can
> >    provide it. For now only LAM_U57 is supported, with 6 tag bits.
> > 
> >  - ARCH_GET_UNTAG_MASK returns untag mask. It can indicates where tag
> >    bits located in the address.
> > 
> >  - ARCH_GET_MAX_TAG_BITS returns the maximum tag bits user can
> > request.
> >    Zero if LAM is not supported.
> > 
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  arch/x86/include/asm/mmu.h        |  2 ++
> >  arch/x86/include/uapi/asm/prctl.h |  4 +++
> >  arch/x86/kernel/process.c         |  3 +++
> >  arch/x86/kernel/process_64.c      | 44
> > ++++++++++++++++++++++++++++++-
> >  4 files changed, 52 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
> > index ed72fcd2292d..54e4a3e9b5c5 100644
> > --- a/arch/x86/include/asm/mmu.h
> > +++ b/arch/x86/include/asm/mmu.h
> > @@ -12,6 +12,8 @@
> >  #define MM_CONTEXT_UPROBE_IA32		0
> >  /* vsyscall page is accessible on this MM */
> >  #define MM_CONTEXT_HAS_VSYSCALL		1
> > +/* Do not allow changing LAM mode */
> > +#define MM_CONTEXT_LOCK_LAM		2
> >  
> >  /*
> >   * x86 has arch-specific MMU state beyond what lives in mm_struct.
> > diff --git a/arch/x86/include/uapi/asm/prctl.h
> > b/arch/x86/include/uapi/asm/prctl.h
> > index 500b96e71f18..a31e27b95b19 100644
> > --- a/arch/x86/include/uapi/asm/prctl.h
> > +++ b/arch/x86/include/uapi/asm/prctl.h
> > @@ -20,4 +20,8 @@
> >  #define ARCH_MAP_VDSO_32		0x2002
> >  #define ARCH_MAP_VDSO_64		0x2003
> >  
> > +#define ARCH_GET_UNTAG_MASK		0x4001
> > +#define ARCH_ENABLE_TAGGED_ADDR		0x4002
> > +#define ARCH_GET_MAX_TAG_BITS		0x4003
> > +
> >  #endif /* _ASM_X86_PRCTL_H */
> > diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> > index ef6bde1d40d8..cc0677f58f42 100644
> > --- a/arch/x86/kernel/process.c
> > +++ b/arch/x86/kernel/process.c
> > @@ -162,6 +162,9 @@ int copy_thread(struct task_struct *p, const
> > struct kernel_clone_args *args)
> >  
> >  	savesegment(es, p->thread.es);
> >  	savesegment(ds, p->thread.ds);
> > +
> > +	if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) ==
> > CLONE_VM)
> > +		set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags);
> >  #else
> >  	p->thread.sp0 = (unsigned long) (childregs + 1);
> >  	savesegment(gs, p->thread.gs);
> > diff --git a/arch/x86/kernel/process_64.c
> > b/arch/x86/kernel/process_64.c
> > index 8b06034e8c70..fef127ed79b6 100644
> > --- a/arch/x86/kernel/process_64.c
> > +++ b/arch/x86/kernel/process_64.c
> > @@ -743,6 +743,39 @@ static long prctl_map_vdso(const struct
> > vdso_image *image, unsigned long addr)
> >  }
> >  #endif
> >  
> > +#define LAM_U57_BITS 6
> > +
> > +static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned
> > long nr_bits)
> > +{
> > +	if (!cpu_feature_enabled(X86_FEATURE_LAM))
> > +		return -ENODEV;
> > +
> > +	if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags))
> > +		return -EBUSY;
> 
> Since this bit doesn't get set on vfork, you might be able to work
> around this by enabling LAM in a vforked task, then enabling it again
> after returning to the parent.

I don't think so.

Yes, child can enable LAM after vfork(), but it will set the flag, so
parent won't be able to enable it again. And there's no cuncurency between
parent and child due to vfork() semantics.

Anyway, I will move the check inside mmap lock. It may clarify situation
for a reader.

> > +
> > +	if (mmap_write_lock_killable(mm))
> > +		return -EINTR;
> > +
> > +	if (!nr_bits) {
> > +		mmap_write_unlock(mm);
> > +		return -EINVAL;
> > +	} else if (nr_bits <= LAM_U57_BITS) {
> > +		mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
> > +		mm->context.untag_mask =  ~GENMASK(62, 57);
> > +	} else {
> > +		mmap_write_unlock(mm);
> > +		return -EINVAL;
> > +	}
> > +
> > +	write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
> 
> mm might not be from the current task if it came from
> PTRACE_ARCH_PRCTL, so then this would write to the wrong CR3. Maybe for
> simplicity just return an error if task != current.

Oh. Forgot about PTRACE_ARCH_PRCTL. Yes, will add the check.
  

Patch

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index ed72fcd2292d..54e4a3e9b5c5 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -12,6 +12,8 @@ 
 #define MM_CONTEXT_UPROBE_IA32		0
 /* vsyscall page is accessible on this MM */
 #define MM_CONTEXT_HAS_VSYSCALL		1
+/* Do not allow changing LAM mode */
+#define MM_CONTEXT_LOCK_LAM		2
 
 /*
  * x86 has arch-specific MMU state beyond what lives in mm_struct.
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 500b96e71f18..a31e27b95b19 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -20,4 +20,8 @@ 
 #define ARCH_MAP_VDSO_32		0x2002
 #define ARCH_MAP_VDSO_64		0x2003
 
+#define ARCH_GET_UNTAG_MASK		0x4001
+#define ARCH_ENABLE_TAGGED_ADDR		0x4002
+#define ARCH_GET_MAX_TAG_BITS		0x4003
+
 #endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ef6bde1d40d8..cc0677f58f42 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -162,6 +162,9 @@  int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 
 	savesegment(es, p->thread.es);
 	savesegment(ds, p->thread.ds);
+
+	if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM)
+		set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags);
 #else
 	p->thread.sp0 = (unsigned long) (childregs + 1);
 	savesegment(gs, p->thread.gs);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 8b06034e8c70..fef127ed79b6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -743,6 +743,39 @@  static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
 }
 #endif
 
+#define LAM_U57_BITS 6
+
+static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_LAM))
+		return -ENODEV;
+
+	if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags))
+		return -EBUSY;
+
+	if (mmap_write_lock_killable(mm))
+		return -EINTR;
+
+	if (!nr_bits) {
+		mmap_write_unlock(mm);
+		return -EINVAL;
+	} else if (nr_bits <= LAM_U57_BITS) {
+		mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
+		mm->context.untag_mask =  ~GENMASK(62, 57);
+	} else {
+		mmap_write_unlock(mm);
+		return -EINVAL;
+	}
+
+	write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
+	set_tlbstate_cr3_lam_mask(mm->context.lam_cr3_mask);
+	set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
+
+	mmap_write_unlock(mm);
+
+	return 0;
+}
+
 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
 {
 	int ret = 0;
@@ -830,7 +863,16 @@  long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
 	case ARCH_MAP_VDSO_64:
 		return prctl_map_vdso(&vdso_image_64, arg2);
 #endif
-
+	case ARCH_GET_UNTAG_MASK:
+		return put_user(task->mm->context.untag_mask,
+				(unsigned long __user *)arg2);
+	case ARCH_ENABLE_TAGGED_ADDR:
+		return prctl_enable_tagged_addr(task->mm, arg2);
+	case ARCH_GET_MAX_TAG_BITS:
+		if (!cpu_feature_enabled(X86_FEATURE_LAM))
+			return put_user(0, (unsigned long __user *)arg2);
+		else
+			return put_user(LAM_U57_BITS, (unsigned long __user *)arg2);
 	default:
 		ret = -EINVAL;
 		break;