[v9,28/42] x86/shstk: Add user-mode shadow stack support
Commit Message
Introduce basic shadow stack enabling/disabling/allocation routines.
A task's shadow stack is allocated from memory with VM_SHADOW_STACK flag
and has a fixed size of min(RLIMIT_STACK, 4GB).
Keep the task's shadow stack address and size in thread_struct. This will
be copied when cloning new threads, but needs to be cleared during exec,
so add a function to do this.
32 bit shadow stack is not expected to have many users and it will
complicate the signal implementation. So do not support IA32 emulation
or x32.
Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Tested-by: Pengfei Xu <pengfei.xu@intel.com>
Tested-by: John Allen <john.allen@amd.com>
Tested-by: Kees Cook <keescook@chromium.org>
---
arch/x86/include/asm/processor.h | 2 +
arch/x86/include/asm/shstk.h | 7 ++
arch/x86/include/uapi/asm/prctl.h | 3 +
arch/x86/kernel/shstk.c | 145 ++++++++++++++++++++++++++++++
4 files changed, 157 insertions(+)
Comments
On Mon, Jun 12, 2023 at 05:10:54PM -0700, Rick Edgecombe wrote:
> +static void unmap_shadow_stack(u64 base, u64 size)
> +{
> + while (1) {
> + int r;
> +
> + r = vm_munmap(base, size);
> +
> + /*
> + * vm_munmap() returns -EINTR when mmap_lock is held by
> + * something else, and that lock should not be held for a
> + * long time. Retry it for the case.
> + */
> + if (r == -EINTR) {
> + cond_resched();
> + continue;
> + }
This looks generic, not even shadow stack specific - was there any
discussion of making it a vm_munmap_retry() (that's not a great name...)
or similar? I didn't see any in old versions of the thread but I
might've missed something.
On 6/27/23 10:20, Mark Brown wrote:
> On Mon, Jun 12, 2023 at 05:10:54PM -0700, Rick Edgecombe wrote:
>
>> +static void unmap_shadow_stack(u64 base, u64 size)
>> +{
>> + while (1) {
>> + int r;
>> +
>> + r = vm_munmap(base, size);
>> +
>> + /*
>> + * vm_munmap() returns -EINTR when mmap_lock is held by
>> + * something else, and that lock should not be held for a
>> + * long time. Retry it for the case.
>> + */
>> + if (r == -EINTR) {
>> + cond_resched();
>> + continue;
>> + }
> This looks generic, not even shadow stack specific - was there any
> discussion of making it a vm_munmap_retry() (that's not a great name...)
> or similar? I didn't see any in old versions of the thread but I
> might've missed something.
Yeah, that looks odd. Also odd is that none of the other users of
vm_munmap() bother to check the return value (except for one passthrough
in the nommu code).
I don't think the EINTR happens during contention, though. It's really
there to be able break out in the face of SIGKILL. I think that's why
nobody handles it: the task is dying anyway and nobody cares.
Rick, was this hunk here for a specific reason or were you just trying
to be diligent in handling errors?
On Tue, 2023-06-27 at 16:46 -0700, Dave Hansen wrote:
> > This looks generic, not even shadow stack specific - was there any
> > discussion of making it a vm_munmap_retry() (that's not a great
> > name...)
> > or similar? I didn't see any in old versions of the thread but I
> > might've missed something.
>
> Yeah, that looks odd. Also odd is that none of the other users of
> vm_munmap() bother to check the return value (except for one
> passthrough
> in the nommu code).
>
> I don't think the EINTR happens during contention, though. It's really
> there to be able break out in the face of SIGKILL. I think that's why
> nobody handles it: the task is dying anyway and nobody cares.
>
> Rick, was this hunk here for a specific reason or were you just trying
> to be diligent in handling errors?
I'm not aware of any specific required cases. I think it is just pure
dilligence, originating from this comment:
https://lore.kernel.org/lkml/9847845a-749d-47a3-2a1d-bcc7c35f1bdd@intel.com/#t
I didn't see a need to remove it.
The SIGKILL certaintly sounds like something more true than the
comment, but I can't do much of dive until next week. I would think we
need to handle EINTR differently to not WARN at least.
Yea, some of it does seem like the kind of thing that could live
outside of x86 shstk.c. But I'm not sure about the WARN part. That
should probably live in the caller. I guess someday we might even find
some shadow stack specific logic that could be cross-arch.
@@ -479,6 +479,8 @@ struct thread_struct {
#ifdef CONFIG_X86_USER_SHADOW_STACK
unsigned long features;
unsigned long features_locked;
+
+ struct thread_shstk shstk;
#endif
/* Floating point and extended processor state */
@@ -8,12 +8,19 @@
struct task_struct;
#ifdef CONFIG_X86_USER_SHADOW_STACK
+struct thread_shstk {
+ u64 base;
+ u64 size;
+};
+
long shstk_prctl(struct task_struct *task, int option, unsigned long features);
void reset_thread_features(void);
+void shstk_free(struct task_struct *p);
#else
static inline long shstk_prctl(struct task_struct *task, int option,
unsigned long arg2) { return -EINVAL; }
static inline void reset_thread_features(void) {}
+static inline void shstk_free(struct task_struct *p) {}
#endif /* CONFIG_X86_USER_SHADOW_STACK */
#endif /* __ASSEMBLY__ */
@@ -34,4 +34,7 @@
#define ARCH_SHSTK_DISABLE 0x5002
#define ARCH_SHSTK_LOCK 0x5003
+/* ARCH_SHSTK_ features bits */
+#define ARCH_SHSTK_SHSTK (1ULL << 0)
+
#endif /* _ASM_X86_PRCTL_H */
@@ -8,14 +8,159 @@
#include <linux/sched.h>
#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/sched/signal.h>
+#include <linux/compat.h>
+#include <linux/sizes.h>
+#include <linux/user.h>
+#include <asm/msr.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/types.h>
+#include <asm/shstk.h>
+#include <asm/special_insns.h>
+#include <asm/fpu/api.h>
#include <asm/prctl.h>
+static bool features_enabled(unsigned long features)
+{
+ return current->thread.features & features;
+}
+
+static void features_set(unsigned long features)
+{
+ current->thread.features |= features;
+}
+
+static void features_clr(unsigned long features)
+{
+ current->thread.features &= ~features;
+}
+
+static unsigned long alloc_shstk(unsigned long size)
+{
+ int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
+ struct mm_struct *mm = current->mm;
+ unsigned long addr, unused;
+
+ mmap_write_lock(mm);
+ addr = do_mmap(NULL, addr, size, PROT_READ, flags,
+ VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
+
+ mmap_write_unlock(mm);
+
+ return addr;
+}
+
+static unsigned long adjust_shstk_size(unsigned long size)
+{
+ if (size)
+ return PAGE_ALIGN(size);
+
+ return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
+}
+
+static void unmap_shadow_stack(u64 base, u64 size)
+{
+ while (1) {
+ int r;
+
+ r = vm_munmap(base, size);
+
+ /*
+ * vm_munmap() returns -EINTR when mmap_lock is held by
+ * something else, and that lock should not be held for a
+ * long time. Retry it for the case.
+ */
+ if (r == -EINTR) {
+ cond_resched();
+ continue;
+ }
+
+ /*
+ * For all other types of vm_munmap() failure, either the
+ * system is out of memory or there is bug.
+ */
+ WARN_ON_ONCE(r);
+ break;
+ }
+}
+
+static int shstk_setup(void)
+{
+ struct thread_shstk *shstk = ¤t->thread.shstk;
+ unsigned long addr, size;
+
+ /* Already enabled */
+ if (features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ /* Also not supported for 32 bit and x32 */
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall())
+ return -EOPNOTSUPP;
+
+ size = adjust_shstk_size(0);
+ addr = alloc_shstk(size);
+ if (IS_ERR_VALUE(addr))
+ return PTR_ERR((void *)addr);
+
+ fpregs_lock_and_load();
+ wrmsrl(MSR_IA32_PL3_SSP, addr + size);
+ wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
+ fpregs_unlock();
+
+ shstk->base = addr;
+ shstk->size = size;
+ features_set(ARCH_SHSTK_SHSTK);
+
+ return 0;
+}
+
void reset_thread_features(void)
{
+ memset(¤t->thread.shstk, 0, sizeof(struct thread_shstk));
current->thread.features = 0;
current->thread.features_locked = 0;
}
+void shstk_free(struct task_struct *tsk)
+{
+ struct thread_shstk *shstk = &tsk->thread.shstk;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !features_enabled(ARCH_SHSTK_SHSTK))
+ return;
+
+ if (!tsk->mm)
+ return;
+
+ unmap_shadow_stack(shstk->base, shstk->size);
+}
+
+static int shstk_disable(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -EOPNOTSUPP;
+
+ /* Already disabled? */
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ fpregs_lock_and_load();
+ /* Disable WRSS too when disabling shadow stack */
+ wrmsrl(MSR_IA32_U_CET, 0);
+ wrmsrl(MSR_IA32_PL3_SSP, 0);
+ fpregs_unlock();
+
+ shstk_free(current);
+ features_clr(ARCH_SHSTK_SHSTK);
+
+ return 0;
+}
+
long shstk_prctl(struct task_struct *task, int option, unsigned long features)
{
if (option == ARCH_SHSTK_LOCK) {