[1/6] x86/bugs: Add asm helpers for executing VERW

Message ID 20231020-delay-verw-v1-1-cff54096326d@linux.intel.com
State New
Headers
Series Delay VERW |

Commit Message

Pawan Gupta Oct. 20, 2023, 8:44 p.m. UTC
  MDS mitigation requires clearing the CPU buffers before returning to
user. This needs to be done late in the exit-to-user path. Current
location of VERW leaves a possibility of kernel data ending up in CPU
buffers for memory accesses done after VERW such as:

  1. Kernel data accessed by an NMI between VERW and return-to-user can
     remain in CPU buffers ( since NMI returning to kernel does not
     execute VERW to clear CPU buffers.
  2. Alyssa reported that after VERW is executed,
     CONFIG_GCC_PLUGIN_STACKLEAK=y scrubs the stack used by a system
     call. Memory accesses during stack scrubbing can move kernel stack
     contents into CPU buffers.
  3. When caller saved registers are restored after a return from
     function executing VERW, the kernel stack accesses can remain in
     CPU buffers(since they occur after VERW).

To fix this VERW needs to be moved very late in exit-to-user path.

In preparation for moving VERW to entry/exit asm code, create macros
that can be used in asm. Also make them depend on a new feature flag
X86_FEATURE_USER_CLEAR_CPU_BUF.

Reported-by: Alyssa Milburn <alyssa.milburn@intel.com>
Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
---
 arch/x86/include/asm/cpufeatures.h   |  2 +-
 arch/x86/include/asm/nospec-branch.h | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)
  

Comments

Sean Christopherson Oct. 20, 2023, 11:13 p.m. UTC | #1
On Fri, Oct 20, 2023, Pawan Gupta wrote:
> diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> index c55cc243592e..e1b623a27e1b 100644
> --- a/arch/x86/include/asm/nospec-branch.h
> +++ b/arch/x86/include/asm/nospec-branch.h
> @@ -111,6 +111,24 @@
>  #define RESET_CALL_DEPTH_FROM_CALL
>  #endif
>  
> +/*
> + * Macro to execute VERW instruction to mitigate transient data sampling
> + * attacks such as MDS. On affected systems a microcode update overloaded VERW
> + * instruction to also clear the CPU buffers.
> + *
> + * Note: Only the memory operand variant of VERW clears the CPU buffers. To
> + * handle the case when VERW is executed after user registers are restored, use
> + * RIP to point the memory operand to a part NOPL instruction that contains
> + * __KERNEL_DS.
> + */
> +#define __EXEC_VERW(m)	verw _ASM_RIP(m)
> +
> +#define EXEC_VERW				\
> +	__EXEC_VERW(551f);			\
> +	/* nopl __KERNEL_DS(%rax) */		\
> +	.byte 0x0f, 0x1f, 0x80, 0x00, 0x00;	\
> +551:	.word __KERNEL_DS;			\

Why are there so many macro layers?  Nothing jumps out to justfying two layers,
let alone three.

> +
>  /*
>   * Fill the CPU return stack buffer.
>   *
> @@ -329,6 +347,13 @@
>  #endif
>  .endm
>  
> +/* Clear CPU buffers before returning to user */
> +.macro USER_CLEAR_CPU_BUFFERS
> +	ALTERNATIVE "jmp .Lskip_verw_\@;", "", X86_FEATURE_USER_CLEAR_CPU_BUF
> +	EXEC_VERW

Rather than a NOP after VERW, why not something like this?

/* Clear CPU buffers before returning to user */
.macro USER_CLEAR_CPU_BUFFERS
                ALTERNATIVE "jmp .Lskip_verw_\@;", "jmp .Ldo_verw_\@;", X86_FEATURE_USER_CLEAR_CPU_BUF
551:            .word __KERNEL_DS
.Ldo_verw_\@:   verw _ASM_RIP(551b)
.Lskip_verw_\@:
.endm

> +.Lskip_verw_\@:
> +.endm
  
Andrew Cooper Oct. 20, 2023, 11:55 p.m. UTC | #2
On 20/10/2023 9:44 pm, Pawan Gupta wrote:
> +#define EXEC_VERW				\
> +	__EXEC_VERW(551f);			\
> +	/* nopl __KERNEL_DS(%rax) */		\
> +	.byte 0x0f, 0x1f, 0x80, 0x00, 0x00;	\
> +551:	.word __KERNEL_DS;			\

Is this actually wise from a perf point of view?

You're causing a data access to the instruction stream, and not only
that, the immediate next instruction.  Some parts don't take kindly to
snoops hitting L1I.

A better option would be to simply have

.section .text.entry
.align CACHELINE
mds_verw_sel:
    .word __KERNEL_DS
    int3
.align CACHELINE


And then just have EXEC_VERW be

    verw mds_verw_sel(%rip)

in the fastpaths.  That keeps the memory operand in .text.entry it works
on Meltdown-vulnerable CPUs, but creates effectively a data cacheline
that isn't mixed into anywhere in the frontend, which also gets far
better locality of reference rather than having it duplicated in 9
different places.

Also it avoids playing games with hiding data inside an instruction.
It's a neat trick, but the neater trick is avoid it whenever possible.

~Andrew
  
Pawan Gupta Oct. 21, 2023, 1 a.m. UTC | #3
On Fri, Oct 20, 2023 at 04:13:13PM -0700, Sean Christopherson wrote:
> On Fri, Oct 20, 2023, Pawan Gupta wrote:
> > diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> > index c55cc243592e..e1b623a27e1b 100644
> > --- a/arch/x86/include/asm/nospec-branch.h
> > +++ b/arch/x86/include/asm/nospec-branch.h
> > @@ -111,6 +111,24 @@
> >  #define RESET_CALL_DEPTH_FROM_CALL
> >  #endif
> >  
> > +/*
> > + * Macro to execute VERW instruction to mitigate transient data sampling
> > + * attacks such as MDS. On affected systems a microcode update overloaded VERW
> > + * instruction to also clear the CPU buffers.
> > + *
> > + * Note: Only the memory operand variant of VERW clears the CPU buffers. To
> > + * handle the case when VERW is executed after user registers are restored, use
> > + * RIP to point the memory operand to a part NOPL instruction that contains
> > + * __KERNEL_DS.
> > + */
> > +#define __EXEC_VERW(m)	verw _ASM_RIP(m)
> > +
> > +#define EXEC_VERW				\
> > +	__EXEC_VERW(551f);			\
> > +	/* nopl __KERNEL_DS(%rax) */		\
> > +	.byte 0x0f, 0x1f, 0x80, 0x00, 0x00;	\
> > +551:	.word __KERNEL_DS;			\
> 
> Why are there so many macro layers?  Nothing jumps out to justfying two layers,
> let alone three.

I can't remember the exact reason, but I think the reason I added
__EXEC_VERW() was because using EXEC_VERW() in a C function was leading
to build error in the internal draft version. This version is not
calling it from C, so yes I can get rid of the extra layer.

> >  /*
> >   * Fill the CPU return stack buffer.
> >   *
> > @@ -329,6 +347,13 @@
> >  #endif
> >  .endm
> >  
> > +/* Clear CPU buffers before returning to user */
> > +.macro USER_CLEAR_CPU_BUFFERS
> > +	ALTERNATIVE "jmp .Lskip_verw_\@;", "", X86_FEATURE_USER_CLEAR_CPU_BUF
> > +	EXEC_VERW
> 
> Rather than a NOP after VERW, why not something like this?
> 
> /* Clear CPU buffers before returning to user */
> .macro USER_CLEAR_CPU_BUFFERS
>                 ALTERNATIVE "jmp .Lskip_verw_\@;", "jmp .Ldo_verw_\@;", X86_FEATURE_USER_CLEAR_CPU_BUF
> 551:            .word __KERNEL_DS
> .Ldo_verw_\@:   verw _ASM_RIP(551b)
> .Lskip_verw_\@:
> .endm

I wasn't comfortable adding a variable directly in the instruction
stream because the CPU may interpret it wrongly. With NOP it is bound to
ignore the data part.
  
Pawan Gupta Oct. 21, 2023, 1:18 a.m. UTC | #4
On Sat, Oct 21, 2023 at 12:55:45AM +0100, Andrew Cooper wrote:
> On 20/10/2023 9:44 pm, Pawan Gupta wrote:
> > +#define EXEC_VERW				\
> > +	__EXEC_VERW(551f);			\
> > +	/* nopl __KERNEL_DS(%rax) */		\
> > +	.byte 0x0f, 0x1f, 0x80, 0x00, 0x00;	\
> > +551:	.word __KERNEL_DS;			\
> 
> Is this actually wise from a perf point of view?
> 
> You're causing a data access to the instruction stream, and not only
> that, the immediate next instruction.  Some parts don't take kindly to
> snoops hitting L1I.

I suspected the same and asked CPU architects, they did not anticipate
reads being interpreted as part of self modifying code. The perf numbers
do not indicate a problem, but they dont speak for all the parts. It
could be an issue with some parts.

> A better option would be to simply have
> 
> .section .text.entry
> .align CACHELINE
> mds_verw_sel:
>     .word __KERNEL_DS
>     int3
> .align CACHELINE
> 
> 
> And then just have EXEC_VERW be
> 
>     verw mds_verw_sel(%rip)
> 
> in the fastpaths.  That keeps the memory operand in .text.entry it works
> on Meltdown-vulnerable CPUs, but creates effectively a data cacheline
> that isn't mixed into anywhere in the frontend, which also gets far
> better locality of reference rather than having it duplicated in 9
> different places.

> Also it avoids playing games with hiding data inside an instruction.
> It's a neat trick, but the neater trick is avoid it whenever possible.

Thanks for the pointers. I think verw in 32-bit mode won't be able to
address the operand outside of 4GB range. Maybe this is fine or could it
be a problem addressing from e.g. KVM module?
  
Andrew Cooper Oct. 21, 2023, 1:33 a.m. UTC | #5
On 21/10/2023 2:18 am, Pawan Gupta wrote:
> On Sat, Oct 21, 2023 at 12:55:45AM +0100, Andrew Cooper wrote:
>> Also it avoids playing games with hiding data inside an instruction.
>> It's a neat trick, but the neater trick is avoid it whenever possible.
> Thanks for the pointers. I think verw in 32-bit mode won't be able to
> address the operand outside of 4GB range.

And?  In a 32bit kernel, what lives outside of a 4G range?

> Maybe this is fine or could it
> be a problem addressing from e.g. KVM module?

RIP-relative addressing is disp32.  Which is the same as it is for
direct calls.

So if your module is far enough away for VERW to have issues, you've got
far more basic problems to solve first.

~Andrew
  
Pawan Gupta Oct. 21, 2023, 2:21 a.m. UTC | #6
On Sat, Oct 21, 2023 at 02:33:47AM +0100, Andrew Cooper wrote:
> On 21/10/2023 2:18 am, Pawan Gupta wrote:
> > On Sat, Oct 21, 2023 at 12:55:45AM +0100, Andrew Cooper wrote:
> >> Also it avoids playing games with hiding data inside an instruction.
> >> It's a neat trick, but the neater trick is avoid it whenever possible.
> > Thanks for the pointers. I think verw in 32-bit mode won't be able to
> > address the operand outside of 4GB range.
> 
> And?  In a 32bit kernel, what lives outside of a 4G range?
> 
> > Maybe this is fine or could it
> > be a problem addressing from e.g. KVM module?
> 
> RIP-relative addressing is disp32.  Which is the same as it is for
> direct calls.
> 
> So if your module is far enough away for VERW to have issues, you've got
> far more basic problems to solve first.

Sorry, I raised the wrong problem. In 64-bit mode, verww only has 32-bit
of relative addressing, so memory operand has to be within 4GB of
callsite. That could be a constraint.
  
Peter Zijlstra Oct. 22, 2023, 4:16 p.m. UTC | #7
On Sat, Oct 21, 2023 at 12:50:37AM +0100, Andrew Cooper wrote:
> On 20/10/2023 9:44 pm, Pawan Gupta wrote:
> > diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> > index c55cc243592e..e1b623a27e1b 100644
> > --- a/arch/x86/include/asm/nospec-branch.h
> > +++ b/arch/x86/include/asm/nospec-branch.h
> > @@ -111,6 +111,24 @@
> >  #define RESET_CALL_DEPTH_FROM_CALL
> >  #endif
> >  
> > +/*
> > + * Macro to execute VERW instruction to mitigate transient data sampling
> > + * attacks such as MDS. On affected systems a microcode update overloaded VERW
> > + * instruction to also clear the CPU buffers.
> > + *
> > + * Note: Only the memory operand variant of VERW clears the CPU buffers. To
> > + * handle the case when VERW is executed after user registers are restored, use
> > + * RIP to point the memory operand to a part NOPL instruction that contains
> > + * __KERNEL_DS.
> > + */
> > +#define __EXEC_VERW(m)	verw _ASM_RIP(m)
> > +
> > +#define EXEC_VERW				\
> > +	__EXEC_VERW(551f);			\
> > +	/* nopl __KERNEL_DS(%rax) */		\
> > +	.byte 0x0f, 0x1f, 0x80, 0x00, 0x00;	\
> > +551:	.word __KERNEL_DS;			\
> > +
> 
> Is this actually wise from a perf point of view?
> 
> You're causing a data access to the instruction stream, and not only
> that, the immediate next instruction.  Some parts don't take kindly to
> snoops hitting L1I.
> 
> A better option would be to simply have
> 
> .section .text.entry
> .align CACHELINE
> mds_verw_sel:
>     .word __KERNEL_DS
>     int3
> .align CACHELINE
> 
> 
> And then just have EXEC_VERW be
> 
>     verw mds_verw_sel(%rip)

	ALTERNATIVE "", "verw mds_verw_sel(%rip)", X86_FEATURE_USER_CLEAR_CPU_BUF

But yeah, his seems like the sanest form.
  
Josh Poimboeuf Oct. 23, 2023, 6:08 p.m. UTC | #8
On Fri, Oct 20, 2023 at 07:21:34PM -0700, Pawan Gupta wrote:
> On Sat, Oct 21, 2023 at 02:33:47AM +0100, Andrew Cooper wrote:
> > On 21/10/2023 2:18 am, Pawan Gupta wrote:
> > > On Sat, Oct 21, 2023 at 12:55:45AM +0100, Andrew Cooper wrote:
> > >> Also it avoids playing games with hiding data inside an instruction.
> > >> It's a neat trick, but the neater trick is avoid it whenever possible.
> > > Thanks for the pointers. I think verw in 32-bit mode won't be able to
> > > address the operand outside of 4GB range.
> > 
> > And?  In a 32bit kernel, what lives outside of a 4G range?
> > 
> > > Maybe this is fine or could it
> > > be a problem addressing from e.g. KVM module?
> > 
> > RIP-relative addressing is disp32.  Which is the same as it is for
> > direct calls.
> > 
> > So if your module is far enough away for VERW to have issues, you've got
> > far more basic problems to solve first.
> 
> Sorry, I raised the wrong problem. In 64-bit mode, verww only has 32-bit
> of relative addressing, so memory operand has to be within 4GB of
> callsite. That could be a constraint.

Even on x86-64, modules are mapped within 4GB of the kernel, so I don't
think that's a concern.
  
Pawan Gupta Oct. 23, 2023, 7:09 p.m. UTC | #9
On Mon, Oct 23, 2023 at 11:08:06AM -0700, Josh Poimboeuf wrote:
> On Fri, Oct 20, 2023 at 07:21:34PM -0700, Pawan Gupta wrote:
> > On Sat, Oct 21, 2023 at 02:33:47AM +0100, Andrew Cooper wrote:
> > > On 21/10/2023 2:18 am, Pawan Gupta wrote:
> > > > On Sat, Oct 21, 2023 at 12:55:45AM +0100, Andrew Cooper wrote:
> > > >> Also it avoids playing games with hiding data inside an instruction.
> > > >> It's a neat trick, but the neater trick is avoid it whenever possible.
> > > > Thanks for the pointers. I think verw in 32-bit mode won't be able to
> > > > address the operand outside of 4GB range.
> > > 
> > > And?  In a 32bit kernel, what lives outside of a 4G range?
> > > 
> > > > Maybe this is fine or could it
> > > > be a problem addressing from e.g. KVM module?
> > > 
> > > RIP-relative addressing is disp32.  Which is the same as it is for
> > > direct calls.
> > > 
> > > So if your module is far enough away for VERW to have issues, you've got
> > > far more basic problems to solve first.
> > 
> > Sorry, I raised the wrong problem. In 64-bit mode, verww only has 32-bit
> > of relative addressing, so memory operand has to be within 4GB of
> > callsite. That could be a constraint.
> 
> Even on x86-64, modules are mapped within 4GB of the kernel, so I don't
> think that's a concern.

You are correct, modules are indeed mapped within 4GB of the kernel. So
what Andrew suggested is feasible. Is that your preference?
  
Pawan Gupta Oct. 25, 2023, 6:28 a.m. UTC | #10
On Sat, Oct 21, 2023 at 12:55:45AM +0100, Andrew Cooper wrote:
> On 20/10/2023 9:44 pm, Pawan Gupta wrote:
> > +#define EXEC_VERW				\
> > +	__EXEC_VERW(551f);			\
> > +	/* nopl __KERNEL_DS(%rax) */		\
> > +	.byte 0x0f, 0x1f, 0x80, 0x00, 0x00;	\
> > +551:	.word __KERNEL_DS;			\
> 
> Is this actually wise from a perf point of view?
> 
> You're causing a data access to the instruction stream, and not only
> that, the immediate next instruction.  Some parts don't take kindly to
> snoops hitting L1I.
> 
> A better option would be to simply have
> 
> .section .text.entry
> .align CACHELINE
> mds_verw_sel:
>     .word __KERNEL_DS
>     int3
> .align CACHELINE
> 
> 
> And then just have EXEC_VERW be
> 
>     verw mds_verw_sel(%rip)
> 
> in the fastpaths.  That keeps the memory operand in .text.entry it works
> on Meltdown-vulnerable CPUs, but creates effectively a data cacheline
> that isn't mixed into anywhere in the frontend, which also gets far
> better locality of reference.

With .text.entry section I am getting getting below warnings and an
error:

-----------------------------------------------------------------
    LD      vmlinux.o
  vmlinux.o: warning: objtool: .text.entry+0x0: unreachable instruction
  vmlinux.o: warning: objtool: .text.entry+0x40: unreachable instruction
  vmlinux.o: warning: objtool: .text.entry+0x80: unreachable instruction
  vmlinux.o: warning: objtool: .text.entry+0xc0: unreachable instruction
  vmlinux.o: warning: objtool: .text.entry+0x100: unreachable instruction
  vmlinux.o: warning: objtool: .text.entry+0x140: unreachable instruction
  vmlinux.o: warning: objtool: .text.entry+0x180: unreachable instruction
  vmlinux.o: warning: objtool: .text.entry+0x1c0: unreachable instruction
  vmlinux.o: warning: objtool: .text.entry+0x200: unreachable instruction
  vmlinux.o: warning: objtool: .text.entry+0x240: unreachable instruction
  vmlinux.o: warning: objtool: .text.entry+0x280: unreachable instruction
  vmlinux.o: warning: objtool: .text.entry+0x2c0: unreachable instruction
  vmlinux.o: warning: objtool: .text.entry+0x300: unreachable instruction
  vmlinux.o: warning: objtool: .altinstr_replacement+0x2c: relocation to !ENDBR: .text.entry+0x0
  vmlinux.o: warning: objtool: .altinstr_replacement+0x1c4: relocation to !ENDBR: .text.entry+0x0
  vmlinux.o: warning: objtool: .altinstr_replacement+0x1d0: relocation to !ENDBR: .text.entry+0x0
  vmlinux.o: warning: objtool: .altinstr_replacement+0x2d2: relocation to !ENDBR: .text.entry+0x80
  vmlinux.o: warning: objtool: .altinstr_replacement+0x5d5: relocation to !ENDBR: .text.entry+0xc0
    OBJCOPY modules.builtin.modinfo
    GEN     modules.builtin
    MODPOST vmlinux.symvers
    UPD     include/generated/utsversion.h
    CC      init/version-timestamp.o
    LD      .tmp_vmlinux.kallsyms1
  ld: error: unplaced orphan section `.text.entry' from `vmlinux.o'
  make[2]: *** [scripts/Makefile.vmlinux:36: vmlinux] Error 1
-----------------------------------------------------------------

... because my config has CONFIG_LD_ORPHAN_WARN_LEVEL="error" and
objtool needs to be told about this entry.

Do you think its worth fighting these warnings and error, or simply use
.rodata section for verw memory operand?
  
Peter Zijlstra Oct. 25, 2023, 7:22 a.m. UTC | #11
On Tue, Oct 24, 2023 at 11:28:18PM -0700, Pawan Gupta wrote:

> With .text.entry section I am getting getting below warnings and an
> error:
> 
> -----------------------------------------------------------------
>     LD      vmlinux.o
>   vmlinux.o: warning: objtool: .text.entry+0x0: unreachable instruction
>   vmlinux.o: warning: objtool: .text.entry+0x40: unreachable instruction
>   vmlinux.o: warning: objtool: .text.entry+0x80: unreachable instruction
>   vmlinux.o: warning: objtool: .text.entry+0xc0: unreachable instruction
>   vmlinux.o: warning: objtool: .text.entry+0x100: unreachable instruction
>   vmlinux.o: warning: objtool: .text.entry+0x140: unreachable instruction
>   vmlinux.o: warning: objtool: .text.entry+0x180: unreachable instruction
>   vmlinux.o: warning: objtool: .text.entry+0x1c0: unreachable instruction
>   vmlinux.o: warning: objtool: .text.entry+0x200: unreachable instruction
>   vmlinux.o: warning: objtool: .text.entry+0x240: unreachable instruction
>   vmlinux.o: warning: objtool: .text.entry+0x280: unreachable instruction
>   vmlinux.o: warning: objtool: .text.entry+0x2c0: unreachable instruction
>   vmlinux.o: warning: objtool: .text.entry+0x300: unreachable instruction
>   vmlinux.o: warning: objtool: .altinstr_replacement+0x2c: relocation to !ENDBR: .text.entry+0x0
>   vmlinux.o: warning: objtool: .altinstr_replacement+0x1c4: relocation to !ENDBR: .text.entry+0x0
>   vmlinux.o: warning: objtool: .altinstr_replacement+0x1d0: relocation to !ENDBR: .text.entry+0x0
>   vmlinux.o: warning: objtool: .altinstr_replacement+0x2d2: relocation to !ENDBR: .text.entry+0x80
>   vmlinux.o: warning: objtool: .altinstr_replacement+0x5d5: relocation to !ENDBR: .text.entry+0xc0
>     OBJCOPY modules.builtin.modinfo
>     GEN     modules.builtin
>     MODPOST vmlinux.symvers
>     UPD     include/generated/utsversion.h
>     CC      init/version-timestamp.o
>     LD      .tmp_vmlinux.kallsyms1
>   ld: error: unplaced orphan section `.text.entry' from `vmlinux.o'
>   make[2]: *** [scripts/Makefile.vmlinux:36: vmlinux] Error 1
> -----------------------------------------------------------------
> 
> ... because my config has CONFIG_LD_ORPHAN_WARN_LEVEL="error" and
> objtool needs to be told about this entry.
> 
> Do you think its worth fighting these warnings and error, or simply use
> .rodata section for verw memory operand?

I'm thinking you need to at the very least stay in a section that's
actually still mapped with PTI :-)

.entry.text is the only thing that is reliably mapped with PTI (IIRC),
some configs also get a chunk of the kernel image, but not all.

Something like the below should do I suppose.

---
 arch/x86/entry/entry.S | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index bfb7bcb362bc..9eb2b532c92a 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -6,6 +6,8 @@
 #include <linux/linkage.h>
 #include <asm/export.h>
 #include <asm/msr-index.h>
+#include <asm/unwind_hints.h>
+#include <asm/segment.h>
 
 .pushsection .noinstr.text, "ax"
 
@@ -20,3 +22,16 @@ SYM_FUNC_END(entry_ibpb)
 EXPORT_SYMBOL_GPL(entry_ibpb);
 
 .popsection
+
+.pushsection .entry.text, "ax"
+
+.align 64
+SYM_CODE_START_NOALIGN(mds_verw_sel)
+	UNWIND_HINT_UNDEFINED
+	ANNOTATE_NOENDBR
+1:
+	.word __KERNEL_DS
+	.skip 64 - (. - 1b), 0xcc
+SYM_CODE_END(mds_verw_sel);
+
+.popsection
  
Andrew Cooper Oct. 25, 2023, 7:52 a.m. UTC | #12
On 25/10/2023 8:22 am, Peter Zijlstra wrote:
> On Tue, Oct 24, 2023 at 11:28:18PM -0700, Pawan Gupta wrote:
>
>> With .text.entry section I am getting getting below warnings and an
>> error:
>>
>> -----------------------------------------------------------------
>>     LD      vmlinux.o
>>   vmlinux.o: warning: objtool: .text.entry+0x0: unreachable instruction
>>   vmlinux.o: warning: objtool: .text.entry+0x40: unreachable instruction
>>   vmlinux.o: warning: objtool: .text.entry+0x80: unreachable instruction
>>   vmlinux.o: warning: objtool: .text.entry+0xc0: unreachable instruction
>>   vmlinux.o: warning: objtool: .text.entry+0x100: unreachable instruction
>>   vmlinux.o: warning: objtool: .text.entry+0x140: unreachable instruction
>>   vmlinux.o: warning: objtool: .text.entry+0x180: unreachable instruction
>>   vmlinux.o: warning: objtool: .text.entry+0x1c0: unreachable instruction
>>   vmlinux.o: warning: objtool: .text.entry+0x200: unreachable instruction
>>   vmlinux.o: warning: objtool: .text.entry+0x240: unreachable instruction
>>   vmlinux.o: warning: objtool: .text.entry+0x280: unreachable instruction
>>   vmlinux.o: warning: objtool: .text.entry+0x2c0: unreachable instruction
>>   vmlinux.o: warning: objtool: .text.entry+0x300: unreachable instruction
>>   vmlinux.o: warning: objtool: .altinstr_replacement+0x2c: relocation to !ENDBR: .text.entry+0x0
>>   vmlinux.o: warning: objtool: .altinstr_replacement+0x1c4: relocation to !ENDBR: .text.entry+0x0
>>   vmlinux.o: warning: objtool: .altinstr_replacement+0x1d0: relocation to !ENDBR: .text.entry+0x0
>>   vmlinux.o: warning: objtool: .altinstr_replacement+0x2d2: relocation to !ENDBR: .text.entry+0x80
>>   vmlinux.o: warning: objtool: .altinstr_replacement+0x5d5: relocation to !ENDBR: .text.entry+0xc0
>>     OBJCOPY modules.builtin.modinfo
>>     GEN     modules.builtin
>>     MODPOST vmlinux.symvers
>>     UPD     include/generated/utsversion.h
>>     CC      init/version-timestamp.o
>>     LD      .tmp_vmlinux.kallsyms1
>>   ld: error: unplaced orphan section `.text.entry' from `vmlinux.o'
>>   make[2]: *** [scripts/Makefile.vmlinux:36: vmlinux] Error 1
>> -----------------------------------------------------------------
>>
>> ... because my config has CONFIG_LD_ORPHAN_WARN_LEVEL="error" and
>> objtool needs to be told about this entry.
>>
>> Do you think its worth fighting these warnings and error, or simply use
>> .rodata section for verw memory operand?
> I'm thinking you need to at the very least stay in a section that's
> actually still mapped with PTI :-)

Sorry.  Xen and Linux have this section named opposite ways around.

> diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
> index bfb7bcb362bc..9eb2b532c92a 100644
> --- a/arch/x86/entry/entry.S
> +++ b/arch/x86/entry/entry.S
> @@ -20,3 +22,16 @@ SYM_FUNC_END(entry_ibpb)
>  EXPORT_SYMBOL_GPL(entry_ibpb);
>  
>  .popsection
> +
> +.pushsection .entry.text, "ax"
> +
> +.align 64
> +SYM_CODE_START_NOALIGN(mds_verw_sel)
> +	UNWIND_HINT_UNDEFINED
> +	ANNOTATE_NOENDBR
> +1:
> +	.word __KERNEL_DS
> +	.skip 64 - (. - 1b), 0xcc

The 1 label aliases mds_verw_sel and this must remain like this for the
construct to work.

So instead of .skip, why not simply .align 64, 0xcc and get rid of the
1: label?

Do we have a suitably named constant cacheline size, rather than
opencoding 64?

> +SYM_CODE_END(mds_verw_sel);

Given that KVM needs it, this probably needs an EXPORT_SYMBOL_GPL() on it.

~Andrew
  
Peter Zijlstra Oct. 25, 2023, 8:02 a.m. UTC | #13
On Wed, Oct 25, 2023 at 08:52:50AM +0100, Andrew Cooper wrote:

> > +.pushsection .entry.text, "ax"
> > +
> > +.align 64
> > +SYM_CODE_START_NOALIGN(mds_verw_sel)
> > +	UNWIND_HINT_UNDEFINED
> > +	ANNOTATE_NOENDBR
> > +1:
> > +	.word __KERNEL_DS
> > +	.skip 64 - (. - 1b), 0xcc
> 
> The 1 label aliases mds_verw_sel and this must remain like this for the
> construct to work.
> 
> So instead of .skip, why not simply .align 64, 0xcc and get rid of the
> 1: label?

Because I forgot you can add a filler byte to .align :/ Yes, that's much
saner.

> Do we have a suitably named constant cacheline size, rather than
> opencoding 64?

L1_CACHE_BYTES probably.

> 
> > +SYM_CODE_END(mds_verw_sel);
> 
> Given that KVM needs it, this probably needs an EXPORT_SYMBOL_GPL() on it.

localyesconfig ftw ;-)

/me runs
  
Pawan Gupta Oct. 25, 2023, 3:27 p.m. UTC | #14
On Wed, Oct 25, 2023 at 09:22:55AM +0200, Peter Zijlstra wrote:
 
> I'm thinking you need to at the very least stay in a section that's
> actually still mapped with PTI :-)
> 
> .entry.text is the only thing that is reliably mapped with PTI (IIRC),
> some configs also get a chunk of the kernel image, but not all.
> 
> Something like the below should do I suppose.

Thanks, will do this with Andrew's improvements.

> ---
>  arch/x86/entry/entry.S | 15 +++++++++++++++
  

Patch

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 58cb9495e40f..3f018dfedd5f 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -308,10 +308,10 @@ 
 #define X86_FEATURE_SMBA		(11*32+21) /* "" Slow Memory Bandwidth Allocation */
 #define X86_FEATURE_BMEC		(11*32+22) /* "" Bandwidth Monitoring Event Configuration */
 #define X86_FEATURE_USER_SHSTK		(11*32+23) /* Shadow stack support for user mode applications */
-
 #define X86_FEATURE_SRSO		(11*32+24) /* "" AMD BTB untrain RETs */
 #define X86_FEATURE_SRSO_ALIAS		(11*32+25) /* "" AMD BTB untrain RETs through aliasing */
 #define X86_FEATURE_IBPB_ON_VMEXIT	(11*32+26) /* "" Issue an IBPB only on VMEXIT */
+#define X86_FEATURE_USER_CLEAR_CPU_BUF	(11*32+27) /* "" Clear CPU buffers before returning to user */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
 #define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* AVX VNNI instructions */
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index c55cc243592e..e1b623a27e1b 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -111,6 +111,24 @@ 
 #define RESET_CALL_DEPTH_FROM_CALL
 #endif
 
+/*
+ * Macro to execute VERW instruction to mitigate transient data sampling
+ * attacks such as MDS. On affected systems a microcode update overloaded VERW
+ * instruction to also clear the CPU buffers.
+ *
+ * Note: Only the memory operand variant of VERW clears the CPU buffers. To
+ * handle the case when VERW is executed after user registers are restored, use
+ * RIP to point the memory operand to a part NOPL instruction that contains
+ * __KERNEL_DS.
+ */
+#define __EXEC_VERW(m)	verw _ASM_RIP(m)
+
+#define EXEC_VERW				\
+	__EXEC_VERW(551f);			\
+	/* nopl __KERNEL_DS(%rax) */		\
+	.byte 0x0f, 0x1f, 0x80, 0x00, 0x00;	\
+551:	.word __KERNEL_DS;			\
+
 /*
  * Fill the CPU return stack buffer.
  *
@@ -329,6 +347,13 @@ 
 #endif
 .endm
 
+/* Clear CPU buffers before returning to user */
+.macro USER_CLEAR_CPU_BUFFERS
+	ALTERNATIVE "jmp .Lskip_verw_\@;", "", X86_FEATURE_USER_CLEAR_CPU_BUF
+	EXEC_VERW
+.Lskip_verw_\@:
+.endm
+
 #else /* __ASSEMBLY__ */
 
 #define ANNOTATE_RETPOLINE_SAFE					\