[2/3] x86/callthunks: Handle %rip-relative relocations in call thunk template

Message ID 20231102112850.3448745-3-ubizjak@gmail.com
State New
Headers
Series Fix and unify call thunks assembly snippets |

Commit Message

Uros Bizjak Nov. 2, 2023, 11:25 a.m. UTC
  Contrary to alternatives, relocations are currently not supported in
call thunk templates.  Implement minimal support for relocations
when copying template from its storage location to handle %rip-relative
addresses.

Support for relocations will be needed when PER_CPU_VAR macro switches
to %rip-relative addressing.

The patch allows unification of ASM_INCREMENT_CALL_DEPTH, which already
uses PER_CPU_VAR macro, with INCREMENT_CALL_DEPTH, used in call thunk
template, which is currently limited to use absolute address.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
---
 arch/x86/kernel/callthunks.c | 63 ++++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 7 deletions(-)
  

Comments

Peter Zijlstra Nov. 2, 2023, 11:44 a.m. UTC | #1
On Thu, Nov 02, 2023 at 12:25:47PM +0100, Uros Bizjak wrote:

> @@ -166,13 +168,51 @@ static const u8 nops[] = {
>  	0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
>  };
>  
> +#define apply_reloc_n(n_, p_, d_)				\
> +	do {							\
> +		s32 v = *(s##n_ *)(p_);				\
> +		v += (d_);					\
> +		BUG_ON((v >> 31) != (v >> (n_-1)));		\
> +		*(s##n_ *)(p_) = (s##n_)v;			\
> +	} while (0)
> +
> +static __always_inline
> +void apply_reloc(int n, void *ptr, uintptr_t diff)
> +{
> +	switch (n) {
> +	case 4: apply_reloc_n(32, ptr, diff); break;
> +	default: BUG();
> +	}
> +}
> +
> +static void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src)
> +{
> +	for (int next, i = 0; i < len; i = next) {
> +		struct insn insn;
> +
> +		if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
> +			return;
> +
> +		next = i + insn.length;
> +
> +		if (insn_rip_relative(&insn))
> +			apply_reloc(insn.displacement.nbytes,
> +				    buf + i + insn_offset_displacement(&insn),
> +				    src - dest);
> +	}
> +}

Isn't it simpler to use apply_relocation() from alternative.c?

Remove static, add decl, stuff like that?
  
Uros Bizjak Nov. 2, 2023, 11:50 a.m. UTC | #2
On Thu, Nov 2, 2023 at 12:44 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Nov 02, 2023 at 12:25:47PM +0100, Uros Bizjak wrote:
>
> > @@ -166,13 +168,51 @@ static const u8 nops[] = {
> >       0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
> >  };
> >
> > +#define apply_reloc_n(n_, p_, d_)                            \
> > +     do {                                                    \
> > +             s32 v = *(s##n_ *)(p_);                         \
> > +             v += (d_);                                      \
> > +             BUG_ON((v >> 31) != (v >> (n_-1)));             \
> > +             *(s##n_ *)(p_) = (s##n_)v;                      \
> > +     } while (0)
> > +
> > +static __always_inline
> > +void apply_reloc(int n, void *ptr, uintptr_t diff)
> > +{
> > +     switch (n) {
> > +     case 4: apply_reloc_n(32, ptr, diff); break;
> > +     default: BUG();
> > +     }
> > +}
> > +
> > +static void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src)
> > +{
> > +     for (int next, i = 0; i < len; i = next) {
> > +             struct insn insn;
> > +
> > +             if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
> > +                     return;
> > +
> > +             next = i + insn.length;
> > +
> > +             if (insn_rip_relative(&insn))
> > +                     apply_reloc(insn.displacement.nbytes,
> > +                                 buf + i + insn_offset_displacement(&insn),
> > +                                 src - dest);
> > +     }
> > +}
>
> Isn't it simpler to use apply_relocation() from alternative.c?

Yes, I was looking at that function, but somehow thought that it is a
bit overkill here, since we just need a %rip-relative reloc.

> Remove static, add decl, stuff like that?

On second thought, you are right. Should I move the above function
somewhere (reloc.c?) , or can I just use it from alternative.c and add
decl (where?) ?

Thanks,
Uros.
  
Peter Zijlstra Nov. 2, 2023, 11:56 a.m. UTC | #3
On Thu, Nov 02, 2023 at 12:50:01PM +0100, Uros Bizjak wrote:

> > Remove static, add decl, stuff like that?
> 
> On second thought, you are right. Should I move the above function
> somewhere (reloc.c?) , or can I just use it from alternative.c and add
> decl (where?) ?

Yeah, leave it there for now, perhaps asm/text-patching.h ?
  
Uros Bizjak Nov. 2, 2023, 12:34 p.m. UTC | #4
On Thu, Nov 2, 2023 at 12:56 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Nov 02, 2023 at 12:50:01PM +0100, Uros Bizjak wrote:
>
> > > Remove static, add decl, stuff like that?
> >
> > On second thought, you are right. Should I move the above function
> > somewhere (reloc.c?) , or can I just use it from alternative.c and add
> > decl (where?) ?
>
> Yeah, leave it there for now, perhaps asm/text-patching.h ?

The new version boots OK and looks like the attached patch.

Uros.
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 29832c338cdc..ba8d900f3ebe 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -18,6 +18,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
 #define __parainstructions_end	NULL
 #endif
 
+void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len);
+
 /*
  * Currently, the max observed size in the kernel code is
  * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5.
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 73be3931e4f0..66140c54d4f6 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -325,8 +325,7 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
 	return (target < src || target > src + src_len);
 }
 
-static void __init_or_module noinline
-apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
+void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
 {
 	int prev, target = 0;
 
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index d0922cf94c90..832eaec36e2b 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -24,6 +24,8 @@
 
 static int __initdata_or_module debug_callthunks;
 
+#define MAX_PATCH_LEN (255-1)
+
 #define prdbg(fmt, args...)					\
 do {								\
 	if (debug_callthunks)					\
@@ -169,10 +171,15 @@ static const u8 nops[] = {
 static void *patch_dest(void *dest, bool direct)
 {
 	unsigned int tsize = SKL_TMPL_SIZE;
+	u8 insn_buff[MAX_PATCH_LEN];
 	u8 *pad = dest - tsize;
 
+	memcpy(insn_buff, skl_call_thunk_template, tsize);
+	apply_relocation(insn_buff, tsize, pad,
+			 skl_call_thunk_template, tsize);
+
 	/* Already patched? */
-	if (!bcmp(pad, skl_call_thunk_template, tsize))
+	if (!bcmp(pad, insn_buff, tsize))
 		return pad;
 
 	/* Ensure there are nops */
@@ -182,9 +189,9 @@ static void *patch_dest(void *dest, bool direct)
 	}
 
 	if (direct)
-		memcpy(pad, skl_call_thunk_template, tsize);
+		memcpy(pad, insn_buff, tsize);
 	else
-		text_poke_copy_locked(pad, skl_call_thunk_template, tsize, true);
+		text_poke_copy_locked(pad, insn_buff, tsize, true);
 	return pad;
 }
 
@@ -281,20 +288,27 @@ void *callthunks_translate_call_dest(void *dest)
 static bool is_callthunk(void *addr)
 {
 	unsigned int tmpl_size = SKL_TMPL_SIZE;
-	void *tmpl = skl_call_thunk_template;
+	u8 insn_buff[MAX_PATCH_LEN];
 	unsigned long dest;
+	u8 *pad;
 
 	dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT);
 	if (!thunks_initialized || skip_addr((void *)dest))
 		return false;
 
-	return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size);
+	*pad = dest - tmpl_size;
+
+	memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
+	apply_relocation(insn_buff, tmpl_size, pad,
+			 skl_call_thunk_template, tmpl_size);
+
+	return !bcmp(pad, insn_buff, tmpl_size);
 }
 
 int x86_call_depth_emit_accounting(u8 **pprog, void *func)
 {
 	unsigned int tmpl_size = SKL_TMPL_SIZE;
-	void *tmpl = skl_call_thunk_template;
+	u8 insn_buff[MAX_PATCH_LEN];
 
 	if (!thunks_initialized)
 		return 0;
@@ -303,7 +317,11 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func)
 	if (func && is_callthunk(func))
 		return 0;
 
-	memcpy(*pprog, tmpl, tmpl_size);
+	memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
+	apply_relocation(insn_buff, tmpl_size, *pprog,
+			 skl_call_thunk_template, tmpl_size);
+
+	memcpy(*pprog, insn_buff, tmpl_size);
 	*pprog += tmpl_size;
 	return tmpl_size;
 }
  

Patch

diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index d0922cf94c90..bda09d82bff7 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -24,6 +24,8 @@ 
 
 static int __initdata_or_module debug_callthunks;
 
+#define MAX_PATCH_LEN (255-1)
+
 #define prdbg(fmt, args...)					\
 do {								\
 	if (debug_callthunks)					\
@@ -166,13 +168,51 @@  static const u8 nops[] = {
 	0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
 };
 
+#define apply_reloc_n(n_, p_, d_)				\
+	do {							\
+		s32 v = *(s##n_ *)(p_);				\
+		v += (d_);					\
+		BUG_ON((v >> 31) != (v >> (n_-1)));		\
+		*(s##n_ *)(p_) = (s##n_)v;			\
+	} while (0)
+
+static __always_inline
+void apply_reloc(int n, void *ptr, uintptr_t diff)
+{
+	switch (n) {
+	case 4: apply_reloc_n(32, ptr, diff); break;
+	default: BUG();
+	}
+}
+
+static void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src)
+{
+	for (int next, i = 0; i < len; i = next) {
+		struct insn insn;
+
+		if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
+			return;
+
+		next = i + insn.length;
+
+		if (insn_rip_relative(&insn))
+			apply_reloc(insn.displacement.nbytes,
+				    buf + i + insn_offset_displacement(&insn),
+				    src - dest);
+	}
+}
+
 static void *patch_dest(void *dest, bool direct)
 {
 	unsigned int tsize = SKL_TMPL_SIZE;
+	u8 insn_buff[MAX_PATCH_LEN];
 	u8 *pad = dest - tsize;
 
+	memcpy(insn_buff, skl_call_thunk_template, tsize);
+	apply_relocation(insn_buff, tsize, pad, skl_call_thunk_template);
+
 	/* Already patched? */
-	if (!bcmp(pad, skl_call_thunk_template, tsize))
+	if (!bcmp(pad, insn_buff, tsize))
 		return pad;
 
 	/* Ensure there are nops */
@@ -182,9 +222,9 @@  static void *patch_dest(void *dest, bool direct)
 	}
 
 	if (direct)
-		memcpy(pad, skl_call_thunk_template, tsize);
+		memcpy(pad, insn_buff, tsize);
 	else
-		text_poke_copy_locked(pad, skl_call_thunk_template, tsize, true);
+		text_poke_copy_locked(pad, insn_buff, tsize, true);
 	return pad;
 }
 
@@ -281,20 +321,26 @@  void *callthunks_translate_call_dest(void *dest)
 static bool is_callthunk(void *addr)
 {
 	unsigned int tmpl_size = SKL_TMPL_SIZE;
-	void *tmpl = skl_call_thunk_template;
+	u8 insn_buff[MAX_PATCH_LEN];
 	unsigned long dest;
+	u8 *pad;
 
 	dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT);
 	if (!thunks_initialized || skip_addr((void *)dest))
 		return false;
 
-	return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size);
+	*pad = dest - tmpl_size;
+
+	memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
+	apply_relocation(insn_buff, tmpl_size, pad, skl_call_thunk_template);
+
+	return !bcmp(pad, insn_buff, tmpl_size);
 }
 
 int x86_call_depth_emit_accounting(u8 **pprog, void *func)
 {
 	unsigned int tmpl_size = SKL_TMPL_SIZE;
-	void *tmpl = skl_call_thunk_template;
+	u8 insn_buff[MAX_PATCH_LEN];
 
 	if (!thunks_initialized)
 		return 0;
@@ -303,7 +349,10 @@  int x86_call_depth_emit_accounting(u8 **pprog, void *func)
 	if (func && is_callthunk(func))
 		return 0;
 
-	memcpy(*pprog, tmpl, tmpl_size);
+	memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
+	apply_relocation(insn_buff, tmpl_size, *pprog, skl_call_thunk_template);
+
+	memcpy(*pprog, insn_buff, tmpl_size);
 	*pprog += tmpl_size;
 	return tmpl_size;
 }