[7/8] arm64: ftrace: Add direct call support

Message ID 20230201163420.1579014-8-revest@chromium.org
State New
Headers
Series Add ftrace direct call for arm64 |

Commit Message

Florent Revest Feb. 1, 2023, 4:34 p.m. UTC
  This builds up on the CALL_OPS work which extends the ftrace patchsite
on arm64 with an ops pointer usable by the ftrace trampoline.

This ops pointer is valid at all time. Indeed, it is either pointing to
ftrace_list_ops or to the single ops which should be called from that
patchsite.

There are a few cases to distinguish:
- If a direct call ops is the only one tracing a function:
  - If the direct called trampoline is within the reach of a BL
    instruction
     -> the ftrace patchsite jumps to the trampoline
  - Else
     -> the ftrace patchsite jumps to the ftrace_caller trampoline which
        reads the ops pointer in the patchsite and jumps to the direct
        call address stored in the ops
- Else
  -> the ftrace patchsite jumps to the ftrace_caller trampoline and its
     ops literal points to ftrace_list_ops so it iterates over all
     registered ftrace ops, including the direct call ops and calls its
     call_direct_funcs handler which stores the direct called
     trampoline's address in the ftrace_regs and the ftrace_caller
     trampoline will return to that address instead of returning to the
     traced function

Signed-off-by: Florent Revest <revest@chromium.org>
Suggested-by: Mark Rutland <mark.rutland@arm.com>
---
 arch/arm64/Kconfig               |  2 ++
 arch/arm64/include/asm/ftrace.h  | 17 +++++++++
 arch/arm64/kernel/asm-offsets.c  |  6 ++++
 arch/arm64/kernel/entry-ftrace.S | 60 +++++++++++++++++++++++---------
 arch/arm64/kernel/ftrace.c       | 36 ++++++++++++++++---
 5 files changed, 100 insertions(+), 21 deletions(-)
  

Comments

Mark Rutland Feb. 3, 2023, 3:34 p.m. UTC | #1
On Wed, Feb 01, 2023 at 05:34:19PM +0100, Florent Revest wrote:
> This builds up on the CALL_OPS work which extends the ftrace patchsite
> on arm64 with an ops pointer usable by the ftrace trampoline.
> 
> This ops pointer is valid at all time. Indeed, it is either pointing to
> ftrace_list_ops or to the single ops which should be called from that
> patchsite.
> 
> There are a few cases to distinguish:
> - If a direct call ops is the only one tracing a function:
>   - If the direct called trampoline is within the reach of a BL
>     instruction
>      -> the ftrace patchsite jumps to the trampoline
>   - Else
>      -> the ftrace patchsite jumps to the ftrace_caller trampoline which
>         reads the ops pointer in the patchsite and jumps to the direct
>         call address stored in the ops
> - Else
>   -> the ftrace patchsite jumps to the ftrace_caller trampoline and its
>      ops literal points to ftrace_list_ops so it iterates over all
>      registered ftrace ops, including the direct call ops and calls its
>      call_direct_funcs handler which stores the direct called
>      trampoline's address in the ftrace_regs and the ftrace_caller
>      trampoline will return to that address instead of returning to the
>      traced function

This looks pretty good!

Overall I think this is the right shape, but I have a few minor comments that
lead to a bit of churn. I've noted those below, and I've also pushed out a
branch with suggested fixups (as discrete patches) to my
arm64/ftrace/direct-call-testing branch, which you can find at:

  https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git
  git://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git

Note that's based on a merge of the arm64 tree's for-next/ftrace branch and the
trace tree's trace/for-next branch, and there were a couple of trivial
conflicts I had to fix up when first picking this series (which I've noted in
the affected patches)

Those trees are at:

  # arm64
  https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
  git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git

  # trace
  https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
  git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git

> Signed-off-by: Florent Revest <revest@chromium.org>
> Suggested-by: Mark Rutland <mark.rutland@arm.com>
> ---
>  arch/arm64/Kconfig               |  2 ++
>  arch/arm64/include/asm/ftrace.h  | 17 +++++++++
>  arch/arm64/kernel/asm-offsets.c  |  6 ++++
>  arch/arm64/kernel/entry-ftrace.S | 60 +++++++++++++++++++++++---------
>  arch/arm64/kernel/ftrace.c       | 36 ++++++++++++++++---
>  5 files changed, 100 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 6f6f37161cf6..7deafd653c42 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -188,6 +188,8 @@ config ARM64
>  	select HAVE_DYNAMIC_FTRACE
>  	select HAVE_DYNAMIC_FTRACE_WITH_ARGS \
>  		if $(cc-option,-fpatchable-function-entry=2)
> +	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS \
> +		if DYNAMIC_FTRACE_WITH_ARGS && DYNAMIC_FTRACE_WITH_CALL_OPS
>  	select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS \
>  		if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG)
>  	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \
> diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
> index cf6d9c42ff36..9fb95966b6d5 100644
> --- a/arch/arm64/include/asm/ftrace.h
> +++ b/arch/arm64/include/asm/ftrace.h
> @@ -80,6 +80,10 @@ struct ftrace_regs {
>  
>  	unsigned long sp;
>  	unsigned long pc;
> +
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> +	unsigned long custom_tramp;

Minor nit, but could we please s/custom_tramp/direct_tramp/?

> +#endif
>  };

I forgot to add a comment here, but we require that sizeof(struct ftrace_regs)
is a multiple of 16, as the AAPCS requires that the stack is 16-byte aligned at
function call boundaries (and on arm64 we currently assume that it's *always*
aligned).

I'd suggest we make this:

| /*
|  * Note: sizeof(struct ftrace_regs) must be a multiple of 16 to ensure correct
|  * stack alignment
|  */
| struct ftrace_regs {
| 	/* x0 - x8 */
| 	unsigned long regs[9];
|
| #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
| 	unsigned long direct_tramp;
| #else
| 	unsigned long __unused;
| #endif
|  
| 	unsigned long fp;
| 	unsigned long lr;
| 
| 	unsigned long sp;
| 	unsigned long pc;
| };

>  
>  static __always_inline unsigned long
> @@ -136,6 +140,19 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
>  void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
>  		       struct ftrace_ops *op, struct ftrace_regs *fregs);
>  #define ftrace_graph_func ftrace_graph_func
> +
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> +static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs,
> +						 unsigned long addr)
> +{
> +	/*
> +	 * The ftrace trampoline will return to this address instead of the
> +	 * instrumented function.
> +	 */
> +	fregs->custom_tramp = addr;
> +}
> +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
> +
>  #endif
>  
>  #define ftrace_return_address(n) return_address(n)
> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
> index ae345b06e9f7..c67f0373b88c 100644
> --- a/arch/arm64/kernel/asm-offsets.c
> +++ b/arch/arm64/kernel/asm-offsets.c
> @@ -93,6 +93,9 @@ int main(void)
>    DEFINE(FREGS_LR,		offsetof(struct ftrace_regs, lr));
>    DEFINE(FREGS_SP,		offsetof(struct ftrace_regs, sp));
>    DEFINE(FREGS_PC,		offsetof(struct ftrace_regs, pc));
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> +  DEFINE(FREGS_CUSTOM_TRAMP,	offsetof(struct ftrace_regs, custom_tramp));
> +#endif
>    DEFINE(FREGS_SIZE,		sizeof(struct ftrace_regs));
>    BLANK();
>  #endif
> @@ -197,6 +200,9 @@ int main(void)
>  #endif
>  #ifdef CONFIG_FUNCTION_TRACER
>    DEFINE(FTRACE_OPS_FUNC,		offsetof(struct ftrace_ops, func));
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> +  DEFINE(FTRACE_OPS_DIRECT_CALL,	offsetof(struct ftrace_ops, direct_call));
> +#endif
>  #endif
>    return 0;
>  }
> diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
> index 9869debd22fb..0576f38e6362 100644
> --- a/arch/arm64/kernel/entry-ftrace.S
> +++ b/arch/arm64/kernel/entry-ftrace.S
> @@ -36,6 +36,30 @@
>  SYM_CODE_START(ftrace_caller)
>  	bti	c
>  
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> +	/*
> +	 * The literal pointer to the ops is at an 8-byte aligned boundary
> +	 * which is either 12 or 16 bytes before the BL instruction in the call
> +	 * site. See ftrace_call_adjust() for details.
> +	 *
> +	 * Therefore here the LR points at `literal + 16` or `literal + 20`,
> +	 * and we can find the address of the literal in either case by
> +	 * aligning to an 8-byte boundary and subtracting 16. We do the
> +	 * alignment first as this allows us to fold the subtraction into the
> +	 * LDR.
> +	 */
> +	bic	x11, x30, 0x7
> +	ldr	x11, [x11, #-(4 * AARCH64_INSN_SIZE)]		// op
> +
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> +	/* If the op has a direct call address, return to it */
> +	ldr	x12, [x11, #FTRACE_OPS_DIRECT_CALL]		// op->direct_call
> +	cbz	x12, 1f
> +	ret	x12
> +1:
> +#endif

For branching to the direct call trampoline, it would be better to use a
forward branch (i.e. BR), as that won't unbalance return stack predictors.
That'll need to use x16 or x17 to be compatible with a `BTI C` landing pad.

I'd also prefer if we could move the direct call invocation to a stub at the
end of ftrace_caller, e.g.

| SYM_CODE_START(ftrace_caller)
| 	... discover ops pointer here ...
| 
| 	ldr	w17, [x11, #FTRACE_OPS_DIRECT_CALL]
| 	cbnz	ftrace_caller_direct
| 
| 	... usual save and handling logic here ...
| 
| 	// just prior to return 
| 	ldr	x17, [sp, #FREGS_DIRECT_CALL]
| 	cbnz	ftrace_caller_direct_late
| 
| 	...  usual restore logic here ...
| 
| 	ret	x9
| 
| SYM_INNER_LABEL(ftrace_caller_direct_late, SYM_L_LOCAL)
| 	
| 	... shuffle registers for the trampoline here ...
| 
|	// fallthrough to the usual invocation
| SYM_INNER_LABEL(ftrace_caller_direct, SYM_L_LOCAL)
| 	br	x17
| SYM_CODE_END(ftrace_caller)

> +#endif
> +
>  	/* Save original SP */
>  	mov	x10, sp
>  
> @@ -57,6 +81,11 @@ SYM_CODE_START(ftrace_caller)
>  	/* Save the PC after the ftrace callsite */
>  	str	x30, [sp, #FREGS_PC]
>  
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> +	/* Set custom_tramp to zero  */
> +	str	xzr, [sp, #FREGS_CUSTOM_TRAMP]
> +#endif
> +
>  	/* Create a frame record for the callsite above the ftrace regs */
>  	stp	x29, x9, [sp, #FREGS_SIZE + 16]
>  	add	x29, sp, #FREGS_SIZE + 16
> @@ -71,20 +100,7 @@ SYM_CODE_START(ftrace_caller)
>  	mov	x3, sp					// regs
>  
>  #ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> -	/*
> -	 * The literal pointer to the ops is at an 8-byte aligned boundary
> -	 * which is either 12 or 16 bytes before the BL instruction in the call
> -	 * site. See ftrace_call_adjust() for details.
> -	 *
> -	 * Therefore here the LR points at `literal + 16` or `literal + 20`,
> -	 * and we can find the address of the literal in either case by
> -	 * aligning to an 8-byte boundary and subtracting 16. We do the
> -	 * alignment first as this allows us to fold the subtraction into the
> -	 * LDR.
> -	 */
> -	bic	x2, x30, 0x7
> -	ldr	x2, [x2, #-16]				// op
> -
> +	mov	x2, x11					// op
>  	ldr	x4, [x2, #FTRACE_OPS_FUNC]		// op->func
>  	blr	x4					// op->func(ip, parent_ip, op, regs)
>  
> @@ -110,12 +126,24 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
>  	/* Restore the callsite's FP, LR, PC */
>  	ldr	x29, [sp, #FREGS_FP]
>  	ldr	x30, [sp, #FREGS_LR]
> -	ldr	x9,  [sp, #FREGS_PC]
> +	ldr	x10, [sp, #FREGS_PC]
> +
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> +	ldr	x11, [sp, #FREGS_CUSTOM_TRAMP]
> +	cbz	x11, 1f
> +	/* Set x9 to parent ip before jump to custom trampoline */
> +	mov	x9,  x30
> +	/* Set lr to self ip */
> +	ldr	x30, [sp, #FREGS_PC]
> +	/* Set x10 (used for return address) to custom trampoline */
> +	mov	x10, x11
> +1:
> +#endif
>  
>  	/* Restore the callsite's SP */
>  	add	sp, sp, #FREGS_SIZE + 32
>  
> -	ret	x9
> +	ret	x10
>  SYM_CODE_END(ftrace_caller)
>  
>  #if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \
> diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
> index 5545fe1a9012..758436727fba 100644
> --- a/arch/arm64/kernel/ftrace.c
> +++ b/arch/arm64/kernel/ftrace.c
> @@ -206,6 +206,13 @@ static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr)
>  	return NULL;
>  }
>  
> +static bool reachable_by_bl(unsigned long addr, unsigned long pc)
> +{
> +	long offset = (long)addr - (long)pc;
> +
> +	return offset >= -SZ_128M && offset < SZ_128M;
> +}
> +
>  /*
>   * Find the address the callsite must branch to in order to reach '*addr'.
>   *
> @@ -220,14 +227,21 @@ static bool ftrace_find_callable_addr(struct dyn_ftrace *rec,
>  				      unsigned long *addr)
>  {
>  	unsigned long pc = rec->ip;
> -	long offset = (long)*addr - (long)pc;
>  	struct plt_entry *plt;
>  
> +	/*
> +	 * If a custom trampoline is unreachable, rely on the ftrace_caller
> +	 * trampoline which knows how to indirectly reach that trampoline
> +	 * through ops->direct_call.
> +	 */
> +	if (*addr != FTRACE_ADDR && !reachable_by_bl(*addr, pc))
> +		*addr = FTRACE_ADDR;
> +

With this, the check in get_ftrace_plt() is now redundant, since that's always
called with FTRACE_ADDR as the 'addr' argument. We could probably clean that
up, but I'm happy to leave that for now and handle that as a follow-up, since
I think that'll imply making some other structural changes, which qould obscure
the bulk of this patch.

>  	/*
>  	 * When the target is within range of the 'BL' instruction, use 'addr'
>  	 * as-is and branch to that directly.
>  	 */
> -	if (offset >= -SZ_128M && offset < SZ_128M)
> +	if (reachable_by_bl(*addr, pc))
>  		return true;
>  
>  	/*
> @@ -330,12 +344,24 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
>  int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
>  		       unsigned long addr)
>  {
> -	if (WARN_ON_ONCE(old_addr != (unsigned long)ftrace_caller))
> +	unsigned long pc = rec->ip;
> +	u32 old, new;
> +	int ret;
> +
> +	ret = ftrace_rec_set_ops(rec, arm64_rec_get_ops(rec));
> +	if (ret)
> +		return ret;
> +
> +	if (!ftrace_find_callable_addr(rec, NULL, &old_addr))
>  		return -EINVAL;
> -	if (WARN_ON_ONCE(addr != (unsigned long)ftrace_caller))
> +	if (!ftrace_find_callable_addr(rec, NULL, &addr))
>  		return -EINVAL;
>  
> -	return ftrace_rec_update_ops(rec);
> +	old = aarch64_insn_gen_branch_imm(pc, old_addr,
> +					  AARCH64_INSN_BRANCH_LINK);
> +	new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
> +
> +	return ftrace_modify_code(pc, old, new, true);
>  }
>  #endif

Aside from the above, this looks good to me.

Thanks,
Mark.
  
Florent Revest Feb. 6, 2023, 4:25 p.m. UTC | #2
On Fri, Feb 3, 2023 at 4:34 PM Mark Rutland <mark.rutland@arm.com> wrote:
>
> On Wed, Feb 01, 2023 at 05:34:19PM +0100, Florent Revest wrote:
> > This builds up on the CALL_OPS work which extends the ftrace patchsite
> > on arm64 with an ops pointer usable by the ftrace trampoline.
> >
> > This ops pointer is valid at all time. Indeed, it is either pointing to
> > ftrace_list_ops or to the single ops which should be called from that
> > patchsite.
> >
> > There are a few cases to distinguish:
> > - If a direct call ops is the only one tracing a function:
> >   - If the direct called trampoline is within the reach of a BL
> >     instruction
> >      -> the ftrace patchsite jumps to the trampoline
> >   - Else
> >      -> the ftrace patchsite jumps to the ftrace_caller trampoline which
> >         reads the ops pointer in the patchsite and jumps to the direct
> >         call address stored in the ops
> > - Else
> >   -> the ftrace patchsite jumps to the ftrace_caller trampoline and its
> >      ops literal points to ftrace_list_ops so it iterates over all
> >      registered ftrace ops, including the direct call ops and calls its
> >      call_direct_funcs handler which stores the direct called
> >      trampoline's address in the ftrace_regs and the ftrace_caller
> >      trampoline will return to that address instead of returning to the
> >      traced function
>
> This looks pretty good!
>
> Overall I think this is the right shape, but I have a few minor comments that
> lead to a bit of churn. I've noted those below, and I've also pushed out a
> branch with suggested fixups (as discrete patches) to my
> arm64/ftrace/direct-call-testing branch, which you can find at:
>
>   https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git
>   git://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git
>
> Note that's based on a merge of the arm64 tree's for-next/ftrace branch and the
> trace tree's trace/for-next branch, and there were a couple of trivial
> conflicts I had to fix up when first picking this series (which I've noted in
> the affected patches)
>
> Those trees are at:
>
>   # arm64
>   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
>   git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
>
>   # trace
>   https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
>   git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
>

Thanks for taking the time to publish these, that helps a lot.

> > @@ -80,6 +80,10 @@ struct ftrace_regs {
> >
> >       unsigned long sp;
> >       unsigned long pc;
> > +
> > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> > +     unsigned long custom_tramp;
>
> Minor nit, but could we please s/custom_tramp/direct_tramp/?

Yes, that's better

> I forgot to add a comment here, but we require that sizeof(struct ftrace_regs)
> is a multiple of 16, as the AAPCS requires that the stack is 16-byte aligned at
> function call boundaries (and on arm64 we currently assume that it's *always*
> aligned).
>
> I'd suggest we make this:
>
> | /*
> |  * Note: sizeof(struct ftrace_regs) must be a multiple of 16 to ensure correct
> |  * stack alignment
> |  */
> | struct ftrace_regs {
> |       /* x0 - x8 */
> |       unsigned long regs[9];
> |
> | #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> |       unsigned long direct_tramp;
> | #else
> |       unsigned long __unused;
> | #endif
> |
> |       unsigned long fp;
> |       unsigned long lr;
> |
> |       unsigned long sp;
> |       unsigned long pc;
> | };

Oh good catch, that was easy to miss. I'm surprised I didn't hit
horrible bugs when testing this in qemu.

> For branching to the direct call trampoline, it would be better to use a
> forward branch (i.e. BR), as that won't unbalance return stack predictors.
> That'll need to use x16 or x17 to be compatible with a `BTI C` landing pad.
>
> I'd also prefer if we could move the direct call invocation to a stub at the
> end of ftrace_caller, e.g.
>
> | SYM_CODE_START(ftrace_caller)
> |       ... discover ops pointer here ...
> |
> |       ldr     w17, [x11, #FTRACE_OPS_DIRECT_CALL]
> |       cbnz    ftrace_caller_direct
> |
> |       ... usual save and handling logic here ...
> |
> |       // just prior to return
> |       ldr     x17, [sp, #FREGS_DIRECT_CALL]
> |       cbnz    ftrace_caller_direct_late
> |
> |       ...  usual restore logic here ...
> |
> |       ret     x9
> |
> | SYM_INNER_LABEL(ftrace_caller_direct_late, SYM_L_LOCAL)
> |
> |       ... shuffle registers for the trampoline here ...
> |
> |       // fallthrough to the usual invocation
> | SYM_INNER_LABEL(ftrace_caller_direct, SYM_L_LOCAL)
> |       br      x17
> | SYM_CODE_END(ftrace_caller)

Agreed, it makes things a lot easier to read and your branch makes
restoring LR and PC more sane too. I squashed your change in and will
send it as part of v2.

> > @@ -220,14 +227,21 @@ static bool ftrace_find_callable_addr(struct dyn_ftrace *rec,
> >                                     unsigned long *addr)
> >  {
> >       unsigned long pc = rec->ip;
> > -     long offset = (long)*addr - (long)pc;
> >       struct plt_entry *plt;
> >
> > +     /*
> > +      * If a custom trampoline is unreachable, rely on the ftrace_caller
> > +      * trampoline which knows how to indirectly reach that trampoline
> > +      * through ops->direct_call.
> > +      */
> > +     if (*addr != FTRACE_ADDR && !reachable_by_bl(*addr, pc))
> > +             *addr = FTRACE_ADDR;
> > +
>
> With this, the check in get_ftrace_plt() is now redundant, since that's always
> called with FTRACE_ADDR as the 'addr' argument. We could probably clean that
> up, but I'm happy to leave that for now and handle that as a follow-up, since
> I think that'll imply making some other structural changes, which qould obscure
> the bulk of this patch.

Fair enough, I'll add an extra patch for this in v2. It's fairly
simple and if other maintainers think it's too much for the scope of
the series, we can always pick it up later and merge it separately.
  

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6f6f37161cf6..7deafd653c42 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -188,6 +188,8 @@  config ARM64
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_ARGS \
 		if $(cc-option,-fpatchable-function-entry=2)
+	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS \
+		if DYNAMIC_FTRACE_WITH_ARGS && DYNAMIC_FTRACE_WITH_CALL_OPS
 	select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS \
 		if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG)
 	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \
diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index cf6d9c42ff36..9fb95966b6d5 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -80,6 +80,10 @@  struct ftrace_regs {
 
 	unsigned long sp;
 	unsigned long pc;
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	unsigned long custom_tramp;
+#endif
 };
 
 static __always_inline unsigned long
@@ -136,6 +140,19 @@  int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs);
 #define ftrace_graph_func ftrace_graph_func
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs,
+						 unsigned long addr)
+{
+	/*
+	 * The ftrace trampoline will return to this address instead of the
+	 * instrumented function.
+	 */
+	fregs->custom_tramp = addr;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
 #endif
 
 #define ftrace_return_address(n) return_address(n)
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index ae345b06e9f7..c67f0373b88c 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -93,6 +93,9 @@  int main(void)
   DEFINE(FREGS_LR,		offsetof(struct ftrace_regs, lr));
   DEFINE(FREGS_SP,		offsetof(struct ftrace_regs, sp));
   DEFINE(FREGS_PC,		offsetof(struct ftrace_regs, pc));
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+  DEFINE(FREGS_CUSTOM_TRAMP,	offsetof(struct ftrace_regs, custom_tramp));
+#endif
   DEFINE(FREGS_SIZE,		sizeof(struct ftrace_regs));
   BLANK();
 #endif
@@ -197,6 +200,9 @@  int main(void)
 #endif
 #ifdef CONFIG_FUNCTION_TRACER
   DEFINE(FTRACE_OPS_FUNC,		offsetof(struct ftrace_ops, func));
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+  DEFINE(FTRACE_OPS_DIRECT_CALL,	offsetof(struct ftrace_ops, direct_call));
+#endif
 #endif
   return 0;
 }
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index 9869debd22fb..0576f38e6362 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -36,6 +36,30 @@ 
 SYM_CODE_START(ftrace_caller)
 	bti	c
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+	/*
+	 * The literal pointer to the ops is at an 8-byte aligned boundary
+	 * which is either 12 or 16 bytes before the BL instruction in the call
+	 * site. See ftrace_call_adjust() for details.
+	 *
+	 * Therefore here the LR points at `literal + 16` or `literal + 20`,
+	 * and we can find the address of the literal in either case by
+	 * aligning to an 8-byte boundary and subtracting 16. We do the
+	 * alignment first as this allows us to fold the subtraction into the
+	 * LDR.
+	 */
+	bic	x11, x30, 0x7
+	ldr	x11, [x11, #-(4 * AARCH64_INSN_SIZE)]		// op
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	/* If the op has a direct call address, return to it */
+	ldr	x12, [x11, #FTRACE_OPS_DIRECT_CALL]		// op->direct_call
+	cbz	x12, 1f
+	ret	x12
+1:
+#endif
+#endif
+
 	/* Save original SP */
 	mov	x10, sp
 
@@ -57,6 +81,11 @@  SYM_CODE_START(ftrace_caller)
 	/* Save the PC after the ftrace callsite */
 	str	x30, [sp, #FREGS_PC]
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	/* Set custom_tramp to zero  */
+	str	xzr, [sp, #FREGS_CUSTOM_TRAMP]
+#endif
+
 	/* Create a frame record for the callsite above the ftrace regs */
 	stp	x29, x9, [sp, #FREGS_SIZE + 16]
 	add	x29, sp, #FREGS_SIZE + 16
@@ -71,20 +100,7 @@  SYM_CODE_START(ftrace_caller)
 	mov	x3, sp					// regs
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
-	/*
-	 * The literal pointer to the ops is at an 8-byte aligned boundary
-	 * which is either 12 or 16 bytes before the BL instruction in the call
-	 * site. See ftrace_call_adjust() for details.
-	 *
-	 * Therefore here the LR points at `literal + 16` or `literal + 20`,
-	 * and we can find the address of the literal in either case by
-	 * aligning to an 8-byte boundary and subtracting 16. We do the
-	 * alignment first as this allows us to fold the subtraction into the
-	 * LDR.
-	 */
-	bic	x2, x30, 0x7
-	ldr	x2, [x2, #-16]				// op
-
+	mov	x2, x11					// op
 	ldr	x4, [x2, #FTRACE_OPS_FUNC]		// op->func
 	blr	x4					// op->func(ip, parent_ip, op, regs)
 
@@ -110,12 +126,24 @@  SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
 	/* Restore the callsite's FP, LR, PC */
 	ldr	x29, [sp, #FREGS_FP]
 	ldr	x30, [sp, #FREGS_LR]
-	ldr	x9,  [sp, #FREGS_PC]
+	ldr	x10, [sp, #FREGS_PC]
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	ldr	x11, [sp, #FREGS_CUSTOM_TRAMP]
+	cbz	x11, 1f
+	/* Set x9 to parent ip before jump to custom trampoline */
+	mov	x9,  x30
+	/* Set lr to self ip */
+	ldr	x30, [sp, #FREGS_PC]
+	/* Set x10 (used for return address) to custom trampoline */
+	mov	x10, x11
+1:
+#endif
 
 	/* Restore the callsite's SP */
 	add	sp, sp, #FREGS_SIZE + 32
 
-	ret	x9
+	ret	x10
 SYM_CODE_END(ftrace_caller)
 
 #if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index 5545fe1a9012..758436727fba 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -206,6 +206,13 @@  static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr)
 	return NULL;
 }
 
+static bool reachable_by_bl(unsigned long addr, unsigned long pc)
+{
+	long offset = (long)addr - (long)pc;
+
+	return offset >= -SZ_128M && offset < SZ_128M;
+}
+
 /*
  * Find the address the callsite must branch to in order to reach '*addr'.
  *
@@ -220,14 +227,21 @@  static bool ftrace_find_callable_addr(struct dyn_ftrace *rec,
 				      unsigned long *addr)
 {
 	unsigned long pc = rec->ip;
-	long offset = (long)*addr - (long)pc;
 	struct plt_entry *plt;
 
+	/*
+	 * If a custom trampoline is unreachable, rely on the ftrace_caller
+	 * trampoline which knows how to indirectly reach that trampoline
+	 * through ops->direct_call.
+	 */
+	if (*addr != FTRACE_ADDR && !reachable_by_bl(*addr, pc))
+		*addr = FTRACE_ADDR;
+
 	/*
 	 * When the target is within range of the 'BL' instruction, use 'addr'
 	 * as-is and branch to that directly.
 	 */
-	if (offset >= -SZ_128M && offset < SZ_128M)
+	if (reachable_by_bl(*addr, pc))
 		return true;
 
 	/*
@@ -330,12 +344,24 @@  int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
 		       unsigned long addr)
 {
-	if (WARN_ON_ONCE(old_addr != (unsigned long)ftrace_caller))
+	unsigned long pc = rec->ip;
+	u32 old, new;
+	int ret;
+
+	ret = ftrace_rec_set_ops(rec, arm64_rec_get_ops(rec));
+	if (ret)
+		return ret;
+
+	if (!ftrace_find_callable_addr(rec, NULL, &old_addr))
 		return -EINVAL;
-	if (WARN_ON_ONCE(addr != (unsigned long)ftrace_caller))
+	if (!ftrace_find_callable_addr(rec, NULL, &addr))
 		return -EINVAL;
 
-	return ftrace_rec_update_ops(rec);
+	old = aarch64_insn_gen_branch_imm(pc, old_addr,
+					  AARCH64_INSN_BRANCH_LINK);
+	new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
+
+	return ftrace_modify_code(pc, old, new, true);
 }
 #endif