[2/2] x86: Rewrite ret_from_fork() in C

Message ID 20230622120750.5549-3-brgerst@gmail.com
State New
Headers
Series x86: Rewrite ret_from_fork() in C |

Commit Message

Brian Gerst June 22, 2023, 12:07 p.m. UTC
  When kCFI is enabled, special handling is needed for the indirect call
to the kernel thread function.  Rewrite the ret_from_fork() function in
C so that the compiler can properly handle the indirect call.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
 arch/x86/entry/entry_32.S        | 30 +++++++--------------------
 arch/x86/entry/entry_64.S        | 35 +++++++++-----------------------
 arch/x86/include/asm/switch_to.h |  4 +++-
 arch/x86/kernel/process.c        | 22 +++++++++++++++++++-
 4 files changed, 41 insertions(+), 50 deletions(-)
  

Comments

Peter Zijlstra June 22, 2023, 1:29 p.m. UTC | #1
On Thu, Jun 22, 2023 at 08:07:50AM -0400, Brian Gerst wrote:
> When kCFI is enabled, special handling is needed for the indirect call
> to the kernel thread function.  Rewrite the ret_from_fork() function in
> C so that the compiler can properly handle the indirect call.
> 
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Brian Gerst <brgerst@gmail.com>

This is much nicer indeed. I'll take these patches into my series and
repost later today if you don't mind.

One little niggle below..

> ---

> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index f31e286c2977..5ee32e7e29e8 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -284,36 +284,21 @@ SYM_FUNC_END(__switch_to_asm)
>   * r12: kernel thread arg
>   */
>  .pushsection .text, "ax"
> +SYM_CODE_START(ret_from_fork_asm)
>  	UNWIND_HINT_END_OF_STACK
>  	ANNOTATE_NOENDBR // copy_thread
>  	CALL_DEPTH_ACCOUNT
>  
> +	/* return address for the stack unwinder */
> +	pushq	$swapgs_restore_regs_and_return_to_usermode
> +	UNWIND_HINT_FUNC
>  
> +	movq	%rax, %rdi		/* prev */
> +	movq	%rsp, %rsi		/* regs */
> +	movq	%rbx, %rdx		/* fn */
> +	movq	%r12, %rcx		/* fn_arg */
> +	jmp	ret_from_fork
> +SYM_CODE_END(ret_from_fork_asm)
>  .popsection
>  
>  .macro DEBUG_ENTRY_ASSERT_IRQS_OFF

> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index dac41a0072ea..f5dbfebac076 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -28,6 +28,7 @@
>  #include <linux/static_call.h>
>  #include <trace/events/power.h>
>  #include <linux/hw_breakpoint.h>
> +#include <linux/entry-common.h>
>  #include <asm/cpu.h>
>  #include <asm/apic.h>
>  #include <linux/uaccess.h>
> @@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
>  		return do_set_thread_area_64(p, ARCH_SET_FS, tls);
>  }
>  
> +__visible noinstr void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
> +				     int (*fn)(void *), void *fn_arg)

So I had noinstr in my initial patch, but it leads to objtool
complaints. I suppose we can actually handle tracing and all the other
gunk at this point, so I've removed it.

The alternative is to use __noinstr_section(".text") if we really want
to suppress all the funnies.

> +{
> +	schedule_tail(prev);
> +
> +	/* Is this a kernel thread? */
> +	if (unlikely(fn)) {
> +		fn(fn_arg);
> +		/*
> +		 * A kernel thread is allowed to return here after successfully
> +		 * calling kernel_execve().  Exit to userspace to complete the
> +		 * execve() syscall.
> +		 */
> +		regs->ax = 0;
> +	}
> +
> +	syscall_exit_to_user_mode(regs);
> +}
  
Brian Gerst June 22, 2023, 4:04 p.m. UTC | #2
On Thu, Jun 22, 2023 at 9:29 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Jun 22, 2023 at 08:07:50AM -0400, Brian Gerst wrote:
> > When kCFI is enabled, special handling is needed for the indirect call
> > to the kernel thread function.  Rewrite the ret_from_fork() function in
> > C so that the compiler can properly handle the indirect call.
> >
> > Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Signed-off-by: Brian Gerst <brgerst@gmail.com>
>
> This is much nicer indeed. I'll take these patches into my series and
> repost later today if you don't mind.

Yes, that's fine.

> One little niggle below..
>
> > ---
>
> > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > index f31e286c2977..5ee32e7e29e8 100644
> > --- a/arch/x86/entry/entry_64.S
> > +++ b/arch/x86/entry/entry_64.S
> > @@ -284,36 +284,21 @@ SYM_FUNC_END(__switch_to_asm)
> >   * r12: kernel thread arg
> >   */
> >  .pushsection .text, "ax"
> > +SYM_CODE_START(ret_from_fork_asm)
> >       UNWIND_HINT_END_OF_STACK
> >       ANNOTATE_NOENDBR // copy_thread
> >       CALL_DEPTH_ACCOUNT
> >
> > +     /* return address for the stack unwinder */
> > +     pushq   $swapgs_restore_regs_and_return_to_usermode
> > +     UNWIND_HINT_FUNC
> >
> > +     movq    %rax, %rdi              /* prev */
> > +     movq    %rsp, %rsi              /* regs */
> > +     movq    %rbx, %rdx              /* fn */
> > +     movq    %r12, %rcx              /* fn_arg */
> > +     jmp     ret_from_fork
> > +SYM_CODE_END(ret_from_fork_asm)
> >  .popsection
> >
> >  .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
>
> > diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> > index dac41a0072ea..f5dbfebac076 100644
> > --- a/arch/x86/kernel/process.c
> > +++ b/arch/x86/kernel/process.c
> > @@ -28,6 +28,7 @@
> >  #include <linux/static_call.h>
> >  #include <trace/events/power.h>
> >  #include <linux/hw_breakpoint.h>
> > +#include <linux/entry-common.h>
> >  #include <asm/cpu.h>
> >  #include <asm/apic.h>
> >  #include <linux/uaccess.h>
> > @@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
> >               return do_set_thread_area_64(p, ARCH_SET_FS, tls);
> >  }
> >
> > +__visible noinstr void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
> > +                                  int (*fn)(void *), void *fn_arg)
>
> So I had noinstr in my initial patch, but it leads to objtool
> complaints. I suppose we can actually handle tracing and all the other
> gunk at this point, so I've removed it.

I'm not an expert on noinstr usage, but looking at the other syscall
functions, instrumentation needs to be disabled before
syscall_exit_to_user_mode() is called.  Perhaps adding an
instrumentation_begin()/instrumentation_end() pair to this function is
needed?

Brian Gerst
  
H. Peter Anvin June 22, 2023, 4:33 p.m. UTC | #3
On June 22, 2023 9:04:03 AM PDT, Brian Gerst <brgerst@gmail.com> wrote:
>On Thu, Jun 22, 2023 at 9:29 AM Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> On Thu, Jun 22, 2023 at 08:07:50AM -0400, Brian Gerst wrote:
>> > When kCFI is enabled, special handling is needed for the indirect call
>> > to the kernel thread function.  Rewrite the ret_from_fork() function in
>> > C so that the compiler can properly handle the indirect call.
>> >
>> > Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> > Signed-off-by: Brian Gerst <brgerst@gmail.com>
>>
>> This is much nicer indeed. I'll take these patches into my series and
>> repost later today if you don't mind.
>
>Yes, that's fine.
>
>> One little niggle below..
>>
>> > ---
>>
>> > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
>> > index f31e286c2977..5ee32e7e29e8 100644
>> > --- a/arch/x86/entry/entry_64.S
>> > +++ b/arch/x86/entry/entry_64.S
>> > @@ -284,36 +284,21 @@ SYM_FUNC_END(__switch_to_asm)
>> >   * r12: kernel thread arg
>> >   */
>> >  .pushsection .text, "ax"
>> > +SYM_CODE_START(ret_from_fork_asm)
>> >       UNWIND_HINT_END_OF_STACK
>> >       ANNOTATE_NOENDBR // copy_thread
>> >       CALL_DEPTH_ACCOUNT
>> >
>> > +     /* return address for the stack unwinder */
>> > +     pushq   $swapgs_restore_regs_and_return_to_usermode
>> > +     UNWIND_HINT_FUNC
>> >
>> > +     movq    %rax, %rdi              /* prev */
>> > +     movq    %rsp, %rsi              /* regs */
>> > +     movq    %rbx, %rdx              /* fn */
>> > +     movq    %r12, %rcx              /* fn_arg */
>> > +     jmp     ret_from_fork
>> > +SYM_CODE_END(ret_from_fork_asm)
>> >  .popsection
>> >
>> >  .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
>>
>> > diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
>> > index dac41a0072ea..f5dbfebac076 100644
>> > --- a/arch/x86/kernel/process.c
>> > +++ b/arch/x86/kernel/process.c
>> > @@ -28,6 +28,7 @@
>> >  #include <linux/static_call.h>
>> >  #include <trace/events/power.h>
>> >  #include <linux/hw_breakpoint.h>
>> > +#include <linux/entry-common.h>
>> >  #include <asm/cpu.h>
>> >  #include <asm/apic.h>
>> >  #include <linux/uaccess.h>
>> > @@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
>> >               return do_set_thread_area_64(p, ARCH_SET_FS, tls);
>> >  }
>> >
>> > +__visible noinstr void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
>> > +                                  int (*fn)(void *), void *fn_arg)
>>
>> So I had noinstr in my initial patch, but it leads to objtool
>> complaints. I suppose we can actually handle tracing and all the other
>> gunk at this point, so I've removed it.
>
>I'm not an expert on noinstr usage, but looking at the other syscall
>functions, instrumentation needs to be disabled before
>syscall_exit_to_user_mode() is called.  Perhaps adding an
>instrumentation_begin()/instrumentation_end() pair to this function is
>needed?
>
>Brian Gerst
>

I don't have the code in front of me right now, but how does this affect FRED enabling? In the case of FRED, the exit path is much simpler; in the FRED enabling patchset we simply deal with it by alternatives-patching the terminal jump after resetting the stack pointer to the standard FRED user space exit stub (which simply pops the user space registers and executes ERETU.)

I'm assuming this is still valid/possible after your patches, since resetting the stack pointer isn't possible in C, but I wanted to double check.
  
Brian Gerst June 22, 2023, 5:33 p.m. UTC | #4
On Thu, Jun 22, 2023 at 12:33 PM H. Peter Anvin <hpa@zytor.com> wrote:
>
> On June 22, 2023 9:04:03 AM PDT, Brian Gerst <brgerst@gmail.com> wrote:
> >On Thu, Jun 22, 2023 at 9:29 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >>
> >> On Thu, Jun 22, 2023 at 08:07:50AM -0400, Brian Gerst wrote:
> >> > When kCFI is enabled, special handling is needed for the indirect call
> >> > to the kernel thread function.  Rewrite the ret_from_fork() function in
> >> > C so that the compiler can properly handle the indirect call.
> >> >
> >> > Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> >> > Signed-off-by: Brian Gerst <brgerst@gmail.com>
> >>
> >> This is much nicer indeed. I'll take these patches into my series and
> >> repost later today if you don't mind.
> >
> >Yes, that's fine.
> >
> >> One little niggle below..
> >>
> >> > ---
> >>
> >> > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> >> > index f31e286c2977..5ee32e7e29e8 100644
> >> > --- a/arch/x86/entry/entry_64.S
> >> > +++ b/arch/x86/entry/entry_64.S
> >> > @@ -284,36 +284,21 @@ SYM_FUNC_END(__switch_to_asm)
> >> >   * r12: kernel thread arg
> >> >   */
> >> >  .pushsection .text, "ax"
> >> > +SYM_CODE_START(ret_from_fork_asm)
> >> >       UNWIND_HINT_END_OF_STACK
> >> >       ANNOTATE_NOENDBR // copy_thread
> >> >       CALL_DEPTH_ACCOUNT
> >> >
> >> > +     /* return address for the stack unwinder */
> >> > +     pushq   $swapgs_restore_regs_and_return_to_usermode
> >> > +     UNWIND_HINT_FUNC
> >> >
> >> > +     movq    %rax, %rdi              /* prev */
> >> > +     movq    %rsp, %rsi              /* regs */
> >> > +     movq    %rbx, %rdx              /* fn */
> >> > +     movq    %r12, %rcx              /* fn_arg */
> >> > +     jmp     ret_from_fork
> >> > +SYM_CODE_END(ret_from_fork_asm)
> >> >  .popsection
> >> >
> >> >  .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
> >>
> >> > diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> >> > index dac41a0072ea..f5dbfebac076 100644
> >> > --- a/arch/x86/kernel/process.c
> >> > +++ b/arch/x86/kernel/process.c
> >> > @@ -28,6 +28,7 @@
> >> >  #include <linux/static_call.h>
> >> >  #include <trace/events/power.h>
> >> >  #include <linux/hw_breakpoint.h>
> >> > +#include <linux/entry-common.h>
> >> >  #include <asm/cpu.h>
> >> >  #include <asm/apic.h>
> >> >  #include <linux/uaccess.h>
> >> > @@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
> >> >               return do_set_thread_area_64(p, ARCH_SET_FS, tls);
> >> >  }
> >> >
> >> > +__visible noinstr void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
> >> > +                                  int (*fn)(void *), void *fn_arg)
> >>
> >> So I had noinstr in my initial patch, but it leads to objtool
> >> complaints. I suppose we can actually handle tracing and all the other
> >> gunk at this point, so I've removed it.
> >
> >I'm not an expert on noinstr usage, but looking at the other syscall
> >functions, instrumentation needs to be disabled before
> >syscall_exit_to_user_mode() is called.  Perhaps adding an
> >instrumentation_begin()/instrumentation_end() pair to this function is
> >needed?
> >
> >Brian Gerst
> >
>
> I don't have the code in front of me right now, but how does this affect FRED enabling? In the case of FRED, the exit path is much simpler; in the FRED enabling patchset we simply deal with it by alternatives-patching the terminal jump after resetting the stack pointer to the standard FRED user space exit stub (which simply pops the user space registers and executes ERETU.)
>
> I'm assuming this is still valid/possible after your patches, since resetting the stack pointer isn't possible in C, but I wanted to double check.

It should just need to change the "pushq
$swapgs_restore_regs_and_return_to_usermode" to some other address via
an alternative.

Brian Gerst
  
Brian Gerst June 23, 2023, 6:12 p.m. UTC | #5
On Thu, Jun 22, 2023 at 8:08 AM Brian Gerst <brgerst@gmail.com> wrote:
>
> When kCFI is enabled, special handling is needed for the indirect call
> to the kernel thread function.  Rewrite the ret_from_fork() function in
> C so that the compiler can properly handle the indirect call.
>
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Brian Gerst <brgerst@gmail.com>
> ---
>  arch/x86/entry/entry_32.S        | 30 +++++++--------------------
>  arch/x86/entry/entry_64.S        | 35 +++++++++-----------------------
>  arch/x86/include/asm/switch_to.h |  4 +++-
>  arch/x86/kernel/process.c        | 22 +++++++++++++++++++-
>  4 files changed, 41 insertions(+), 50 deletions(-)
>
> diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
> index 6c1ee76adc11..7932c14199fb 100644
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -727,37 +727,21 @@ SYM_CODE_END(__switch_to_asm)
>   * edi: kernel thread arg
>   */
>  .pushsection .text, "ax"
> -SYM_CODE_START(ret_from_fork)
> +SYM_CODE_START(ret_from_fork_asm)
>         /* return address for the stack unwinder */
>         pushl   $.Lsyscall_32_done
>         FRAME_BEGIN
>
> -       pushl   %eax
> -       call    schedule_tail
> +       /* prev already in EAX */
> +       movl    %esp, %edx      /* regs */
> +       movl    %ebx, %ecx      /* fn */
> +       pushl   %edi            /* fn_arg */
> +       call    ret_from_fork
>         addl    $4, %esp
>
> -       testl   %ebx, %ebx
> -       jnz     1f              /* kernel threads are uncommon */
> -
> -2:
> -       /* When we fork, we trace the syscall return in the child, too. */
> -       movl    %esp, %eax
> -       call    syscall_exit_to_user_mode
> -
>         FRAME_END
>         RET
> -
> -       /* kernel thread */
> -1:     movl    %edi, %eax
> -       CALL_NOSPEC ebx
> -       /*
> -        * A kernel thread is allowed to return here after successfully
> -        * calling kernel_execve().  Exit to userspace to complete the execve()
> -        * syscall.
> -        */
> -       movl    $0, PT_EAX(%esp)
> -       jmp     2b
> -SYM_CODE_END(ret_from_fork)
> +SYM_CODE_END(ret_from_fork_asm)
>  .popsection
>
>  SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index f31e286c2977..5ee32e7e29e8 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -284,36 +284,21 @@ SYM_FUNC_END(__switch_to_asm)
>   * r12: kernel thread arg
>   */
>  .pushsection .text, "ax"
> -       __FUNC_ALIGN
> -SYM_CODE_START_NOALIGN(ret_from_fork)
> +SYM_CODE_START(ret_from_fork_asm)
>         UNWIND_HINT_END_OF_STACK
>         ANNOTATE_NOENDBR // copy_thread
>         CALL_DEPTH_ACCOUNT
> -       movq    %rax, %rdi
> -       call    schedule_tail                   /* rdi: 'prev' task parameter */
> -
> -       testq   %rbx, %rbx                      /* from kernel_thread? */
> -       jnz     1f                              /* kernel threads are uncommon */
>
> -2:
> -       UNWIND_HINT_REGS
> -       movq    %rsp, %rdi
> -       call    syscall_exit_to_user_mode       /* returns with IRQs disabled */
> -       jmp     swapgs_restore_regs_and_return_to_usermode
> +       /* return address for the stack unwinder */
> +       pushq   $swapgs_restore_regs_and_return_to_usermode
> +       UNWIND_HINT_FUNC
>
> -1:
> -       /* kernel thread */
> -       UNWIND_HINT_END_OF_STACK
> -       movq    %r12, %rdi
> -       CALL_NOSPEC rbx
> -       /*
> -        * A kernel thread is allowed to return here after successfully
> -        * calling kernel_execve().  Exit to userspace to complete the execve()
> -        * syscall.
> -        */
> -       movq    $0, RAX(%rsp)
> -       jmp     2b
> -SYM_CODE_END(ret_from_fork)
> +       movq    %rax, %rdi              /* prev */
> +       movq    %rsp, %rsi              /* regs */

The push above makes this give the wrong address for regs.  New version coming.

Brian Gerst
  

Patch

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 6c1ee76adc11..7932c14199fb 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -727,37 +727,21 @@  SYM_CODE_END(__switch_to_asm)
  * edi: kernel thread arg
  */
 .pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
+SYM_CODE_START(ret_from_fork_asm)
 	/* return address for the stack unwinder */
 	pushl	$.Lsyscall_32_done
 	FRAME_BEGIN
 
-	pushl	%eax
-	call	schedule_tail
+	/* prev already in EAX */
+	movl	%esp, %edx	/* regs */
+	movl	%ebx, %ecx	/* fn */
+	pushl	%edi		/* fn_arg */
+	call	ret_from_fork
 	addl	$4, %esp
 
-	testl	%ebx, %ebx
-	jnz	1f		/* kernel threads are uncommon */
-
-2:
-	/* When we fork, we trace the syscall return in the child, too. */
-	movl    %esp, %eax
-	call    syscall_exit_to_user_mode
-
 	FRAME_END
 	RET
-
-	/* kernel thread */
-1:	movl	%edi, %eax
-	CALL_NOSPEC ebx
-	/*
-	 * A kernel thread is allowed to return here after successfully
-	 * calling kernel_execve().  Exit to userspace to complete the execve()
-	 * syscall.
-	 */
-	movl	$0, PT_EAX(%esp)
-	jmp	2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_END(ret_from_fork_asm)
 .popsection
 
 SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f31e286c2977..5ee32e7e29e8 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -284,36 +284,21 @@  SYM_FUNC_END(__switch_to_asm)
  * r12: kernel thread arg
  */
 .pushsection .text, "ax"
-	__FUNC_ALIGN
-SYM_CODE_START_NOALIGN(ret_from_fork)
+SYM_CODE_START(ret_from_fork_asm)
 	UNWIND_HINT_END_OF_STACK
 	ANNOTATE_NOENDBR // copy_thread
 	CALL_DEPTH_ACCOUNT
-	movq	%rax, %rdi
-	call	schedule_tail			/* rdi: 'prev' task parameter */
-
-	testq	%rbx, %rbx			/* from kernel_thread? */
-	jnz	1f				/* kernel threads are uncommon */
 
-2:
-	UNWIND_HINT_REGS
-	movq	%rsp, %rdi
-	call	syscall_exit_to_user_mode	/* returns with IRQs disabled */
-	jmp	swapgs_restore_regs_and_return_to_usermode
+	/* return address for the stack unwinder */
+	pushq	$swapgs_restore_regs_and_return_to_usermode
+	UNWIND_HINT_FUNC
 
-1:
-	/* kernel thread */
-	UNWIND_HINT_END_OF_STACK
-	movq	%r12, %rdi
-	CALL_NOSPEC rbx
-	/*
-	 * A kernel thread is allowed to return here after successfully
-	 * calling kernel_execve().  Exit to userspace to complete the execve()
-	 * syscall.
-	 */
-	movq	$0, RAX(%rsp)
-	jmp	2b
-SYM_CODE_END(ret_from_fork)
+	movq	%rax, %rdi		/* prev */
+	movq	%rsp, %rsi		/* regs */
+	movq	%rbx, %rdx		/* fn */
+	movq	%r12, %rcx		/* fn_arg */
+	jmp	ret_from_fork
+SYM_CODE_END(ret_from_fork_asm)
 .popsection
 
 .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 5c91305d09d2..f42dbf17f52b 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -12,7 +12,9 @@  struct task_struct *__switch_to_asm(struct task_struct *prev,
 __visible struct task_struct *__switch_to(struct task_struct *prev,
 					  struct task_struct *next);
 
-asmlinkage void ret_from_fork(void);
+asmlinkage void ret_from_fork_asm(void);
+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+			     int (*fn)(void *), void *fn_arg);
 
 /*
  * This is the structure pointed to by thread.sp for an inactive task.  The
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index dac41a0072ea..f5dbfebac076 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@ 
 #include <linux/static_call.h>
 #include <trace/events/power.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/entry-common.h>
 #include <asm/cpu.h>
 #include <asm/apic.h>
 #include <linux/uaccess.h>
@@ -134,6 +135,25 @@  static int set_new_tls(struct task_struct *p, unsigned long tls)
 		return do_set_thread_area_64(p, ARCH_SET_FS, tls);
 }
 
+__visible noinstr void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+				     int (*fn)(void *), void *fn_arg)
+{
+	schedule_tail(prev);
+
+	/* Is this a kernel thread? */
+	if (unlikely(fn)) {
+		fn(fn_arg);
+		/*
+		 * A kernel thread is allowed to return here after successfully
+		 * calling kernel_execve().  Exit to userspace to complete the
+		 * execve() syscall.
+		 */
+		regs->ax = 0;
+	}
+
+	syscall_exit_to_user_mode(regs);
+}
+
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
 	unsigned long clone_flags = args->flags;
@@ -149,7 +169,7 @@  int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 	frame = &fork_frame->frame;
 
 	frame->bp = encode_frame_pointer(childregs);
-	frame->ret_addr = (unsigned long) ret_from_fork;
+	frame->ret_addr = (unsigned long) ret_from_fork_asm;
 	p->thread.sp = (unsigned long) fork_frame;
 	p->thread.io_bitmap = NULL;
 	p->thread.iopl_warn = 0;