[v2,2/2] x86: Rewrite ret_from_fork() in C

Message ID 20230623225529.34590-3-brgerst@gmail.com
State New
Headers
Series Rewrite ret_from_fork() in C |

Commit Message

Brian Gerst June 23, 2023, 10:55 p.m. UTC
  When kCFI is enabled, special handling is needed for the indirect call
to the kernel thread function.  Rewrite the ret_from_fork() function in
C so that the compiler can properly handle the indirect call.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
 arch/x86/entry/entry_32.S        | 30 ++++++++---------------------
 arch/x86/entry/entry_64.S        | 33 ++++++++------------------------
 arch/x86/include/asm/switch_to.h |  4 +++-
 arch/x86/kernel/process.c        | 22 ++++++++++++++++++++-
 4 files changed, 40 insertions(+), 49 deletions(-)
  

Comments

Petr Mladek July 19, 2023, 3:21 p.m. UTC | #1
On Fri 2023-06-23 18:55:29, Brian Gerst wrote:
> When kCFI is enabled, special handling is needed for the indirect call
> to the kernel thread function.  Rewrite the ret_from_fork() function in
> C so that the compiler can properly handle the indirect call.

This patch broke livepatching. Kthreads never have a reliable stack.
It works when I revert it.

See also below.

> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -284,36 +284,19 @@ SYM_FUNC_END(__switch_to_asm)
>   * r12: kernel thread arg
>   */
>  .pushsection .text, "ax"
> -	__FUNC_ALIGN
> -SYM_CODE_START_NOALIGN(ret_from_fork)
> -	UNWIND_HINT_END_OF_STACK
> +SYM_CODE_START(ret_from_fork_asm)
> +	UNWIND_HINT_REGS
>  	ANNOTATE_NOENDBR // copy_thread
>  	CALL_DEPTH_ACCOUNT
> -	movq	%rax, %rdi
> -	call	schedule_tail			/* rdi: 'prev' task parameter */
>  
> -	testq	%rbx, %rbx			/* from kernel_thread? */
> -	jnz	1f				/* kernel threads are uncommon */
> +	movq	%rax, %rdi		/* prev */
> +	movq	%rsp, %rsi		/* regs */
> +	movq	%rbx, %rdx		/* fn */
> +	movq	%r12, %rcx		/* fn_arg */
> +	call	ret_from_fork
>  
> -2:
> -	UNWIND_HINT_REGS
> -	movq	%rsp, %rdi
> -	call	syscall_exit_to_user_mode	/* returns with IRQs disabled */
>  	jmp	swapgs_restore_regs_and_return_to_usermode
> -
> -1:
> -	/* kernel thread */
> -	UNWIND_HINT_END_OF_STACK

I think that it might be related to removal of this line.
The following intructions are going to call fn(fn_arg).
See below.

> -	movq	%r12, %rdi
> -	CALL_NOSPEC rbx
> -	/*
> -	 * A kernel thread is allowed to return here after successfully
> -	 * calling kernel_execve().  Exit to userspace to complete the execve()
> -	 * syscall.
> -	 */
> -	movq	$0, RAX(%rsp)
> -	jmp	2b
> -SYM_CODE_END(ret_from_fork)
> +SYM_CODE_END(ret_from_fork_asm)
>  .popsection
>  
>  .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
> diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
> index 5c91305d09d2..f42dbf17f52b 100644
> --- a/arch/x86/include/asm/switch_to.h
> +++ b/arch/x86/include/asm/switch_to.h
> @@ -12,7 +12,9 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
>  __visible struct task_struct *__switch_to(struct task_struct *prev,
>  					  struct task_struct *next);
>  
> -asmlinkage void ret_from_fork(void);
> +asmlinkage void ret_from_fork_asm(void);
> +__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
> +			     int (*fn)(void *), void *fn_arg);
>  
>  /*
>   * This is the structure pointed to by thread.sp for an inactive task.  The
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index cc7a642f8c9d..001e6dad9a48 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -136,6 +137,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
>  		return do_set_thread_area_64(p, ARCH_SET_FS, tls);
>  }
>  
> +__visible noinstr void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
> +				     int (*fn)(void *), void *fn_arg)
> +{
> +	schedule_tail(prev);
> +
> +	/* Is this a kernel thread? */
> +	if (unlikely(fn)) {
> +		fn(fn_arg);

This is the related code but it does not include the annotation
about the end of the stack.

Honestly, I am not familiar with the stack unwinder and how this is
supposed to work.

I hope that Josh or anyone else might know better.

> +		/*
> +		 * A kernel thread is allowed to return here after successfully
> +		 * calling kernel_execve().  Exit to userspace to complete the
> +		 * execve() syscall.
> +		 */
> +		regs->ax = 0;
> +	}
> +
> +	syscall_exit_to_user_mode(regs);
> +}
> +
>  int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
>  {
>  	unsigned long clone_flags = args->flags;

Best Regards,
Petr
  
Peter Zijlstra July 19, 2023, 8:02 p.m. UTC | #2
On Wed, Jul 19, 2023 at 05:21:11PM +0200, Petr Mladek wrote:

> This patch broke livepatching. Kthreads never have a reliable stack.
> It works when I revert it.

> > +SYM_CODE_START(ret_from_fork_asm)
> > +	UNWIND_HINT_REGS

It works again when I change the above hint to UNWIND_HINT_END_OF_STACK,
so yeah. Doing this makes objtool unhappy with something else though,
so I'll go prod at things with something sharp...

Thanks!

> >  	ANNOTATE_NOENDBR // copy_thread
> >  	CALL_DEPTH_ACCOUNT
> >  
> > +	movq	%rax, %rdi		/* prev */
> > +	movq	%rsp, %rsi		/* regs */
> > +	movq	%rbx, %rdx		/* fn */
> > +	movq	%r12, %rcx		/* fn_arg */
> > +	call	ret_from_fork
> >  
> > +SYM_CODE_END(ret_from_fork_asm)
  
Peter Zijlstra July 19, 2023, 8:15 p.m. UTC | #3
On Wed, Jul 19, 2023 at 10:02:22PM +0200, Peter Zijlstra wrote:
> On Wed, Jul 19, 2023 at 05:21:11PM +0200, Petr Mladek wrote:
> 
> > This patch broke livepatching. Kthreads never have a reliable stack.
> > It works when I revert it.
> 
> > > +SYM_CODE_START(ret_from_fork_asm)
> > > +	UNWIND_HINT_REGS
> 
> It works again when I change the above hint to UNWIND_HINT_END_OF_STACK,
> so yeah. Doing this makes objtool unhappy with something else though,
> so I'll go prod at things with something sharp...


The below cures things; Josh, did I miss anything?

---
 arch/x86/entry/entry_64.S | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 91f6818884fa..cfe7882ea9ae 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -285,7 +285,14 @@ SYM_FUNC_END(__switch_to_asm)
  */
 .pushsection .text, "ax"
 SYM_CODE_START(ret_from_fork_asm)
-	UNWIND_HINT_REGS
+	/*
+	 * This is the start of the kernel stack; even through there's a regs
+	 * set at the top, there is no real exception frame and one cannot
+	 * unwind further. This is the end.
+	 *
+	 * This ensures stack unwinds of kernel threads hit a known good state.
+	 */
+	UNWIND_HINT_END_OF_STACK
 	ANNOTATE_NOENDBR // copy_thread
 	CALL_DEPTH_ACCOUNT
 
@@ -295,6 +302,11 @@ SYM_CODE_START(ret_from_fork_asm)
 	movq	%r12, %rcx		/* fn_arg */
 	call	ret_from_fork
 
+	/*
+	 * Set the stack state to what is expected for the target function
+	 * -- also it is not wrong.
+	 */
+	UNWIND_HINT_REGS
 	jmp	swapgs_restore_regs_and_return_to_usermode
 SYM_CODE_END(ret_from_fork_asm)
 .popsection
  
Joe Lawrence July 19, 2023, 8:33 p.m. UTC | #4
On 7/19/23 11:21, Petr Mladek wrote:
> On Fri 2023-06-23 18:55:29, Brian Gerst wrote:
>> When kCFI is enabled, special handling is needed for the indirect call
>> to the kernel thread function.  Rewrite the ret_from_fork() function in
>> C so that the compiler can properly handle the indirect call.
> 
> This patch broke livepatching. Kthreads never have a reliable stack.
> It works when I revert it.
> 

Just curious -- did the selftests catch this anywhere?  I'm not 100%
clear on what trees / frequency they all run, so maybe Petr you found
this by code inspection or other means?
  
Peter Zijlstra July 19, 2023, 8:41 p.m. UTC | #5
On Wed, Jul 19, 2023 at 04:33:26PM -0400, Joe Lawrence wrote:
> On 7/19/23 11:21, Petr Mladek wrote:
> > On Fri 2023-06-23 18:55:29, Brian Gerst wrote:
> >> When kCFI is enabled, special handling is needed for the indirect call
> >> to the kernel thread function.  Rewrite the ret_from_fork() function in
> >> C so that the compiler can properly handle the indirect call.
> > 
> > This patch broke livepatching. Kthreads never have a reliable stack.
> > It works when I revert it.
> > 
> 
> Just curious -- did the selftests catch this anywhere?  I'm not 100%
> clear on what trees / frequency they all run, so maybe Petr you found
> this by code inspection or other means?

I suspect Petr ran the selftests himself, they're fairly easy to run
(once you figure out the magic incantation) and insta fail.

I'm not sure the robots consistently run this stuff -- I've had these
patches exposed to 0day for weeks...
  
Peter Zijlstra July 19, 2023, 8:50 p.m. UTC | #6
On Wed, Jul 19, 2023 at 10:15:38PM +0200, Peter Zijlstra wrote:
> On Wed, Jul 19, 2023 at 10:02:22PM +0200, Peter Zijlstra wrote:
> > On Wed, Jul 19, 2023 at 05:21:11PM +0200, Petr Mladek wrote:
> > 
> > > This patch broke livepatching. Kthreads never have a reliable stack.
> > > It works when I revert it.
> > 
> > > > +SYM_CODE_START(ret_from_fork_asm)
> > > > +	UNWIND_HINT_REGS
> > 
> > It works again when I change the above hint to UNWIND_HINT_END_OF_STACK,
> > so yeah. Doing this makes objtool unhappy with something else though,
> > so I'll go prod at things with something sharp...
> 
> 
> The below cures things; Josh, did I miss anything?
> 
> ---
>  arch/x86/entry/entry_64.S | 14 +++++++++++++-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 91f6818884fa..cfe7882ea9ae 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -285,7 +285,14 @@ SYM_FUNC_END(__switch_to_asm)
>   */
>  .pushsection .text, "ax"
>  SYM_CODE_START(ret_from_fork_asm)
> -	UNWIND_HINT_REGS
> +	/*
> +	 * This is the start of the kernel stack; even through there's a regs
> +	 * set at the top, there is no real exception frame and one cannot
> +	 * unwind further. This is the end.
> +	 *
> +	 * This ensures stack unwinds of kernel threads hit a known good state.
> +	 */
> +	UNWIND_HINT_END_OF_STACK

So unwind_orc.c:unwind_next_frame() will terminate on this hint *or* on
user_mode(state->regs).

AFAICT way things are set up in copy_thread(), user_mode() will not be
true -- after all there is no usermode, the kthread would first have to
exec() something to create a usermode.

Yet I'm wondering if perhaps we should spoof the regs to make
user_mode() true and auto-terminate without this explicit hint.

Josh, do you remember the rationale for all this?
  
Josh Poimboeuf July 19, 2023, 11:31 p.m. UTC | #7
On Wed, Jul 19, 2023 at 10:50:50PM +0200, Peter Zijlstra wrote:
> > The below cures things; Josh, did I miss anything?
> > 
> > ---
> >  arch/x86/entry/entry_64.S | 14 +++++++++++++-
> >  1 file changed, 13 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > index 91f6818884fa..cfe7882ea9ae 100644
> > --- a/arch/x86/entry/entry_64.S
> > +++ b/arch/x86/entry/entry_64.S
> > @@ -285,7 +285,14 @@ SYM_FUNC_END(__switch_to_asm)
> >   */
> >  .pushsection .text, "ax"
> >  SYM_CODE_START(ret_from_fork_asm)
> > -	UNWIND_HINT_REGS
> > +	/*
> > +	 * This is the start of the kernel stack; even through there's a regs
> > +	 * set at the top, there is no real exception frame and one cannot
> > +	 * unwind further. This is the end.
> > +	 *
> > +	 * This ensures stack unwinds of kernel threads hit a known good state.
> > +	 */
> > +	UNWIND_HINT_END_OF_STACK

The comments may be a bit superfluous (to me at least) but the patch
looks fine.

> So unwind_orc.c:unwind_next_frame() will terminate on this hint *or* on
> user_mode(state->regs).
> 
> AFAICT way things are set up in copy_thread(), user_mode() will not be
> true -- after all there is no usermode, the kthread would first have to
> exec() something to create a usermode.
> 
> Yet I'm wondering if perhaps we should spoof the regs to make
> user_mode() true and auto-terminate without this explicit hint.

I'm not sure that would be worth the trouble / cleverness.  The hint is
straightforward IMO.

> Josh, do you remember the rationale for all this?

For what exactly :-)
  
Peter Zijlstra July 20, 2023, 5:22 a.m. UTC | #8
On Wed, Jul 19, 2023 at 04:31:11PM -0700, Josh Poimboeuf wrote:
> On Wed, Jul 19, 2023 at 10:50:50PM +0200, Peter Zijlstra wrote:
> > > The below cures things; Josh, did I miss anything?
> > > 
> > > ---
> > >  arch/x86/entry/entry_64.S | 14 +++++++++++++-
> > >  1 file changed, 13 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > > index 91f6818884fa..cfe7882ea9ae 100644
> > > --- a/arch/x86/entry/entry_64.S
> > > +++ b/arch/x86/entry/entry_64.S
> > > @@ -285,7 +285,14 @@ SYM_FUNC_END(__switch_to_asm)
> > >   */
> > >  .pushsection .text, "ax"
> > >  SYM_CODE_START(ret_from_fork_asm)
> > > -	UNWIND_HINT_REGS
> > > +	/*
> > > +	 * This is the start of the kernel stack; even through there's a regs
> > > +	 * set at the top, there is no real exception frame and one cannot
> > > +	 * unwind further. This is the end.
> > > +	 *
> > > +	 * This ensures stack unwinds of kernel threads hit a known good state.
> > > +	 */
> > > +	UNWIND_HINT_END_OF_STACK
> 
> The comments may be a bit superfluous (to me at least) but the patch
> looks fine.

Right, well, it took me a minute to figure out how it was all supposed
to work, I figured I'd stick a comment on it.

The bit I missed is that if you reach the return-to-user part, you will
actually have user_mode() true on the regset.

> > So unwind_orc.c:unwind_next_frame() will terminate on this hint *or* on
> > user_mode(state->regs).
> > 
> > AFAICT way things are set up in copy_thread(), user_mode() will not be
> > true -- after all there is no usermode, the kthread would first have to
> > exec() something to create a usermode.
> > 
> > Yet I'm wondering if perhaps we should spoof the regs to make
> > user_mode() true and auto-terminate without this explicit hint.
> 
> I'm not sure that would be worth the trouble / cleverness.  The hint is
> straightforward IMO.

I tried, it doesn't work, clearly I missed something.
  
Petr Mladek July 20, 2023, 8:18 a.m. UTC | #9
On Wed 2023-07-19 22:15:38, Peter Zijlstra wrote:
> On Wed, Jul 19, 2023 at 10:02:22PM +0200, Peter Zijlstra wrote:
> > On Wed, Jul 19, 2023 at 05:21:11PM +0200, Petr Mladek wrote:
> > 
> > > This patch broke livepatching. Kthreads never have a reliable stack.
> > > It works when I revert it.
> > 
> > > > +SYM_CODE_START(ret_from_fork_asm)
> > > > +	UNWIND_HINT_REGS
> > 
> > It works again when I change the above hint to UNWIND_HINT_END_OF_STACK,
> > so yeah. Doing this makes objtool unhappy with something else though,
> > so I'll go prod at things with something sharp...
> 
> 
> The below cures things; Josh, did I miss anything?

I can confirm that it solved the problem. Feel free to use:

Tested-by: Petr Mladek <pmladek@suse.com>

Thanks a lot for the quick fix.

Best Regards,
Petr

> ---
>  arch/x86/entry/entry_64.S | 14 +++++++++++++-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 91f6818884fa..cfe7882ea9ae 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -285,7 +285,14 @@ SYM_FUNC_END(__switch_to_asm)
>   */
>  .pushsection .text, "ax"
>  SYM_CODE_START(ret_from_fork_asm)
> -	UNWIND_HINT_REGS
> +	/*
> +	 * This is the start of the kernel stack; even through there's a regs
> +	 * set at the top, there is no real exception frame and one cannot
> +	 * unwind further. This is the end.
> +	 *
> +	 * This ensures stack unwinds of kernel threads hit a known good state.
> +	 */
> +	UNWIND_HINT_END_OF_STACK
>  	ANNOTATE_NOENDBR // copy_thread
>  	CALL_DEPTH_ACCOUNT
>  
> @@ -295,6 +302,11 @@ SYM_CODE_START(ret_from_fork_asm)
>  	movq	%r12, %rcx		/* fn_arg */
>  	call	ret_from_fork
>  
> +	/*
> +	 * Set the stack state to what is expected for the target function
> +	 * -- also it is not wrong.
> +	 */
> +	UNWIND_HINT_REGS
>  	jmp	swapgs_restore_regs_and_return_to_usermode
>  SYM_CODE_END(ret_from_fork_asm)
>  .popsection
  
Peter Zijlstra July 20, 2023, 9:28 a.m. UTC | #10
On Thu, Jul 20, 2023 at 07:22:08AM +0200, Peter Zijlstra wrote:

> > I'm not sure that would be worth the trouble / cleverness.  The hint is
> > straightforward IMO.
> 
> I tried, it doesn't work, clearly I missed something.

FWIW, I tried the below. That should make user_mode() true for the
kernel thread regset, and while the kernel did boot, it still fails the
livepatch self-test.

The difference seems to be that END_OF_STACK terminates it right there,
while REGS thinks its a valid frame and only terminates on user_mode()
when unwinding one more frame. The frame at REGS clearly isn't very
sane.


diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 72015dba72ab..45a400b16b80 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -232,6 +232,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 		 * It does the same kernel frame setup to return to a kernel
 		 * function that a kernel thread does.
 		 */
+		childregs->cs = 3;
 		childregs->sp = 0;
 		childregs->ip = 0;
 		kthread_frame_init(frame, args->fn, args->fn_arg);
  

Patch

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index e56123f03a79..6e6af42e044a 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -727,36 +727,22 @@  SYM_CODE_END(__switch_to_asm)
  * edi: kernel thread arg
  */
 .pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
+SYM_CODE_START(ret_from_fork_asm)
+	movl	%esp, %edx	/* regs */
+
 	/* return address for the stack unwinder */
 	pushl	$.Lsyscall_32_done
 
 	FRAME_BEGIN
-	pushl	%eax
-	call	schedule_tail
+	/* prev already in EAX */
+	movl	%ebx, %ecx	/* fn */
+	pushl	%edi		/* fn_arg */
+	call	ret_from_fork
 	addl	$4, %esp
 	FRAME_END
 
-	testl	%ebx, %ebx
-	jnz	1f		/* kernel threads are uncommon */
-
-2:
-	/* When we fork, we trace the syscall return in the child, too. */
-	leal    4(%esp), %eax
-	call    syscall_exit_to_user_mode
 	RET
-
-	/* kernel thread */
-1:	movl	%edi, %eax
-	CALL_NOSPEC ebx
-	/*
-	 * A kernel thread is allowed to return here after successfully
-	 * calling kernel_execve().  Exit to userspace to complete the execve()
-	 * syscall.
-	 */
-	movl	$0, PT_EAX(%esp)
-	jmp	2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_END(ret_from_fork_asm)
 .popsection
 
 SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f31e286c2977..91f6818884fa 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -284,36 +284,19 @@  SYM_FUNC_END(__switch_to_asm)
  * r12: kernel thread arg
  */
 .pushsection .text, "ax"
-	__FUNC_ALIGN
-SYM_CODE_START_NOALIGN(ret_from_fork)
-	UNWIND_HINT_END_OF_STACK
+SYM_CODE_START(ret_from_fork_asm)
+	UNWIND_HINT_REGS
 	ANNOTATE_NOENDBR // copy_thread
 	CALL_DEPTH_ACCOUNT
-	movq	%rax, %rdi
-	call	schedule_tail			/* rdi: 'prev' task parameter */
 
-	testq	%rbx, %rbx			/* from kernel_thread? */
-	jnz	1f				/* kernel threads are uncommon */
+	movq	%rax, %rdi		/* prev */
+	movq	%rsp, %rsi		/* regs */
+	movq	%rbx, %rdx		/* fn */
+	movq	%r12, %rcx		/* fn_arg */
+	call	ret_from_fork
 
-2:
-	UNWIND_HINT_REGS
-	movq	%rsp, %rdi
-	call	syscall_exit_to_user_mode	/* returns with IRQs disabled */
 	jmp	swapgs_restore_regs_and_return_to_usermode
-
-1:
-	/* kernel thread */
-	UNWIND_HINT_END_OF_STACK
-	movq	%r12, %rdi
-	CALL_NOSPEC rbx
-	/*
-	 * A kernel thread is allowed to return here after successfully
-	 * calling kernel_execve().  Exit to userspace to complete the execve()
-	 * syscall.
-	 */
-	movq	$0, RAX(%rsp)
-	jmp	2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_END(ret_from_fork_asm)
 .popsection
 
 .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 5c91305d09d2..f42dbf17f52b 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -12,7 +12,9 @@  struct task_struct *__switch_to_asm(struct task_struct *prev,
 __visible struct task_struct *__switch_to(struct task_struct *prev,
 					  struct task_struct *next);
 
-asmlinkage void ret_from_fork(void);
+asmlinkage void ret_from_fork_asm(void);
+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+			     int (*fn)(void *), void *fn_arg);
 
 /*
  * This is the structure pointed to by thread.sp for an inactive task.  The
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index cc7a642f8c9d..001e6dad9a48 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@ 
 #include <linux/static_call.h>
 #include <trace/events/power.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/entry-common.h>
 #include <asm/cpu.h>
 #include <asm/apic.h>
 #include <linux/uaccess.h>
@@ -136,6 +137,25 @@  static int set_new_tls(struct task_struct *p, unsigned long tls)
 		return do_set_thread_area_64(p, ARCH_SET_FS, tls);
 }
 
+__visible noinstr void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+				     int (*fn)(void *), void *fn_arg)
+{
+	schedule_tail(prev);
+
+	/* Is this a kernel thread? */
+	if (unlikely(fn)) {
+		fn(fn_arg);
+		/*
+		 * A kernel thread is allowed to return here after successfully
+		 * calling kernel_execve().  Exit to userspace to complete the
+		 * execve() syscall.
+		 */
+		regs->ax = 0;
+	}
+
+	syscall_exit_to_user_mode(regs);
+}
+
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
 	unsigned long clone_flags = args->flags;
@@ -152,7 +172,7 @@  int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 	frame = &fork_frame->frame;
 
 	frame->bp = encode_frame_pointer(childregs);
-	frame->ret_addr = (unsigned long) ret_from_fork;
+	frame->ret_addr = (unsigned long) ret_from_fork_asm;
 	p->thread.sp = (unsigned long) fork_frame;
 	p->thread.io_bitmap = NULL;
 	p->thread.iopl_warn = 0;