[7/8] KVM: SVM: move MSR_IA32_SPEC_CTRL save/restore to assembly

Message ID 20221107145436.276079-8-pbonzini@redhat.com
State New
Headers
Series KVM: SVM: fixes for vmentry code |

Commit Message

Paolo Bonzini Nov. 7, 2022, 2:54 p.m. UTC
  Restoration of the host IA32_SPEC_CTRL value is probably too late
with respect to the return thunk training sequence.

With respect to the user/kernel boundary, AMD says, "If software chooses
to toggle STIBP (e.g., set STIBP on kernel entry, and clear it on kernel
exit), software should set STIBP to 1 before executing the return thunk
training sequence." I assume the same requirements apply to the guest/host
boundary. The return thunk training sequence is in vmenter.S, quite close
to the VM-exit. On hosts without V_SPEC_CTRL, however, the host's
IA32_SPEC_CTRL value is not restored until much later.

To avoid this, move the restoration of host SPEC_CTRL to assembly and,
for consistency, move the restoration of the guest SPEC_CTRL as well.
This is not particularly difficult, apart from some care to cover both
32- and 64-bit, and to share code between SEV-ES and normal vmentry.

Cc: stable@vger.kernel.org
Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
Suggested-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kernel/asm-offsets.c |  1 +
 arch/x86/kernel/cpu/bugs.c    | 13 ++---
 arch/x86/kvm/svm/svm.c        | 38 ++++++---------
 arch/x86/kvm/svm/svm.h        |  4 +-
 arch/x86/kvm/svm/vmenter.S    | 92 ++++++++++++++++++++++++++++++++++-
 5 files changed, 111 insertions(+), 37 deletions(-)
  

Comments

Jim Mattson Nov. 7, 2022, 6:45 p.m. UTC | #1
On Mon, Nov 7, 2022 at 6:54 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> Restoration of the host IA32_SPEC_CTRL value is probably too late
> with respect to the return thunk training sequence.
>
> With respect to the user/kernel boundary, AMD says, "If software chooses
> to toggle STIBP (e.g., set STIBP on kernel entry, and clear it on kernel
> exit), software should set STIBP to 1 before executing the return thunk
> training sequence." I assume the same requirements apply to the guest/host
> boundary. The return thunk training sequence is in vmenter.S, quite close
> to the VM-exit. On hosts without V_SPEC_CTRL, however, the host's
> IA32_SPEC_CTRL value is not restored until much later.
>
> To avoid this, move the restoration of host SPEC_CTRL to assembly and,
> for consistency, move the restoration of the guest SPEC_CTRL as well.
> This is not particularly difficult, apart from some care to cover both
> 32- and 64-bit, and to share code between SEV-ES and normal vmentry.
>
> Cc: stable@vger.kernel.org
> Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
> Suggested-by: Jim Mattson <jmattson@google.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kernel/asm-offsets.c |  1 +
>  arch/x86/kernel/cpu/bugs.c    | 13 ++---
>  arch/x86/kvm/svm/svm.c        | 38 ++++++---------
>  arch/x86/kvm/svm/svm.h        |  4 +-
>  arch/x86/kvm/svm/vmenter.S    | 92 ++++++++++++++++++++++++++++++++++-
>  5 files changed, 111 insertions(+), 37 deletions(-)
>
> diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
> index 69d1fed51086..d0bd68af0a5a 100644
> --- a/arch/x86/kernel/asm-offsets.c
> +++ b/arch/x86/kernel/asm-offsets.c
> @@ -115,6 +115,7 @@ static void __used common(void)
>                 OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
>                 OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
>                 OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
> +               OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl);
>                 OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
>         }
>
> diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> index da7c361f47e0..6ec0b7ce7453 100644
> --- a/arch/x86/kernel/cpu/bugs.c
> +++ b/arch/x86/kernel/cpu/bugs.c
> @@ -196,22 +196,15 @@ void __init check_bugs(void)
>  }
>
>  /*
> - * NOTE: This function is *only* called for SVM.  VMX spec_ctrl handling is
> - * done in vmenter.S.
> + * NOTE: This function is *only* called for SVM, since Intel uses
> + * MSR_IA32_SPEC_CTRL for SSBD.
>   */
>  void
>  x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
>  {
> -       u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
> +       u64 guestval, hostval;
>         struct thread_info *ti = current_thread_info();
>
> -       if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
> -               if (hostval != guestval) {
> -                       msrval = setguest ? guestval : hostval;
> -                       wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
> -               }
> -       }
> -
>         /*
>          * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
>          * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 381c7dcffe25..31aa158a2e10 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -731,6 +731,15 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
>         u32 offset;
>         u32 *msrpm;
>
> +       /*
> +        * For non-nested case:
> +        * If the L01 MSR bitmap does not intercept the MSR, then we need to
> +        * save it.
> +        *
> +        * For nested case:
> +        * If the L02 MSR bitmap does not intercept the MSR, then we need to
> +        * save it.
> +        */
>         msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
>                                       to_svm(vcpu)->msrpm;
>
> @@ -3912,18 +3921,19 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
>         return EXIT_FASTPATH_NONE;
>  }
>
> -static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
> +static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
>  {
>         struct vcpu_svm *svm = to_svm(vcpu);
>
>         guest_state_enter_irqoff();
>
>         if (sev_es_guest(vcpu->kvm)) {
> -               __svm_sev_es_vcpu_run(svm);
> +               __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
>         } else {
>                 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
>
> -               __svm_vcpu_run(svm, __sme_page_pa(sd->save_area));
> +               __svm_vcpu_run(svm, __sme_page_pa(sd->save_area),
> +                              spec_ctrl_intercepted);
>         }
>
>         guest_state_exit_irqoff();
> @@ -3932,6 +3942,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
>  static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
>  {
>         struct vcpu_svm *svm = to_svm(vcpu);
> +       bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
>
>         trace_kvm_entry(vcpu);
>
> @@ -3990,26 +4001,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
>         if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
>                 x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
>
> -       svm_vcpu_enter_exit(vcpu);
> -
> -       /*
> -        * We do not use IBRS in the kernel. If this vCPU has used the
> -        * SPEC_CTRL MSR it may have left it on; save the value and
> -        * turn it off. This is much more efficient than blindly adding
> -        * it to the atomic save/restore list. Especially as the former
> -        * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
> -        *
> -        * For non-nested case:
> -        * If the L01 MSR bitmap does not intercept the MSR, then we need to
> -        * save it.
> -        *
> -        * For nested case:
> -        * If the L02 MSR bitmap does not intercept the MSR, then we need to
> -        * save it.
> -        */
> -       if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
> -           unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
> -               svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
> +       svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
>
>         if (!sev_es_guest(vcpu->kvm))
>                 reload_tss(vcpu);
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 99410651f2a5..9d940d8736f0 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -483,7 +483,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm);
>
>  /* vmenter.S */
>
> -void __svm_sev_es_vcpu_run(struct vcpu_svm *svm);
> -void __svm_vcpu_run(struct vcpu_svm *svm, unsigned long hsave_pa);
> +void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
> +void __svm_vcpu_run(struct vcpu_svm *svm, unsigned long hsave_pa, bool spec_ctrl_intercepted);
>
>  #endif
> diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
> index 45a4bd002494..9e381386ffdc 100644
> --- a/arch/x86/kvm/svm/vmenter.S
> +++ b/arch/x86/kvm/svm/vmenter.S
> @@ -32,10 +32,64 @@
>
>  .section .noinstr.text, "ax"
>
> +.macro RESTORE_GUEST_SPEC_CTRL
> +       /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
> +       ALTERNATIVE_2 "jmp 999f", \
> +               "", X86_FEATURE_MSR_SPEC_CTRL, \
> +               "jmp 999f", X86_FEATURE_V_SPEC_CTRL
> +
> +       /*
> +        * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
> +        * host's, write the MSR.
> +        *
> +        * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
> +        * there must not be any returns or indirect branches between this code
> +        * and vmentry.
> +        */
> +       movl SVM_spec_ctrl(%_ASM_DI), %eax
> +       cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
> +       je 999f
> +       mov $MSR_IA32_SPEC_CTRL, %ecx
> +       xor %edx, %edx
> +       wrmsr
> +999:
> +
> +.endm
> +
> +.macro RESTORE_HOST_SPEC_CTRL
> +       /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
> +       ALTERNATIVE_2 "jmp 999f", \
> +               "", X86_FEATURE_MSR_SPEC_CTRL, \
> +               "jmp 999f", X86_FEATURE_V_SPEC_CTRL
> +
> +       mov $MSR_IA32_SPEC_CTRL, %ecx
> +
> +       /*
> +        * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
> +        * if it was not intercepted during guest execution.
> +        */
> +       cmpb $0, (%_ASM_SP)
> +       jnz 998f
> +       rdmsr
> +       movl %eax, SVM_spec_ctrl(%_ASM_DI)
> +998:
> +
> +       /* Now restore the host value of the MSR if different from the guest's.  */
> +       movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
> +       cmp SVM_spec_ctrl(%_ASM_DI), %eax
> +       je 999f
> +       xor %edx, %edx
> +       wrmsr
> +999:
> +
> +.endm
> +
> +

It seems unfortunate to have the unconditional branches in the more
common cases.
  
Paolo Bonzini Nov. 7, 2022, 7:08 p.m. UTC | #2
On 11/7/22 19:45, Jim Mattson wrote:
>> +.macro RESTORE_GUEST_SPEC_CTRL
>> +       /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
>> +       ALTERNATIVE_2 "jmp 999f", \
>> +               "", X86_FEATURE_MSR_SPEC_CTRL, \
>> +               "jmp 999f", X86_FEATURE_V_SPEC_CTRL
>> +
>> +       /*
>> +        * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
>> +        * host's, write the MSR.
>> +        *
>> +        * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
>> +        * there must not be any returns or indirect branches between this code
>> +        * and vmentry.
>> +        */
>> +       movl SVM_spec_ctrl(%_ASM_DI), %eax
>> +       cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
>> +       je 999f
>> +       mov $MSR_IA32_SPEC_CTRL, %ecx
>> +       xor %edx, %edx
>> +       wrmsr
>> +999:
>> +
>> +.endm
>> +
>> +.macro RESTORE_HOST_SPEC_CTRL
>> +       /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
>> +       ALTERNATIVE_2 "jmp 999f", \
>> +               "", X86_FEATURE_MSR_SPEC_CTRL, \
>> +               "jmp 999f", X86_FEATURE_V_SPEC_CTRL
>> +
>> +       mov $MSR_IA32_SPEC_CTRL, %ecx
>> +
>> +       /*
>> +        * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
>> +        * if it was not intercepted during guest execution.
>> +        */
>> +       cmpb $0, (%_ASM_SP)
>> +       jnz 998f
>> +       rdmsr
>> +       movl %eax, SVM_spec_ctrl(%_ASM_DI)
>> +998:
>> +
>> +       /* Now restore the host value of the MSR if different from the guest's.  */
>> +       movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
>> +       cmp SVM_spec_ctrl(%_ASM_DI), %eax
>> +       je 999f
>> +       xor %edx, %edx
>> +       wrmsr
>> +999:
>> +
>> +.endm
>> +
>> +
> 
> It seems unfortunate to have the unconditional branches in the more
> common cases.

One way to do it could be something like

.macro RESTORE_HOST_SPEC_CTRL
        ALTERNATIVE_2 "", \
                "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \
                "", X86_FEATURE_V_SPEC_CTRL \
901:
.endm

.macro RESTORE_SPEC_CTRL_BODY \
800:
	/* restore guest spec ctrl ... */
	jmp 801b

900:
	/* save guest spec ctrl + restore host ... */
	jmp 901b
.endm

The cmp/je pair can also jump back to 801b/901b.

What do you think?  I'll check if objtool is happy and if so include it 
in v2.

Paolo
  
Jim Mattson Nov. 9, 2022, 9:23 p.m. UTC | #3
On Mon, Nov 7, 2022 at 11:08 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 11/7/22 19:45, Jim Mattson wrote:
> >> +.macro RESTORE_GUEST_SPEC_CTRL
> >> +       /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
> >> +       ALTERNATIVE_2 "jmp 999f", \
> >> +               "", X86_FEATURE_MSR_SPEC_CTRL, \
> >> +               "jmp 999f", X86_FEATURE_V_SPEC_CTRL
> >> +
> >> +       /*
> >> +        * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
> >> +        * host's, write the MSR.
> >> +        *
> >> +        * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
> >> +        * there must not be any returns or indirect branches between this code
> >> +        * and vmentry.
> >> +        */
> >> +       movl SVM_spec_ctrl(%_ASM_DI), %eax
> >> +       cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
> >> +       je 999f
> >> +       mov $MSR_IA32_SPEC_CTRL, %ecx
> >> +       xor %edx, %edx
> >> +       wrmsr
> >> +999:
> >> +
> >> +.endm
> >> +
> >> +.macro RESTORE_HOST_SPEC_CTRL
> >> +       /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
> >> +       ALTERNATIVE_2 "jmp 999f", \
> >> +               "", X86_FEATURE_MSR_SPEC_CTRL, \
> >> +               "jmp 999f", X86_FEATURE_V_SPEC_CTRL
> >> +
> >> +       mov $MSR_IA32_SPEC_CTRL, %ecx
> >> +
> >> +       /*
> >> +        * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
> >> +        * if it was not intercepted during guest execution.
> >> +        */
> >> +       cmpb $0, (%_ASM_SP)
> >> +       jnz 998f
> >> +       rdmsr
> >> +       movl %eax, SVM_spec_ctrl(%_ASM_DI)
> >> +998:
> >> +
> >> +       /* Now restore the host value of the MSR if different from the guest's.  */
> >> +       movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
> >> +       cmp SVM_spec_ctrl(%_ASM_DI), %eax
> >> +       je 999f
> >> +       xor %edx, %edx
> >> +       wrmsr
> >> +999:
> >> +
> >> +.endm
> >> +
> >> +
> >
> > It seems unfortunate to have the unconditional branches in the more
> > common cases.
>
> One way to do it could be something like
>
> .macro RESTORE_HOST_SPEC_CTRL
>         ALTERNATIVE_2 "", \
>                 "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \
>                 "", X86_FEATURE_V_SPEC_CTRL \
> 901:
> .endm
>
> .macro RESTORE_SPEC_CTRL_BODY \
> 800:
>         /* restore guest spec ctrl ... */
>         jmp 801b
>
> 900:
>         /* save guest spec ctrl + restore host ... */
>         jmp 901b
> .endm
>
> The cmp/je pair can also jump back to 801b/901b.
>
> What do you think?  I'll check if objtool is happy and if so include it
> in v2.
>
> Paolo
>
This seems reasonable, if you trust a direct branch prior to the IBPB.
  

Patch

diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 69d1fed51086..d0bd68af0a5a 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -115,6 +115,7 @@  static void __used common(void)
 		OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
 		OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
 		OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+		OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl);
 		OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
 	}
 
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index da7c361f47e0..6ec0b7ce7453 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -196,22 +196,15 @@  void __init check_bugs(void)
 }
 
 /*
- * NOTE: This function is *only* called for SVM.  VMX spec_ctrl handling is
- * done in vmenter.S.
+ * NOTE: This function is *only* called for SVM, since Intel uses
+ * MSR_IA32_SPEC_CTRL for SSBD.
  */
 void
 x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
 {
-	u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
+	u64 guestval, hostval;
 	struct thread_info *ti = current_thread_info();
 
-	if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
-		if (hostval != guestval) {
-			msrval = setguest ? guestval : hostval;
-			wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
-		}
-	}
-
 	/*
 	 * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
 	 * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 381c7dcffe25..31aa158a2e10 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -731,6 +731,15 @@  static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
 	u32 offset;
 	u32 *msrpm;
 
+	/*
+	 * For non-nested case:
+	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
+	 * save it.
+	 *
+	 * For nested case:
+	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
+	 * save it.
+	 */
 	msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
 				      to_svm(vcpu)->msrpm;
 
@@ -3912,18 +3921,19 @@  static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 	return EXIT_FASTPATH_NONE;
 }
 
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	guest_state_enter_irqoff();
 
 	if (sev_es_guest(vcpu->kvm)) {
-		__svm_sev_es_vcpu_run(svm);
+		__svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
 	} else {
 		struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
 
-		__svm_vcpu_run(svm, __sme_page_pa(sd->save_area));
+		__svm_vcpu_run(svm, __sme_page_pa(sd->save_area),
+			       spec_ctrl_intercepted);
 	}
 
 	guest_state_exit_irqoff();
@@ -3932,6 +3942,7 @@  static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
+	bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
 
 	trace_kvm_entry(vcpu);
 
@@ -3990,26 +4001,7 @@  static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
 		x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
 
-	svm_vcpu_enter_exit(vcpu);
-
-	/*
-	 * We do not use IBRS in the kernel. If this vCPU has used the
-	 * SPEC_CTRL MSR it may have left it on; save the value and
-	 * turn it off. This is much more efficient than blindly adding
-	 * it to the atomic save/restore list. Especially as the former
-	 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
-	 *
-	 * For non-nested case:
-	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
-	 * save it.
-	 *
-	 * For nested case:
-	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
-	 * save it.
-	 */
-	if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
-	    unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
-		svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
+	svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
 
 	if (!sev_es_guest(vcpu->kvm))
 		reload_tss(vcpu);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 99410651f2a5..9d940d8736f0 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -483,7 +483,7 @@  void sev_es_unmap_ghcb(struct vcpu_svm *svm);
 
 /* vmenter.S */
 
-void __svm_sev_es_vcpu_run(struct vcpu_svm *svm);
-void __svm_vcpu_run(struct vcpu_svm *svm, unsigned long hsave_pa);
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+void __svm_vcpu_run(struct vcpu_svm *svm, unsigned long hsave_pa, bool spec_ctrl_intercepted);
 
 #endif
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 45a4bd002494..9e381386ffdc 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -32,10 +32,64 @@ 
 
 .section .noinstr.text, "ax"
 
+.macro RESTORE_GUEST_SPEC_CTRL
+	/* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+	ALTERNATIVE_2 "jmp 999f", \
+		"", X86_FEATURE_MSR_SPEC_CTRL, \
+		"jmp 999f", X86_FEATURE_V_SPEC_CTRL
+
+	/*
+	 * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
+	 * host's, write the MSR.
+	 *
+	 * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
+	 * there must not be any returns or indirect branches between this code
+	 * and vmentry.
+	 */
+	movl SVM_spec_ctrl(%_ASM_DI), %eax
+	cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
+	je 999f
+	mov $MSR_IA32_SPEC_CTRL, %ecx
+	xor %edx, %edx
+	wrmsr
+999:
+
+.endm
+
+.macro RESTORE_HOST_SPEC_CTRL
+	/* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+	ALTERNATIVE_2 "jmp 999f", \
+		"", X86_FEATURE_MSR_SPEC_CTRL, \
+		"jmp 999f", X86_FEATURE_V_SPEC_CTRL
+
+	mov $MSR_IA32_SPEC_CTRL, %ecx
+
+	/*
+	 * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
+	 * if it was not intercepted during guest execution.
+	 */
+	cmpb $0, (%_ASM_SP)
+	jnz 998f
+	rdmsr
+	movl %eax, SVM_spec_ctrl(%_ASM_DI)
+998:
+
+	/* Now restore the host value of the MSR if different from the guest's.  */
+	movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
+	cmp SVM_spec_ctrl(%_ASM_DI), %eax
+	je 999f
+	xor %edx, %edx
+	wrmsr
+999:
+
+.endm
+
+
 /**
  * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
  * @svm:	struct vcpu_svm *
  * @hsave_pa:	unsigned long
+ * @spec_ctrl_intercepted: bool
  */
 SYM_FUNC_START(__svm_vcpu_run)
 	push %_ASM_BP
@@ -50,7 +104,12 @@  SYM_FUNC_START(__svm_vcpu_run)
 #endif
 	push %_ASM_BX
 
-	/* @hsave_pa is needed last after vmexit, save it first.  */
+	/*
+	 * Both @spec_ctrl_intercepted and @hsave_pa are used only after vmexit.
+	 * @spec_ctrl_intercepted is needed later and accessed directly from
+	 * the stack in RESTORE_HOST_SPEC_CTRL, so save it first.
+	 */
+	push %_ASM_ARG3
 	push %_ASM_ARG2
 
 	/* Save @svm. */
@@ -61,6 +120,8 @@  SYM_FUNC_START(__svm_vcpu_run)
 	mov %_ASM_ARG1, %_ASM_DI
 .endif
 
+	RESTORE_GUEST_SPEC_CTRL
+
 	/*
 	 * Use a single vmcb (vmcb01 because it's always valid) for
 	 * context switching guest state via VMLOAD/VMSAVE, that way
@@ -138,6 +199,8 @@  SYM_FUNC_START(__svm_vcpu_run)
 	FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
 #endif
 
+	RESTORE_HOST_SPEC_CTRL
+
 	/*
 	 * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
 	 * untrained as soon as we exit the VM and are back to the
@@ -173,6 +236,9 @@  SYM_FUNC_START(__svm_vcpu_run)
 	xor %r15d, %r15d
 #endif
 
+	/* "Pop" @spec_ctrl_intercepted.  */
+	pop %_ASM_BX
+
 	pop %_ASM_BX
 
 #ifdef CONFIG_X86_64
@@ -210,6 +276,7 @@  SYM_FUNC_END(__svm_vcpu_run)
 /**
  * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
  * @svm:	struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
  */
 SYM_FUNC_START(__svm_sev_es_vcpu_run)
 	push %_ASM_BP
@@ -224,8 +291,21 @@  SYM_FUNC_START(__svm_sev_es_vcpu_run)
 #endif
 	push %_ASM_BX
 
+	/* Save @spec_ctrl_intercepted for RESTORE_HOST_SPEC_CTRL. */
+	push %_ASM_ARG2
+
+	/* Save @svm. */
+	push %_ASM_ARG1
+
+.ifnc _ASM_ARG1, _ASM_DI
+	/* Move @svm to RDI for RESTORE_GUEST_SPEC_CTRL. */
+	mov %_ASM_ARG1, %_ASM_DI
+.endif
+
+	RESTORE_GUEST_SPEC_CTRL
+
 	/* Get svm->current_vmcb->pa into RAX. */
-	mov SVM_current_vmcb(%_ASM_ARG1), %_ASM_AX
+	mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
 	mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
 
 	/* Enter guest mode */
@@ -235,11 +315,16 @@  SYM_FUNC_START(__svm_sev_es_vcpu_run)
 
 2:	cli
 
+	/* Pop @svm to RDI, guest registers have been saved already. */
+	pop %_ASM_DI
+
 #ifdef CONFIG_RETPOLINE
 	/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
 	FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
 #endif
 
+	RESTORE_HOST_SPEC_CTRL
+
 	/*
 	 * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
 	 * untrained as soon as we exit the VM and are back to the
@@ -249,6 +334,9 @@  SYM_FUNC_START(__svm_sev_es_vcpu_run)
 	 */
 	UNTRAIN_RET
 
+	/* "Pop" @spec_ctrl_intercepted.  */
+	pop %_ASM_BX
+
 	pop %_ASM_BX
 
 #ifdef CONFIG_X86_64