[v2,4/4] x86: KVM: SVM: workaround for AVIC's errata #1235

Message ID 20230928173354.217464-5-mlevitsk@redhat.com
State New
Headers
Series AVIC bugfixes and workarounds |

Commit Message

Maxim Levitsky Sept. 28, 2023, 5:33 p.m. UTC
  On Zen2 (and likely on Zen1 as well), AVIC doesn't reliably detect a change
in the 'is_running' bit during ICR write emulation and might skip a
VM exit, if that bit was recently cleared.

The absence of the VM exit, leads to the KVM not waking up / triggering
nested vm exit on the target(s) of the IPI which can, in some cases,
lead to an unbounded delays in the guest execution.

As I recently discovered, a reasonable workaround exists: make the KVM
never set the is_running bit.

This workaround ensures that (*) all ICR writes always cause a VM exit
and therefore correctly emulated, in expense of never enjoying VM exit-less
ICR emulation.

This workaround does carry a performance penalty but according to my
benchmarks is still much better than not using AVIC at all,
because AVIC is still used for the receiving end of the IPIs, and for the
posted interrupts.

If the user is aware of the errata and it doesn't affect his workload,
the user can disable the workaround with 'avic_zen2_errata_workaround=0'

(*) More correctly all ICR writes except when 'Self' shorthand is used:

In this case AVIC skips reading physid table and just sets bits in IRR
of local APIC. Thankfully in this case, the errata is not possible,
therefore an extra workaround for this case is not needed.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/svm/avic.c | 63 +++++++++++++++++++++++++++++------------
 arch/x86/kvm/svm/svm.h  |  1 +
 2 files changed, 46 insertions(+), 18 deletions(-)
  

Comments

Sean Christopherson Sept. 29, 2023, 2:06 a.m. UTC | #1
On Thu, Sep 28, 2023, Maxim Levitsky wrote:
> diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
> index 4b74ea91f4e6bb6..28bb0e6b321660d 100644
> --- a/arch/x86/kvm/svm/avic.c
> +++ b/arch/x86/kvm/svm/avic.c
> @@ -62,6 +62,9 @@ static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
>  static bool force_avic;
>  module_param_unsafe(force_avic, bool, 0444);
>  
> +static int avic_zen2_errata_workaround = -1;
> +module_param(avic_zen2_errata_workaround, int, 0444);
> +
>  /* Note:
>   * This hash table is used to map VM_ID to a struct kvm_svm,
>   * when handling AMD IOMMU GALOG notification to schedule in
> @@ -276,7 +279,7 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
>  
>  static int avic_init_backing_page(struct kvm_vcpu *vcpu)
>  {
> -	u64 *entry, new_entry;
> +	u64 *entry;
>  	int id = vcpu->vcpu_id;
>  	struct vcpu_svm *svm = to_svm(vcpu);
>  
> @@ -308,10 +311,10 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
>  	if (!entry)
>  		return -EINVAL;
>  
> -	new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
> -			      AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
> -			      AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
> -	WRITE_ONCE(*entry, new_entry);
> +	svm->avic_physical_id_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
> +						 AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
> +						 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
> +	WRITE_ONCE(*entry, svm->avic_physical_id_entry);

Aha!  Rather than deal with the dummy entry at runtime, simply point the pointer
at the dummy entry during setup.

And instead of adding a dedicated erratum param, let's piggyback VMX's enable_ipiv.
It's not a true disable, but IMO it's close enough.  That will make the param
much more self-documenting, and won't feel so awkward if someone wants to disable
IPI virtualization for other reasons.

Then we can do this in three steps:

  1. Move enable_ipiv to common code
  2. Let userspace disable enable_ipiv for SVM+AVIC
  3. Disable enable_ipiv for affected CPUs

The biggest downside to using enable_ipiv is that a the "auto" behavior for the
erratum will be a bit ugly, but that's a solvable problem.

If you've no objection to the above approach, I'll post the attached patches along
with a massaged version of this patch.

The attached patches apply on top of an AVIC clean[*], which (shameless plug)
could use a review ;-)

[*] https://lore.kernel.org/all/20230815213533.548732-1-seanjc@google.com
  

Patch

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 4b74ea91f4e6bb6..28bb0e6b321660d 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -62,6 +62,9 @@  static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
 static bool force_avic;
 module_param_unsafe(force_avic, bool, 0444);
 
+static int avic_zen2_errata_workaround = -1;
+module_param(avic_zen2_errata_workaround, int, 0444);
+
 /* Note:
  * This hash table is used to map VM_ID to a struct kvm_svm,
  * when handling AMD IOMMU GALOG notification to schedule in
@@ -276,7 +279,7 @@  static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
 
 static int avic_init_backing_page(struct kvm_vcpu *vcpu)
 {
-	u64 *entry, new_entry;
+	u64 *entry;
 	int id = vcpu->vcpu_id;
 	struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -308,10 +311,10 @@  static int avic_init_backing_page(struct kvm_vcpu *vcpu)
 	if (!entry)
 		return -EINVAL;
 
-	new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
-			      AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
-			      AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
-	WRITE_ONCE(*entry, new_entry);
+	svm->avic_physical_id_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
+						 AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+						 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
+	WRITE_ONCE(*entry, svm->avic_physical_id_entry);
 
 	svm->avic_physical_id_cache = entry;
 
@@ -835,7 +838,7 @@  static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
 	 * will update the pCPU info when the vCPU awkened and/or scheduled in.
 	 * See also avic_vcpu_load().
 	 */
-	entry = READ_ONCE(*(svm->avic_physical_id_cache));
+	entry = READ_ONCE(svm->avic_physical_id_entry);
 	if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
 		amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
 				    true, pi->ir_data);
@@ -1027,7 +1030,6 @@  avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
 
 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	u64 entry;
 	int h_physical_id = kvm_cpu_get_apicid(cpu);
 	struct vcpu_svm *svm = to_svm(vcpu);
 	unsigned long flags;
@@ -1056,14 +1058,23 @@  void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	 */
 	spin_lock_irqsave(&svm->ir_list_lock, flags);
 
-	entry = READ_ONCE(*(svm->avic_physical_id_cache));
-	WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
 
-	entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
-	entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
-	entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+	WARN_ON_ONCE(svm->avic_physical_id_entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+
+	svm->avic_physical_id_entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+	svm->avic_physical_id_entry |=
+		(h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+
+	svm->avic_physical_id_entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+	/*
+	 * Do not update the actual physical id table entry if workaround
+	 * for #1235 - the physical ID entry is_running is never set when
+	 * the workaround is activated
+	 */
+	if (!avic_zen2_errata_workaround)
+		WRITE_ONCE(*(svm->avic_physical_id_cache), svm->avic_physical_id_entry);
 
-	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
 	avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
 
 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
@@ -1071,7 +1082,6 @@  void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 void avic_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	u64 entry;
 	struct vcpu_svm *svm = to_svm(vcpu);
 	unsigned long flags;
 
@@ -1084,10 +1094,9 @@  void avic_vcpu_put(struct kvm_vcpu *vcpu)
 	 * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
 	 * recursively.
 	 */
-	entry = READ_ONCE(*(svm->avic_physical_id_cache));
 
 	/* Nothing to do if IsRunning == '0' due to vCPU blocking. */
-	if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
+	if (!(svm->avic_physical_id_entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
 		return;
 
 	/*
@@ -1102,8 +1111,14 @@  void avic_vcpu_put(struct kvm_vcpu *vcpu)
 
 	avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
 
-	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+	svm->avic_physical_id_entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+	/*
+	 * Do not update the actual physical id table entry
+	 * See explanation in avic_vcpu_load
+	 */
+	if (!avic_zen2_errata_workaround)
+		WRITE_ONCE(*(svm->avic_physical_id_cache), svm->avic_physical_id_entry);
 
 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
 
@@ -1217,5 +1232,17 @@  bool avic_hardware_setup(void)
 
 	amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
 
+	if (avic_zen2_errata_workaround == -1) {
+
+		/* Assume that Zen1 and Zen2 have errata #1235 */
+		if (boot_cpu_data.x86 == 0x17)
+			avic_zen2_errata_workaround = 1;
+		else
+			avic_zen2_errata_workaround = 0;
+	}
+
+	if (avic_zen2_errata_workaround)
+		pr_info("Workaround for AVIC errata #1235 is enabled\n");
+
 	return true;
 }
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index be67ab7fdd104e3..98dc45b9c194d2e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -265,6 +265,7 @@  struct vcpu_svm {
 	u32 ldr_reg;
 	u32 dfr_reg;
 	struct page *avic_backing_page;
+	u64 avic_physical_id_entry;
 	u64 *avic_physical_id_cache;
 
 	/*