[RFC,v6,5/6] KVM: x86: add vCPU scoped toggling for disabled exits
Commit Message
Introduce support of vCPU-scoped ioctl with KVM_CAP_X86_DISABLE_EXITS
cap for disabling exits to enable finer-grained VM exits disabling
on per vCPU scales instead of whole guest. This patch enables
the vCPU-scoped exits control toggling, but keeps the VM-scoped
exits control behaviors restriction as before.
In use cases like Windows guest running heavy CPU-bound
workloads, disabling HLT VM-exits could mitigate host sched ctx switch
overhead. Simply HLT disabling on all vCPUs could bring
performance benefits, but if no pCPUs reserved for host threads, could
happened to the forced preemption as host does not know the time to do
the schedule for other host threads want to run. With this patch, we
could only disable part of vCPUs HLT exits for one guest, this still
keeps performance benefits, and also shows resiliency to host stressing
workload running at the same time.
In the host stressing workload experiment with Windows guest heavy
CPU-bound workloads, it shows good resiliency and having the ~3%
performance improvement. E.g. Passmark running in a Windows guest
with this patch disabling HLT exits on only half of vCPUs still
showing 2.4% higher main score v/s baseline.
Suggested-by: Sean Christopherson <seanjc@google.com>
Suggested-by: Chao Gao <chao.gao@intel.com>
Signed-off-by: Kechen Lu <kechenl@nvidia.com>
---
Documentation/virt/kvm/api.rst | 2 +-
arch/x86/include/asm/kvm-x86-ops.h | 1 +
arch/x86/include/asm/kvm_host.h | 2 ++
arch/x86/kvm/svm/svm.c | 30 ++++++++++++++++++++++++
arch/x86/kvm/vmx/vmx.c | 37 ++++++++++++++++++++++++++++++
arch/x86/kvm/x86.c | 7 ++++++
6 files changed, 78 insertions(+), 1 deletion(-)
Comments
On Sat, Jan 21, 2023 at 02:07:37AM +0000, Kechen Lu wrote:
>+static void svm_update_disabled_exits(struct kvm_vcpu *vcpu)
Is it possible to call this function on vCPU creation, i.e., consolidate
initialization and runtime toggling?
>+{
>+ struct vcpu_svm *svm = to_svm(vcpu);
>+ struct vmcb_control_area *control = &svm->vmcb->control;
>+
>+ if (kvm_hlt_in_guest(vcpu))
>+ svm_clr_intercept(svm, INTERCEPT_HLT);
>+ else
>+ svm_set_intercept(svm, INTERCEPT_HLT);
>+
>+ if (kvm_mwait_in_guest(vcpu)) {
>+ svm_clr_intercept(svm, INTERCEPT_MONITOR);
>+ svm_clr_intercept(svm, INTERCEPT_MWAIT);
>+ } else {
>+ svm_set_intercept(svm, INTERCEPT_MONITOR);
>+ svm_set_intercept(svm, INTERCEPT_MWAIT);
>+ }
>+
>+ if (kvm_pause_in_guest(vcpu)) {
>+ svm_clr_intercept(svm, INTERCEPT_PAUSE);
>+ } else {
>+ control->pause_filter_count = pause_filter_count;
>+ if (pause_filter_thresh)
>+ control->pause_filter_thresh = pause_filter_thresh;
>+ }
>+}
>+
> static void svm_vm_destroy(struct kvm *kvm)
> {
> avic_vm_destroy(kvm);
>@@ -4825,7 +4852,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
> .complete_emulated_msr = svm_complete_emulated_msr,
>
> .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
>+
> .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
>+
>+ .update_disabled_exits = svm_update_disabled_exits,
> };
>
> /*
>diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>index 019a20029878..f5137afdd424 100644
>--- a/arch/x86/kvm/vmx/vmx.c
>+++ b/arch/x86/kvm/vmx/vmx.c
>@@ -8070,6 +8070,41 @@ static void vmx_vm_destroy(struct kvm *kvm)
> free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
> }
>
>+static void vmx_update_disabled_exits(struct kvm_vcpu *vcpu)
ditto.
Hi Chao,
> -----Original Message-----
> From: Chao Gao <chao.gao@intel.com>
> Sent: Sunday, January 29, 2023 10:42 PM
> To: Kechen Lu <kechenl@nvidia.com>
> Cc: kvm@vger.kernel.org; seanjc@google.com; pbonzini@redhat.com;
> zhi.wang.linux@gmail.com; shaoqin.huang@intel.com;
> vkuznets@redhat.com; linux-kernel@vger.kernel.org
> Subject: Re: [RFC PATCH v6 5/6] KVM: x86: add vCPU scoped toggling for
> disabled exits
>
> External email: Use caution opening links or attachments
>
>
> On Sat, Jan 21, 2023 at 02:07:37AM +0000, Kechen Lu wrote:
> >+static void svm_update_disabled_exits(struct kvm_vcpu *vcpu)
>
> Is it possible to call this function on vCPU creation, i.e., consolidate
> initialization and runtime toggling?
>
Chao, can you elaborate on this? If I understand correctly,
you mean replacing the current redundant code on vCPU creation
for checking the xxx_in_guest and set intercept, while instead, calling this
svm/vmx_update_disabled_exits()? Yeah, I think this makes sense to
me.
BR,
Kechen
> >+{
> >+ struct vcpu_svm *svm = to_svm(vcpu);
> >+ struct vmcb_control_area *control = &svm->vmcb->control;
> >+
> >+ if (kvm_hlt_in_guest(vcpu))
> >+ svm_clr_intercept(svm, INTERCEPT_HLT);
> >+ else
> >+ svm_set_intercept(svm, INTERCEPT_HLT);
> >+
> >+ if (kvm_mwait_in_guest(vcpu)) {
> >+ svm_clr_intercept(svm, INTERCEPT_MONITOR);
> >+ svm_clr_intercept(svm, INTERCEPT_MWAIT);
> >+ } else {
> >+ svm_set_intercept(svm, INTERCEPT_MONITOR);
> >+ svm_set_intercept(svm, INTERCEPT_MWAIT);
> >+ }
> >+
> >+ if (kvm_pause_in_guest(vcpu)) {
> >+ svm_clr_intercept(svm, INTERCEPT_PAUSE);
> >+ } else {
> >+ control->pause_filter_count = pause_filter_count;
> >+ if (pause_filter_thresh)
> >+ control->pause_filter_thresh = pause_filter_thresh;
> >+ }
> >+}
> >+
> > static void svm_vm_destroy(struct kvm *kvm) {
> > avic_vm_destroy(kvm);
> >@@ -4825,7 +4852,10 @@ static struct kvm_x86_ops svm_x86_ops
> __initdata = {
> > .complete_emulated_msr = svm_complete_emulated_msr,
> >
> > .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
> >+
> > .vcpu_get_apicv_inhibit_reasons =
> > avic_vcpu_get_apicv_inhibit_reasons,
> >+
> >+ .update_disabled_exits = svm_update_disabled_exits,
> > };
> >
> > /*
> >diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> >019a20029878..f5137afdd424 100644
> >--- a/arch/x86/kvm/vmx/vmx.c
> >+++ b/arch/x86/kvm/vmx/vmx.c
> >@@ -8070,6 +8070,41 @@ static void vmx_vm_destroy(struct kvm *kvm)
> > free_pages((unsigned long)kvm_vmx->pid_table,
> >vmx_get_pid_table_order(kvm)); }
> >
> >+static void vmx_update_disabled_exits(struct kvm_vcpu *vcpu)
>
> ditto.
On Mon, Jan 30, 2023 at 08:57:16PM +0000, Kechen Lu wrote:
>> On Sat, Jan 21, 2023 at 02:07:37AM +0000, Kechen Lu wrote:
>> >+static void svm_update_disabled_exits(struct kvm_vcpu *vcpu)
>>
>> Is it possible to call this function on vCPU creation, i.e., consolidate
>> initialization and runtime toggling?
>>
>
>Chao, can you elaborate on this? If I understand correctly,
>you mean replacing the current redundant code on vCPU creation
>for checking the xxx_in_guest and set intercept, while instead, calling this
>svm/vmx_update_disabled_exits()?
That's exactly what I mean.
>Yeah, I think this makes sense to
>me.
@@ -7102,7 +7102,7 @@ longer intercept some instructions for improved latency in some
workloads, and is suggested when vCPUs are associated to dedicated
physical CPUs. More bits can be added in the future; userspace can
just pass the KVM_CHECK_EXTENSION result to KVM_ENABLE_CAP to disable
-all such vmexits.
+all such vmexits. VM scoped and vCPU scoped capability are both supported.
By default, this capability only disables exits. To re-enable an exit, or to
override previous settings, userspace can set KVM_X86_DISABLE_EXITS_OVERRIDE,
@@ -131,6 +131,7 @@ KVM_X86_OP(msr_filter_changed)
KVM_X86_OP(complete_emulated_msr)
KVM_X86_OP(vcpu_deliver_sipi_vector)
KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
+KVM_X86_OP(update_disabled_exits)
#undef KVM_X86_OP
#undef KVM_X86_OP_OPTIONAL
@@ -1711,6 +1711,8 @@ struct kvm_x86_ops {
* Returns vCPU specific APICv inhibit reasons
*/
unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu);
+
+ void (*update_disabled_exits)(struct kvm_vcpu *vcpu);
};
struct kvm_x86_nested_ops {
@@ -4680,6 +4680,33 @@ static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
sev_vcpu_deliver_sipi_vector(vcpu, vector);
}
+static void svm_update_disabled_exits(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+
+ if (kvm_hlt_in_guest(vcpu))
+ svm_clr_intercept(svm, INTERCEPT_HLT);
+ else
+ svm_set_intercept(svm, INTERCEPT_HLT);
+
+ if (kvm_mwait_in_guest(vcpu)) {
+ svm_clr_intercept(svm, INTERCEPT_MONITOR);
+ svm_clr_intercept(svm, INTERCEPT_MWAIT);
+ } else {
+ svm_set_intercept(svm, INTERCEPT_MONITOR);
+ svm_set_intercept(svm, INTERCEPT_MWAIT);
+ }
+
+ if (kvm_pause_in_guest(vcpu)) {
+ svm_clr_intercept(svm, INTERCEPT_PAUSE);
+ } else {
+ control->pause_filter_count = pause_filter_count;
+ if (pause_filter_thresh)
+ control->pause_filter_thresh = pause_filter_thresh;
+ }
+}
+
static void svm_vm_destroy(struct kvm *kvm)
{
avic_vm_destroy(kvm);
@@ -4825,7 +4852,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.complete_emulated_msr = svm_complete_emulated_msr,
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
+
.vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
+
+ .update_disabled_exits = svm_update_disabled_exits,
};
/*
@@ -8070,6 +8070,41 @@ static void vmx_vm_destroy(struct kvm *kvm)
free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
}
+static void vmx_update_disabled_exits(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (kvm_hlt_in_guest(vcpu))
+ exec_controls_clearbit(vmx, CPU_BASED_HLT_EXITING);
+ else
+ exec_controls_setbit(vmx, CPU_BASED_HLT_EXITING);
+
+ if (kvm_mwait_in_guest(vcpu))
+ exec_controls_clearbit(vmx, CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING);
+ else
+ exec_controls_setbit(vmx, CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING);
+
+ if (!kvm_pause_in_guest(vcpu)) {
+ vmcs_write32(PLE_GAP, ple_gap);
+ vmx->ple_window = ple_window;
+ vmx->ple_window_dirty = true;
+ }
+
+ if (kvm_cstate_in_guest(vcpu)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ } else {
+ vmx_enable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+ vmx_enable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+ vmx_enable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+ vmx_enable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ }
+}
+
static struct kvm_x86_ops vmx_x86_ops __initdata = {
.name = "kvm_intel",
@@ -8207,6 +8242,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.complete_emulated_msr = kvm_complete_insn_gp,
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
+
+ .update_disabled_exits = vmx_update_disabled_exits,
};
static unsigned int vmx_handle_intel_pt_intr(void)
@@ -5552,6 +5552,13 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
if (vcpu->arch.pv_cpuid.enforce)
kvm_update_pv_runtime(vcpu);
+ return 0;
+ case KVM_CAP_X86_DISABLE_EXITS:
+ if (cap->args[0] & ~kvm_get_allowed_disable_exits())
+ return -EINVAL;
+
+ kvm_ioctl_disable_exits(vcpu->arch, cap->args[0]);
+ static_call(kvm_x86_update_disabled_exits)(vcpu);
return 0;
default:
return -EINVAL;