[v5,08/13] KVM: selftests: Test Intel PMU architectural events on gp counters

Message ID 20231024002633.2540714-9-seanjc@google.com
State New
Headers
Series KVM: x86/pmu: selftests: Fixes and new tests |

Commit Message

Sean Christopherson Oct. 24, 2023, 12:26 a.m. UTC
  From: Jinrong Liang <cloudliang@tencent.com>

Add test cases to check if different Architectural events are available
after it's marked as unavailable via CPUID. It covers vPMU event filtering
logic based on Intel CPUID, which is a complement to pmu_event_filter.

According to Intel SDM, the number of architectural events is reported
through CPUID.0AH:EAX[31:24] and the architectural event x is supported
if EBX[x]=0 && EAX[31:24]>x.

Co-developed-by: Like Xu <likexu@tencent.com>
Signed-off-by: Like Xu <likexu@tencent.com>
Signed-off-by: Jinrong Liang <cloudliang@tencent.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile          |   1 +
 .../selftests/kvm/x86_64/pmu_counters_test.c  | 189 ++++++++++++++++++
 2 files changed, 190 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
  

Comments

Sean Christopherson Oct. 24, 2023, 7:49 p.m. UTC | #1
On Mon, Oct 23, 2023, Sean Christopherson wrote:
> +static void guest_measure_pmu_v1(struct kvm_x86_pmu_feature event,
> +				 uint32_t counter_msr, uint32_t nr_gp_counters)
> +{
> +	uint8_t idx = event.f.bit;
> +	unsigned int i;
> +
> +	for (i = 0; i < nr_gp_counters; i++) {
> +		wrmsr(counter_msr + i, 0);
> +		wrmsr(MSR_P6_EVNTSEL0 + i, ARCH_PERFMON_EVENTSEL_OS |
> +		      ARCH_PERFMON_EVENTSEL_ENABLE | intel_pmu_arch_events[idx]);
> +		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
> +
> +		if (pmu_is_intel_event_stable(idx))
> +			GUEST_ASSERT_EQ(this_pmu_has(event), !!_rdpmc(i));
> +
> +		wrmsr(MSR_P6_EVNTSEL0 + i, ARCH_PERFMON_EVENTSEL_OS |
> +		      !ARCH_PERFMON_EVENTSEL_ENABLE |
> +		      intel_pmu_arch_events[idx]);
> +		wrmsr(counter_msr + i, 0);
> +		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
> +
> +		if (pmu_is_intel_event_stable(idx))
> +			GUEST_ASSERT(!_rdpmc(i));
> +	}
> +
> +	GUEST_DONE();
> +}
> +
> +static void guest_measure_loop(uint8_t idx)
> +{
> +	const struct {
> +		struct kvm_x86_pmu_feature gp_event;
> +	} intel_event_to_feature[] = {
> +		[INTEL_ARCH_CPU_CYCLES]		   = { X86_PMU_FEATURE_CPU_CYCLES },
> +		[INTEL_ARCH_INSTRUCTIONS_RETIRED]  = { X86_PMU_FEATURE_INSNS_RETIRED },
> +		[INTEL_ARCH_REFERENCE_CYCLES]	   = { X86_PMU_FEATURE_REFERENCE_CYCLES },
> +		[INTEL_ARCH_LLC_REFERENCES]	   = { X86_PMU_FEATURE_LLC_REFERENCES },
> +		[INTEL_ARCH_LLC_MISSES]		   = { X86_PMU_FEATURE_LLC_MISSES },
> +		[INTEL_ARCH_BRANCHES_RETIRED]	   = { X86_PMU_FEATURE_BRANCH_INSNS_RETIRED },
> +		[INTEL_ARCH_BRANCHES_MISPREDICTED] = { X86_PMU_FEATURE_BRANCHES_MISPREDICTED },
> +	};
> +
> +	uint32_t nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
> +	uint32_t pmu_version = this_cpu_property(X86_PROPERTY_PMU_VERSION);
> +	struct kvm_x86_pmu_feature gp_event;
> +	uint32_t counter_msr;
> +	unsigned int i;
> +
> +	if (rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES)
> +		counter_msr = MSR_IA32_PMC0;
> +	else
> +		counter_msr = MSR_IA32_PERFCTR0;
> +
> +	gp_event = intel_event_to_feature[idx].gp_event;
> +	TEST_ASSERT_EQ(idx, gp_event.f.bit);
> +
> +	if (pmu_version < 2) {
> +		guest_measure_pmu_v1(gp_event, counter_msr, nr_gp_counters);

Looking at this again, testing guest PMU version 1 is practically impossible
because this testcase doesn't force the guest PMU version.  I.e. unless I'm
missing something, this requires old hardware or running in a VM with its PMU
forced to '1'.

And if all subtests use similar inputs, the common configuration can be shoved
into pmu_vm_create_with_one_vcpu().

It's easy enough to fold test_intel_arch_events() into test_intel_counters(),
which will also provide coverage for running with full-width writes enabled.  The
only downside is that the total runtime will be longer.

> +static void test_arch_events_cpuid(uint8_t i, uint8_t j, uint8_t idx)
> +{
> +	uint8_t arch_events_unavailable_mask = BIT_ULL(j);
> +	uint8_t arch_events_bitmap_size = BIT_ULL(i);
> +	struct kvm_vcpu *vcpu;
> +	struct kvm_vm *vm;
> +
> +	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_measure_loop);
> +
> +	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH,
> +				arch_events_bitmap_size);
> +	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EVENTS_MASK,
> +				arch_events_unavailable_mask);
> +
> +	vcpu_args_set(vcpu, 1, idx);
> +
> +	run_vcpu(vcpu);
> +
> +	kvm_vm_free(vm);
> +}
> +
> +static void test_intel_arch_events(void)
> +{
> +	uint8_t idx, i, j;
> +
> +	for (idx = 0; idx < NR_INTEL_ARCH_EVENTS; idx++) {

There's no need to iterate over each event in the host, we can simply add a wrapper
for guest_measure_loop() in the guest.  That'll be slightly faster since it won't
require creating and destroying a VM for every event.

> +		/*
> +		 * A brute force iteration of all combinations of values is
> +		 * likely to exhaust the limit of the single-threaded thread
> +		 * fd nums, so it's test by iterating through all valid
> +		 * single-bit values.
> +		 */
> +		for (i = 0; i < NR_INTEL_ARCH_EVENTS; i++) {

This is flawed/odd.  'i' becomes arch_events_bitmap_size, i.e. it's a length,
but the length is computed byt BIT(i).  That's nonsensical and will eventually
result in undefined behavior.  Oof, that'll actually happen sooner than later
because arch_events_bitmap_size is only a single byte, i.e. when the number of
events hits 9, this will try to shove 256 into an 8-bit variable.

The more correct approach would be to pass in 0..NR_INTEL_ARCH_EVENTS inclusive
as the size.  But I think we should actually test 0..length+1, where "length" is
the max of the native length and NR_INTEL_ARCH_EVENTS, i.e. we should verify KVM
KVM handles a size larger than the native length.

> +			for (j = 0; j < NR_INTEL_ARCH_EVENTS; j++)
> +				test_arch_events_cpuid(i, j, idx);

And here, I think it makes sense to brute force all possible values for at least
one configuration.  There aren't actually _that_ many values, e.g. currently it's
64 (I think).  E.g. test the native PMU version with the "full" length, and then
test single bits with varying lengths.

I'll send a v6 later this week.
  
Sean Christopherson Oct. 24, 2023, 9 p.m. UTC | #2
On Tue, Oct 24, 2023, Sean Christopherson wrote:
> On Mon, Oct 23, 2023, Sean Christopherson wrote:
> > +static void test_intel_arch_events(void)
> > +{
> > +	uint8_t idx, i, j;
> > +
> > +	for (idx = 0; idx < NR_INTEL_ARCH_EVENTS; idx++) {

*sigh*

Yet another KVM bug that this test _should_ catch, but doesn't because too many
things are hardcoded.  KVM _still_ advertises Top Down Slots to userspace because
KVM doesn't sanitize the bit vector or its length that comes out of perf.
  
Jinrong Liang Oct. 25, 2023, 3:17 a.m. UTC | #3
在 2023/10/25 03:49, Sean Christopherson 写道:
> On Mon, Oct 23, 2023, Sean Christopherson wrote:
>> +static void guest_measure_pmu_v1(struct kvm_x86_pmu_feature event,
>> +				 uint32_t counter_msr, uint32_t nr_gp_counters)
>> +{
>> +	uint8_t idx = event.f.bit;
>> +	unsigned int i;
>> +
>> +	for (i = 0; i < nr_gp_counters; i++) {
>> +		wrmsr(counter_msr + i, 0);
>> +		wrmsr(MSR_P6_EVNTSEL0 + i, ARCH_PERFMON_EVENTSEL_OS |
>> +		      ARCH_PERFMON_EVENTSEL_ENABLE | intel_pmu_arch_events[idx]);
>> +		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
>> +
>> +		if (pmu_is_intel_event_stable(idx))
>> +			GUEST_ASSERT_EQ(this_pmu_has(event), !!_rdpmc(i));
>> +
>> +		wrmsr(MSR_P6_EVNTSEL0 + i, ARCH_PERFMON_EVENTSEL_OS |
>> +		      !ARCH_PERFMON_EVENTSEL_ENABLE |
>> +		      intel_pmu_arch_events[idx]);
>> +		wrmsr(counter_msr + i, 0);
>> +		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
>> +
>> +		if (pmu_is_intel_event_stable(idx))
>> +			GUEST_ASSERT(!_rdpmc(i));
>> +	}
>> +
>> +	GUEST_DONE();
>> +}
>> +
>> +static void guest_measure_loop(uint8_t idx)
>> +{
>> +	const struct {
>> +		struct kvm_x86_pmu_feature gp_event;
>> +	} intel_event_to_feature[] = {
>> +		[INTEL_ARCH_CPU_CYCLES]		   = { X86_PMU_FEATURE_CPU_CYCLES },
>> +		[INTEL_ARCH_INSTRUCTIONS_RETIRED]  = { X86_PMU_FEATURE_INSNS_RETIRED },
>> +		[INTEL_ARCH_REFERENCE_CYCLES]	   = { X86_PMU_FEATURE_REFERENCE_CYCLES },
>> +		[INTEL_ARCH_LLC_REFERENCES]	   = { X86_PMU_FEATURE_LLC_REFERENCES },
>> +		[INTEL_ARCH_LLC_MISSES]		   = { X86_PMU_FEATURE_LLC_MISSES },
>> +		[INTEL_ARCH_BRANCHES_RETIRED]	   = { X86_PMU_FEATURE_BRANCH_INSNS_RETIRED },
>> +		[INTEL_ARCH_BRANCHES_MISPREDICTED] = { X86_PMU_FEATURE_BRANCHES_MISPREDICTED },
>> +	};
>> +
>> +	uint32_t nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
>> +	uint32_t pmu_version = this_cpu_property(X86_PROPERTY_PMU_VERSION);
>> +	struct kvm_x86_pmu_feature gp_event;
>> +	uint32_t counter_msr;
>> +	unsigned int i;
>> +
>> +	if (rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES)
>> +		counter_msr = MSR_IA32_PMC0;
>> +	else
>> +		counter_msr = MSR_IA32_PERFCTR0;
>> +
>> +	gp_event = intel_event_to_feature[idx].gp_event;
>> +	TEST_ASSERT_EQ(idx, gp_event.f.bit);
>> +
>> +	if (pmu_version < 2) {
>> +		guest_measure_pmu_v1(gp_event, counter_msr, nr_gp_counters);
> 
> Looking at this again, testing guest PMU version 1 is practically impossible
> because this testcase doesn't force the guest PMU version.  I.e. unless I'm
> missing something, this requires old hardware or running in a VM with its PMU
> forced to '1'.
> 
> And if all subtests use similar inputs, the common configuration can be shoved
> into pmu_vm_create_with_one_vcpu().
> 
> It's easy enough to fold test_intel_arch_events() into test_intel_counters(),
> which will also provide coverage for running with full-width writes enabled.  The
> only downside is that the total runtime will be longer.
> 
>> +static void test_arch_events_cpuid(uint8_t i, uint8_t j, uint8_t idx)
>> +{
>> +	uint8_t arch_events_unavailable_mask = BIT_ULL(j);
>> +	uint8_t arch_events_bitmap_size = BIT_ULL(i);
>> +	struct kvm_vcpu *vcpu;
>> +	struct kvm_vm *vm;
>> +
>> +	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_measure_loop);
>> +
>> +	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH,
>> +				arch_events_bitmap_size);
>> +	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EVENTS_MASK,
>> +				arch_events_unavailable_mask);
>> +
>> +	vcpu_args_set(vcpu, 1, idx);
>> +
>> +	run_vcpu(vcpu);
>> +
>> +	kvm_vm_free(vm);
>> +}
>> +
>> +static void test_intel_arch_events(void)
>> +{
>> +	uint8_t idx, i, j;
>> +
>> +	for (idx = 0; idx < NR_INTEL_ARCH_EVENTS; idx++) {
> 
> There's no need to iterate over each event in the host, we can simply add a wrapper
> for guest_measure_loop() in the guest.  That'll be slightly faster since it won't
> require creating and destroying a VM for every event.
> 
>> +		/*
>> +		 * A brute force iteration of all combinations of values is
>> +		 * likely to exhaust the limit of the single-threaded thread
>> +		 * fd nums, so it's test by iterating through all valid
>> +		 * single-bit values.
>> +		 */
>> +		for (i = 0; i < NR_INTEL_ARCH_EVENTS; i++) {
> 
> This is flawed/odd.  'i' becomes arch_events_bitmap_size, i.e. it's a length,
> but the length is computed byt BIT(i).  That's nonsensical and will eventually
> result in undefined behavior.  Oof, that'll actually happen sooner than later
> because arch_events_bitmap_size is only a single byte, i.e. when the number of
> events hits 9, this will try to shove 256 into an 8-bit variable.
> 
> The more correct approach would be to pass in 0..NR_INTEL_ARCH_EVENTS inclusive
> as the size.  But I think we should actually test 0..length+1, where "length" is
> the max of the native length and NR_INTEL_ARCH_EVENTS, i.e. we should verify KVM
> KVM handles a size larger than the native length.
> 
>> +			for (j = 0; j < NR_INTEL_ARCH_EVENTS; j++)
>> +				test_arch_events_cpuid(i, j, idx);
> 
> And here, I think it makes sense to brute force all possible values for at least
> one configuration.  There aren't actually _that_ many values, e.g. currently it's
> 64 (I think).  E.g. test the native PMU version with the "full" length, and then
> test single bits with varying lengths.
> 
> I'll send a v6 later this week.

Got it, thanks.

Please feel free to let me know if there's anything you'd like me to do.
  
Mingwei Zhang Oct. 26, 2023, 8:38 p.m. UTC | #4
On Mon, Oct 23, 2023, Sean Christopherson wrote:
> From: Jinrong Liang <cloudliang@tencent.com>
> 
> Add test cases to check if different Architectural events are available
> after it's marked as unavailable via CPUID. It covers vPMU event filtering
> logic based on Intel CPUID, which is a complement to pmu_event_filter.
> 
> According to Intel SDM, the number of architectural events is reported
> through CPUID.0AH:EAX[31:24] and the architectural event x is supported
> if EBX[x]=0 && EAX[31:24]>x.
> 
> Co-developed-by: Like Xu <likexu@tencent.com>
> Signed-off-by: Like Xu <likexu@tencent.com>
> Signed-off-by: Jinrong Liang <cloudliang@tencent.com>
> Co-developed-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  tools/testing/selftests/kvm/Makefile          |   1 +
>  .../selftests/kvm/x86_64/pmu_counters_test.c  | 189 ++++++++++++++++++
>  2 files changed, 190 insertions(+)
>  create mode 100644 tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
> 
> diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> index ed1c17cabc07..4c024fb845b4 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -82,6 +82,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
>  TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test
>  TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test
>  TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
> +TEST_GEN_PROGS_x86_64 += x86_64/pmu_counters_test
>  TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
>  TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
>  TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
> diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
> new file mode 100644
> index 000000000000..2a6336b994d5
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
> @@ -0,0 +1,189 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2023, Tencent, Inc.
> + */
> +
> +#define _GNU_SOURCE /* for program_invocation_short_name */
> +#include <x86intrin.h>
> +
> +#include "pmu.h"
> +#include "processor.h"
> +
> +/* Guest payload for any performance counter counting */
> +#define NUM_BRANCHES		10
> +
> +static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
> +						  void *guest_code)
> +{
> +	struct kvm_vm *vm;
> +
> +	vm = vm_create_with_one_vcpu(vcpu, guest_code);
> +	vm_init_descriptor_tables(vm);
> +	vcpu_init_descriptor_tables(*vcpu);
> +
> +	return vm;
> +}
> +
> +static void run_vcpu(struct kvm_vcpu *vcpu)
> +{
> +	struct ucall uc;
> +
> +	do {
> +		vcpu_run(vcpu);
> +		switch (get_ucall(vcpu, &uc)) {
> +		case UCALL_SYNC:
> +			break;
> +		case UCALL_ABORT:
> +			REPORT_GUEST_ASSERT(uc);
> +			break;
> +		case UCALL_DONE:
> +			break;
> +		default:
> +			TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
> +		}
> +	} while (uc.cmd != UCALL_DONE);
> +}
> +
> +static bool pmu_is_intel_event_stable(uint8_t idx)
> +{
> +	switch (idx) {
> +	case INTEL_ARCH_CPU_CYCLES:
> +	case INTEL_ARCH_INSTRUCTIONS_RETIRED:
> +	case INTEL_ARCH_REFERENCE_CYCLES:
> +	case INTEL_ARCH_BRANCHES_RETIRED:
> +		return true;
> +	default:
> +		return false;
> +	}
> +}

Brief explanation on why other events are not stable please. Since there
are only a few architecture events, maybe listing all of them with
explanation in comments would work better. Let out-of-bound return false
on default.
> +
> +static void guest_measure_pmu_v1(struct kvm_x86_pmu_feature event,
> +				 uint32_t counter_msr, uint32_t nr_gp_counters)
> +{
> +	uint8_t idx = event.f.bit;
> +	unsigned int i;
> +
> +	for (i = 0; i < nr_gp_counters; i++) {
> +		wrmsr(counter_msr + i, 0);
> +		wrmsr(MSR_P6_EVNTSEL0 + i, ARCH_PERFMON_EVENTSEL_OS |
> +		      ARCH_PERFMON_EVENTSEL_ENABLE | intel_pmu_arch_events[idx]);
> +		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));

Some comment might be needed for readability. Abuptly inserting inline
assembly code in C destroys the readability.

I wonder do we need add 'clobber' here for the above line, since it
takes away ecx?

Also, I wonder if we need to disable IRQ here? This code might be
intercepted and resumed. If so, then the test will get a different
number?
> +
> +		if (pmu_is_intel_event_stable(idx))
> +			GUEST_ASSERT_EQ(this_pmu_has(event), !!_rdpmc(i));

Okay, just the counter value is non-zero means we pass the test ?!

hmm, I wonder other than IRQ stuff, what else may affect the result? NMI
watchdog or what?
> +
> +		wrmsr(MSR_P6_EVNTSEL0 + i, ARCH_PERFMON_EVENTSEL_OS |
> +		      !ARCH_PERFMON_EVENTSEL_ENABLE |
> +		      intel_pmu_arch_events[idx]);
> +		wrmsr(counter_msr + i, 0);
> +		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
ditto for readability. Please consider using a macro to avoid repeated
explanation.

> +
> +		if (pmu_is_intel_event_stable(idx))
> +			GUEST_ASSERT(!_rdpmc(i));
> +	}
> +
> +	GUEST_DONE();
> +}
> +
> +static void guest_measure_loop(uint8_t idx)
> +{
> +	const struct {
> +		struct kvm_x86_pmu_feature gp_event;
> +	} intel_event_to_feature[] = {
> +		[INTEL_ARCH_CPU_CYCLES]		   = { X86_PMU_FEATURE_CPU_CYCLES },
> +		[INTEL_ARCH_INSTRUCTIONS_RETIRED]  = { X86_PMU_FEATURE_INSNS_RETIRED },
> +		[INTEL_ARCH_REFERENCE_CYCLES]	   = { X86_PMU_FEATURE_REFERENCE_CYCLES },
> +		[INTEL_ARCH_LLC_REFERENCES]	   = { X86_PMU_FEATURE_LLC_REFERENCES },
> +		[INTEL_ARCH_LLC_MISSES]		   = { X86_PMU_FEATURE_LLC_MISSES },
> +		[INTEL_ARCH_BRANCHES_RETIRED]	   = { X86_PMU_FEATURE_BRANCH_INSNS_RETIRED },
> +		[INTEL_ARCH_BRANCHES_MISPREDICTED] = { X86_PMU_FEATURE_BRANCHES_MISPREDICTED },
> +	};
> +
> +	uint32_t nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
> +	uint32_t pmu_version = this_cpu_property(X86_PROPERTY_PMU_VERSION);
> +	struct kvm_x86_pmu_feature gp_event;
> +	uint32_t counter_msr;
> +	unsigned int i;
> +
> +	if (rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES)
> +		counter_msr = MSR_IA32_PMC0;
> +	else
> +		counter_msr = MSR_IA32_PERFCTR0;
> +
> +	gp_event = intel_event_to_feature[idx].gp_event;
> +	TEST_ASSERT_EQ(idx, gp_event.f.bit);
> +
> +	if (pmu_version < 2) {
> +		guest_measure_pmu_v1(gp_event, counter_msr, nr_gp_counters);
> +		return;
> +	}
> +
> +	for (i = 0; i < nr_gp_counters; i++) {
> +		wrmsr(counter_msr + i, 0);
> +		wrmsr(MSR_P6_EVNTSEL0 + i, ARCH_PERFMON_EVENTSEL_OS |
> +		      ARCH_PERFMON_EVENTSEL_ENABLE |
> +		      intel_pmu_arch_events[idx]);
> +
> +		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, BIT_ULL(i));
> +		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
> +		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
> +
> +		if (pmu_is_intel_event_stable(idx))
> +			GUEST_ASSERT_EQ(this_pmu_has(gp_event), !!_rdpmc(i));
> +	}
> +
> +	GUEST_DONE();
> +}
> +
> +static void test_arch_events_cpuid(uint8_t i, uint8_t j, uint8_t idx)
> +{
> +	uint8_t arch_events_unavailable_mask = BIT_ULL(j);
> +	uint8_t arch_events_bitmap_size = BIT_ULL(i);
> +	struct kvm_vcpu *vcpu;
> +	struct kvm_vm *vm;
> +
> +	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_measure_loop);
> +
> +	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH,
> +				arch_events_bitmap_size);
> +	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EVENTS_MASK,
> +				arch_events_unavailable_mask);
> +
> +	vcpu_args_set(vcpu, 1, idx);
> +
> +	run_vcpu(vcpu);
> +
> +	kvm_vm_free(vm);
> +}
> +
> +static void test_intel_arch_events(void)
> +{
> +	uint8_t idx, i, j;
> +
> +	for (idx = 0; idx < NR_INTEL_ARCH_EVENTS; idx++) {
> +		/*
> +		 * A brute force iteration of all combinations of values is
> +		 * likely to exhaust the limit of the single-threaded thread
> +		 * fd nums, so it's test by iterating through all valid
> +		 * single-bit values.
> +		 */
> +		for (i = 0; i < NR_INTEL_ARCH_EVENTS; i++) {
> +			for (j = 0; j < NR_INTEL_ARCH_EVENTS; j++)
> +				test_arch_events_cpuid(i, j, idx);
> +		}
> +	}
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +	TEST_REQUIRE(get_kvm_param_bool("enable_pmu"));
> +
> +	TEST_REQUIRE(host_cpu_is_intel);
> +	TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION));
> +	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0);
> +	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM));

hmm, this means we cannot run this in nested if X86_FEATURE_PDCM is
missing. It only affects full-width counter, right?

> +
> +	test_intel_arch_events();
> +
> +	return 0;
> +}
> -- 
> 2.42.0.758.gaed0368e0e-goog
>
  
Sean Christopherson Oct. 26, 2023, 8:54 p.m. UTC | #5
On Thu, Oct 26, 2023, Mingwei Zhang wrote:
> > +static bool pmu_is_intel_event_stable(uint8_t idx)
> > +{
> > +	switch (idx) {
> > +	case INTEL_ARCH_CPU_CYCLES:
> > +	case INTEL_ARCH_INSTRUCTIONS_RETIRED:
> > +	case INTEL_ARCH_REFERENCE_CYCLES:
> > +	case INTEL_ARCH_BRANCHES_RETIRED:
> > +		return true;
> > +	default:
> > +		return false;
> > +	}
> > +}
> 
> Brief explanation on why other events are not stable please. Since there
> are only a few architecture events, maybe listing all of them with
> explanation in comments would work better.

Heh, I've already rewritten this logic to make 


> > +
> > +static void guest_measure_pmu_v1(struct kvm_x86_pmu_feature event,
> > +				 uint32_t counter_msr, uint32_t nr_gp_counters)
> > +{
> > +	uint8_t idx = event.f.bit;
> > +	unsigned int i;
> > +
> > +	for (i = 0; i < nr_gp_counters; i++) {
> > +		wrmsr(counter_msr + i, 0);
> > +		wrmsr(MSR_P6_EVNTSEL0 + i, ARCH_PERFMON_EVENTSEL_OS |
> > +		      ARCH_PERFMON_EVENTSEL_ENABLE | intel_pmu_arch_events[idx]);
> > +		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
> 
> Some comment might be needed for readability. Abuptly inserting inline
> assembly code in C destroys the readability.
> 
> I wonder do we need add 'clobber' here for the above line, since it
> takes away ecx?

It's already there.  You can't directly clobber a register that is used as an
input constraint.  The workaround is to make the register both an input and an
output, hense the "+c" in the outputs section instead of just "c" in the inputs
section.  The extra bit of cleverness is to use an intermediate anonymous variable
so that NUM_BRANCHES can effectively be passed in (#defines won't work as output
constraints).

> Also, I wonder if we need to disable IRQ here? This code might be
> intercepted and resumed. If so, then the test will get a different
> number?

This is guest code, disabling IRQs is pointless.  There are no guest virtual IRQs,
guarding aginst host IRQs is impossible, unnecessary, and actualy undesirable,
i.e. the guest vPMU shouldn't be counting host instructions and whatnot.

> > +
> > +		if (pmu_is_intel_event_stable(idx))
> > +			GUEST_ASSERT_EQ(this_pmu_has(event), !!_rdpmc(i));
> 
> Okay, just the counter value is non-zero means we pass the test ?!

FWIW, I've updated 

> hmm, I wonder other than IRQ stuff, what else may affect the result? NMI
> watchdog or what?

This is the beauty of selftests.  There _so_ simple that there are very few
surprises.  E.g. there are no events of any kind unless the test explicitly
generates them.  The downside is that doing anything complex in selftests requires
writing a fair bit of code.

> > +
> > +		wrmsr(MSR_P6_EVNTSEL0 + i, ARCH_PERFMON_EVENTSEL_OS |
> > +		      !ARCH_PERFMON_EVENTSEL_ENABLE |
> > +		      intel_pmu_arch_events[idx]);
> > +		wrmsr(counter_msr + i, 0);
> > +		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
> ditto for readability. Please consider using a macro to avoid repeated
> explanation.

Heh, already did this too.  Though I'm not entirely sure it's more readable.  It's
definitely more precise and featured :-)

#define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP)				\
do {										\
	__asm__ __volatile__("wrmsr\n\t"					\
			     clflush "\n\t"					\
			     "mfence\n\t"					\
			     "1: mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t"	\
			     FEP "loop .\n\t"					\
			     FEP "mov %%edi, %%ecx\n\t"				\
			     FEP "xor %%eax, %%eax\n\t"				\
			     FEP "xor %%edx, %%edx\n\t"				\
			     "wrmsr\n\t"					\
			     : "+c"((int){_msr})				\
			     : "a"((uint32_t)_value), "d"(_value >> 32),	\
			       "D"(_msr)					\
	);									\
} while (0)


> > +int main(int argc, char *argv[])
> > +{
> > +	TEST_REQUIRE(get_kvm_param_bool("enable_pmu"));
> > +
> > +	TEST_REQUIRE(host_cpu_is_intel);
> > +	TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION));
> > +	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0);
> > +	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM));
> 
> hmm, this means we cannot run this in nested if X86_FEATURE_PDCM is
> missing. It only affects full-width counter, right?

Ah, yeah, good call.  It won't be too much trouble to have the test play nice
with !PDCM.
  
Mingwei Zhang Oct. 26, 2023, 10:10 p.m. UTC | #6
On Thu, Oct 26, 2023, Sean Christopherson wrote:
> On Thu, Oct 26, 2023, Mingwei Zhang wrote:
> > > +static bool pmu_is_intel_event_stable(uint8_t idx)
> > > +{
> > > +	switch (idx) {
> > > +	case INTEL_ARCH_CPU_CYCLES:
> > > +	case INTEL_ARCH_INSTRUCTIONS_RETIRED:
> > > +	case INTEL_ARCH_REFERENCE_CYCLES:
> > > +	case INTEL_ARCH_BRANCHES_RETIRED:
> > > +		return true;
> > > +	default:
> > > +		return false;
> > > +	}
> > > +}
> > 
> > Brief explanation on why other events are not stable please. Since there
> > are only a few architecture events, maybe listing all of them with
> > explanation in comments would work better.
> 
> Heh, I've already rewritten this logic to make 
> 
> 
> > > +
> > > +static void guest_measure_pmu_v1(struct kvm_x86_pmu_feature event,
> > > +				 uint32_t counter_msr, uint32_t nr_gp_counters)
> > > +{
> > > +	uint8_t idx = event.f.bit;
> > > +	unsigned int i;
> > > +
> > > +	for (i = 0; i < nr_gp_counters; i++) {
> > > +		wrmsr(counter_msr + i, 0);
> > > +		wrmsr(MSR_P6_EVNTSEL0 + i, ARCH_PERFMON_EVENTSEL_OS |
> > > +		      ARCH_PERFMON_EVENTSEL_ENABLE | intel_pmu_arch_events[idx]);
> > > +		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
> > 
> > Some comment might be needed for readability. Abuptly inserting inline
> > assembly code in C destroys the readability.
> > 
> > I wonder do we need add 'clobber' here for the above line, since it
> > takes away ecx?
> 
> It's already there.  You can't directly clobber a register that is used as an
> input constraint.  The workaround is to make the register both an input and an
> output, hense the "+c" in the outputs section instead of just "c" in the inputs
> section.  The extra bit of cleverness is to use an intermediate anonymous variable
> so that NUM_BRANCHES can effectively be passed in (#defines won't work as output
> constraints).
> 
> > Also, I wonder if we need to disable IRQ here? This code might be
> > intercepted and resumed. If so, then the test will get a different
> > number?
> 
> This is guest code, disabling IRQs is pointless.  There are no guest virtual IRQs,
> guarding aginst host IRQs is impossible, unnecessary, and actualy undesirable,
> i.e. the guest vPMU shouldn't be counting host instructions and whatnot.
> 
> > > +
> > > +		if (pmu_is_intel_event_stable(idx))
> > > +			GUEST_ASSERT_EQ(this_pmu_has(event), !!_rdpmc(i));
> > 
> > Okay, just the counter value is non-zero means we pass the test ?!
> 
> FWIW, I've updated 
> 
> > hmm, I wonder other than IRQ stuff, what else may affect the result? NMI
> > watchdog or what?
> 
> This is the beauty of selftests.  There _so_ simple that there are very few
> surprises.  E.g. there are no events of any kind unless the test explicitly
> generates them.  The downside is that doing anything complex in selftests requires
> writing a fair bit of code.

Understood, so we could support precise matching.
>
> > > +
> > > +		wrmsr(MSR_P6_EVNTSEL0 + i, ARCH_PERFMON_EVENTSEL_OS |
> > > +		      !ARCH_PERFMON_EVENTSEL_ENABLE |
> > > +		      intel_pmu_arch_events[idx]);
> > > +		wrmsr(counter_msr + i, 0);
> > > +		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
> > ditto for readability. Please consider using a macro to avoid repeated
> > explanation.
> 
> Heh, already did this too.  Though I'm not entirely sure it's more readable.  It's
> definitely more precise and featured :-)
> 
Oh dear, this is challenging to my rusty inline assembly skills :)

> #define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP)				\
> do {										\
> 	__asm__ __volatile__("wrmsr\n\t"					\
> 			     clflush "\n\t"					\
> 			     "mfence\n\t"					\
> 			     "1: mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t"	\
> 			     FEP "loop .\n\t"					\
> 			     FEP "mov %%edi, %%ecx\n\t"				\
> 			     FEP "xor %%eax, %%eax\n\t"				\
> 			     FEP "xor %%edx, %%edx\n\t"				\
> 			     "wrmsr\n\t"					\
> 			     : "+c"((int){_msr})				\
isn't it NUM_BRANCHES?
> 			     : "a"((uint32_t)_value), "d"(_value >> 32),	\
> 			       "D"(_msr)					\
> 	);									\
> } while (0)
>

do we need this label '1:' in the above code? It does not seems to be
used anywhere within the code.

why is clflush needed here?
> 
> > > +int main(int argc, char *argv[])
> > > +{
> > > +	TEST_REQUIRE(get_kvm_param_bool("enable_pmu"));
> > > +
> > > +	TEST_REQUIRE(host_cpu_is_intel);
> > > +	TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION));
> > > +	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0);
> > > +	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM));
> > 
> > hmm, this means we cannot run this in nested if X86_FEATURE_PDCM is
> > missing. It only affects full-width counter, right?
> 
> Ah, yeah, good call.  It won't be too much trouble to have the test play nice
> with !PDCM.
  
Sean Christopherson Oct. 26, 2023, 10:54 p.m. UTC | #7
On Thu, Oct 26, 2023, Mingwei Zhang wrote:
> On Thu, Oct 26, 2023, Sean Christopherson wrote:
> > Heh, already did this too.  Though I'm not entirely sure it's more readable.  It's
> > definitely more precise and featured :-)
> > 
> Oh dear, this is challenging to my rusty inline assembly skills :)
> 
> > #define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP)				\
> > do {										\
> > 	__asm__ __volatile__("wrmsr\n\t"					\
> > 			     clflush "\n\t"					\
> > 			     "mfence\n\t"					\
> > 			     "1: mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t"	\
> > 			     FEP "loop .\n\t"					\
> > 			     FEP "mov %%edi, %%ecx\n\t"				\
> > 			     FEP "xor %%eax, %%eax\n\t"				\
> > 			     FEP "xor %%edx, %%edx\n\t"				\
> > 			     "wrmsr\n\t"					\
> > 			     : "+c"((int){_msr})				\
> isn't it NUM_BRANCHES?

Nope.  It's hard to see because this doesn't provide the usage, but @_msr is an
MSR index that is consumed by the first "wrmsr", i.e. this blob relies on the
compiler to preload ECX, EAX, and EDX for WRMSR.  NUM_BRANCHES is manually loaded
into ECX after WRMSR (WRMSR and LOOP both hardcode consuming ECX).

Ha!  I can actually drop the "+c" clobbering trick since ECX is restored to its
input value before the asm blob finishes.  EDI is also loaded with _@msr so that
it can be quickly reloaded into ECX for the WRMSR to disable the event.

The purpose of doing both WRMSRs in assembly is to ensure the compiler doesn't
insert _any_ code into the measured sequence, e.g. so that a random Jcc doesn't
throw off instructions retired.

> > 			     : "a"((uint32_t)_value), "d"(_value >> 32),	\
> > 			       "D"(_msr)					\
> > 	);									\
> > } while (0)
> >
> 
> do we need this label '1:' in the above code? It does not seems to be
> used anywhere within the code.

It's used by the caller as the target for CLFLUSH{,OPT}.

	if (this_cpu_has(X86_FEATURE_CLFLUSHOPT))				\
		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt 1f", FEP);	\
	else if (this_cpu_has(X86_FEATURE_CLFLUSH))				\
		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt 1f", FEP);	\
	else									\
		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP);		\
> 
> why is clflush needed here?

As suggested by Jim, it allows verifying LLC references and misses by forcing
the CPU to evict the cache line that holds the MOV at 1: (and likely holds most
of the asm blob).
  

Patch

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index ed1c17cabc07..4c024fb845b4 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -82,6 +82,7 @@  TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
 TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test
 TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test
 TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
+TEST_GEN_PROGS_x86_64 += x86_64/pmu_counters_test
 TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
 TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
 TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
new file mode 100644
index 000000000000..2a6336b994d5
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
@@ -0,0 +1,189 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023, Tencent, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <x86intrin.h>
+
+#include "pmu.h"
+#include "processor.h"
+
+/* Guest payload for any performance counter counting */
+#define NUM_BRANCHES		10
+
+static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
+						  void *guest_code)
+{
+	struct kvm_vm *vm;
+
+	vm = vm_create_with_one_vcpu(vcpu, guest_code);
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(*vcpu);
+
+	return vm;
+}
+
+static void run_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	do {
+		vcpu_run(vcpu);
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		case UCALL_DONE:
+			break;
+		default:
+			TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+		}
+	} while (uc.cmd != UCALL_DONE);
+}
+
+static bool pmu_is_intel_event_stable(uint8_t idx)
+{
+	switch (idx) {
+	case INTEL_ARCH_CPU_CYCLES:
+	case INTEL_ARCH_INSTRUCTIONS_RETIRED:
+	case INTEL_ARCH_REFERENCE_CYCLES:
+	case INTEL_ARCH_BRANCHES_RETIRED:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static void guest_measure_pmu_v1(struct kvm_x86_pmu_feature event,
+				 uint32_t counter_msr, uint32_t nr_gp_counters)
+{
+	uint8_t idx = event.f.bit;
+	unsigned int i;
+
+	for (i = 0; i < nr_gp_counters; i++) {
+		wrmsr(counter_msr + i, 0);
+		wrmsr(MSR_P6_EVNTSEL0 + i, ARCH_PERFMON_EVENTSEL_OS |
+		      ARCH_PERFMON_EVENTSEL_ENABLE | intel_pmu_arch_events[idx]);
+		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+
+		if (pmu_is_intel_event_stable(idx))
+			GUEST_ASSERT_EQ(this_pmu_has(event), !!_rdpmc(i));
+
+		wrmsr(MSR_P6_EVNTSEL0 + i, ARCH_PERFMON_EVENTSEL_OS |
+		      !ARCH_PERFMON_EVENTSEL_ENABLE |
+		      intel_pmu_arch_events[idx]);
+		wrmsr(counter_msr + i, 0);
+		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+
+		if (pmu_is_intel_event_stable(idx))
+			GUEST_ASSERT(!_rdpmc(i));
+	}
+
+	GUEST_DONE();
+}
+
+static void guest_measure_loop(uint8_t idx)
+{
+	const struct {
+		struct kvm_x86_pmu_feature gp_event;
+	} intel_event_to_feature[] = {
+		[INTEL_ARCH_CPU_CYCLES]		   = { X86_PMU_FEATURE_CPU_CYCLES },
+		[INTEL_ARCH_INSTRUCTIONS_RETIRED]  = { X86_PMU_FEATURE_INSNS_RETIRED },
+		[INTEL_ARCH_REFERENCE_CYCLES]	   = { X86_PMU_FEATURE_REFERENCE_CYCLES },
+		[INTEL_ARCH_LLC_REFERENCES]	   = { X86_PMU_FEATURE_LLC_REFERENCES },
+		[INTEL_ARCH_LLC_MISSES]		   = { X86_PMU_FEATURE_LLC_MISSES },
+		[INTEL_ARCH_BRANCHES_RETIRED]	   = { X86_PMU_FEATURE_BRANCH_INSNS_RETIRED },
+		[INTEL_ARCH_BRANCHES_MISPREDICTED] = { X86_PMU_FEATURE_BRANCHES_MISPREDICTED },
+	};
+
+	uint32_t nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
+	uint32_t pmu_version = this_cpu_property(X86_PROPERTY_PMU_VERSION);
+	struct kvm_x86_pmu_feature gp_event;
+	uint32_t counter_msr;
+	unsigned int i;
+
+	if (rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES)
+		counter_msr = MSR_IA32_PMC0;
+	else
+		counter_msr = MSR_IA32_PERFCTR0;
+
+	gp_event = intel_event_to_feature[idx].gp_event;
+	TEST_ASSERT_EQ(idx, gp_event.f.bit);
+
+	if (pmu_version < 2) {
+		guest_measure_pmu_v1(gp_event, counter_msr, nr_gp_counters);
+		return;
+	}
+
+	for (i = 0; i < nr_gp_counters; i++) {
+		wrmsr(counter_msr + i, 0);
+		wrmsr(MSR_P6_EVNTSEL0 + i, ARCH_PERFMON_EVENTSEL_OS |
+		      ARCH_PERFMON_EVENTSEL_ENABLE |
+		      intel_pmu_arch_events[idx]);
+
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, BIT_ULL(i));
+		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+		if (pmu_is_intel_event_stable(idx))
+			GUEST_ASSERT_EQ(this_pmu_has(gp_event), !!_rdpmc(i));
+	}
+
+	GUEST_DONE();
+}
+
+static void test_arch_events_cpuid(uint8_t i, uint8_t j, uint8_t idx)
+{
+	uint8_t arch_events_unavailable_mask = BIT_ULL(j);
+	uint8_t arch_events_bitmap_size = BIT_ULL(i);
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_measure_loop);
+
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH,
+				arch_events_bitmap_size);
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EVENTS_MASK,
+				arch_events_unavailable_mask);
+
+	vcpu_args_set(vcpu, 1, idx);
+
+	run_vcpu(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+static void test_intel_arch_events(void)
+{
+	uint8_t idx, i, j;
+
+	for (idx = 0; idx < NR_INTEL_ARCH_EVENTS; idx++) {
+		/*
+		 * A brute force iteration of all combinations of values is
+		 * likely to exhaust the limit of the single-threaded thread
+		 * fd nums, so it's test by iterating through all valid
+		 * single-bit values.
+		 */
+		for (i = 0; i < NR_INTEL_ARCH_EVENTS; i++) {
+			for (j = 0; j < NR_INTEL_ARCH_EVENTS; j++)
+				test_arch_events_cpuid(i, j, idx);
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(get_kvm_param_bool("enable_pmu"));
+
+	TEST_REQUIRE(host_cpu_is_intel);
+	TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION));
+	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0);
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM));
+
+	test_intel_arch_events();
+
+	return 0;
+}