diff mbox series

[v4,11/12] KVM: x86/svm/pmu: Add AMD PerfMonV2 support

Message ID	20230214050757.9623-12-likexu@tencent.com
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20; From: Like Xu <like.xu.linux@gmail.com> To: Sean Christopherson <seanjc@google.com> Cc: Paolo Bonzini <pbonzini@redhat.com>, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, Sandipan Das <sandipan.das@amd.com> Subject: [PATCH v4 11/12] KVM: x86/svm/pmu: Add AMD PerfMonV2 support Date: Tue, 14 Feb 2023 13:07:56 +0800 Message-Id: <20230214050757.9623-12-likexu@tencent.com> In-Reply-To: <20230214050757.9623-1-likexu@tencent.com> References: <20230214050757.9623-1-likexu@tencent.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	KVM: x86: Add AMD Guest PerfMonV2 PMU support \| [v4,00/12] KVM: x86: Add AMD Guest PerfMonV2 PMU support [v4,01/12] KVM: x86/pmu: Rename pmc_is_enabled() to pmc_is_globally_enabled() [v4,02/12] KVM: VMX: Refactor intel_pmu_set_msr() to align with other set_msr() helpers [v4,03/12] KVM: x86/pmu: Rewrite reprogram_counters() to improve performance [v4,04/12] KVM: x86/pmu: Expose reprogram_counters() in pmu.h [v4,05/12] KVM: x86/pmu: Error when user sets the GLOBAL_STATUS reserved bits [v4,06/12] KVM: x86/pmu: Make part of the Intel v2 PMU MSRs handling x86 generic [v4,07/12] KVM: x86/cpuid: Use fast return for cpuid "0xa" leaf when !enable_pmu [v4,08/12] KVM: x86/pmu: Disable vPMU if the minimum num of counters isn't met [v4,09/12] KVM: x86/pmu: Forget PERFCTR_CORE if the min num of counters isn't met [v4,10/12] KVM: x86/cpuid: Add X86_FEATURE_PERFMON_V2 as a scattered flag [v4,11/12] KVM: x86/svm/pmu: Add AMD PerfMonV2 support [v4,12/12] KVM: x86/cpuid: Add AMD CPUID ExtPerfMonAndDbg leaf 0x80000022

Commit Message

Like Xu Feb. 14, 2023, 5:07 a.m. UTC

  From: Like Xu <likexu@tencent.com>

If AMD Performance Monitoring Version 2 (PerfMonV2) is detected by
the guest, it can use a new scheme to manage the Core PMCs using the
new global control and status registers.

In addition to benefiting from the PerfMonV2 functionality in the same
way as the host (higher precision), the guest also can reduce the number
of vm-exits by lowering the total number of MSRs accesses.

In terms of implementation details, amd_is_valid_msr() is resurrected
since three newly added MSRs could not be mapped to one vPMC.
The possibility of emulating PerfMonV2 on the mainframe has also
been eliminated for reasons of precision.

Co-developed-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Like Xu <likexu@tencent.com>
---
 arch/x86/kvm/pmu.c     | 14 +++++++++--
 arch/x86/kvm/svm/pmu.c | 53 ++++++++++++++++++++++++++++++++++--------
 arch/x86/kvm/x86.c     | 10 ++++++++
 3 files changed, 65 insertions(+), 12 deletions(-)

Comments

Sean Christopherson April 7, 2023, 1:35 a.m. UTC | #1

On Tue, Feb 14, 2023, Like Xu wrote:
> +	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
> +		if (!msr_info->host_initiated)
> +			return 0; /* Writes are ignored */

Where is the "writes ignored" behavior documented?  I can't find anything in the
APM that defines write behavior. 

>  
>  		pmu->global_status = data;
>  		return 0;
>  	case MSR_CORE_PERF_GLOBAL_CTRL:
>  		if (!kvm_valid_perf_global_ctrl(pmu, data))
>  			return 1;
> -
> +		fallthrough;

This _definitely_ needs a comment.  Hmm, and I would prefer to reverse these, i.e.

	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
		data &= ~pmu->global_ctrl_mask;
		fallthrough;
	case MSR_CORE_PERF_GLOBAL_CTRL:
		if (!kvm_valid_perf_global_ctrl(pmu, data))
			return 1;

It's a bit arbitrary, but either Intel or AMD is going to end up with extra code,
and IMO skipping a validity check is more alarming than skipping clearing of
reserved bits, i.e. will look like a bug to future readers.

> +	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
> +		data &= ~pmu->global_ctrl_mask;
>  		if (pmu->global_ctrl != data) {
>  			diff = pmu->global_ctrl ^ data;
>  			pmu->global_ctrl = data;
> @@ -616,7 +625,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
>  		if (data & pmu->global_ovf_ctrl_mask)
>  			return 1;
> -
> +		fallthrough;

Here too.  Argh, the APM doesn't actually define what happens on reserved bits,
it just says "WO".  I vote to be conservative and ignore writes to reserved bits.
And then we can have one comment for the whole block, e.g.

	/*
	 * Note, AMD ignores writes to read-only PMU MSRs/bits, whereas Intel
	 * generates #GP on attempts to write reserved bits or RO MSRs.
	 */
	switch (msr) {
	case MSR_CORE_PERF_GLOBAL_STATUS:
		if (!msr_info->host_initiated)
			return 1; /* RO MSR */
		fallthrough;
	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
		if (!msr_info->host_initiated)
			break;

		pmu->global_status = data;
		break;
	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
		data &= ~pmu->global_ctrl_mask;
		fallthrough;
	case MSR_CORE_PERF_GLOBAL_CTRL:
		if (!kvm_valid_perf_global_ctrl(pmu, data))
			return 1;

		if (pmu->global_ctrl != data) {
			diff = pmu->global_ctrl ^ data;
			pmu->global_ctrl = data;
			reprogram_counters(pmu, diff);
		}
		break;
	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
		fallthrough;
	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
		if (data & pmu->global_ovf_ctrl_mask)
			return 1;

		if (!msr_info->host_initiated)
			pmu->global_status &= ~data;
		break;
	default:
		kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
		return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
	}

	return 0;	

> @@ -164,20 +181,34 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +	struct kvm_cpuid_entry2 *entry;
> +	union cpuid_0x80000022_ebx ebx;
>  
> -	if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
> +	pmu->version = 1;
> +	if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) {
> +		pmu->version = 2;
> +		entry = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0);

No need for the intermediate "entry".
> +		ebx.full = entry->ebx;

Oof, at first glance this looks like a potential null-pointer deref bug.  I
believe we can do

		/*
		 * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest
		 * CPUID entry is guaranteed to be non-NULL.
		 */
		BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 ||
			     x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index != 0x80000022);
		ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;

> +		pmu->nr_arch_gp_counters = min_t(unsigned int,
> +						 ebx.split.num_core_pmc,
> +						 kvm_pmu_cap.num_counters_gp);
> +	} else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
>  		pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;

This needs to be sanitized, no?  E.g. if KVM only has access to 4 counters, but
userspace sets X86_FEATURE_PERFCTR_CORE anyways.  Hrm, unless I'm missing something,
that's a pre-existing bug.

If I'm right, can you add a patch to cap nr_arch_gp_counters at
kvm_pmu_cap.num_counters_gp in the common flow, i.e. after this if-else block?
Then there is no change needed in this patch, e.g. we'll naturally end up with:

	union cpuid_0x80000022_ebx ebx;

	pmu->version = 1;
	if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) {
		pmu->version = 2;
		/*                                                              
                 * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest   
                 * CPUID entry is guaranteed to be non-NULL.                    
                 */                                                             
                BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 ||
                             x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index);
		ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;
		pmu->nr_arch_gp_counters = ebx.split.num_core_pmc;
	} else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
		pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
	} else {
		pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
	}

	pmu->nr_arch_gp_counters = min_t(unsigned int,
					 pmu->nr_arch_gp_counters,
				       	 kvm_pmu_cap.num_counters_gp);

Like Xu April 7, 2023, 7:08 a.m. UTC | #2

On 7/4/2023 9:35 am, Sean Christopherson wrote:
> On Tue, Feb 14, 2023, Like Xu wrote:
>> +	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
>> +		if (!msr_info->host_initiated)
>> +			return 0; /* Writes are ignored */
> 
> Where is the "writes ignored" behavior documented?  I can't find anything in the
> APM that defines write behavior.

KVM would follow the real hardware behavior once specifications stay silent on 
details or secret.

> 
>>   
>>   		pmu->global_status = data;
>>   		return 0;
>>   	case MSR_CORE_PERF_GLOBAL_CTRL:
>>   		if (!kvm_valid_perf_global_ctrl(pmu, data))
>>   			return 1;
>> -
>> +		fallthrough;
> 
> This _definitely_ needs a comment.  Hmm, and I would prefer to reverse these, i.e.
> 
> 	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
> 		data &= ~pmu->global_ctrl_mask;
> 		fallthrough;
> 	case MSR_CORE_PERF_GLOBAL_CTRL:
> 		if (!kvm_valid_perf_global_ctrl(pmu, data))
> 			return 1;
> 
> It's a bit arbitrary, but either Intel or AMD is going to end up with extra code,
> and IMO skipping a validity check is more alarming than skipping clearing of
> reserved bits, i.e. will look like a bug to future readers.
> 
>> +	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
>> +		data &= ~pmu->global_ctrl_mask;
>>   		if (pmu->global_ctrl != data) {
>>   			diff = pmu->global_ctrl ^ data;
>>   			pmu->global_ctrl = data;
>> @@ -616,7 +625,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>   	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
>>   		if (data & pmu->global_ovf_ctrl_mask)
>>   			return 1;
>> -
>> +		fallthrough;
> 
> Here too.  Argh, the APM doesn't actually define what happens on reserved bits,
> it just says "WO".  I vote to be conservative and ignore writes to reserved bits.
> And then we can have one comment for the whole block, e.g.
> 
> 	/*
> 	 * Note, AMD ignores writes to read-only PMU MSRs/bits, whereas Intel
> 	 * generates #GP on attempts to write reserved bits or RO MSRs.
> 	 */
> 	switch (msr) {
> 	case MSR_CORE_PERF_GLOBAL_STATUS:
> 		if (!msr_info->host_initiated)
> 			return 1; /* RO MSR */
> 		fallthrough;
> 	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
> 		if (!msr_info->host_initiated)
> 			break;
> 
> 		pmu->global_status = data;
> 		break;
> 	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
> 		data &= ~pmu->global_ctrl_mask;
> 		fallthrough;
> 	case MSR_CORE_PERF_GLOBAL_CTRL:
> 		if (!kvm_valid_perf_global_ctrl(pmu, data))
> 			return 1;
> 
> 		if (pmu->global_ctrl != data) {
> 			diff = pmu->global_ctrl ^ data;
> 			pmu->global_ctrl = data;
> 			reprogram_counters(pmu, diff);
> 		}
> 		break;
> 	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
> 		fallthrough;
> 	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
> 		if (data & pmu->global_ovf_ctrl_mask)
> 			return 1;
> 
> 		if (!msr_info->host_initiated)
> 			pmu->global_status &= ~data;
> 		break;
> 	default:
> 		kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
> 		return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
> 	}
> 
> 	return 0;	

AMD doesn't generates #GP on attempts to write PMU RO MSRs and reserved bits.

How about this:

	/*
	 * Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
	 * whereas Intel generates #GP on attempts to write reserved/RO MSRs.
	 */
	switch (msr) {
	case MSR_CORE_PERF_GLOBAL_STATUS:
		if (!msr_info->host_initiated || (data & pmu->global_ovf_ctrl_mask))
			return 1; /* RO MSR */
		fallthrough;
	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
		if (!msr_info->host_initiated)
			break;

		pmu->global_status = data;
		break;
	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
		data &= ~pmu->global_ctrl_mask;
		fallthrough;
	case MSR_CORE_PERF_GLOBAL_CTRL:
		if (!kvm_valid_perf_global_ctrl(pmu, data))
			return 1;

		if (pmu->global_ctrl != data) {
			diff = pmu->global_ctrl ^ data;
			pmu->global_ctrl = data;
			reprogram_counters(pmu, diff);
		}
		break;
	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
		if (data & pmu->global_ovf_ctrl_mask)
			return 1;
		fallthrough;
	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
		if (!msr_info->host_initiated)
			pmu->global_status &= ~data;
		break;
	default:
		kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
		return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
	}

	return 0;

> 
>> @@ -164,20 +181,34 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>   static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
>>   {
>>   	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
>> +	struct kvm_cpuid_entry2 *entry;
>> +	union cpuid_0x80000022_ebx ebx;
>>   
>> -	if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
>> +	pmu->version = 1;
>> +	if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) {
>> +		pmu->version = 2;
>> +		entry = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0);
> 
> No need for the intermediate "entry".
>> +		ebx.full = entry->ebx;
> 
> Oof, at first glance this looks like a potential null-pointer deref bug.  I
> believe we can do
> 
> 		/*
> 		 * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest
> 		 * CPUID entry is guaranteed to be non-NULL.
> 		 */
> 		BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 ||
> 			     x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index != 0x80000022);

  x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index);

> 		ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;
> 
>> +		pmu->nr_arch_gp_counters = min_t(unsigned int,
>> +						 ebx.split.num_core_pmc,
>> +						 kvm_pmu_cap.num_counters_gp);
>> +	} else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
>>   		pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
> 
> This needs to be sanitized, no?  E.g. if KVM only has access to 4 counters, but
> userspace sets X86_FEATURE_PERFCTR_CORE anyways.  Hrm, unless I'm missing something,
> that's a pre-existing bug.

Now your point is that if a user space more capbility than KVM can support, KVM 
should constrain it.
Your previous preference was that the user space can set capbilities that evene 
if KVM doesn't support
as long as it doesn't break KVM and host and the guest will eat its own.

> 
> If I'm right, can you add a patch to cap nr_arch_gp_counters at
> kvm_pmu_cap.num_counters_gp in the common flow, i.e. after this if-else block?
> Then there is no change needed in this patch, e.g. we'll naturally end up with:
> 
> 	union cpuid_0x80000022_ebx ebx;
> 
> 	pmu->version = 1;
> 	if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) {
> 		pmu->version = 2;
> 		/*
>                   * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest
>                   * CPUID entry is guaranteed to be non-NULL.
>                   */
>                  BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 ||
>                               x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index);
> 		ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;
> 		pmu->nr_arch_gp_counters = ebx.split.num_core_pmc;
> 	} else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
> 		pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
> 	} else {
> 		pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
> 	}
> 
> 	pmu->nr_arch_gp_counters = min_t(unsigned int,
> 					 pmu->nr_arch_gp_counters,
> 				       	 kvm_pmu_cap.num_counters_gp);

Sean Christopherson April 7, 2023, 2:44 p.m. UTC | #3

On Fri, Apr 07, 2023, Like Xu wrote:
> On 7/4/2023 9:35 am, Sean Christopherson wrote:
> > On Tue, Feb 14, 2023, Like Xu wrote:
> > > +	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
> > > +		if (!msr_info->host_initiated)
> > > +			return 0; /* Writes are ignored */
> > 
> > Where is the "writes ignored" behavior documented?  I can't find anything in the
> > APM that defines write behavior.
> 
> KVM would follow the real hardware behavior once specifications stay silent
> on details or secret.

So is that a "this isn't actually documented anywhere" answer?  It's not your
responsibility to get AMD to document their CPUs, but I want to clearly document
when KVM's behavior is based solely off of observed hardware behavior, versus an
actual specification.

> How about this:
> 
> 	/*
> 	 * Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
> 	 * whereas Intel generates #GP on attempts to write reserved/RO MSRs.
> 	 */

Looks good.

> > > +		pmu->nr_arch_gp_counters = min_t(unsigned int,
> > > +						 ebx.split.num_core_pmc,
> > > +						 kvm_pmu_cap.num_counters_gp);
> > > +	} else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
> > >   		pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
> > 
> > This needs to be sanitized, no?  E.g. if KVM only has access to 4 counters, but
> > userspace sets X86_FEATURE_PERFCTR_CORE anyways.  Hrm, unless I'm missing something,
> > that's a pre-existing bug.
> 
> Now your point is that if a user space more capbility than KVM can support,
> KVM should constrain it.
> Your previous preference was that the user space can set capbilities that
> evene if KVM doesn't support as long as it doesn't break KVM and host and the
> guest will eat its own.

Letting userspace define a "bad" configuration is perfectly ok, but KVM needs to
be careful not to endanger itself by consuming the bad state.  A good example is
the handling of nested SVM features in svm_vcpu_after_set_cpuid().  KVM lets
userspace define anything and everything, but KVM only actually tries to utilize
a feature if the feature is actually supported in hardware.

In this case, it's not clear to me that putting a bogus value into "nr_arch_gp_counters"
is safe (for KVM).  And AIUI, the guest can't actually use more than
kvm_pmu_cap.num_counters_gp counters, i.e. KVM isn't arbitrarily restricting the
setup.

Like Xu April 10, 2023, 11:34 a.m. UTC | #4

On 7/4/2023 10:44 pm, Sean Christopherson wrote:
> On Fri, Apr 07, 2023, Like Xu wrote:
>> On 7/4/2023 9:35 am, Sean Christopherson wrote:
>>> On Tue, Feb 14, 2023, Like Xu wrote:
>>>> +	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
>>>> +		if (!msr_info->host_initiated)
>>>> +			return 0; /* Writes are ignored */
>>>
>>> Where is the "writes ignored" behavior documented?  I can't find anything in the
>>> APM that defines write behavior.
>>
>> KVM would follow the real hardware behavior once specifications stay silent
>> on details or secret.
> 
> So is that a "this isn't actually documented anywhere" answer?  It's not your
> responsibility to get AMD to document their CPUs, but I want to clearly document
> when KVM's behavior is based solely off of observed hardware behavior, versus an
> actual specification.

Indeed, you draw a clearer line than APM or PPR.

Spec-defined:

	RO: Read-only. Readable; writes are ignored (Per PPR "AccessType Definitions")
	WO: Writable. Reads are undefined. (Per PPR "AccessType Definitions")

And vPMU will refer to real HW observations for the (hidden) undefined behaviour.
More comments in the new version may help. Please check.

> 
>> How about this:
>>
>> 	/*
>> 	 * Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
>> 	 * whereas Intel generates #GP on attempts to write reserved/RO MSRs.
>> 	 */
> 
> Looks good.
> 
>>>> +		pmu->nr_arch_gp_counters = min_t(unsigned int,
>>>> +						 ebx.split.num_core_pmc,
>>>> +						 kvm_pmu_cap.num_counters_gp);
>>>> +	} else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
>>>>    		pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
>>>
>>> This needs to be sanitized, no?  E.g. if KVM only has access to 4 counters, but
>>> userspace sets X86_FEATURE_PERFCTR_CORE anyways.  Hrm, unless I'm missing something,
>>> that's a pre-existing bug.
>>
>> Now your point is that if a user space more capbility than KVM can support,
>> KVM should constrain it.
>> Your previous preference was that the user space can set capbilities that
>> evene if KVM doesn't support as long as it doesn't break KVM and host and the
>> guest will eat its own.
> 
> Letting userspace define a "bad" configuration is perfectly ok, but KVM needs to
> be careful not to endanger itself by consuming the bad state.  A good example is
> the handling of nested SVM features in svm_vcpu_after_set_cpuid().  KVM lets
> userspace define anything and everything, but KVM only actually tries to utilize
> a feature if the feature is actually supported in hardware.
> 
> In this case, it's not clear to me that putting a bogus value into "nr_arch_gp_counters"
> is safe (for KVM).  And AIUI, the guest can't actually use more than
> kvm_pmu_cap.num_counters_gp counters, i.e. KVM isn't arbitrarily restricting the
> setup.

AFAI,  when a guest has more counters (N) than the host (M), and they are all 
enabled,
thus KVM will create an equal number (N) of perf_events, and these events will 
occupy
real hardware counters (M) in the host perf scheduler subsystem in a round robin 
way.

 From the point of view of a vCPU, its virtual counters can only occupy the hardware
part of the time slice to count for guest payload, which affects the accuracy. 
However,
from the host security point of view, too many counters will only result in too many
perf_events created by KVM, which is a normal usage for the perf subsystem, called
perf counter multiplexing. It seems to be safe (using perf API for KVM).

But considering that scheduling too many perf_events is also a performance overhead,
it can also be seen as a performance attack on the scheduling of vCPU processes 
on host.

Back to the diff itself, code for intel_pmu does a similar sanity check, thus 
here we just
let AMD_PMU follow the same decision pattern. Please refer to the latest version.

diff mbox series

Patch

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 5a3428d212dd..d5be8320fd54 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -574,12 +574,15 @@  int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 	switch (msr) {
 	case MSR_CORE_PERF_GLOBAL_STATUS:
+	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
 		msr_info->data = pmu->global_status;
 		return 0;
 	case MSR_CORE_PERF_GLOBAL_CTRL:
+	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
 		msr_info->data = pmu->global_ctrl;
 		return 0;
 	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
 		msr_info->data = 0;
 		return 0;
 	default:
@@ -600,13 +603,19 @@  int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_CORE_PERF_GLOBAL_STATUS:
 		if (!msr_info->host_initiated || (data & pmu->global_ovf_ctrl_mask))
 			return 1; /* RO MSR */
+		fallthrough;
+	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+		if (!msr_info->host_initiated)
+			return 0; /* Writes are ignored */
 
 		pmu->global_status = data;
 		return 0;
 	case MSR_CORE_PERF_GLOBAL_CTRL:
 		if (!kvm_valid_perf_global_ctrl(pmu, data))
 			return 1;
-
+		fallthrough;
+	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+		data &= ~pmu->global_ctrl_mask;
 		if (pmu->global_ctrl != data) {
 			diff = pmu->global_ctrl ^ data;
 			pmu->global_ctrl = data;
@@ -616,7 +625,8 @@  int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
 		if (data & pmu->global_ovf_ctrl_mask)
 			return 1;
-
+		fallthrough;
+	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
 		if (!msr_info->host_initiated)
 			pmu->global_status &= ~data;
 		return 0;
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 9e12142e0c4b..8bbaace003b2 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -94,12 +94,6 @@  static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
 	return amd_pmc_idx_to_pmc(vcpu_to_pmu(vcpu), idx & ~(3u << 30));
 }
 
-static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
-{
-	/* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough.  */
-	return false;
-}
-
 static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
 {
 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -111,6 +105,29 @@  static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
 	return pmc;
 }
 
+static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+	switch (msr) {
+	case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3:
+		return pmu->version > 0;
+	case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+		return guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE);
+	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+		return pmu->version > 1;
+	default:
+		if (msr > MSR_F15H_PERF_CTR5 &&
+		    msr < MSR_F15H_PERF_CTL0 + 2 * pmu->nr_arch_gp_counters)
+			return pmu->version > 1;
+		break;
+	}
+
+	return amd_msr_idx_to_pmc(vcpu, msr);
+}
+
 static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -164,20 +181,34 @@  static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_cpuid_entry2 *entry;
+	union cpuid_0x80000022_ebx ebx;
 
-	if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+	pmu->version = 1;
+	if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) {
+		pmu->version = 2;
+		entry = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0);
+		ebx.full = entry->ebx;
+		pmu->nr_arch_gp_counters = min_t(unsigned int,
+						 ebx.split.num_core_pmc,
+						 kvm_pmu_cap.num_counters_gp);
+	} else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
 		pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
-	else
+	} else {
 		pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+	}
+
+	if (pmu->version > 1) {
+		pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1);
+		pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask;
+	}
 
 	pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
 	pmu->reserved_bits = 0xfffffff000280000ull;
 	pmu->raw_event_mask = AMD64_RAW_EVENT_MASK;
-	pmu->version = 1;
 	/* not applicable to AMD; but clean them to prevent any fall out */
 	pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
 	pmu->nr_arch_fixed_counters = 0;
-	pmu->global_status = 0;
 	bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters);
 }
 
@@ -208,6 +239,8 @@  static void amd_pmu_reset(struct kvm_vcpu *vcpu)
 		pmc_stop_counter(pmc);
 		pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
 	}
+
+	pmu->global_ctrl = pmu->global_status = 0;
 }
 
 struct kvm_pmu_ops amd_pmu_ops __initdata = {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 64c567a1b32b..74e0c53b6a00 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1464,6 +1464,10 @@  static const u32 msrs_to_save_pmu[] = {
 	MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
 	MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
 	MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+
+	MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
+	MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
+	MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
 };
 
 static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
@@ -7067,6 +7071,12 @@  static void kvm_probe_msr_to_save(u32 msr_index)
 		    kvm_pmu_cap.num_counters_fixed)
 			return;
 		break;
+	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+		if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2))
+			return;
+		break;
 	case MSR_IA32_XFD:
 	case MSR_IA32_XFD_ERR:
 		if (!kvm_cpu_cap_has(X86_FEATURE_XFD))