diff mbox series

x86/resctrl: Fix event counts regression in reused RMIDs

Message ID	20221207112924.3602960-1-peternewman@google.com
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20; Date: Wed, 7 Dec 2022 12:29:24 +0100 Mime-Version: 1.0 Message-ID: <20221207112924.3602960-1-peternewman@google.com> Subject: [PATCH] x86/resctrl: Fix event counts regression in reused RMIDs From: Peter Newman <peternewman@google.com> To: Fenghua Yu <fenghua.yu@intel.com>, Reinette Chatre <reinette.chatre@intel.com> Cc: Thomas Gleixner <tglx@linutronix.de>, Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>, Dave Hansen <dave.hansen@linux.intel.com>, x86@kernel.org, "H. Peter Anvin" <hpa@zytor.com>, James Morse <james.morse@arm.com>, Shaopeng Tan <tan.shaopeng@fujitsu.com>, Jamie Iles <quic_jiles@quicinc.com>, linux-kernel@vger.kernel.org, eranian@google.com, Peter Newman <peternewman@google.com> Content-Type: text/plain; charset="UTF-8" Precedence: bulk
Series	x86/resctrl: Fix event counts regression in reused RMIDs \| x86/resctrl: Fix event counts regression in reused RMIDs

Commit Message

Peter Newman Dec. 7, 2022, 11:29 a.m. UTC

  When creating a new monitoring group, the RMID allocated for it may have
been used by a group which was previously removed. In this case, the
hardware counters will have non-zero values which should be deducted
from what is reported in the new group's counts.

resctrl_arch_reset_rmid() initializes the prev_msr value for counters to
0, causing the initial count to be charged to the new group. Resurrect
__rmid_read() and use it to initialize prev_msr correctly.

Fixes: 1d81d15db39c ("x86/resctrl: Move mbm_overflow_count() into resctrl_arch_rmid_read()")
Signed-off-by: Peter Newman <peternewman@google.com>
---
 arch/x86/kernel/cpu/resctrl/monitor.c | 39 ++++++++++++++++++---------
 1 file changed, 27 insertions(+), 12 deletions(-)


base-commit: 76dcd734eca23168cb008912c0f69ff408905235
--
2.39.0.rc0.267.gcb52ba06e7-goog

Comments

Fenghua Yu Dec. 7, 2022, 7:26 p.m. UTC | #1

Hi, Peter,

> When creating a new monitoring group, the RMID allocated for it may have
> been used by a group which was previously removed. In this case, the hardware
> counters will have non-zero values which should be deducted from what is
> reported in the new group's counts.
> 
> resctrl_arch_reset_rmid() initializes the prev_msr value for counters to 0,
> causing the initial count to be charged to the new group. Resurrect
> __rmid_read() and use it to initialize prev_msr correctly.
> 
> Fixes: 1d81d15db39c ("x86/resctrl: Move mbm_overflow_count() into
> resctrl_arch_rmid_read()")

Are you sure the patch fixes 1d81d15db39c? This commit is just a refactoring patch and doesn't change resctrl_arch_reset_rmid().

Shouldn't the patch fix commit fea62d370d7a?
	
> Signed-off-by: Peter Newman <peternewman@google.com>
> ---
>  arch/x86/kernel/cpu/resctrl/monitor.c | 39 ++++++++++++++++++---------
>  1 file changed, 27 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c
> b/arch/x86/kernel/cpu/resctrl/monitor.c
> index efe0c30d3a12..404dd9c472c7 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -146,6 +146,24 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid)
>  	return entry;
>  }
> 
> +static u64 __rmid_read(u32 rmid, enum resctrl_event_id eventid) {
> +	u64 val;
> +
> +	/*
> +	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
> +	 * with a valid event code for supported resource type and the bits
> +	 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
> +	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
> +	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
> +	 * are error bits.
> +	 */
> +	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
> +	rdmsrl(MSR_IA32_QM_CTR, val);
> +
> +	return val;
> +}
> +
>  static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain
> *hw_dom,
>  						 u32 rmid,
>  						 enum resctrl_event_id eventid)
> @@ -170,10 +188,17 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r,
> struct rdt_domain *d,  {
>  	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
>  	struct arch_mbm_state *am;
> +	uint64_t val;

Please move this line to below:

> 
>  	am = get_arch_mbm_state(hw_dom, rmid, eventid);
> -	if (am)
> +	if (am) {

here:
+	u64 val;

And it's better to keep the same type name "u64" as other values.

>  		memset(am, 0, sizeof(*am));
> +
> +		/* Record any initial, non-zero count value. */
> +		val = __rmid_read(rmid, eventid);
> +		if (!(val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)))
> +			am->prev_msr = val;
> +	}
>  }
> 
>  static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
> @@ -195,17 +220,7 @@ int resctrl_arch_rmid_read(struct rdt_resource *r,
> struct rdt_domain *d,
>  	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
>  		return -EINVAL;
> 
> -	/*
> -	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
> -	 * with a valid event code for supported resource type and the bits
> -	 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
> -	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
> -	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
> -	 * are error bits.
> -	 */
> -	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
> -	rdmsrl(MSR_IA32_QM_CTR, msr_val);
> -
> +	msr_val = __rmid_read(rmid, eventid);
>  	if (msr_val & RMID_VAL_ERROR)
>  		return -EIO;
>  	if (msr_val & RMID_VAL_UNAVAIL)
> 
> base-commit: 76dcd734eca23168cb008912c0f69ff408905235
> --
> 2.39.0.rc0.267.gcb52ba06e7-goog

Thanks.

-Fenghua

Reinette Chatre Dec. 7, 2022, 7:48 p.m. UTC | #2

Hi Peter,

On 12/7/2022 3:29 AM, Peter Newman wrote:
> When creating a new monitoring group, the RMID allocated for it may have
> been used by a group which was previously removed. In this case, the
> hardware counters will have non-zero values which should be deducted
> from what is reported in the new group's counts.
> 
> resctrl_arch_reset_rmid() initializes the prev_msr value for counters to
> 0, causing the initial count to be charged to the new group. Resurrect
> __rmid_read() and use it to initialize prev_msr correctly.

Thank you very much for catching this.

> 
> Fixes: 1d81d15db39c ("x86/resctrl: Move mbm_overflow_count() into resctrl_arch_rmid_read()")
> Signed-off-by: Peter Newman <peternewman@google.com>
> ---
>  arch/x86/kernel/cpu/resctrl/monitor.c | 39 ++++++++++++++++++---------
>  1 file changed, 27 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index efe0c30d3a12..404dd9c472c7 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -146,6 +146,24 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid)
>  	return entry;
>  }
> 
> +static u64 __rmid_read(u32 rmid, enum resctrl_event_id eventid)
> +{
> +	u64 val;
> +
> +	/*
> +	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
> +	 * with a valid event code for supported resource type and the bits
> +	 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
> +	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
> +	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
> +	 * are error bits.
> +	 */
> +	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
> +	rdmsrl(MSR_IA32_QM_CTR, val);
> +
> +	return val;
> +}
> +
>  static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,
>  						 u32 rmid,
>  						 enum resctrl_event_id eventid)
> @@ -170,10 +188,17 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
>  {
>  	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
>  	struct arch_mbm_state *am;
> +	uint64_t val;
> 
>  	am = get_arch_mbm_state(hw_dom, rmid, eventid);
> -	if (am)
> +	if (am) {
>  		memset(am, 0, sizeof(*am));
> +
> +		/* Record any initial, non-zero count value. */
> +		val = __rmid_read(rmid, eventid);
> +		if (!(val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)))
> +			am->prev_msr = val;
> +	}
>  }
> 
>  static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
> @@ -195,17 +220,7 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
>  	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
>  		return -EINVAL;
> 
> -	/*
> -	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
> -	 * with a valid event code for supported resource type and the bits
> -	 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
> -	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
> -	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
> -	 * are error bits.
> -	 */
> -	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
> -	rdmsrl(MSR_IA32_QM_CTR, msr_val);
> -
> +	msr_val = __rmid_read(rmid, eventid);
>  	if (msr_val & RMID_VAL_ERROR)
>  		return -EIO;
>  	if (msr_val & RMID_VAL_UNAVAIL)
> 
> base-commit: 76dcd734eca23168cb008912c0f69ff408905235
> --
> 2.39.0.rc0.267.gcb52ba06e7-goog
> 

To get back to the original behavior before the refactoring it also seems
that __mon_event_count() needs to return right after calling 
resctrl_arch_reset_rmid(). The only caller with rr->first set is when
the mon directory is created and the returned values are not used,
it is just run to get prev_msr set. This also avoids unnecessarily reading
the counters twice.

So, how about:

static int __mon_event_count(u32 rmid, struct rmid_read *rr)
{

	...
	if (rr->first) {
		resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
		return 0;
	}
	...

}

Also ... there appears to be a leftover related snippet in __mon_event_count()
that does not belong anymore and may still cause incorrect behavior:

static int __mon_event_count(u32 rmid, struct rmid_read *rr)
{
	...
	if (rr->first) {
		memset(m, 0, sizeof(struct mbm_state));
		return 0;
	}
	...
}


Reinette

Peter Newman Dec. 8, 2022, 9:45 a.m. UTC | #3

Hi Fenghua,

On Wed, Dec 7, 2022 at 8:26 PM Yu, Fenghua <fenghua.yu@intel.com> wrote:
> > Fixes: 1d81d15db39c ("x86/resctrl: Move mbm_overflow_count() into
> > resctrl_arch_rmid_read()")
>
> Are you sure the patch fixes 1d81d15db39c? This commit is just a refactoring patch and doesn't change resctrl_arch_reset_rmid().

Yes, the breaking change was actually in __mon_event_count(). That
patch removes the initialization of mbm_state::prev_msr and switches
to using arch_mbm_state::prev_msr.

-Peter

Peter Newman Dec. 8, 2022, 10:04 a.m. UTC | #4

Hi Reinette,

On Wed, Dec 7, 2022 at 8:48 PM Reinette Chatre
<reinette.chatre@intel.com> wrote:
>
> To get back to the original behavior before the refactoring it also seems
> that __mon_event_count() needs to return right after calling
> resctrl_arch_reset_rmid(). The only caller with rr->first set is when
> the mon directory is created and the returned values are not used,
> it is just run to get prev_msr set. This also avoids unnecessarily reading
> the counters twice.
>
> So, how about:
>
> static int __mon_event_count(u32 rmid, struct rmid_read *rr)
> {
>
> ...
> if (rr->first) {
> resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
> return 0;
> }
> ...
>
> }

Avoiding the double-read sounds good, but...

>
> Also ... there appears to be a leftover related snippet in __mon_event_count()
> that does not belong anymore and may still cause incorrect behavior:
>
> static int __mon_event_count(u32 rmid, struct rmid_read *rr)
> {
> ...
> if (rr->first) {
> memset(m, 0, sizeof(struct mbm_state));
> return 0;
> }
> ...
> }

I'm less sure about removing (or skipping) this. mbm_state::mbm_local
still seems to be used by the mba_sc code. That might be why James
left this code in.

I was sort of confused about the new role of mbm_state following the
refactoring when reviewing Babu's change. (which reminds me that I
should have CC'ed him on this patch)

-Peter

Reinette Chatre Dec. 8, 2022, 6:30 p.m. UTC | #5

Hi Peter,

On 12/8/2022 2:04 AM, Peter Newman wrote:
> Hi Reinette,
> 
> On Wed, Dec 7, 2022 at 8:48 PM Reinette Chatre
> <reinette.chatre@intel.com> wrote:
>>
>> To get back to the original behavior before the refactoring it also seems
>> that __mon_event_count() needs to return right after calling
>> resctrl_arch_reset_rmid(). The only caller with rr->first set is when
>> the mon directory is created and the returned values are not used,
>> it is just run to get prev_msr set. This also avoids unnecessarily reading
>> the counters twice.
>>
>> So, how about:
>>
>> static int __mon_event_count(u32 rmid, struct rmid_read *rr)
>> {
>>
>> ...
>> if (rr->first) {
>> resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
>> return 0;
>> }
>> ...
>>
>> }
> 
> Avoiding the double-read sounds good, but...
> 
>>
>> Also ... there appears to be a leftover related snippet in __mon_event_count()
>> that does not belong anymore and may still cause incorrect behavior:
>>
>> static int __mon_event_count(u32 rmid, struct rmid_read *rr)
>> {
>> ...
>> if (rr->first) {
>> memset(m, 0, sizeof(struct mbm_state));
>> return 0;
>> }
>> ...
>> }
> 
> I'm less sure about removing (or skipping) this. mbm_state::mbm_local
> still seems to be used by the mba_sc code. That might be why James
> left this code in.
> 
> I was sort of confused about the new role of mbm_state following the
> refactoring when reviewing Babu's change. (which reminds me that I
> should have CC'ed him on this patch)


I think this can be cleaned up to make the code more clear. Notice the
duplication of  following snippet in __mon_event_count():
	rr->val += tval;
	return 0;

I do not see any need to check the event id before doing the above. That
leaves the bulk of the switch just needed for the rr->first handling that
can be moved to resctrl_arch_reset_rmid().

Something like:

void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, ...
{
	...
	struct arch_mbm_state *am;
	struct mbm_state *m;
	u64 val = 0;
	int ret;
	
	m = get_mbm_state(d, rmid, eventid); /* get_mbm_state() to be created */
	if (m)
		memset(m, 0, sizeof(*m));	

	am = get_arch_mbm_state(hw_dom, rmid, eventid);
	if (am) {
		memset(am, 0, sizeof(*am));	
		/* Record any initial, non-zero count value. */
		ret = __rmid_read(rmid, eventid, &val);
		if (!ret)
			am->prev_msr = val;
	}

}

Having this would be helpful as reference to Babu's usage. 

Also please note that I changed the __rmid_read(). There is no need
to require each __rmid_read() caller to test MSR bits for validity, that
can be contained within __rmid_read().

Something like below remains:

static int __mon_event_count(u32 rmid, struct rmid_read *rr)
{

	...

	if (rr->first) {
		resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
		return 0;
	}

	rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval);
	if (rr->err)
		return rr->err;

	rr->val += tval;
	return 0;

}

What do you think? 

Reinette

Peter Newman Dec. 14, 2022, 2:21 p.m. UTC | #6

Hi Reinette,

On Thu, Dec 8, 2022 at 7:31 PM Reinette Chatre
<reinette.chatre@intel.com> wrote:
>
> I think this can be cleaned up to make the code more clear. Notice the
> duplication of following snippet in __mon_event_count():
> rr->val += tval;
> return 0;
>
> I do not see any need to check the event id before doing the above. That
> leaves the bulk of the switch just needed for the rr->first handling that
> can be moved to resctrl_arch_reset_rmid().
>
> Something like:
>
> void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, ...
> {
> ...
> struct arch_mbm_state *am;
> struct mbm_state *m;
> u64 val = 0;
> int ret;
>
> m = get_mbm_state(d, rmid, eventid); /* get_mbm_state() to be created */

Good call. When prototyping another change, I quickly found the need to
create this myself.

> if (m)
> memset(m, 0, sizeof(*m));

mbm_state is arch-independent, so I think putting it here would require
the MPAM version to copy this and for get_mbm_state() to be exported.

>
> am = get_arch_mbm_state(hw_dom, rmid, eventid);
> if (am) {
> memset(am, 0, sizeof(*am));
> /* Record any initial, non-zero count value. */
> ret = __rmid_read(rmid, eventid, &val);
> if (!ret)
> am->prev_msr = val;
> }
>
> }
>
> Having this would be helpful as reference to Babu's usage.

His usage looks a little different.

According to the comment in Babu's patch:

https://lore.kernel.org/lkml/166990903030.17806.5106229901730558377.stgit@bmoger-ubuntu/

+ /*
+ * When an Event Configuration is changed, the bandwidth counters
+ * for all RMIDs and Events will be cleared by the hardware. The
+ * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
+ * every RMID on the next read to any event for every RMID.
+ * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
+ * cleared while it is tracked by the hardware. Clear the
+ * mbm_local and mbm_total counts for all the RMIDs.
+ */
+ resctrl_arch_reset_rmid_all(r, d);

If all the hardware counters are zeroed as the comment suggests, then
leaving am->prev_msr zero seems correct. __rmid_read() would likely
return an error anyways. The bug I was addressing was one of reusing
an RMID which had not been reset.

>
> Also please note that I changed the __rmid_read(). There is no need
> to require each __rmid_read() caller to test MSR bits for validity, that
> can be contained within __rmid_read().
>
> Something like below remains:
>
> static int __mon_event_count(u32 rmid, struct rmid_read *rr)
> {
>
> ...
>
> if (rr->first) {
> resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
> return 0;
> }
>
> rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval);
> if (rr->err)
> return rr->err;
>
> rr->val += tval;
> return 0;
>
> }
>
> What do you think?

Looks much better. This function has been bothering me since the
refactor. I'll see how close I can get to this in the next patch.

Thanks!
-Peter

Reinette Chatre Dec. 14, 2022, 7:17 p.m. UTC | #7

Hi Peter,

On 12/14/2022 6:21 AM, Peter Newman wrote:
> On Thu, Dec 8, 2022 at 7:31 PM Reinette Chatre
> <reinette.chatre@intel.com> wrote:
>>
>> I think this can be cleaned up to make the code more clear. Notice the
>> duplication of following snippet in __mon_event_count():
>> rr->val += tval;
>> return 0;
>>
>> I do not see any need to check the event id before doing the above. That
>> leaves the bulk of the switch just needed for the rr->first handling that
>> can be moved to resctrl_arch_reset_rmid().
>>
>> Something like:
>>
>> void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, ...
>> {
>> ...
>> struct arch_mbm_state *am;
>> struct mbm_state *m;
>> u64 val = 0;
>> int ret;
>>
>> m = get_mbm_state(d, rmid, eventid); /* get_mbm_state() to be created */
> 
> Good call. When prototyping another change, I quickly found the need to
> create this myself.
> 
>> if (m)
>> memset(m, 0, sizeof(*m));
> 
> mbm_state is arch-independent, so I think putting it here would require
> the MPAM version to copy this and for get_mbm_state() to be exported.

You are correct, it is arch independent ... so every arch is expected to
have it.
I peeked at your series and that looks good also - having cleanup done in
a central place helps to avoid future mistakes.

>> am = get_arch_mbm_state(hw_dom, rmid, eventid);
>> if (am) {
>> memset(am, 0, sizeof(*am));
>> /* Record any initial, non-zero count value. */
>> ret = __rmid_read(rmid, eventid, &val);
>> if (!ret)
>> am->prev_msr = val;
>> }
>>
>> }
>>
>> Having this would be helpful as reference to Babu's usage.
> 
> His usage looks a little different.
> 
> According to the comment in Babu's patch:
> 
> https://lore.kernel.org/lkml/166990903030.17806.5106229901730558377.stgit@bmoger-ubuntu/
> 
> + /*
> + * When an Event Configuration is changed, the bandwidth counters
> + * for all RMIDs and Events will be cleared by the hardware. The
> + * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
> + * every RMID on the next read to any event for every RMID.
> + * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
> + * cleared while it is tracked by the hardware. Clear the
> + * mbm_local and mbm_total counts for all the RMIDs.
> + */
> + resctrl_arch_reset_rmid_all(r, d);
> 
> If all the hardware counters are zeroed as the comment suggests, then
> leaving am->prev_msr zero seems correct. __rmid_read() would likely
> return an error anyways. The bug I was addressing was one of reusing
> an RMID which had not been reset.

You are correct, but there are two things to keep in mind though:
* the change from which you copied the above snippet introduces a new
  _generic_ utility far away from this call site. It is thus reasonable to
  assume that this utility should work for all use cases, not just the one
  for which it is created. Since there are no other use cases at this time,
  this may be ok, but I think at minimum the utility will benefit from 
  a snippet indicating the caveats of its use as a heads up to any future users.
* the utility does not clear struct mbm_state contents. Again, this is ok
  for this usage since AMD does not support the software controller but 
  as far as a generic utility goes the usage should be clear to avoid
  traps for future changes.

Reinette

Peter Newman Dec. 16, 2022, 1:54 p.m. UTC | #8

Hi Reinette,

On Wed, Dec 14, 2022 at 8:17 PM Reinette Chatre
<reinette.chatre@intel.com> wrote:
> On 12/14/2022 6:21 AM, Peter Newman wrote:
> > mbm_state is arch-independent, so I think putting it here would require
> > the MPAM version to copy this and for get_mbm_state() to be exported.
>
> You are correct, it is arch independent ... so every arch is expected to
> have it.
> I peeked at your series and that looks good also - having cleanup done in
> a central place helps to avoid future mistakes.
>
> >> am = get_arch_mbm_state(hw_dom, rmid, eventid);
> >> if (am) {
> >> memset(am, 0, sizeof(*am));
> >> /* Record any initial, non-zero count value. */
> >> ret = __rmid_read(rmid, eventid, &val);
> >> if (!ret)
> >> am->prev_msr = val;
> >> }
> >>
> >> }
> >>
> >> Having this would be helpful as reference to Babu's usage.
> >
> > His usage looks a little different.
> >
> > According to the comment in Babu's patch:
> >
> > https://lore.kernel.org/lkml/166990903030.17806.5106229901730558377.stgit@bmoger-ubuntu/
> >
> > + /*
> > + * When an Event Configuration is changed, the bandwidth counters
> > + * for all RMIDs and Events will be cleared by the hardware. The
> > + * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
> > + * every RMID on the next read to any event for every RMID.
> > + * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
> > + * cleared while it is tracked by the hardware. Clear the
> > + * mbm_local and mbm_total counts for all the RMIDs.
> > + */
> > + resctrl_arch_reset_rmid_all(r, d);
> >
> > If all the hardware counters are zeroed as the comment suggests, then
> > leaving am->prev_msr zero seems correct. __rmid_read() would likely
> > return an error anyways. The bug I was addressing was one of reusing
> > an RMID which had not been reset.
>
> You are correct, but there are two things to keep in mind though:
> * the change from which you copied the above snippet introduces a new
>   _generic_ utility far away from this call site. It is thus reasonable to
>   assume that this utility should work for all use cases, not just the one
>   for which it is created. Since there are no other use cases at this time,
>   this may be ok, but I think at minimum the utility will benefit from
>   a snippet indicating the caveats of its use as a heads up to any future users.
> * the utility does not clear struct mbm_state contents. Again, this is ok
>   for this usage since AMD does not support the software controller but
>   as far as a generic utility goes the usage should be clear to avoid
>   traps for future changes.

To this end, would it help if I pulled the rr->first case into a
separate function like this:

-               resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
-               m = get_mbm_state(rr->d, rmid, rr->evtid);
-               if (m)
-                       memset(m, 0, sizeof(struct mbm_state));
+               resctrl_reset_rmid(rr->r, rr->d, rmid, rr->evtid);

I'm open to suggestions on the name.

-Peter

Reinette Chatre Dec. 16, 2022, 10:29 p.m. UTC | #9

Hi Peter,

On 12/16/2022 5:54 AM, Peter Newman wrote:
> On Wed, Dec 14, 2022 at 8:17 PM Reinette Chatre
> <reinette.chatre@intel.com> wrote:
>> On 12/14/2022 6:21 AM, Peter Newman wrote:
>>> mbm_state is arch-independent, so I think putting it here would require
>>> the MPAM version to copy this and for get_mbm_state() to be exported.
>>
>> You are correct, it is arch independent ... so every arch is expected to
>> have it.
>> I peeked at your series and that looks good also - having cleanup done in
>> a central place helps to avoid future mistakes.
>>
>>>> am = get_arch_mbm_state(hw_dom, rmid, eventid);
>>>> if (am) {
>>>> memset(am, 0, sizeof(*am));
>>>> /* Record any initial, non-zero count value. */
>>>> ret = __rmid_read(rmid, eventid, &val);
>>>> if (!ret)
>>>> am->prev_msr = val;
>>>> }
>>>>
>>>> }
>>>>
>>>> Having this would be helpful as reference to Babu's usage.
>>>
>>> His usage looks a little different.
>>>
>>> According to the comment in Babu's patch:
>>>
>>> https://lore.kernel.org/lkml/166990903030.17806.5106229901730558377.stgit@bmoger-ubuntu/
>>>
>>> + /*
>>> + * When an Event Configuration is changed, the bandwidth counters
>>> + * for all RMIDs and Events will be cleared by the hardware. The
>>> + * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
>>> + * every RMID on the next read to any event for every RMID.
>>> + * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
>>> + * cleared while it is tracked by the hardware. Clear the
>>> + * mbm_local and mbm_total counts for all the RMIDs.
>>> + */
>>> + resctrl_arch_reset_rmid_all(r, d);
>>>
>>> If all the hardware counters are zeroed as the comment suggests, then
>>> leaving am->prev_msr zero seems correct. __rmid_read() would likely
>>> return an error anyways. The bug I was addressing was one of reusing
>>> an RMID which had not been reset.
>>
>> You are correct, but there are two things to keep in mind though:
>> * the change from which you copied the above snippet introduces a new
>>   _generic_ utility far away from this call site. It is thus reasonable to
>>   assume that this utility should work for all use cases, not just the one
>>   for which it is created. Since there are no other use cases at this time,
>>   this may be ok, but I think at minimum the utility will benefit from
>>   a snippet indicating the caveats of its use as a heads up to any future users.
>> * the utility does not clear struct mbm_state contents. Again, this is ok
>>   for this usage since AMD does not support the software controller but
>>   as far as a generic utility goes the usage should be clear to avoid
>>   traps for future changes.
> 
> To this end, would it help if I pulled the rr->first case into a
> separate function like this:
> 
> -               resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
> -               m = get_mbm_state(rr->d, rmid, rr->evtid);
> -               if (m)
> -                       memset(m, 0, sizeof(struct mbm_state));
> +               resctrl_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
> 
> I'm open to suggestions on the name.

This email thread started to talk about two generic utilities, the one relevant
to this fix (resctrl_arch_reset_rmid()) and the one being created by Babu 
(resctrl_arch_reset_rmid_all()). Focusing on the one related to this fix I do
think the way in which the utility is used in V2 makes it clear how cleanup
should be done. I could have been more explicit but that is what I meant earlier
when saying that the way that the cleanup is done in a central place looks good.
Any future scenario would have a good reference to follow and if needed a new
utility can be created at that time. 

Reinette

diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index efe0c30d3a12..404dd9c472c7 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -146,6 +146,24 @@  static inline struct rmid_entry *__rmid_entry(u32 rmid)
 	return entry;
 }

+static u64 __rmid_read(u32 rmid, enum resctrl_event_id eventid)
+{
+	u64 val;
+
+	/*
+	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
+	 * with a valid event code for supported resource type and the bits
+	 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
+	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
+	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
+	 * are error bits.
+	 */
+	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+	rdmsrl(MSR_IA32_QM_CTR, val);
+
+	return val;
+}
+
 static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,
 						 u32 rmid,
 						 enum resctrl_event_id eventid)
@@ -170,10 +188,17 @@  void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
 {
 	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
 	struct arch_mbm_state *am;
+	uint64_t val;

 	am = get_arch_mbm_state(hw_dom, rmid, eventid);
-	if (am)
+	if (am) {
 		memset(am, 0, sizeof(*am));
+
+		/* Record any initial, non-zero count value. */
+		val = __rmid_read(rmid, eventid);
+		if (!(val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)))
+			am->prev_msr = val;
+	}
 }

 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
@@ -195,17 +220,7 @@  int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
 	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
 		return -EINVAL;

-	/*
-	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
-	 * with a valid event code for supported resource type and the bits
-	 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
-	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
-	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
-	 * are error bits.
-	 */
-	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
-	rdmsrl(MSR_IA32_QM_CTR, msr_val);
-
+	msr_val = __rmid_read(rmid, eventid);
 	if (msr_val & RMID_VAL_ERROR)
 		return -EIO;
 	if (msr_val & RMID_VAL_UNAVAIL)