x86/resctrl: mba_MBps: Fall back to total b/w if local b/w unavailable

Message ID 20231024181600.8270-1-tony.luck@intel.com
State New
Headers
Series x86/resctrl: mba_MBps: Fall back to total b/w if local b/w unavailable |

Commit Message

Luck, Tony Oct. 24, 2023, 6:16 p.m. UTC
  On Intel the various resource director technology (RDT) features are all
orthogonal and independently enumerated. Thus it is possible to have
a system that  provides "total" memory bandwidth measurements without
providing "local" bandwidth measurements.

If local bandwidth measurement is not available, do not give up on
providing the "mba_MBps" feedback option completely, make the code fall
back to using total bandwidth.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/resctrl/monitor.c  | 34 ++++++++++++++++----------
 arch/x86/kernel/cpu/resctrl/rdtgroup.c |  2 +-
 2 files changed, 22 insertions(+), 14 deletions(-)
  

Comments

Luck, Tony Oct. 24, 2023, 6:24 p.m. UTC | #1
> If local bandwidth measurement is not available, do not give up on
> providing the "mba_MBps" feedback option completely, make the code fall
> back to using total bandwidth.

N.B. I don't have a system yet that does this. Tested using boot command line
"clearcpuid=cqm_mbm_local" argument to the kernel to fake the non-prescence
of local bandwidth monitoring.

-Tony
  
Moger, Babu Oct. 24, 2023, 11:20 p.m. UTC | #2
Hi Tony,

> -----Original Message-----
> From: Tony Luck <tony.luck@intel.com>
> Sent: Tuesday, October 24, 2023 1:16 PM
> To: Fenghua Yu <fenghua.yu@intel.com>; Reinette Chatre
> <reinette.chatre@intel.com>; Peter Newman <peternewman@google.com>;
> Jonathan Corbet <corbet@lwn.net>; Shuah Khan <skhan@linuxfoundation.org>;
> x86@kernel.org
> Cc: Shaopeng Tan <tan.shaopeng@fujitsu.com>; James Morse
> <james.morse@arm.com>; Jamie Iles <quic_jiles@quicinc.com>; Moger, Babu
> <Babu.Moger@amd.com>; Randy Dunlap <rdunlap@infradead.org>; linux-
> kernel@vger.kernel.org; linux-doc@vger.kernel.org; patches@lists.linux.dev;
> Tony Luck <tony.luck@intel.com>
> Subject: [PATCH] x86/resctrl: mba_MBps: Fall back to total b/w if local b/w
> unavailable
> 
> On Intel the various resource director technology (RDT) features are all
> orthogonal and independently enumerated. Thus it is possible to have a system
> that  provides "total" memory bandwidth measurements without providing
> "local" bandwidth measurements.
> 
> If local bandwidth measurement is not available, do not give up on providing
> the "mba_MBps" feedback option completely, make the code fall back to using
> total bandwidth.


Is this customer requirement ?
What do you mean by " If local bandwidth measurement is not available" ?
Is the hardware supports only total bandwidth and not local?

It can get real ugly if we try to handle one special case.

thanks
Babu

> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/kernel/cpu/resctrl/monitor.c  | 34 ++++++++++++++++----------
> arch/x86/kernel/cpu/resctrl/rdtgroup.c |  2 +-
>  2 files changed, 22 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c
> b/arch/x86/kernel/cpu/resctrl/monitor.c
> index f136ac046851..3b9531cce807 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -418,6 +418,14 @@ static int __mon_event_count(u32 rmid, struct
> rmid_read *rr)
>  	return 0;
>  }
> 
> +static struct mbm_state *get_mbm_data(struct rdt_domain *dom_mbm, int
> +rmid) {
> +	if (is_mbm_local_enabled())
> +		return &dom_mbm->mbm_local[rmid];
> +
> +	return &dom_mbm->mbm_total[rmid];
> +}
> +
>  /*
>   * mbm_bw_count() - Update bw count from values previously read by
>   *		    __mon_event_count().
> @@ -431,7 +439,7 @@ static int __mon_event_count(u32 rmid, struct
> rmid_read *rr)
>   */
>  static void mbm_bw_count(u32 rmid, struct rmid_read *rr)  {
> -	struct mbm_state *m = &rr->d->mbm_local[rmid];
> +	struct mbm_state *m = get_mbm_data(rr->d, rmid);
>  	u64 cur_bw, bytes, cur_bytes;
> 
>  	cur_bytes = rr->val;
> @@ -526,14 +534,14 @@ static void update_mba_bw(struct rdtgroup *rgrp,
> struct rdt_domain *dom_mbm)
>  	struct list_head *head;
>  	struct rdtgroup *entry;
> 
> -	if (!is_mbm_local_enabled())
> +	if (!is_mbm_enabled())
>  		return;
> 
>  	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
> 
>  	closid = rgrp->closid;
>  	rmid = rgrp->mon.rmid;
> -	pmbm_data = &dom_mbm->mbm_local[rmid];
> +	pmbm_data = get_mbm_data(dom_mbm, rmid);
> 
>  	dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
>  	if (!dom_mba) {
> @@ -553,7 +561,7 @@ static void update_mba_bw(struct rdtgroup *rgrp,
> struct rdt_domain *dom_mbm)
>  	 */
>  	head = &rgrp->mon.crdtgrp_list;
>  	list_for_each_entry(entry, head, mon.crdtgrp_list) {
> -		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
> +		cmbm_data = get_mbm_data(dom_mbm, entry->mon.rmid);
>  		cur_bw += cmbm_data->prev_bw;
>  		delta_bw += cmbm_data->delta_bw;
>  	}
> @@ -595,7 +603,7 @@ static void update_mba_bw(struct rdtgroup *rgrp,
> struct rdt_domain *dom_mbm)
>  	 */
>  	pmbm_data->delta_comp = true;
>  	list_for_each_entry(entry, head, mon.crdtgrp_list) {
> -		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
> +		cmbm_data = get_mbm_data(dom_mbm, entry->mon.rmid);
>  		cmbm_data->delta_comp = true;
>  	}
>  }
> @@ -621,15 +629,15 @@ static void mbm_update(struct rdt_resource *r, struct
> rdt_domain *d, int rmid)
>  		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
>  		rr.val = 0;
>  		__mon_event_count(rmid, &rr);
> -
> -		/*
> -		 * Call the MBA software controller only for the
> -		 * control groups and when user has enabled
> -		 * the software controller explicitly.
> -		 */
> -		if (is_mba_sc(NULL))
> -			mbm_bw_count(rmid, &rr);
>  	}
> +
> +	/*
> +	 * Call the MBA software controller only for the
> +	 * control groups and when user has enabled
> +	 * the software controller explicitly.
> +	 */
> +	if (is_mba_sc(NULL))
> +		mbm_bw_count(rmid, &rr);
>  }
> 
>  /*
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index 69a1de92384a..0c4f8a1b8df0 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -2294,7 +2294,7 @@ static bool supports_mba_mbps(void)  {
>  	struct rdt_resource *r =
> &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
> 
> -	return (is_mbm_local_enabled() &&
> +	return (is_mbm_enabled() &&
>  		r->alloc_capable && is_mba_linear());  }
> 
> --
> 2.41.0
  
Luck, Tony Oct. 24, 2023, 11:43 p.m. UTC | #3
> Is this customer requirement ?

Any customer using the mba_MBps feedback mount option will need this
on platforms that don't support local bandwidth measurement.

> What do you mean by " If local bandwidth measurement is not available" ?
> Is the hardware supports only total bandwidth and not local?

There's going to be an Intel CPU that will only provide "total" bandwidth.

The CPUID enumeration in (CPUID.(EAX=0FH, ECX=1H) ).EDX{2}
will be "0" indicating that the local mbm monitor event is not supported.

> It can get real ugly if we try to handle one special case.

Hard to predict the future (I didn't see this coming, or I'd have had Vikas
implement the fallback in the original mba_MBps code). But I don't believe
this will be a one-off special case.

I'm also wondering why this feedback loop picked "local" rather than "total".
I dug into the e-mail archives, and I don't see any discussion. There's just
an RFC series, and then the v2 series was applied with a few small suggestions
from Thomas to make things cleaner..

-Tony
  
Peter Newman Oct. 25, 2023, 12:46 p.m. UTC | #4
Hi Tony,

On Tue, Oct 24, 2023 at 8:16 PM Tony Luck <tony.luck@intel.com> wrote:
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -418,6 +418,14 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
>         return 0;
>  }
>
> +static struct mbm_state *get_mbm_data(struct rdt_domain *dom_mbm, int rmid)
> +{
> +       if (is_mbm_local_enabled())
> +               return &dom_mbm->mbm_local[rmid];
> +
> +       return &dom_mbm->mbm_total[rmid];
> +}

That looks very similar to the get_mbm_state() function I added to
this same file recently:

https://lore.kernel.org/all/20221220164132.443083-2-peternewman%40google.com

I think the name you picked is misleadingly general. "local if
available, otherwise total" seems to be a choice specific to the mbps
controller. I think these functions should be reconciled a little
better.

Other than that, looks good.

Reviewed-by: Peter Newman <peternewman@google.com>
  
Moger, Babu Oct. 25, 2023, 4:01 p.m. UTC | #5
Hi Tony,

On 10/24/23 18:43, Luck, Tony wrote:
>> Is this customer requirement ?
> 
> Any customer using the mba_MBps feedback mount option will need this
> on platforms that don't support local bandwidth measurement.
> 
>> What do you mean by " If local bandwidth measurement is not available" ?
>> Is the hardware supports only total bandwidth and not local?
> 
> There's going to be an Intel CPU that will only provide "total" bandwidth.

ok.

Why dont you use get_mbm_state which is already available instead of
writing another function(get_mbm_data).

You can pass evtid, rmid, domain information. Decide the evtid based on
what is available. I think that will make code simpler.

> 
> The CPUID enumeration in (CPUID.(EAX=0FH, ECX=1H) ).EDX{2}
> will be "0" indicating that the local mbm monitor event is not supported.
> 
>> It can get real ugly if we try to handle one special case.
> 
> Hard to predict the future (I didn't see this coming, or I'd have had Vikas
> implement the fallback in the original mba_MBps code). But I don't believe
> this will be a one-off special case.
> 
> I'm also wondering why this feedback loop picked "local" rather than "total".
> I dug into the e-mail archives, and I don't see any discussion. There's just
> an RFC series, and then the v2 series was applied with a few small suggestions
> from Thomas to make things cleaner..

May be MSR write which feedback loop does only has local effect. This will
be interesting to know.
  
Luck, Tony Oct. 25, 2023, 7:38 p.m. UTC | #6
On Wed, Oct 25, 2023 at 02:46:53PM +0200, Peter Newman wrote:
> Hi Tony,
> 
> On Tue, Oct 24, 2023 at 8:16 PM Tony Luck <tony.luck@intel.com> wrote:
> > --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> > +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> > @@ -418,6 +418,14 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
> >         return 0;
> >  }
> >
> > +static struct mbm_state *get_mbm_data(struct rdt_domain *dom_mbm, int rmid)
> > +{
> > +       if (is_mbm_local_enabled())
> > +               return &dom_mbm->mbm_local[rmid];
> > +
> > +       return &dom_mbm->mbm_total[rmid];
> > +}
> 
> That looks very similar to the get_mbm_state() function I added to
> this same file recently:
> 
> https://lore.kernel.org/all/20221220164132.443083-2-peternewman%40google.com
> 
> I think the name you picked is misleadingly general. "local if
> available, otherwise total" seems to be a choice specific to the mbps
> controller. I think these functions should be reconciled a little
> better.
> 

Peter (and Babu, who made the same point about get_mbm_state().

Do you want to see your function extended to do the "pick an MBM event?"

I could add a s/w defined "event" to the enum resctrl_event_id and
extend get_mbm_state() like this:


static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid,
				       enum resctrl_event_id evtid)
{
	switch (evtid) {
	case QOS_L3_MBM_TOTAL_EVENT_ID:
		return &d->mbm_total[rmid];
	case QOS_L3_MBM_LOCAL_EVENT_ID:
		return &d->mbm_local[rmid];
+	case QOS_L3_MBM_LOCAL_OR_TOTAL_EVENT_ID:
+		if (is_mbm_local_enabled())
+			return &d->mbm_local[rmid];
+		if (is_mbm_total_enabled())
+			return &d->mbm_total[rmid];
+		fallthrough;
	default:
		return NULL;
	}
}

Is this the direction you are thinking of?

Callers then look like:

static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
{
	struct mbm_state *m = get_mbm_state(rr->d, rmid, QOS_L3_MBM_LOCAL_OR_TOTAL_EVENT_ID);
	u64 cur_bw, bytes, cur_bytes;

similar for the other three places where this is needed.

Any suggestions on how "QOS_L3_MBM_LOCAL_OR_TOTAL_EVENT_ID" could be
abbreviated, or just have some different, but descriptive, name?

-Tony
  
Moger, Babu Oct. 25, 2023, 8:39 p.m. UTC | #7
Hi Tony,

On 10/25/23 14:38, Tony Luck wrote:
> On Wed, Oct 25, 2023 at 02:46:53PM +0200, Peter Newman wrote:
>> Hi Tony,
>>
>> On Tue, Oct 24, 2023 at 8:16 PM Tony Luck <tony.luck@intel.com> wrote:
>>> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
>>> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
>>> @@ -418,6 +418,14 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
>>>         return 0;
>>>  }
>>>
>>> +static struct mbm_state *get_mbm_data(struct rdt_domain *dom_mbm, int rmid)
>>> +{
>>> +       if (is_mbm_local_enabled())
>>> +               return &dom_mbm->mbm_local[rmid];
>>> +
>>> +       return &dom_mbm->mbm_total[rmid];
>>> +}
>>
>> That looks very similar to the get_mbm_state() function I added to
>> this same file recently:
>>
>> https://lore.kernel.org/all/20221220164132.443083-2-peternewman%40google.com
>>
>> I think the name you picked is misleadingly general. "local if
>> available, otherwise total" seems to be a choice specific to the mbps
>> controller. I think these functions should be reconciled a little
>> better.
>>
> 
> Peter (and Babu, who made the same point about get_mbm_state().
> 
> Do you want to see your function extended to do the "pick an MBM event?"
> 
> I could add a s/w defined "event" to the enum resctrl_event_id and
> extend get_mbm_state() like this:
> 
> 
> static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid,
> 				       enum resctrl_event_id evtid)
> {
> 	switch (evtid) {
> 	case QOS_L3_MBM_TOTAL_EVENT_ID:
> 		return &d->mbm_total[rmid];
> 	case QOS_L3_MBM_LOCAL_EVENT_ID:
> 		return &d->mbm_local[rmid];
> +	case QOS_L3_MBM_LOCAL_OR_TOTAL_EVENT_ID:
> +		if (is_mbm_local_enabled())
> +			return &d->mbm_local[rmid];
> +		if (is_mbm_total_enabled())
> +			return &d->mbm_total[rmid];
> +		fallthrough;
> 	default:
> 		return NULL;
> 	}
> }
> 
> Is this the direction you are thinking of?

No. I was not thinking bit different.

You need these changes in only two functions, mbm_bw_count and
update_mba_bw. You decide which event you want to use based on availability,

Something like this. I updated mbm_bw_count.

diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c
b/arch/x86/kernel/cpu/resctrl/monitor.c
index 0ad23475fe16..302993e4fbc3 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -436,8 +436,16 @@ static int __mon_event_count(u32 rmid, struct
rmid_read *rr)
  */
 static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
 {
-       struct mbm_state *m = &rr->d->mbm_local[rmid];
        u64 cur_bw, bytes, cur_bytes;
+       struct mbm_state *m;
+       int evtid;
+
+       if (is_mbm_local_enabled())
+               evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
+       else
+               evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
+
+       m = get_mbm_state(rr->d, rmid, evtid);

        cur_bytes = rr->val;
        bytes = cur_bytes - m->prev_bw_bytes;


Will this work?

Thanks
Babu


> 
> Callers then look like:
> 
> static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
> {
> 	struct mbm_state *m = get_mbm_state(rr->d, rmid, QOS_L3_MBM_LOCAL_OR_TOTAL_EVENT_ID);
> 	u64 cur_bw, bytes, cur_bytes;
> 
> similar for the other three places where this is needed.
> 
> Any suggestions on how "QOS_L3_MBM_LOCAL_OR_TOTAL_EVENT_ID" could be
> abbreviated, or just have some different, but descriptive, name?
> 
> -Tony
  
Moger, Babu Oct. 25, 2023, 8:42 p.m. UTC | #8
Tony,

On 10/25/23 15:39, Moger, Babu wrote:
> Hi Tony,
> 
> On 10/25/23 14:38, Tony Luck wrote:
>> On Wed, Oct 25, 2023 at 02:46:53PM +0200, Peter Newman wrote:
>>> Hi Tony,
>>>
>>> On Tue, Oct 24, 2023 at 8:16 PM Tony Luck <tony.luck@intel.com> wrote:
>>>> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
>>>> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
>>>> @@ -418,6 +418,14 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
>>>>         return 0;
>>>>  }
>>>>
>>>> +static struct mbm_state *get_mbm_data(struct rdt_domain *dom_mbm, int rmid)
>>>> +{
>>>> +       if (is_mbm_local_enabled())
>>>> +               return &dom_mbm->mbm_local[rmid];
>>>> +
>>>> +       return &dom_mbm->mbm_total[rmid];
>>>> +}
>>>
>>> That looks very similar to the get_mbm_state() function I added to
>>> this same file recently:
>>>
>>> https://lore.kernel.org/all/20221220164132.443083-2-peternewman%40google.com
>>>
>>> I think the name you picked is misleadingly general. "local if
>>> available, otherwise total" seems to be a choice specific to the mbps
>>> controller. I think these functions should be reconciled a little
>>> better.
>>>
>>
>> Peter (and Babu, who made the same point about get_mbm_state().
>>
>> Do you want to see your function extended to do the "pick an MBM event?"
>>
>> I could add a s/w defined "event" to the enum resctrl_event_id and
>> extend get_mbm_state() like this:
>>
>>
>> static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid,
>> 				       enum resctrl_event_id evtid)
>> {
>> 	switch (evtid) {
>> 	case QOS_L3_MBM_TOTAL_EVENT_ID:
>> 		return &d->mbm_total[rmid];
>> 	case QOS_L3_MBM_LOCAL_EVENT_ID:
>> 		return &d->mbm_local[rmid];
>> +	case QOS_L3_MBM_LOCAL_OR_TOTAL_EVENT_ID:
>> +		if (is_mbm_local_enabled())
>> +			return &d->mbm_local[rmid];
>> +		if (is_mbm_total_enabled())
>> +			return &d->mbm_total[rmid];
>> +		fallthrough;
>> 	default:
>> 		return NULL;
>> 	}
>> }
>>
>> Is this the direction you are thinking of?
> 
> No. I was not thinking bit different.

I meant, I was thinking bit different.

> 
> You need these changes in only two functions, mbm_bw_count and
> update_mba_bw. You decide which event you want to use based on availability,
> 
> Something like this. I updated mbm_bw_count.
> 
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c
> b/arch/x86/kernel/cpu/resctrl/monitor.c
> index 0ad23475fe16..302993e4fbc3 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -436,8 +436,16 @@ static int __mon_event_count(u32 rmid, struct
> rmid_read *rr)
>   */
>  static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
>  {
> -       struct mbm_state *m = &rr->d->mbm_local[rmid];
>         u64 cur_bw, bytes, cur_bytes;
> +       struct mbm_state *m;
> +       int evtid;
> +
> +       if (is_mbm_local_enabled())
> +               evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
> +       else
> +               evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
> +
> +       m = get_mbm_state(rr->d, rmid, evtid);
> 
>         cur_bytes = rr->val;
>         bytes = cur_bytes - m->prev_bw_bytes;
> 
> 
> Will this work?
> 
> Thanks
> Babu
> 
> 
>>
>> Callers then look like:
>>
>> static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
>> {
>> 	struct mbm_state *m = get_mbm_state(rr->d, rmid, QOS_L3_MBM_LOCAL_OR_TOTAL_EVENT_ID);
>> 	u64 cur_bw, bytes, cur_bytes;
>>
>> similar for the other three places where this is needed.
>>
>> Any suggestions on how "QOS_L3_MBM_LOCAL_OR_TOTAL_EVENT_ID" could be
>> abbreviated, or just have some different, but descriptive, name?
>>
>> -Tony
>
  
Luck, Tony Oct. 25, 2023, 8:52 p.m. UTC | #9
On Wed, Oct 25, 2023 at 03:42:14PM -0500, Moger, Babu wrote:
> I meant, I was thinking bit different.
> 
> > 
> > You need these changes in only two functions, mbm_bw_count and
> > update_mba_bw. You decide which event you want to use based on availability,
> > 
> > Something like this. I updated mbm_bw_count.
> > 
> > diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c
> > b/arch/x86/kernel/cpu/resctrl/monitor.c
> > index 0ad23475fe16..302993e4fbc3 100644
> > --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> > +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> > @@ -436,8 +436,16 @@ static int __mon_event_count(u32 rmid, struct
> > rmid_read *rr)
> >   */
> >  static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
> >  {
> > -       struct mbm_state *m = &rr->d->mbm_local[rmid];
> >         u64 cur_bw, bytes, cur_bytes;
> > +       struct mbm_state *m;
> > +       int evtid;
> > +
> > +       if (is_mbm_local_enabled())
> > +               evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
> > +       else
> > +               evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
> > +
> > +       m = get_mbm_state(rr->d, rmid, evtid);

Ok. Yes. That seems simpler.

Maybe I should just set a global "mbm_evtid" at mount
time. No need to check every time to see if is_mbm_local_enabled()
somehow changed and local b/w measurements were suddenly
available!

-Tony
  
Peter Newman Oct. 25, 2023, 9:06 p.m. UTC | #10
Hi Tony,

On Wed, Oct 25, 2023, 21:38 Tony Luck <tony.luck@intel.com> wrote:
>
> On Wed, Oct 25, 2023 at 02:46:53PM +0200, Peter Newman wrote:
>
> > > +static struct mbm_state *get_mbm_data(struct rdt_domain *dom_mbm, int rmid)
> > > +{
> > > +       if (is_mbm_local_enabled())
> > > +               return &dom_mbm->mbm_local[rmid];
> > > +
> > > +       return &dom_mbm->mbm_total[rmid];
> > > +}
> >
> > That looks very similar to the get_mbm_state() function I added to
> > this same file recently:
> >
> > https://lore.kernel.org/all/20221220164132.443083-2-peternewman%40google.com
> >
> > I think the name you picked is misleadingly general. "local if
> > available, otherwise total" seems to be a choice specific to the mbps
> > controller. I think these functions should be reconciled a little
> > better.
> >
>
> Peter (and Babu, who made the same point about get_mbm_state().
>
> Do you want to see your function extended to do the "pick an MBM event?"

What I meant was I think it would be enough to just give the function
you added a name that's more specific to the Mbps controller use case.
For example, get_mba_sc_mbm_state().

It's only problematic that you added a function with an equivalent
name to an existing function that does something different.

-Peter
  
Moger, Babu Oct. 25, 2023, 11:41 p.m. UTC | #11
Hi Tony,

On 10/25/2023 3:52 PM, Tony Luck wrote:
> On Wed, Oct 25, 2023 at 03:42:14PM -0500, Moger, Babu wrote:
>> I meant, I was thinking bit different.
>>
>>> You need these changes in only two functions, mbm_bw_count and
>>> update_mba_bw. You decide which event you want to use based on availability,
>>>
>>> Something like this. I updated mbm_bw_count.
>>>
>>> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c
>>> b/arch/x86/kernel/cpu/resctrl/monitor.c
>>> index 0ad23475fe16..302993e4fbc3 100644
>>> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
>>> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
>>> @@ -436,8 +436,16 @@ static int __mon_event_count(u32 rmid, struct
>>> rmid_read *rr)
>>>    */
>>>   static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
>>>   {
>>> -       struct mbm_state *m = &rr->d->mbm_local[rmid];
>>>          u64 cur_bw, bytes, cur_bytes;
>>> +       struct mbm_state *m;
>>> +       int evtid;
>>> +
>>> +       if (is_mbm_local_enabled())
>>> +               evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
>>> +       else
>>> +               evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
>>> +
>>> +       m = get_mbm_state(rr->d, rmid, evtid);
> Ok. Yes. That seems simpler.
>
> Maybe I should just set a global "mbm_evtid" at mount

Lets not make it global yet. This is only affecting couple of functions 
when mba_MPps is enabled.

> time. No need to check every time to see if is_mbm_local_enabled()
> somehow changed and local b/w measurements were suddenly
> available!

What changed suddenly? Can you please elaborate.

Thanks

Babu
  
Luck, Tony Oct. 26, 2023, 12:07 a.m. UTC | #12
> Lets not make it global yet. This is only affecting couple of functions 
> when mba_MPps is enabled.

See v2 (just posted). I made it "static" in monitor.c since both the
functions that need it are in that file.

>> time. No need to check every time to see if is_mbm_local_enabled()
>> somehow changed and local b/w measurements were suddenly
>> available!
>
> What changed suddenly? Can you please elaborate.

Nothing is going to change. If the system doesn't support local memory
bandwidth reporting when it booted, it isn't going to start doing so.
Same in reverse. If you have local bandwidth reporting, it won't
go away.

So at resctrl init time when discovering which of local & total are supported,
just save the event id to use for mba_MBps at that point.

-Tony
  
Moger, Babu Oct. 26, 2023, 1:55 p.m. UTC | #13
Hi Tony,

On 10/25/23 16:06, Peter Newman wrote:
> Hi Tony,
> 
> On Wed, Oct 25, 2023, 21:38 Tony Luck <tony.luck@intel.com> wrote:
>>
>> On Wed, Oct 25, 2023 at 02:46:53PM +0200, Peter Newman wrote:
>>
>>>> +static struct mbm_state *get_mbm_data(struct rdt_domain *dom_mbm, int rmid)
>>>> +{
>>>> +       if (is_mbm_local_enabled())
>>>> +               return &dom_mbm->mbm_local[rmid];
>>>> +
>>>> +       return &dom_mbm->mbm_total[rmid];
>>>> +}
>>>
>>> That looks very similar to the get_mbm_state() function I added to
>>> this same file recently:
>>>
>>> https://lore.kernel.org/all/20221220164132.443083-2-peternewman%40google.com
>>>
>>> I think the name you picked is misleadingly general. "local if
>>> available, otherwise total" seems to be a choice specific to the mbps
>>> controller. I think these functions should be reconciled a little
>>> better.
>>>
>>
>> Peter (and Babu, who made the same point about get_mbm_state().
>>
>> Do you want to see your function extended to do the "pick an MBM event?"
> 
> What I meant was I think it would be enough to just give the function
> you added a name that's more specific to the Mbps controller use case.
> For example, get_mba_sc_mbm_state().

I actually liked this idea. Add a new function get_mba_sc_mbm_state. That
way we exactly know why this function is used. I see you already sent a v2
making the event global. Making it global may not be good idea. Can you
please update the patch and resend. Also please add the comment about why
you are adding that function.
  
Luck, Tony Oct. 26, 2023, 4:09 p.m. UTC | #14
> > What I meant was I think it would be enough to just give the function
> > you added a name that's more specific to the Mbps controller use case.
> > For example, get_mba_sc_mbm_state().
>
> I actually liked this idea. Add a new function get_mba_sc_mbm_state. That
> way we exactly know why this function is used. I see you already sent a v2
> making the event global. Making it global may not be good idea. Can you
> please update the patch and resend. Also please add the comment about why
> you are adding that function.

Can you explain why you don't like the global? If there is a better name for it,
or a better comment for what it does, or you think the code that sets the value
could be clearer, then I'm happy to make changes there.

Which events are supported by a system is a static property. Figuring out once
at "init" time which event to use for mba_MBps seems a better choice than
re-checking for each of possibly hundreds of RMIDs every second. Even though
the check is cheap, it is utterly pointless.

-Tony
  
Moger, Babu Oct. 26, 2023, 5:19 p.m. UTC | #15
Hi Tony,

On 10/26/23 11:09, Luck, Tony wrote:
>>> What I meant was I think it would be enough to just give the function
>>> you added a name that's more specific to the Mbps controller use case.
>>> For example, get_mba_sc_mbm_state().
>>
>> I actually liked this idea. Add a new function get_mba_sc_mbm_state. That
>> way we exactly know why this function is used. I see you already sent a v2
>> making the event global. Making it global may not be good idea. Can you
>> please update the patch and resend. Also please add the comment about why
>> you are adding that function.
> 
> Can you explain why you don't like the global? If there is a better name for it,
> or a better comment for what it does, or you think the code that sets the value
> could be clearer, then I'm happy to make changes there.

My theory is always try to localize the changes and avoid global variables
when there are other ways to do the same thing. It may not be strong argument.
> 
> Which events are supported by a system is a static property. Figuring out once
> at "init" time which event to use for mba_MBps seems a better choice than
> re-checking for each of possibly hundreds of RMIDs every second. Even though
> the check is cheap, it is utterly pointless.

mbm_update happens here only to the active group (not on all the available
rmids).
Also, I am not clear about weather this is going fix your problem.
You are setting the MSR limit based on total bandwidth. The MSR you are
writing may only have the local socket effect. In cases where all the
memory is allocated from remote socket then writing the MSR may not have
any effect.
Also you said you don't have the hardware to verify. Its always good to
verify if is really fixing the problem. my 02 cents.
Thanks
Babu Moger
  
Luck, Tony Oct. 26, 2023, 7:54 p.m. UTC | #16
On Thu, Oct 26, 2023 at 12:19:14PM -0500, Moger, Babu wrote:
> Hi Tony,
> 
> On 10/26/23 11:09, Luck, Tony wrote:
> >>> What I meant was I think it would be enough to just give the function
> >>> you added a name that's more specific to the Mbps controller use case.
> >>> For example, get_mba_sc_mbm_state().
> >>
> >> I actually liked this idea. Add a new function get_mba_sc_mbm_state. That
> >> way we exactly know why this function is used. I see you already sent a v2
> >> making the event global. Making it global may not be good idea. Can you
> >> please update the patch and resend. Also please add the comment about why
> >> you are adding that function.
> > 
> > Can you explain why you don't like the global? If there is a better name for it,
> > or a better comment for what it does, or you think the code that sets the value
> > could be clearer, then I'm happy to make changes there.
> 
> My theory is always try to localize the changes and avoid global variables
> when there are other ways to do the same thing. It may not be strong argument.

A good theory. I do this too. But it seems I'm more likely to go with
global variables if the cost of avoiding them is high. But "cost" is
a very subjective thing.

> > Which events are supported by a system is a static property. Figuring out once
> > at "init" time which event to use for mba_MBps seems a better choice than
> > re-checking for each of possibly hundreds of RMIDs every second. Even though
> > the check is cheap, it is utterly pointless.
> 
> mbm_update happens here only to the active group (not on all the available
> rmids).

mbaMBps needs to get data from all active RMIDs to provide input to
the feedback loop. That might be a lot of RMIDs if many jobs are being
monitored independently (which I believe is a common mode of operation).

> Also, I am not clear about weather this is going fix your problem.
> You are setting the MSR limit based on total bandwidth. The MSR you are
> writing may only have the local socket effect. In cases where all the
> memory is allocated from remote socket then writing the MSR may not have
> any effect.

Intel MBA controls operate on all memory operations that miss the L3
cache (whether they are going to a local memory controller, or across
a UPI link to a memory controller on another socket).

> Also you said you don't have the hardware to verify. Its always good to
> verify if is really fixing the problem. my 02 cents.

I don't have hardare that enforces this. But Linux does have a boot
option clearcpuid=cqm_mbm_local to tell Linux that the system doesn't
provide a local counter. I've been using that for all my testing.

-Tony
  

Patch

diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index f136ac046851..3b9531cce807 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -418,6 +418,14 @@  static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 	return 0;
 }
 
+static struct mbm_state *get_mbm_data(struct rdt_domain *dom_mbm, int rmid)
+{
+	if (is_mbm_local_enabled())
+		return &dom_mbm->mbm_local[rmid];
+
+	return &dom_mbm->mbm_total[rmid];
+}
+
 /*
  * mbm_bw_count() - Update bw count from values previously read by
  *		    __mon_event_count().
@@ -431,7 +439,7 @@  static int __mon_event_count(u32 rmid, struct rmid_read *rr)
  */
 static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
 {
-	struct mbm_state *m = &rr->d->mbm_local[rmid];
+	struct mbm_state *m = get_mbm_data(rr->d, rmid);
 	u64 cur_bw, bytes, cur_bytes;
 
 	cur_bytes = rr->val;
@@ -526,14 +534,14 @@  static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	struct list_head *head;
 	struct rdtgroup *entry;
 
-	if (!is_mbm_local_enabled())
+	if (!is_mbm_enabled())
 		return;
 
 	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
 
 	closid = rgrp->closid;
 	rmid = rgrp->mon.rmid;
-	pmbm_data = &dom_mbm->mbm_local[rmid];
+	pmbm_data = get_mbm_data(dom_mbm, rmid);
 
 	dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
 	if (!dom_mba) {
@@ -553,7 +561,7 @@  static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	 */
 	head = &rgrp->mon.crdtgrp_list;
 	list_for_each_entry(entry, head, mon.crdtgrp_list) {
-		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+		cmbm_data = get_mbm_data(dom_mbm, entry->mon.rmid);
 		cur_bw += cmbm_data->prev_bw;
 		delta_bw += cmbm_data->delta_bw;
 	}
@@ -595,7 +603,7 @@  static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	 */
 	pmbm_data->delta_comp = true;
 	list_for_each_entry(entry, head, mon.crdtgrp_list) {
-		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+		cmbm_data = get_mbm_data(dom_mbm, entry->mon.rmid);
 		cmbm_data->delta_comp = true;
 	}
 }
@@ -621,15 +629,15 @@  static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
 		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
 		rr.val = 0;
 		__mon_event_count(rmid, &rr);
-
-		/*
-		 * Call the MBA software controller only for the
-		 * control groups and when user has enabled
-		 * the software controller explicitly.
-		 */
-		if (is_mba_sc(NULL))
-			mbm_bw_count(rmid, &rr);
 	}
+
+	/*
+	 * Call the MBA software controller only for the
+	 * control groups and when user has enabled
+	 * the software controller explicitly.
+	 */
+	if (is_mba_sc(NULL))
+		mbm_bw_count(rmid, &rr);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 69a1de92384a..0c4f8a1b8df0 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2294,7 +2294,7 @@  static bool supports_mba_mbps(void)
 {
 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
 
-	return (is_mbm_local_enabled() &&
+	return (is_mbm_enabled() &&
 		r->alloc_capable && is_mba_linear());
 }