[v5] x86/resctrl: Add event choices for mba_MBps

Message ID 20231201214737.104444-1-tony.luck@intel.com
State New
Headers
Series [v5] x86/resctrl: Add event choices for mba_MBps |

Commit Message

Luck, Tony Dec. 1, 2023, 9:47 p.m. UTC
  The MBA Software Controller(mba_sc) is a feedback loop that uses
measurements of local memory bandwidth to adjust MBA throttling levels to
keep workloads in a resctrl group within a target bandwidth set in the
schemata file.

But on Intel systems the memory bandwidth monitoring events are
independently enumerated. It is possible for a system to support
total memory bandwidth monitoring, but not support local bandwidth
monitoring. On such a system a user could not enable mba_sc mode.
Users will see this highly unhelpful error message from mount:

 # mount -t resctrl -o mba_MBps resctrl /sys/fs/resctrl
 mount: /sys/fs/resctrl: wrong fs type, bad option, bad superblock on
 resctrl, missing codepage or helper program, or other error.
 dmesg(1) may have more information after failed mount system call.

dmesg(1) does not provide any additional information.

Add a new mount option "mba_MBps_event=[local|total]" that allows
a user to specify which monitoring event to use. Also modify the
existing "mba_MBps" option to switch to total bandwidth monitoring
if local monitoring is not available.

Update the once-per-second polling code to use the chosen event (local
or total memory bandwidth).

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 Documentation/arch/x86/resctrl.rst     |  7 +++-
 include/linux/resctrl.h                |  2 ++
 arch/x86/kernel/cpu/resctrl/internal.h |  3 +-
 arch/x86/kernel/cpu/resctrl/monitor.c  | 21 ++++++-----
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 48 ++++++++++++++++++++------
 5 files changed, 58 insertions(+), 23 deletions(-)
  

Comments

Moger, Babu Dec. 4, 2023, 4:24 p.m. UTC | #1
Hi Tony,

You are intending to achieve two things at once here.
1. Adding new mount option
2. Changing behaviour for the current option.
I think you need to split this patch into two. Few comments below.

On 12/1/23 15:47, Tony Luck wrote:
> The MBA Software Controller(mba_sc) is a feedback loop that uses
> measurements of local memory bandwidth to adjust MBA throttling levels to
> keep workloads in a resctrl group within a target bandwidth set in the
> schemata file.
> 
> But on Intel systems the memory bandwidth monitoring events are
> independently enumerated. It is possible for a system to support
> total memory bandwidth monitoring, but not support local bandwidth
> monitoring. On such a system a user could not enable mba_sc mode.
> Users will see this highly unhelpful error message from mount:
> 
>  # mount -t resctrl -o mba_MBps resctrl /sys/fs/resctrl
>  mount: /sys/fs/resctrl: wrong fs type, bad option, bad superblock on
>  resctrl, missing codepage or helper program, or other error.
>  dmesg(1) may have more information after failed mount system call.
> 
> dmesg(1) does not provide any additional information.
> 
> Add a new mount option "mba_MBps_event=[local|total]" that allows
> a user to specify which monitoring event to use. Also modify the
> existing "mba_MBps" option to switch to total bandwidth monitoring
> if local monitoring is not available.

I am not sure why you need both these options. I feel you just need one of
these options.

Thanks
Babu

> 
> Update the once-per-second polling code to use the chosen event (local
> or total memory bandwidth).
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  Documentation/arch/x86/resctrl.rst     |  7 +++-
>  include/linux/resctrl.h                |  2 ++
>  arch/x86/kernel/cpu/resctrl/internal.h |  3 +-
>  arch/x86/kernel/cpu/resctrl/monitor.c  | 21 ++++++-----
>  arch/x86/kernel/cpu/resctrl/rdtgroup.c | 48 ++++++++++++++++++++------
>  5 files changed, 58 insertions(+), 23 deletions(-)
> 
> diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
> index a6279df64a9d..f06cb189911a 100644
> --- a/Documentation/arch/x86/resctrl.rst
> +++ b/Documentation/arch/x86/resctrl.rst
> @@ -45,7 +45,12 @@ mount options are:
>  	Enable code/data prioritization in L2 cache allocations.
>  "mba_MBps":
>  	Enable the MBA Software Controller(mba_sc) to specify MBA
> -	bandwidth in MBps
> +	bandwidth in MBps. Defaults to using MBM local bandwidth,
> +	but will use total bandwidth on systems that do not support
> +	local bandwidth monitoring.
> +"mba_MBps_event=[local|total]":
> +	Enable the MBA Software Controller(mba_sc) with a specific
> +	MBM event as input to the feedback loop.
>  "debug":
>  	Make debug files accessible. Available debug files are annotated with
>  	"Available only with debug option".
> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> index 66942d7fba7f..1feb3b2e64fa 100644
> --- a/include/linux/resctrl.h
> +++ b/include/linux/resctrl.h
> @@ -129,6 +129,7 @@ enum membw_throttle_mode {
>   * @throttle_mode:	Bandwidth throttling mode when threads request
>   *			different memory bandwidths
>   * @mba_sc:		True if MBA software controller(mba_sc) is enabled
> + * @mba_mbps_event:	Event (local or total) for mba_sc
>   * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
>   */
>  struct resctrl_membw {
> @@ -138,6 +139,7 @@ struct resctrl_membw {
>  	bool				arch_needs_linear;
>  	enum membw_throttle_mode	throttle_mode;
>  	bool				mba_sc;
> +	enum resctrl_event_id		mba_mbps_event;
>  	u32				*mb_map;
>  };
>  
> diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
> index a4f1aa15f0a2..8b9b8f664324 100644
> --- a/arch/x86/kernel/cpu/resctrl/internal.h
> +++ b/arch/x86/kernel/cpu/resctrl/internal.h
> @@ -58,7 +58,8 @@ struct rdt_fs_context {
>  	struct kernfs_fs_context	kfc;
>  	bool				enable_cdpl2;
>  	bool				enable_cdpl3;
> -	bool				enable_mba_mbps;
> +	bool				enable_mba_mbps_local;
> +	bool				enable_mba_mbps_total;
>  	bool				enable_debug;
>  };
>  
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index f136ac046851..d9e590f1cbc3 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -431,9 +431,10 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
>   */
>  static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
>  {
> -	struct mbm_state *m = &rr->d->mbm_local[rmid];
>  	u64 cur_bw, bytes, cur_bytes;
> +	struct mbm_state *m;
>  
> +	m = get_mbm_state(rr->d, rmid, rr->evtid);
>  	cur_bytes = rr->val;
>  	bytes = cur_bytes - m->prev_bw_bytes;
>  	m->prev_bw_bytes = cur_bytes;
> @@ -521,19 +522,21 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
>  	u32 closid, rmid, cur_msr_val, new_msr_val;
>  	struct mbm_state *pmbm_data, *cmbm_data;
>  	u32 cur_bw, delta_bw, user_bw;
> +	enum resctrl_event_id evt_id;
>  	struct rdt_resource *r_mba;
>  	struct rdt_domain *dom_mba;
>  	struct list_head *head;
>  	struct rdtgroup *entry;
>  
> -	if (!is_mbm_local_enabled())
> +	if (!is_mbm_enabled())
>  		return;
>  
>  	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
> +	evt_id = r_mba->membw.mba_mbps_event;
>  
>  	closid = rgrp->closid;
>  	rmid = rgrp->mon.rmid;
> -	pmbm_data = &dom_mbm->mbm_local[rmid];
> +	pmbm_data = get_mbm_state(dom_mbm, rmid, evt_id);
>  
>  	dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
>  	if (!dom_mba) {
> @@ -553,7 +556,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
>  	 */
>  	head = &rgrp->mon.crdtgrp_list;
>  	list_for_each_entry(entry, head, mon.crdtgrp_list) {
> -		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
> +		cmbm_data = get_mbm_state(dom_mbm, entry->mon.rmid, evt_id);
>  		cur_bw += cmbm_data->prev_bw;
>  		delta_bw += cmbm_data->delta_bw;
>  	}
> @@ -616,18 +619,14 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
>  		rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
>  		rr.val = 0;
>  		__mon_event_count(rmid, &rr);
> +		if (is_mba_sc(NULL) && rr.evtid == r->membw.mba_mbps_event)
> +			mbm_bw_count(rmid, &rr);
>  	}
>  	if (is_mbm_local_enabled()) {
>  		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
>  		rr.val = 0;
>  		__mon_event_count(rmid, &rr);
> -
> -		/*
> -		 * Call the MBA software controller only for the
> -		 * control groups and when user has enabled
> -		 * the software controller explicitly.
> -		 */
> -		if (is_mba_sc(NULL))
> +		if (is_mba_sc(NULL) && rr.evtid == r->membw.mba_mbps_event)
>  			mbm_bw_count(rmid, &rr);
>  	}
>  }
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index 69a1de92384a..79141d33d5b4 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -2294,7 +2294,7 @@ static bool supports_mba_mbps(void)
>  {
>  	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
>  
> -	return (is_mbm_local_enabled() &&
> +	return (is_mbm_enabled() &&
>  		r->alloc_capable && is_mba_linear());
>  }
>  
> @@ -2302,7 +2302,7 @@ static bool supports_mba_mbps(void)
>   * Enable or disable the MBA software controller
>   * which helps user specify bandwidth in MBps.
>   */
> -static int set_mba_sc(bool mba_sc)
> +static int set_mba_sc(bool mba_sc, enum resctrl_event_id mba_mbps_event)
>  {
>  	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
>  	u32 num_closid = resctrl_arch_get_num_closid(r);
> @@ -2313,6 +2313,7 @@ static int set_mba_sc(bool mba_sc)
>  		return -EINVAL;
>  
>  	r->membw.mba_sc = mba_sc;
> +	r->membw.mba_mbps_event = mba_mbps_event;
>  
>  	list_for_each_entry(d, &r->domains, list) {
>  		for (i = 0; i < num_closid; i++)
> @@ -2445,13 +2446,14 @@ static void rdt_disable_ctx(void)
>  {
>  	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
>  	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
> -	set_mba_sc(false);
> +	set_mba_sc(false, QOS_L3_MBM_LOCAL_EVENT_ID);
>  
>  	resctrl_debug = false;
>  }
>  
>  static int rdt_enable_ctx(struct rdt_fs_context *ctx)
>  {
> +	enum resctrl_event_id mba_mbps_event;
>  	int ret = 0;
>  
>  	if (ctx->enable_cdpl2) {
> @@ -2466,8 +2468,12 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx)
>  			goto out_cdpl2;
>  	}
>  
> -	if (ctx->enable_mba_mbps) {
> -		ret = set_mba_sc(true);
> +	if (ctx->enable_mba_mbps_local || ctx->enable_mba_mbps_total) {
> +		if (ctx->enable_mba_mbps_total)
> +			mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
> +		else
> +			mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
> +		ret = set_mba_sc(true, mba_mbps_event);
>  		if (ret)
>  			goto out_cdpl3;
>  	}
> @@ -2683,15 +2689,17 @@ enum rdt_param {
>  	Opt_cdp,
>  	Opt_cdpl2,
>  	Opt_mba_mbps,
> +	Opt_mba_mbps_event,
>  	Opt_debug,
>  	nr__rdt_params
>  };
>  
>  static const struct fs_parameter_spec rdt_fs_parameters[] = {
> -	fsparam_flag("cdp",		Opt_cdp),
> -	fsparam_flag("cdpl2",		Opt_cdpl2),
> -	fsparam_flag("mba_MBps",	Opt_mba_mbps),
> -	fsparam_flag("debug",		Opt_debug),
> +	fsparam_flag("cdp",			Opt_cdp),
> +	fsparam_flag("cdpl2",			Opt_cdpl2),
> +	fsparam_flag("mba_MBps",		Opt_mba_mbps),
> +	fsparam_string("mba_MBps_event",	Opt_mba_mbps_event),
> +	fsparam_flag("debug",			Opt_debug),
>  	{}
>  };
>  
> @@ -2715,7 +2723,27 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
>  	case Opt_mba_mbps:
>  		if (!supports_mba_mbps())
>  			return -EINVAL;
> -		ctx->enable_mba_mbps = true;
> +		if (is_mbm_local_enabled())
> +			ctx->enable_mba_mbps_local = true;
> +		else if (is_mbm_total_enabled())
> +			ctx->enable_mba_mbps_total = true;
> +		else
> +			return -EINVAL;
> +		return 0;
> +	case Opt_mba_mbps_event:
> +		if (!supports_mba_mbps())
> +			return -EINVAL;
> +		if (!strcmp("local", param->string)) {
> +			if (!is_mbm_local_enabled())
> +				return -EINVAL;
> +			ctx->enable_mba_mbps_local = true;
> +		} else if (!strcmp("total", param->string)) {
> +			if (!is_mbm_total_enabled())
> +				return -EINVAL;
> +			ctx->enable_mba_mbps_total = true;
> +		} else {
> +			return -EINVAL;
> +		}
>  		return 0;
>  	case Opt_debug:
>  		ctx->enable_debug = true;
  
Luck, Tony Dec. 4, 2023, 6:16 p.m. UTC | #2
On Mon, Dec 04, 2023 at 10:24:58AM -0600, Moger, Babu wrote:
> Hi Tony,
>
> You are intending to achieve two things at once here.
> 1. Adding new mount option
> 2. Changing behaviour for the current option.
> I think you need to split this patch into two. Few comments below.

Hi Babu,

Thanks for looking at this patch.

You are right. I will split the patch into two as you suggest.

> On 12/1/23 15:47, Tony Luck wrote:
> > The MBA Software Controller(mba_sc) is a feedback loop that uses
> > measurements of local memory bandwidth to adjust MBA throttling levels to
> > keep workloads in a resctrl group within a target bandwidth set in the
> > schemata file.
> >
> > But on Intel systems the memory bandwidth monitoring events are
> > independently enumerated. It is possible for a system to support
> > total memory bandwidth monitoring, but not support local bandwidth
> > monitoring. On such a system a user could not enable mba_sc mode.
> > Users will see this highly unhelpful error message from mount:
> >
> >  # mount -t resctrl -o mba_MBps resctrl /sys/fs/resctrl
> >  mount: /sys/fs/resctrl: wrong fs type, bad option, bad superblock on
> >  resctrl, missing codepage or helper program, or other error.
> >  dmesg(1) may have more information after failed mount system call.
> >
> > dmesg(1) does not provide any additional information.
> >
> > Add a new mount option "mba_MBps_event=[local|total]" that allows
> > a user to specify which monitoring event to use. Also modify the
> > existing "mba_MBps" option to switch to total bandwidth monitoring
> > if local monitoring is not available.
>
> I am not sure why you need both these options. I feel you just need one of
> these options.

I should have included "changes since v4" in with this message, and
pasted in some parts of this earlier messge from the discussion about
v4:

https://lore.kernel.org/all/ZWpF5m4mIeZdK8kv@agluck-desk3/

Having the option take "local" would give a way for a user to
avoid the failover to using "total" if they really didn't want
that to happen.

Not in that message, because I didn't think of it until later, it
opens the door for different events in the future.

But I'm also open to other suggestions on naming and function of
mount options here.

Thanks

-Tony
  
Moger, Babu Dec. 4, 2023, 7:04 p.m. UTC | #3
Hi Tony,

On 12/4/23 12:16, Tony Luck wrote:
> On Mon, Dec 04, 2023 at 10:24:58AM -0600, Moger, Babu wrote:
>> Hi Tony,
>>
>> You are intending to achieve two things at once here.
>> 1. Adding new mount option
>> 2. Changing behaviour for the current option.
>> I think you need to split this patch into two. Few comments below.
> 
> Hi Babu,
> 
> Thanks for looking at this patch.
> 
> You are right. I will split the patch into two as you suggest.
> 
>> On 12/1/23 15:47, Tony Luck wrote:
>>> The MBA Software Controller(mba_sc) is a feedback loop that uses
>>> measurements of local memory bandwidth to adjust MBA throttling levels to
>>> keep workloads in a resctrl group within a target bandwidth set in the
>>> schemata file.
>>>
>>> But on Intel systems the memory bandwidth monitoring events are
>>> independently enumerated. It is possible for a system to support
>>> total memory bandwidth monitoring, but not support local bandwidth
>>> monitoring. On such a system a user could not enable mba_sc mode.
>>> Users will see this highly unhelpful error message from mount:
>>>
>>>  # mount -t resctrl -o mba_MBps resctrl /sys/fs/resctrl
>>>  mount: /sys/fs/resctrl: wrong fs type, bad option, bad superblock on
>>>  resctrl, missing codepage or helper program, or other error.
>>>  dmesg(1) may have more information after failed mount system call.
>>>
>>> dmesg(1) does not provide any additional information.
>>>
>>> Add a new mount option "mba_MBps_event=[local|total]" that allows
>>> a user to specify which monitoring event to use. Also modify the
>>> existing "mba_MBps" option to switch to total bandwidth monitoring
>>> if local monitoring is not available.
>>
>> I am not sure why you need both these options. I feel you just need one of
>> these options.
> 
> I should have included "changes since v4" in with this message, and
> pasted in some parts of this earlier messge from the discussion about
> v4:
> 
> https://lore.kernel.org/all/ZWpF5m4mIeZdK8kv@agluck-desk3/
> 
> Having the option take "local" would give a way for a user to
> avoid the failover to using "total" if they really didn't want
> that to happen.

Yes. I saw the thread. Even then I feel having two similar options can
cause confusion. I feel it is enough just to solve the original problem.
Giving more options to a corner cases is a overkill in my opinion.

Thanks
Babu


> 
> Not in that message, because I didn't think of it until later, it
> opens the door for different events in the future.
> 
> But I'm also open to other suggestions on naming and function of
> mount options here.
> 
> Thanks
> 
> -Tony
  
Luck, Tony Dec. 4, 2023, 7:45 p.m. UTC | #4
> Yes. I saw the thread. Even then I feel having two similar options can
> cause confusion. I feel it is enough just to solve the original problem.
> Giving more options to a corner cases is a overkill in my opinion.

The "original" problem was systems without "local" bandwidth event. I
wanted to give a way for users of mba_MBps to still have some way to
use it (assuming that "total" bandwidth event was present).

Reinette suggested that some people might want to use "total", even
on systems that support "local". I firmly agree with that.  It is easy to
construct scenarios where most bandwidth is to a remote node. using
"local" event will do nothing to throttle in these case. I'm not at all sure
why "local" event was picked. There's nothing in the LKML threads to
give clues.

I proposed a mount option "total" as a modifier to be used in conjunction
with "mba_MBps". Reinette said it was too generic. Her suggestion was
to add "mba_MBps_total" to be used instead of "mba_MBps".

Is that where I should have gone, instead of "mba_MBps={local|total}"?

-Tony
  
Reinette Chatre Dec. 4, 2023, 8:03 p.m. UTC | #5
Hi Tony,

On 12/4/2023 11:45 AM, Luck, Tony wrote:
>> Yes. I saw the thread. Even then I feel having two similar options can
>> cause confusion. I feel it is enough just to solve the original problem.
>> Giving more options to a corner cases is a overkill in my opinion.
> 
> The "original" problem was systems without "local" bandwidth event. I
> wanted to give a way for users of mba_MBps to still have some way to
> use it (assuming that "total" bandwidth event was present).
> 
> Reinette suggested that some people might want to use "total", even
> on systems that support "local". I firmly agree with that.  It is easy to
> construct scenarios where most bandwidth is to a remote node. using
> "local" event will do nothing to throttle in these case. I'm not at all sure
> why "local" event was picked. There's nothing in the LKML threads to
> give clues.
> 
> I proposed a mount option "total" as a modifier to be used in conjunction
> with "mba_MBps". Reinette said it was too generic. Her suggestion was
> to add "mba_MBps_total" to be used instead of "mba_MBps".

No, it cannot be used instead of "mba_MBps". My intention was for it to be
in addition to existing "mba_MBps" since taking "mba_MBps" away would be
considered breaking user space ABI.

Even so ...

> 
> Is that where I should have gone, instead of "mba_MBps={local|total}"?

While I did propose "mba_MBps_total" (in addition to "mba_MBps") I do
recognize your comment that a new key of mba_MBps_event does give more
flexibility if different events become available in future. Emphasis is
on "different" since I do not believe the parsing can support multiple
events and thus mba_MBps_event cannot be treated as a general bucket for all
mba_sc options, just different events guiding the feedback loop.

"mba_MBps" must be kept and having it continue to use local bw as default,
but total bw on systems that do not support local bw seems appropriate,
(which is what this patch does).

Reinette
  
Luck, Tony Dec. 4, 2023, 9:08 p.m. UTC | #6
On Mon, Dec 04, 2023 at 12:03:23PM -0800, Reinette Chatre wrote:
> Hi Tony,
> 
> On 12/4/2023 11:45 AM, Luck, Tony wrote:
> >> Yes. I saw the thread. Even then I feel having two similar options can
> >> cause confusion. I feel it is enough just to solve the original problem.
> >> Giving more options to a corner cases is a overkill in my opinion.
> > 
> > The "original" problem was systems without "local" bandwidth event. I
> > wanted to give a way for users of mba_MBps to still have some way to
> > use it (assuming that "total" bandwidth event was present).
> > 
> > Reinette suggested that some people might want to use "total", even
> > on systems that support "local". I firmly agree with that.  It is easy to
> > construct scenarios where most bandwidth is to a remote node. using
> > "local" event will do nothing to throttle in these case. I'm not at all sure
> > why "local" event was picked. There's nothing in the LKML threads to
> > give clues.
> > 
> > I proposed a mount option "total" as a modifier to be used in conjunction
> > with "mba_MBps". Reinette said it was too generic. Her suggestion was
> > to add "mba_MBps_total" to be used instead of "mba_MBps".
> 
> No, it cannot be used instead of "mba_MBps". My intention was for it to be
> in addition to existing "mba_MBps" since taking "mba_MBps" away would be
> considered breaking user space ABI.

I was unclear. The mba_MBps option must remain as legacy ABI. My
"instead of" was intended to convey that a user wanting total bandwidth
would use:

# mount -t resctrl -o mba_MBps_total resctrl /sys/fs/resctrl

rather than the new option being a modifier and requiring both
the legacy option and the modifier like this:

# mount -t resctrl -o mba_MBps,mba_MBps_total resctrl /sys/fs/resctrl

which seems overly verbose.

> 
> Even so ...
> 
> > 
> > Is that where I should have gone, instead of "mba_MBps={local|total}"?
> 
> While I did propose "mba_MBps_total" (in addition to "mba_MBps") I do
> recognize your comment that a new key of mba_MBps_event does give more
> flexibility if different events become available in future. Emphasis is
> on "different" since I do not believe the parsing can support multiple
> events and thus mba_MBps_event cannot be treated as a general bucket for all
> mba_sc options, just different events guiding the feedback loop.
> 
> "mba_MBps" must be kept and having it continue to use local bw as default,
> but total bw on systems that do not support local bw seems appropriate,
> (which is what this patch does).

So we defintely have:

"mba_MBps" - defaults to local, on systems without local may switch to
total if that is available. Should this switch get a pr_info()? Or just happen
silently (as I've done in patches so far).

and need to come to agreement on which of these to implement:

A) "mba_MBps_total" - forces use of total. Fails the mount if total is not
   available.

B) "mba_MBps={local|total)" forces use of chosen event, fails if event
is unavailable.

C) Something else.

D) Don't provide any way to force use of total event.

> 
> Reinette

-Tony
  
Reinette Chatre Dec. 4, 2023, 10:15 p.m. UTC | #7
Hi Tony,

On 12/4/2023 1:08 PM, Tony Luck wrote:
> On Mon, Dec 04, 2023 at 12:03:23PM -0800, Reinette Chatre wrote:
>> Hi Tony,
>>
>> On 12/4/2023 11:45 AM, Luck, Tony wrote:
>>>> Yes. I saw the thread. Even then I feel having two similar options can
>>>> cause confusion. I feel it is enough just to solve the original problem.
>>>> Giving more options to a corner cases is a overkill in my opinion.
>>>
>>> The "original" problem was systems without "local" bandwidth event. I
>>> wanted to give a way for users of mba_MBps to still have some way to
>>> use it (assuming that "total" bandwidth event was present).
>>>
>>> Reinette suggested that some people might want to use "total", even
>>> on systems that support "local". I firmly agree with that.  It is easy to
>>> construct scenarios where most bandwidth is to a remote node. using
>>> "local" event will do nothing to throttle in these case. I'm not at all sure
>>> why "local" event was picked. There's nothing in the LKML threads to
>>> give clues.
>>>
>>> I proposed a mount option "total" as a modifier to be used in conjunction
>>> with "mba_MBps". Reinette said it was too generic. Her suggestion was
>>> to add "mba_MBps_total" to be used instead of "mba_MBps".
>>
>> No, it cannot be used instead of "mba_MBps". My intention was for it to be
>> in addition to existing "mba_MBps" since taking "mba_MBps" away would be
>> considered breaking user space ABI.
> 
> I was unclear. The mba_MBps option must remain as legacy ABI. My
> "instead of" was intended to convey that a user wanting total bandwidth
> would use:
> 
> # mount -t resctrl -o mba_MBps_total resctrl /sys/fs/resctrl
> 
> rather than the new option being a modifier and requiring both
> the legacy option and the modifier like this:
> 
> # mount -t resctrl -o mba_MBps,mba_MBps_total resctrl /sys/fs/resctrl
> 
> which seems overly verbose.
> 

I misunderstood your comment. Thank you for clarifying.

>>
>> Even so ...
>>
>>>
>>> Is that where I should have gone, instead of "mba_MBps={local|total}"?
>>
>> While I did propose "mba_MBps_total" (in addition to "mba_MBps") I do
>> recognize your comment that a new key of mba_MBps_event does give more
>> flexibility if different events become available in future. Emphasis is
>> on "different" since I do not believe the parsing can support multiple
>> events and thus mba_MBps_event cannot be treated as a general bucket for all
>> mba_sc options, just different events guiding the feedback loop.
>>
>> "mba_MBps" must be kept and having it continue to use local bw as default,
>> but total bw on systems that do not support local bw seems appropriate,
>> (which is what this patch does).
> 
> So we defintely have:
> 
> "mba_MBps" - defaults to local, on systems without local may switch to
> total if that is available. Should this switch get a pr_info()? Or just happen
> silently (as I've done in patches so far).

I do not think a message is required ... I cannot see how it provides information
that user space does not already know. It surely does no harm and I would not
object if it is added. Even so, I do not think a kernel message should be what
is relied on to share with user space what guides the feedback loop. To that end
I think the mount options combined with the system capabilities (learned via
resctrl) provide that information.

Please do note that if the solution of this version is maintained then
rdtgroup_show_options() needs to be updated. With that done, user space should
be able to determine at any time during runtime (no matter if kernel log has been
cleared) which event is being used.

> and need to come to agreement on which of these to implement:
> 
> A) "mba_MBps_total" - forces use of total. Fails the mount if total is not
>    available.

The "cons" of this is that (a) user is not able to prevent the failover,
and (b) harder to support future events (which are unknown and difficult
to prepare for anyway).

> 
> B) "mba_MBps={local|total)" forces use of chosen event, fails if event
> is unavailable.

I assume you mean "mba_MBps_event={local|total}". This is my preference but
I would like to learn more about Babu's "overkill" argument. I do believe that
(B) is well motivated in [1] and has no impact on AMD.
My uncertainty here (considering one motivation is to future proof it against
adding more events) is the generic "local" and "total" names. Even so, all
I could come up with are "local_bw" and "total_bw", which I do not think are
improvements.

> 
> C) Something else.
> 
> D) Don't provide any way to force use of total event.

(D) is what would be needed at minimum. 

In support of Babu's comment I can see how having mba_MBps as well as
mba_MBps_event can cause confusion. To help with this the documentation could
be expanded with more detailed content and also examples.

Reinette

[1] https://lore.kernel.org/lkml/SJ1PR11MB60839F92B1C15A659CD32478FC86A@SJ1PR11MB6083.namprd11.prod.outlook.com/
  
Reinette Chatre Dec. 4, 2023, 10:51 p.m. UTC | #8
On 12/4/2023 2:15 PM, Reinette Chatre wrote:
> On 12/4/2023 1:08 PM, Tony Luck wrote:

>>
>> B) "mba_MBps={local|total)" forces use of chosen event, fails if event
>> is unavailable.
> 
> I assume you mean "mba_MBps_event={local|total}". This is my preference but
> I would like to learn more about Babu's "overkill" argument. I do believe that
> (B) is well motivated in [1] and has no impact on AMD.
> My uncertainty here (considering one motivation is to future proof it against
> adding more events) is the generic "local" and "total" names. Even so, all
> I could come up with are "local_bw" and "total_bw", which I do not think are
> improvements.

Thinking about this more the existing names in resctrl may help here
... "mbm_total_bytes" and "mbm_local_bytes"? Any new event would need to
integrate with these names.

Reinette
  

Patch

diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
index a6279df64a9d..f06cb189911a 100644
--- a/Documentation/arch/x86/resctrl.rst
+++ b/Documentation/arch/x86/resctrl.rst
@@ -45,7 +45,12 @@  mount options are:
 	Enable code/data prioritization in L2 cache allocations.
 "mba_MBps":
 	Enable the MBA Software Controller(mba_sc) to specify MBA
-	bandwidth in MBps
+	bandwidth in MBps. Defaults to using MBM local bandwidth,
+	but will use total bandwidth on systems that do not support
+	local bandwidth monitoring.
+"mba_MBps_event=[local|total]":
+	Enable the MBA Software Controller(mba_sc) with a specific
+	MBM event as input to the feedback loop.
 "debug":
 	Make debug files accessible. Available debug files are annotated with
 	"Available only with debug option".
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 66942d7fba7f..1feb3b2e64fa 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -129,6 +129,7 @@  enum membw_throttle_mode {
  * @throttle_mode:	Bandwidth throttling mode when threads request
  *			different memory bandwidths
  * @mba_sc:		True if MBA software controller(mba_sc) is enabled
+ * @mba_mbps_event:	Event (local or total) for mba_sc
  * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
  */
 struct resctrl_membw {
@@ -138,6 +139,7 @@  struct resctrl_membw {
 	bool				arch_needs_linear;
 	enum membw_throttle_mode	throttle_mode;
 	bool				mba_sc;
+	enum resctrl_event_id		mba_mbps_event;
 	u32				*mb_map;
 };
 
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index a4f1aa15f0a2..8b9b8f664324 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -58,7 +58,8 @@  struct rdt_fs_context {
 	struct kernfs_fs_context	kfc;
 	bool				enable_cdpl2;
 	bool				enable_cdpl3;
-	bool				enable_mba_mbps;
+	bool				enable_mba_mbps_local;
+	bool				enable_mba_mbps_total;
 	bool				enable_debug;
 };
 
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index f136ac046851..d9e590f1cbc3 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -431,9 +431,10 @@  static int __mon_event_count(u32 rmid, struct rmid_read *rr)
  */
 static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
 {
-	struct mbm_state *m = &rr->d->mbm_local[rmid];
 	u64 cur_bw, bytes, cur_bytes;
+	struct mbm_state *m;
 
+	m = get_mbm_state(rr->d, rmid, rr->evtid);
 	cur_bytes = rr->val;
 	bytes = cur_bytes - m->prev_bw_bytes;
 	m->prev_bw_bytes = cur_bytes;
@@ -521,19 +522,21 @@  static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	u32 closid, rmid, cur_msr_val, new_msr_val;
 	struct mbm_state *pmbm_data, *cmbm_data;
 	u32 cur_bw, delta_bw, user_bw;
+	enum resctrl_event_id evt_id;
 	struct rdt_resource *r_mba;
 	struct rdt_domain *dom_mba;
 	struct list_head *head;
 	struct rdtgroup *entry;
 
-	if (!is_mbm_local_enabled())
+	if (!is_mbm_enabled())
 		return;
 
 	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+	evt_id = r_mba->membw.mba_mbps_event;
 
 	closid = rgrp->closid;
 	rmid = rgrp->mon.rmid;
-	pmbm_data = &dom_mbm->mbm_local[rmid];
+	pmbm_data = get_mbm_state(dom_mbm, rmid, evt_id);
 
 	dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
 	if (!dom_mba) {
@@ -553,7 +556,7 @@  static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	 */
 	head = &rgrp->mon.crdtgrp_list;
 	list_for_each_entry(entry, head, mon.crdtgrp_list) {
-		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+		cmbm_data = get_mbm_state(dom_mbm, entry->mon.rmid, evt_id);
 		cur_bw += cmbm_data->prev_bw;
 		delta_bw += cmbm_data->delta_bw;
 	}
@@ -616,18 +619,14 @@  static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
 		rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
 		rr.val = 0;
 		__mon_event_count(rmid, &rr);
+		if (is_mba_sc(NULL) && rr.evtid == r->membw.mba_mbps_event)
+			mbm_bw_count(rmid, &rr);
 	}
 	if (is_mbm_local_enabled()) {
 		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
 		rr.val = 0;
 		__mon_event_count(rmid, &rr);
-
-		/*
-		 * Call the MBA software controller only for the
-		 * control groups and when user has enabled
-		 * the software controller explicitly.
-		 */
-		if (is_mba_sc(NULL))
+		if (is_mba_sc(NULL) && rr.evtid == r->membw.mba_mbps_event)
 			mbm_bw_count(rmid, &rr);
 	}
 }
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 69a1de92384a..79141d33d5b4 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2294,7 +2294,7 @@  static bool supports_mba_mbps(void)
 {
 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
 
-	return (is_mbm_local_enabled() &&
+	return (is_mbm_enabled() &&
 		r->alloc_capable && is_mba_linear());
 }
 
@@ -2302,7 +2302,7 @@  static bool supports_mba_mbps(void)
  * Enable or disable the MBA software controller
  * which helps user specify bandwidth in MBps.
  */
-static int set_mba_sc(bool mba_sc)
+static int set_mba_sc(bool mba_sc, enum resctrl_event_id mba_mbps_event)
 {
 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
 	u32 num_closid = resctrl_arch_get_num_closid(r);
@@ -2313,6 +2313,7 @@  static int set_mba_sc(bool mba_sc)
 		return -EINVAL;
 
 	r->membw.mba_sc = mba_sc;
+	r->membw.mba_mbps_event = mba_mbps_event;
 
 	list_for_each_entry(d, &r->domains, list) {
 		for (i = 0; i < num_closid; i++)
@@ -2445,13 +2446,14 @@  static void rdt_disable_ctx(void)
 {
 	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
 	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
-	set_mba_sc(false);
+	set_mba_sc(false, QOS_L3_MBM_LOCAL_EVENT_ID);
 
 	resctrl_debug = false;
 }
 
 static int rdt_enable_ctx(struct rdt_fs_context *ctx)
 {
+	enum resctrl_event_id mba_mbps_event;
 	int ret = 0;
 
 	if (ctx->enable_cdpl2) {
@@ -2466,8 +2468,12 @@  static int rdt_enable_ctx(struct rdt_fs_context *ctx)
 			goto out_cdpl2;
 	}
 
-	if (ctx->enable_mba_mbps) {
-		ret = set_mba_sc(true);
+	if (ctx->enable_mba_mbps_local || ctx->enable_mba_mbps_total) {
+		if (ctx->enable_mba_mbps_total)
+			mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
+		else
+			mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
+		ret = set_mba_sc(true, mba_mbps_event);
 		if (ret)
 			goto out_cdpl3;
 	}
@@ -2683,15 +2689,17 @@  enum rdt_param {
 	Opt_cdp,
 	Opt_cdpl2,
 	Opt_mba_mbps,
+	Opt_mba_mbps_event,
 	Opt_debug,
 	nr__rdt_params
 };
 
 static const struct fs_parameter_spec rdt_fs_parameters[] = {
-	fsparam_flag("cdp",		Opt_cdp),
-	fsparam_flag("cdpl2",		Opt_cdpl2),
-	fsparam_flag("mba_MBps",	Opt_mba_mbps),
-	fsparam_flag("debug",		Opt_debug),
+	fsparam_flag("cdp",			Opt_cdp),
+	fsparam_flag("cdpl2",			Opt_cdpl2),
+	fsparam_flag("mba_MBps",		Opt_mba_mbps),
+	fsparam_string("mba_MBps_event",	Opt_mba_mbps_event),
+	fsparam_flag("debug",			Opt_debug),
 	{}
 };
 
@@ -2715,7 +2723,27 @@  static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	case Opt_mba_mbps:
 		if (!supports_mba_mbps())
 			return -EINVAL;
-		ctx->enable_mba_mbps = true;
+		if (is_mbm_local_enabled())
+			ctx->enable_mba_mbps_local = true;
+		else if (is_mbm_total_enabled())
+			ctx->enable_mba_mbps_total = true;
+		else
+			return -EINVAL;
+		return 0;
+	case Opt_mba_mbps_event:
+		if (!supports_mba_mbps())
+			return -EINVAL;
+		if (!strcmp("local", param->string)) {
+			if (!is_mbm_local_enabled())
+				return -EINVAL;
+			ctx->enable_mba_mbps_local = true;
+		} else if (!strcmp("total", param->string)) {
+			if (!is_mbm_total_enabled())
+				return -EINVAL;
+			ctx->enable_mba_mbps_total = true;
+		} else {
+			return -EINVAL;
+		}
 		return 0;
 	case Opt_debug:
 		ctx->enable_debug = true;