[v8,10/13] x86/resctrl: Add sysfs interface to write mbm_total_bytes_config

Message ID 166759206900.3281208.11975514088019160962.stgit@bmoger-ubuntu
State New
Headers
Series Support for AMD QoS new features |

Commit Message

Moger, Babu Nov. 4, 2022, 8:01 p.m. UTC
  The current event configuration for mbm_total_bytes can be changed by
the user by writing to the file
/sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config.

The event configuration settings are domain specific and will affect all
the CPUs in the domain.

Following are the types of events supported:

====  ===========================================================
Bits   Description
====  ===========================================================
6      Dirty Victims from the QOS domain to all types of memory
5      Reads to slow memory in the non-local NUMA domain
4      Reads to slow memory in the local NUMA domain
3      Non-temporal writes to non-local NUMA domain
2      Non-temporal writes to local NUMA domain
1      Reads to memory in the non-local NUMA domain
0      Reads to memory in the local NUMA domain
====  ===========================================================

For example:
To change the mbm_total_bytes to count only reads on domain 0, the bits
0, 1, 4 and 5 needs to be set, which is 110011b (in hex 0x33). Run the
command.
	$echo  0=0x33 > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config

To change the mbm_total_bytes to count all the slow memory reads on
domain 1, the bits 4 and 5 needs to be set which is 110000b (in hex 0x30).
Run the command.
	$echo  1=0x30 > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config

Signed-off-by: Babu Moger <babu.moger@amd.com>
---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c |  130 ++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+), 1 deletion(-)
  

Comments

Peter Newman Nov. 7, 2022, 10:21 a.m. UTC | #1
Hi Babu,

On Fri, Nov 04, 2022 at 03:01:09PM -0500, Babu Moger wrote:
> +	/*
> +	 * When an Event Configuration is changed, the bandwidth counters
> +	 * for all RMIDs and Events will be cleared by the hardware. The
> +	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
> +	 * every RMID on the next read to any event for every RMID.
> +	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
> +	 * cleared while it is tracked by the hardware. Clear the
> +	 * mbm_local and mbm_total counts for all the RMIDs.
> +	 */
> +	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
> +	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);

Looking around, I can't find a reader for mbm_total anymore. It looks
like the last place it was used went away in James's recent change:

https://lore.kernel.org/all/20220902154829.30399-19-james.morse@arm.com

Are we supposed to be clearing arch_mbm_total now?

Thanks!
-Peter
  
Moger, Babu Nov. 7, 2022, 6:50 p.m. UTC | #2
Hi Peter,

On 11/7/22 04:21, Peter Newman wrote:
> Hi Babu,
>
> On Fri, Nov 04, 2022 at 03:01:09PM -0500, Babu Moger wrote:
>> +	/*
>> +	 * When an Event Configuration is changed, the bandwidth counters
>> +	 * for all RMIDs and Events will be cleared by the hardware. The
>> +	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
>> +	 * every RMID on the next read to any event for every RMID.
>> +	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
>> +	 * cleared while it is tracked by the hardware. Clear the
>> +	 * mbm_local and mbm_total counts for all the RMIDs.
>> +	 */
>> +	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
>> +	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
> Looking around, I can't find a reader for mbm_total anymore. It looks
> like the last place it was used went away in James's recent change:
>
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F20220902154829.30399-19-james.morse%40arm.com&amp;data=05%7C01%7Cbabu.moger%40amd.com%7C84a9d0f934894a3031a608dac0a9db33%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638034133003350939%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=i3isjCzxnBp4b2VblC7ZpH3hShUEe7unKiKfngG1kzE%3D&amp;reserved=0
>
> Are we supposed to be clearing arch_mbm_total now?

Yes. You are right. We should be using resctrl_arch_reset_rmid to reset
the rmids here.

This patch should work. Will fix it in next revision after the other comments.

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
b/arch/x86/kernel/cpu/resctrl/rdtgroup.c             │
index 6b222f8e58ae..28d9d99a639e
100644                                                                  │
---
a/arch/x86/kernel/cpu/resctrl/rdtgroup.c                                                            
│
+++
b/arch/x86/kernel/cpu/resctrl/rdtgroup.c                                                            
│
@@ -1517,7 +1517,7 @@ static int mbm_config_write(struct rdt_resource *r,
struct rdt_domain *d,          │
                            u32 evtid, u32
val)                                                          │
 {                                                                                                      
│
        struct mon_config_info mon_info =
{0};                                                           │
-       int ret =
0;                                                                                    
│
+       int ret = 0,
i;                                                                                 
│
                                                                                                        
│
       
rdt_last_cmd_clear();                                                                           
│
                                                                                                        
│
@@ -1557,8 +1557,10 @@ static int mbm_config_write(struct rdt_resource *r,
struct rdt_domain *d,         │
         * cleared while it is tracked by the hardware. Clear
the                                        │
         * mbm_local and mbm_total counts for all the
RMIDs.                                             │
        
*/                                                                                             
│
-       memset(d->mbm_local, 0, sizeof(struct mbm_state) *
r->num_rmid);                                 │
-       memset(d->mbm_total, 0, sizeof(struct mbm_state) *
r->num_rmid);                                 │
+       for (i = 0; i < r->num_rmid; i++)
{                                                              │
+               resctrl_arch_reset_rmid(r, d, i,
QOS_L3_MBM_TOTAL_EVENT_ID);                             │
+               resctrl_arch_reset_rmid(r, d, i,
QOS_L3_MBM_LOCAL_EVENT_ID);                             │
+      
}                                                                                               
│
                                                                                                        
│
 write_exit:                                                                                            
│
        return ret;                                   


Thanks

Babu
  
Moger, Babu Nov. 7, 2022, 7 p.m. UTC | #3
On 11/7/22 04:21, Peter Newman wrote:
> Hi Babu,
>
> On Fri, Nov 04, 2022 at 03:01:09PM -0500, Babu Moger wrote:
>> +	/*
>> +	 * When an Event Configuration is changed, the bandwidth counters
>> +	 * for all RMIDs and Events will be cleared by the hardware. The
>> +	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
>> +	 * every RMID on the next read to any event for every RMID.
>> +	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
>> +	 * cleared while it is tracked by the hardware. Clear the
>> +	 * mbm_local and mbm_total counts for all the RMIDs.
>> +	 */
>> +	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
>> +	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
> Looking around, I can't find a reader for mbm_total anymore. It looks
> like the last place it was used went away in James's recent change:
>
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F20220902154829.30399-19-james.morse%40arm.com&amp;data=05%7C01%7Cbabu.moger%40amd.com%7C84a9d0f934894a3031a608dac0a9db33%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638034133003350939%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=i3isjCzxnBp4b2VblC7ZpH3hShUEe7unKiKfngG1kzE%3D&amp;reserved=0
>
> Are we supposed to be clearing arch_mbm_total now?
>
Patch got garbled in previous response.

Here is it now.

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 6b222f8e58ae..28d9d99a639e 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1517,7 +1517,7 @@ static int mbm_config_write(struct rdt_resource *r,
struct rdt_domain *d,
                            u32 evtid, u32 val)
 {
        struct mon_config_info mon_info = {0};
-       int ret = 0;
+       int ret = 0, i;
 
        rdt_last_cmd_clear();
 
@@ -1557,8 +1557,10 @@ static int mbm_config_write(struct rdt_resource *r,
struct rdt_domain *d,
         * cleared while it is tracked by the hardware. Clear the
         * mbm_local and mbm_total counts for all the RMIDs.
         */
-       memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
-       memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
+       for (i = 0; i < r->num_rmid; i++) {
+               resctrl_arch_reset_rmid(r, d, i, QOS_L3_MBM_TOTAL_EVENT_ID);
+               resctrl_arch_reset_rmid(r, d, i, QOS_L3_MBM_LOCAL_EVENT_ID);
+       }
 
 write_exit:
        return ret;

Tthanks

Babu
  
Reinette Chatre Nov. 22, 2022, 11:43 p.m. UTC | #4
Hi Babu,

On 11/7/2022 11:00 AM, Moger, Babu wrote:
> 
> On 11/7/22 04:21, Peter Newman wrote:
>> Hi Babu,
>>
>> On Fri, Nov 04, 2022 at 03:01:09PM -0500, Babu Moger wrote:
>>> +	/*
>>> +	 * When an Event Configuration is changed, the bandwidth counters
>>> +	 * for all RMIDs and Events will be cleared by the hardware. The
>>> +	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
>>> +	 * every RMID on the next read to any event for every RMID.
>>> +	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
>>> +	 * cleared while it is tracked by the hardware. Clear the
>>> +	 * mbm_local and mbm_total counts for all the RMIDs.
>>> +	 */
>>> +	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
>>> +	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
>> Looking around, I can't find a reader for mbm_total anymore. It looks
>> like the last place it was used went away in James's recent change:
>>
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F20220902154829.30399-19-james.morse%40arm.com&amp;data=05%7C01%7Cbabu.moger%40amd.com%7C84a9d0f934894a3031a608dac0a9db33%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638034133003350939%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=i3isjCzxnBp4b2VblC7ZpH3hShUEe7unKiKfngG1kzE%3D&amp;reserved=0
>>
>> Are we supposed to be clearing arch_mbm_total now?
>>
> Patch got garbled in previous response.
> 
> Here is it now.
> 
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index 6b222f8e58ae..28d9d99a639e 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -1517,7 +1517,7 @@ static int mbm_config_write(struct rdt_resource *r,
> struct rdt_domain *d,
>                             u32 evtid, u32 val)
>  {
>         struct mon_config_info mon_info = {0};
> -       int ret = 0;
> +       int ret = 0, i;
>  
>         rdt_last_cmd_clear();
>  
> @@ -1557,8 +1557,10 @@ static int mbm_config_write(struct rdt_resource *r,
> struct rdt_domain *d,
>          * cleared while it is tracked by the hardware. Clear the
>          * mbm_local and mbm_total counts for all the RMIDs.
>          */
> -       memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
> -       memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
> +       for (i = 0; i < r->num_rmid; i++) {
> +               resctrl_arch_reset_rmid(r, d, i, QOS_L3_MBM_TOTAL_EVENT_ID);
> +               resctrl_arch_reset_rmid(r, d, i, QOS_L3_MBM_LOCAL_EVENT_ID);
> +       }
>  
>  write_exit:
>         return ret;

Resetting each member of an array individually seems unnecessary when the
array could just be reset as a unit. How about instead a new
resctrl_arch_reset_rmid_all() that can do so?

Reinette
  
Reinette Chatre Nov. 23, 2022, 12:22 a.m. UTC | #5
Hi Babu,

On 11/4/2022 1:01 PM, Babu Moger wrote:
> The current event configuration for mbm_total_bytes can be changed by
> the user by writing to the file
> /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config.
> 
> The event configuration settings are domain specific and will affect all
> the CPUs in the domain.
> 
> Following are the types of events supported:
> 
> ====  ===========================================================
> Bits   Description
> ====  ===========================================================
> 6      Dirty Victims from the QOS domain to all types of memory
> 5      Reads to slow memory in the non-local NUMA domain
> 4      Reads to slow memory in the local NUMA domain
> 3      Non-temporal writes to non-local NUMA domain
> 2      Non-temporal writes to local NUMA domain
> 1      Reads to memory in the non-local NUMA domain
> 0      Reads to memory in the local NUMA domain
> ====  ===========================================================
> 
> For example:
> To change the mbm_total_bytes to count only reads on domain 0, the bits
> 0, 1, 4 and 5 needs to be set, which is 110011b (in hex 0x33). Run the
> command.
> 	$echo  0=0x33 > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
> 
> To change the mbm_total_bytes to count all the slow memory reads on
> domain 1, the bits 4 and 5 needs to be set which is 110000b (in hex 0x30).
> Run the command.
> 	$echo  1=0x30 > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
> 
> Signed-off-by: Babu Moger <babu.moger@amd.com>
> ---
>  arch/x86/kernel/cpu/resctrl/rdtgroup.c |  130 ++++++++++++++++++++++++++++++++
>  1 file changed, 129 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index 18f9588a41cf..0cdccb69386e 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -1505,6 +1505,133 @@ static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
>  	return 0;
>  }
>  
> +static void mon_event_config_write(void *info)
> +{
> +	struct mon_config_info *mon_info = info;
> +	u32 index;
> +
> +	index = mon_event_config_index_get(mon_info->evtid);
> +	if (index >= MAX_CONFIG_EVENTS) {
> +		pr_warn_once("Invalid event id %d\n", mon_info->evtid);
> +		return;
> +	}
> +	wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
> +}
> +
> +static int mbm_config_write(struct rdt_resource *r, struct rdt_domain *d,
> +			    u32 evtid, u32 val)
> +{
> +	struct mon_config_info mon_info = {0};
> +	int ret = 0;
> +
> +	rdt_last_cmd_clear();
> +

Why is this extra clear() needed?

> +	/* mon_config cannot be more than the supported set of events */
> +	if (val > MAX_EVT_CONFIG_BITS) {
> +		rdt_last_cmd_puts("Invalid event configuration\n");
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * Read the current config value first. If both are same then
> +	 * we don't need to write it again.

Please no "we". Maybe just "If both are the same then no need to write it again."

> +	 */
> +	mon_info.evtid = evtid;
> +	mondata_config_read(d, &mon_info);

Here I see motivation for doing validity check in mondata_config_read() as
mentioned in feedback for patch #8. If hardware decides to use the other bits
in that MSR then the check below would have trouble.

> +	if (mon_info.mon_config == val)
> +		goto write_exit;
> +

Could you please follow the custom in this area? Please see goto usage in the rest
of the file that you are changing. The label should reflect the action being
jumped to. In that sense, "write_exit" is not clear. A simple "goto out"
would be clear and matches usage in rest of file.

> +	mon_info.mon_config = val;
> +
> +	/*
> +	 * Update MSR_IA32_EVT_CFG_BASE MSRs on all the CPUs in the
> +	 * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
> +	 * are scoped at the domain level. Writing any of these MSRs
> +	 * on one CPU is supposed to be observed by all CPUs in the
> +	 * domain. However, the hardware team recommends to update
> +	 * these MSRs on all the CPUs in the domain.
> +	 */
> +	on_each_cpu_mask(&d->cpu_mask, mon_event_config_write, &mon_info, 1);
> +
> +	/*
> +	 * When an Event Configuration is changed, the bandwidth counters
> +	 * for all RMIDs and Events will be cleared by the hardware. The
> +	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
> +	 * every RMID on the next read to any event for every RMID.
> +	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
> +	 * cleared while it is tracked by the hardware. Clear the
> +	 * mbm_local and mbm_total counts for all the RMIDs.
> +	 */
> +	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
> +	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
> +
> +write_exit:
> +	return ret;
> +}
> +
> +static int mon_config_parse(struct rdt_resource *r, char *tok, u32 evtid)
> +{
> +	char *dom_str = NULL, *id_str;
> +	unsigned long dom_id, val;
> +	struct rdt_domain *d;
> +	int ret = 0;
> +
> +next:
> +	if (!tok || tok[0] == '\0')
> +		return 0;
> +
> +	/* Start processing the strings for each domain */
> +	dom_str = strim(strsep(&tok, ";"));
> +	id_str = strsep(&dom_str, "=");
> +
> +	if (!dom_str || kstrtoul(id_str, 10, &dom_id)) {
> +		rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
> +		return -EINVAL;
> +	}
> +
> +	if (!dom_str || kstrtoul(dom_str, 16, &val)) {
> +		rdt_last_cmd_puts("Missing '=' or non-numeric event configuration value\n");
> +		return -EINVAL;
> +	}

There is some duplication above ... both if () statememts
check for "!dom_str" - is one intended to be "!id_str"? 
Could both checks really mean that a "=" may be missing?

> +
> +	list_for_each_entry(d, &r->domains, list) {
> +		if (d->id == dom_id) {
> +			ret = mbm_config_write(r, d, evtid, val);
> +			if (ret)
> +				return -EINVAL;
> +			goto next;
> +		}
> +	}
> +
> +	return -EINVAL;
> +}
> +
> +static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
> +					    char *buf, size_t nbytes,
> +					    loff_t off)
> +{
> +	struct rdt_resource *r = of->kn->parent->priv;
> +	int ret;
> +
> +	/* Valid input requires a trailing newline */
> +	if (nbytes == 0 || buf[nbytes - 1] != '\n')
> +		return -EINVAL;
> +
> +	cpus_read_lock();
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	rdt_last_cmd_clear();
> +
> +	buf[nbytes - 1] = '\0';
> +
> +	ret = mon_config_parse(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
> +

The naming here does not reflect what is done ... much more than
parsing is done here.

How about renaming mon_config_parse() to mon_config_write(), and
renaming mon_config_write() to mon_config_write_domain() ? 


Reinette
  
Moger, Babu Nov. 23, 2022, 9:44 p.m. UTC | #6
[AMD Official Use Only - General]

Hi Reinette,

> -----Original Message-----
> From: Reinette Chatre <reinette.chatre@intel.com>
> Sent: Tuesday, November 22, 2022 5:43 PM
> To: Moger, Babu <Babu.Moger@amd.com>; Peter Newman
> <peternewman@google.com>
> Cc: akpm@linux-foundation.org; bagasdotme@gmail.com; bp@alien8.de;
> chang.seok.bae@intel.com; corbet@lwn.net;
> damien.lemoal@opensource.wdc.com; daniel.sneddon@linux.intel.com;
> dave.hansen@linux.intel.com; eranian@google.com; fenghua.yu@intel.com;
> hpa@zytor.com; james.morse@arm.com; jmattson@google.com;
> jpoimboe@kernel.org; linux-doc@vger.kernel.org; linux-
> kernel@vger.kernel.org; mingo@redhat.com; paulmck@kernel.org;
> pawan.kumar.gupta@linux.intel.com; pbonzini@redhat.com;
> peterz@infradead.org; quic_neeraju@quicinc.com; rdunlap@infradead.org;
> Das1, Sandipan <Sandipan.Das@amd.com>; songmuchun@bytedance.com;
> tglx@linutronix.de; tony.luck@intel.com; x86@kernel.org
> Subject: Re: [PATCH v8 10/13] x86/resctrl: Add sysfs interface to write
> mbm_total_bytes_config
> 
> Hi Babu,
> 
> On 11/7/2022 11:00 AM, Moger, Babu wrote:
> >
> > On 11/7/22 04:21, Peter Newman wrote:
> >> Hi Babu,
> >>
> >> On Fri, Nov 04, 2022 at 03:01:09PM -0500, Babu Moger wrote:
> >>> +	/*
> >>> +	 * When an Event Configuration is changed, the bandwidth counters
> >>> +	 * for all RMIDs and Events will be cleared by the hardware. The
> >>> +	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
> >>> +	 * every RMID on the next read to any event for every RMID.
> >>> +	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
> >>> +	 * cleared while it is tracked by the hardware. Clear the
> >>> +	 * mbm_local and mbm_total counts for all the RMIDs.
> >>> +	 */
> >>> +	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
> >>> +	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
> >> Looking around, I can't find a reader for mbm_total anymore. It looks
> >> like the last place it was used went away in James's recent change:
> >>
> >> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flor
> >> e.kernel.org%2Fall%2F20220902154829.30399-19-
> james.morse%40arm.com&am
> >>
> p;data=05%7C01%7Cbabu.moger%40amd.com%7Ccb4a2daf65b84b45aeac08da
> cce35
> >>
> 66d%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C63804757402544
> 6241%7
> >>
> CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI
> 6Ik
> >>
> 1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=QZUVrpdr0YQFSJ
> BbS0BHSu
> >> q%2BhMwZHAA06MUqx98hD0U%3D&amp;reserved=0
> >>
> >> Are we supposed to be clearing arch_mbm_total now?
> >>
> > Patch got garbled in previous response.
> >
> > Here is it now.
> >
> > diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > index 6b222f8e58ae..28d9d99a639e 100644
> > --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > @@ -1517,7 +1517,7 @@ static int mbm_config_write(struct rdt_resource
> > *r, struct rdt_domain *d,
> >                             u32 evtid, u32 val)
> >  {
> >         struct mon_config_info mon_info = {0};
> > -       int ret = 0;
> > +       int ret = 0, i;
> >
> >         rdt_last_cmd_clear();
> >
> > @@ -1557,8 +1557,10 @@ static int mbm_config_write(struct rdt_resource
> > *r, struct rdt_domain *d,
> >          * cleared while it is tracked by the hardware. Clear the
> >          * mbm_local and mbm_total counts for all the RMIDs.
> >          */
> > -       memset(d->mbm_local, 0, sizeof(struct mbm_state) *
> > r->num_rmid);
> > -       memset(d->mbm_total, 0, sizeof(struct mbm_state) *
> > r->num_rmid);
> > +       for (i = 0; i < r->num_rmid; i++) {
> > +               resctrl_arch_reset_rmid(r, d, i,
> > +QOS_L3_MBM_TOTAL_EVENT_ID);
> > +               resctrl_arch_reset_rmid(r, d, i,
> > +QOS_L3_MBM_LOCAL_EVENT_ID);
> > +       }
> >
> >  write_exit:
> >         return ret;
> 
> Resetting each member of an array individually seems unnecessary when the
> array could just be reset as a unit. How about instead a new
> resctrl_arch_reset_rmid_all() that can do so?

Yes. We can do something like this below. 

diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index a188dacab6c8..2e67de911222 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -176,6 +176,14 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
                memset(am, 0, sizeof(*am));
 }

+void resctrl_arch_reset_rmid_all(struct rdt_domain *d)
+{
+       struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+
+       memset(hw_dom->arch_mbm_total, 0, sizeof(*hw_dom->arch_mbm_total));
+       memset(hw_dom->arch_mbm_local, 0, sizeof(*hw_dom->arch_mbm_local));
+}
+
 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
 {
        u64 shift = 64 - width, chunks;
  
Reinette Chatre Nov. 23, 2022, 10:22 p.m. UTC | #7
Hi Babu,

On 11/23/2022 1:44 PM, Moger, Babu wrote:
> [AMD Official Use Only - General]
> 
> Hi Reinette,
> 
>> -----Original Message-----
>> From: Reinette Chatre <reinette.chatre@intel.com>
>> Sent: Tuesday, November 22, 2022 5:43 PM
>> To: Moger, Babu <Babu.Moger@amd.com>; Peter Newman
>> <peternewman@google.com>
>> Cc: akpm@linux-foundation.org; bagasdotme@gmail.com; bp@alien8.de;
>> chang.seok.bae@intel.com; corbet@lwn.net;
>> damien.lemoal@opensource.wdc.com; daniel.sneddon@linux.intel.com;
>> dave.hansen@linux.intel.com; eranian@google.com; fenghua.yu@intel.com;
>> hpa@zytor.com; james.morse@arm.com; jmattson@google.com;
>> jpoimboe@kernel.org; linux-doc@vger.kernel.org; linux-
>> kernel@vger.kernel.org; mingo@redhat.com; paulmck@kernel.org;
>> pawan.kumar.gupta@linux.intel.com; pbonzini@redhat.com;
>> peterz@infradead.org; quic_neeraju@quicinc.com; rdunlap@infradead.org;
>> Das1, Sandipan <Sandipan.Das@amd.com>; songmuchun@bytedance.com;
>> tglx@linutronix.de; tony.luck@intel.com; x86@kernel.org
>> Subject: Re: [PATCH v8 10/13] x86/resctrl: Add sysfs interface to write
>> mbm_total_bytes_config
>>
>> Hi Babu,
>>
>> On 11/7/2022 11:00 AM, Moger, Babu wrote:
>>>
>>> On 11/7/22 04:21, Peter Newman wrote:
>>>> Hi Babu,
>>>>
>>>> On Fri, Nov 04, 2022 at 03:01:09PM -0500, Babu Moger wrote:
>>>>> + /*
>>>>> +  * When an Event Configuration is changed, the bandwidth counters
>>>>> +  * for all RMIDs and Events will be cleared by the hardware. The
>>>>> +  * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
>>>>> +  * every RMID on the next read to any event for every RMID.
>>>>> +  * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
>>>>> +  * cleared while it is tracked by the hardware. Clear the
>>>>> +  * mbm_local and mbm_total counts for all the RMIDs.
>>>>> +  */
>>>>> + memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
>>>>> + memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
>>>> Looking around, I can't find a reader for mbm_total anymore. It looks
>>>> like the last place it was used went away in James's recent change:
>>>>
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flor
>>>> e.kernel.org%2Fall%2F20220902154829.30399-19-
>> james.morse%40arm.com&am
>>>>
>> p;data=05%7C01%7Cbabu.moger%40amd.com%7Ccb4a2daf65b84b45aeac08da
>> cce35
>>>>
>> 66d%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C63804757402544
>> 6241%7
>>>>
>> CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI
>> 6Ik
>>>>
>> 1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=QZUVrpdr0YQFSJ
>> BbS0BHSu
>>>> q%2BhMwZHAA06MUqx98hD0U%3D&amp;reserved=0
>>>>
>>>> Are we supposed to be clearing arch_mbm_total now?
>>>>
>>> Patch got garbled in previous response.
>>>
>>> Here is it now.
>>>
>>> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>>> b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>>> index 6b222f8e58ae..28d9d99a639e 100644
>>> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>>> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>>> @@ -1517,7 +1517,7 @@ static int mbm_config_write(struct rdt_resource
>>> *r, struct rdt_domain *d,
>>>                             u32 evtid, u32 val)
>>>  {
>>>         struct mon_config_info mon_info = {0};
>>> -       int ret = 0;
>>> +       int ret = 0, i;
>>>
>>>         rdt_last_cmd_clear();
>>>
>>> @@ -1557,8 +1557,10 @@ static int mbm_config_write(struct rdt_resource
>>> *r, struct rdt_domain *d,
>>>          * cleared while it is tracked by the hardware. Clear the
>>>          * mbm_local and mbm_total counts for all the RMIDs.
>>>          */
>>> -       memset(d->mbm_local, 0, sizeof(struct mbm_state) *
>>> r->num_rmid);
>>> -       memset(d->mbm_total, 0, sizeof(struct mbm_state) *
>>> r->num_rmid);
>>> +       for (i = 0; i < r->num_rmid; i++) {
>>> +               resctrl_arch_reset_rmid(r, d, i,
>>> +QOS_L3_MBM_TOTAL_EVENT_ID);
>>> +               resctrl_arch_reset_rmid(r, d, i,
>>> +QOS_L3_MBM_LOCAL_EVENT_ID);
>>> +       }
>>>
>>>  write_exit:
>>>         return ret;
>>
>> Resetting each member of an array individually seems unnecessary when the
>> array could just be reset as a unit. How about instead a new
>> resctrl_arch_reset_rmid_all() that can do so?
> 
> Yes. We can do something like this below.
> 
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index a188dacab6c8..2e67de911222 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -176,6 +176,14 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
>                 memset(am, 0, sizeof(*am));
>  }
> 
> +void resctrl_arch_reset_rmid_all(struct rdt_domain *d)
> +{
> +       struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
> +
> +       memset(hw_dom->arch_mbm_total, 0, sizeof(*hw_dom->arch_mbm_total));
> +       memset(hw_dom->arch_mbm_local, 0, sizeof(*hw_dom->arch_mbm_local));
> +}
> +
>  static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
>  {
>         u64 shift = 64 - width, chunks;

It looks like the above would only reset the first entry of the array.
I expect that the resource should also be provided as parameter so that
the num_rmid can be obtained to be able to clear the entire array.
Also, what is the likelihood of this being called when the array does not
exist? It may be safer to wrap each memset() with an is_mbm_total_enabled()
or is_mbm_local_enabled().

Reinette
  
Moger, Babu Nov. 23, 2022, 10:44 p.m. UTC | #8
[AMD Official Use Only - General]

Hi Reinette,

> -----Original Message-----
> From: Reinette Chatre <reinette.chatre@intel.com>
> Sent: Tuesday, November 22, 2022 6:22 PM
> To: Moger, Babu <Babu.Moger@amd.com>; corbet@lwn.net;
> tglx@linutronix.de; mingo@redhat.com; bp@alien8.de
> Cc: fenghua.yu@intel.com; dave.hansen@linux.intel.com; x86@kernel.org;
> hpa@zytor.com; paulmck@kernel.org; akpm@linux-foundation.org;
> quic_neeraju@quicinc.com; rdunlap@infradead.org;
> damien.lemoal@opensource.wdc.com; songmuchun@bytedance.com;
> peterz@infradead.org; jpoimboe@kernel.org; pbonzini@redhat.com;
> chang.seok.bae@intel.com; pawan.kumar.gupta@linux.intel.com;
> jmattson@google.com; daniel.sneddon@linux.intel.com; Das1, Sandipan
> <Sandipan.Das@amd.com>; tony.luck@intel.com; james.morse@arm.com;
> linux-doc@vger.kernel.org; linux-kernel@vger.kernel.org;
> bagasdotme@gmail.com; eranian@google.com
> Subject: Re: [PATCH v8 10/13] x86/resctrl: Add sysfs interface to write
> mbm_total_bytes_config
> 
> Hi Babu,
> 
> On 11/4/2022 1:01 PM, Babu Moger wrote:
> > The current event configuration for mbm_total_bytes can be changed by
> > the user by writing to the file
> > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config.
> >
> > The event configuration settings are domain specific and will affect
> > all the CPUs in the domain.
> >
> > Following are the types of events supported:
> >
> > ====
> ===========================================================
> > Bits   Description
> > ====
> ===========================================================
> > 6      Dirty Victims from the QOS domain to all types of memory
> > 5      Reads to slow memory in the non-local NUMA domain
> > 4      Reads to slow memory in the local NUMA domain
> > 3      Non-temporal writes to non-local NUMA domain
> > 2      Non-temporal writes to local NUMA domain
> > 1      Reads to memory in the non-local NUMA domain
> > 0      Reads to memory in the local NUMA domain
> > ====
> ===========================================================
> >
> > For example:
> > To change the mbm_total_bytes to count only reads on domain 0, the
> > bits 0, 1, 4 and 5 needs to be set, which is 110011b (in hex 0x33).
> > Run the command.
> > 	$echo  0=0x33 > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
> >
> > To change the mbm_total_bytes to count all the slow memory reads on
> > domain 1, the bits 4 and 5 needs to be set which is 110000b (in hex 0x30).
> > Run the command.
> > 	$echo  1=0x30 > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
> >
> > Signed-off-by: Babu Moger <babu.moger@amd.com>
> > ---
> >  arch/x86/kernel/cpu/resctrl/rdtgroup.c |  130
> > ++++++++++++++++++++++++++++++++
> >  1 file changed, 129 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > index 18f9588a41cf..0cdccb69386e 100644
> > --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > @@ -1505,6 +1505,133 @@ static int mbm_local_bytes_config_show(struct
> kernfs_open_file *of,
> >  	return 0;
> >  }
> >
> > +static void mon_event_config_write(void *info) {
> > +	struct mon_config_info *mon_info = info;
> > +	u32 index;
> > +
> > +	index = mon_event_config_index_get(mon_info->evtid);
> > +	if (index >= MAX_CONFIG_EVENTS) {
> > +		pr_warn_once("Invalid event id %d\n", mon_info->evtid);
> > +		return;
> > +	}
> > +	wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
> }
> > +
> > +static int mbm_config_write(struct rdt_resource *r, struct rdt_domain *d,
> > +			    u32 evtid, u32 val)
> > +{
> > +	struct mon_config_info mon_info = {0};
> > +	int ret = 0;
> > +
> > +	rdt_last_cmd_clear();
> > +
> 
> Why is this extra clear() needed?

I am not sure why I added that. It does not seem required. I can remove it.
> 
> > +	/* mon_config cannot be more than the supported set of events */
> > +	if (val > MAX_EVT_CONFIG_BITS) {
> > +		rdt_last_cmd_puts("Invalid event configuration\n");
> > +		return -EINVAL;
> > +	}
> > +
> > +	/*
> > +	 * Read the current config value first. If both are same then
> > +	 * we don't need to write it again.
> 
> Please no "we". Maybe just "If both are the same then no need to write it
> again."

Ok.

> 
> > +	 */
> > +	mon_info.evtid = evtid;
> > +	mondata_config_read(d, &mon_info);
> 
> Here I see motivation for doing validity check in mondata_config_read() as
> mentioned in feedback for patch #8. If hardware decides to use the other bits in
> that MSR then the check below would have trouble.
> 
> > +	if (mon_info.mon_config == val)
> > +		goto write_exit;
> > +
> 
> Could you please follow the custom in this area? Please see goto usage in the
> rest of the file that you are changing. The label should reflect the action being
> jumped to. In that sense, "write_exit" is not clear. A simple "goto out"
> would be clear and matches usage in rest of file.

Ok. Sure
> 
> > +	mon_info.mon_config = val;
> > +
> > +	/*
> > +	 * Update MSR_IA32_EVT_CFG_BASE MSRs on all the CPUs in the
> > +	 * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
> > +	 * are scoped at the domain level. Writing any of these MSRs
> > +	 * on one CPU is supposed to be observed by all CPUs in the
> > +	 * domain. However, the hardware team recommends to update
> > +	 * these MSRs on all the CPUs in the domain.
> > +	 */
> > +	on_each_cpu_mask(&d->cpu_mask, mon_event_config_write,
> &mon_info,
> > +1);
> > +
> > +	/*
> > +	 * When an Event Configuration is changed, the bandwidth counters
> > +	 * for all RMIDs and Events will be cleared by the hardware. The
> > +	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
> > +	 * every RMID on the next read to any event for every RMID.
> > +	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
> > +	 * cleared while it is tracked by the hardware. Clear the
> > +	 * mbm_local and mbm_total counts for all the RMIDs.
> > +	 */
> > +	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
> > +	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
> > +
> > +write_exit:
> > +	return ret;
> > +}
> > +
> > +static int mon_config_parse(struct rdt_resource *r, char *tok, u32
> > +evtid) {
> > +	char *dom_str = NULL, *id_str;
> > +	unsigned long dom_id, val;
> > +	struct rdt_domain *d;
> > +	int ret = 0;
> > +
> > +next:
> > +	if (!tok || tok[0] == '\0')
> > +		return 0;
> > +
> > +	/* Start processing the strings for each domain */
> > +	dom_str = strim(strsep(&tok, ";"));
> > +	id_str = strsep(&dom_str, "=");
> > +
> > +	if (!dom_str || kstrtoul(id_str, 10, &dom_id)) {
> > +		rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (!dom_str || kstrtoul(dom_str, 16, &val)) {
> > +		rdt_last_cmd_puts("Missing '=' or non-numeric event
> configuration value\n");
> > +		return -EINVAL;
> > +	}
> 
> There is some duplication above ... both if () statememts check for "!dom_str" -
> is one intended to be "!id_str"?

The first check should be !id_str. Will correct it.

> Could both checks really mean that a "=" may be missing?

The second check failure means, there is a missing event configuration value. Will remove "missing =".

> 
> > +
> > +	list_for_each_entry(d, &r->domains, list) {
> > +		if (d->id == dom_id) {
> > +			ret = mbm_config_write(r, d, evtid, val);
> > +			if (ret)
> > +				return -EINVAL;
> > +			goto next;
> > +		}
> > +	}
> > +
> > +	return -EINVAL;
> > +}
> > +
> > +static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
> > +					    char *buf, size_t nbytes,
> > +					    loff_t off)
> > +{
> > +	struct rdt_resource *r = of->kn->parent->priv;
> > +	int ret;
> > +
> > +	/* Valid input requires a trailing newline */
> > +	if (nbytes == 0 || buf[nbytes - 1] != '\n')
> > +		return -EINVAL;
> > +
> > +	cpus_read_lock();
> > +	mutex_lock(&rdtgroup_mutex);
> > +
> > +	rdt_last_cmd_clear();
> > +
> > +	buf[nbytes - 1] = '\0';
> > +
> > +	ret = mon_config_parse(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
> > +
> 
> The naming here does not reflect what is done ... much more than parsing is
> done here.
> 
> How about renaming mon_config_parse() to mon_config_write(), and
> renaming mon_config_write() to mon_config_write_domain() ?

Sure. Thanks
Babu
  
Moger, Babu Nov. 28, 2022, 4:01 p.m. UTC | #9
This thread did not land in my mailbox. Replying with git send mail link.

Ok. Sure. I am fine with these changes. Thanks
  
James Morse Dec. 7, 2022, 5:20 p.m. UTC | #10
Hi Babu,

(Nit: all the 'sysfs' in the subjects should really be 'resctrl', but as they already have
'x86/resctrl', could you just remove the sysfs?
This patch would be "x86/resctrl: Add interface to write mbm_total_bytes_config")

On 04/11/2022 20:01, Babu Moger wrote:
> The current event configuration for mbm_total_bytes can be changed by
> the user by writing to the file
> /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config.
> 
> The event configuration settings are domain specific and will affect all
> the CPUs in the domain.
> 
> Following are the types of events supported:
> 
> ====  ===========================================================
> Bits   Description
> ====  ===========================================================
> 6      Dirty Victims from the QOS domain to all types of memory
> 5      Reads to slow memory in the non-local NUMA domain
> 4      Reads to slow memory in the local NUMA domain
> 3      Non-temporal writes to non-local NUMA domain
> 2      Non-temporal writes to local NUMA domain
> 1      Reads to memory in the non-local NUMA domain
> 0      Reads to memory in the local NUMA domain
> ====  ===========================================================
> 
> For example:
> To change the mbm_total_bytes to count only reads on domain 0, the bits
> 0, 1, 4 and 5 needs to be set, which is 110011b (in hex 0x33). Run the
> command.
> 	$echo  0=0x33 > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
> 
> To change the mbm_total_bytes to count all the slow memory reads on
> domain 1, the bits 4 and 5 needs to be set which is 110000b (in hex 0x30).
> Run the command.
> 	$echo  1=0x30 > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config

> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index 18f9588a41cf..0cdccb69386e 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -1505,6 +1505,133 @@ static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
>  	return 0;
>  }
>  
> +static void mon_event_config_write(void *info)
> +{
> +	struct mon_config_info *mon_info = info;
> +	u32 index;
> +
> +	index = mon_event_config_index_get(mon_info->evtid);
> +	if (index >= MAX_CONFIG_EVENTS) {
> +		pr_warn_once("Invalid event id %d\n", mon_info->evtid);
> +		return;
> +	}
> +	wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
> +}
> +
> +static int mbm_config_write(struct rdt_resource *r, struct rdt_domain *d,
> +			    u32 evtid, u32 val)
> +{
> +	struct mon_config_info mon_info = {0};
> +	int ret = 0;
> +
> +	rdt_last_cmd_clear();
> +
> +	/* mon_config cannot be more than the supported set of events */
> +	if (val > MAX_EVT_CONFIG_BITS) {
> +		rdt_last_cmd_puts("Invalid event configuration\n");
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * Read the current config value first. If both are same then
> +	 * we don't need to write it again.
> +	 */
> +	mon_info.evtid = evtid;

> +	mondata_config_read(d, &mon_info);

This reads the MSR on this CPU, which gets the result for this domain...


> +	if (mon_info.mon_config == val)
> +		goto write_exit;
> +
> +	mon_info.mon_config = val;
> +
> +	/*
> +	 * Update MSR_IA32_EVT_CFG_BASE MSRs on all the CPUs in the
> +	 * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
> +	 * are scoped at the domain level. Writing any of these MSRs
> +	 * on one CPU is supposed to be observed by all CPUs in the
> +	 * domain. However, the hardware team recommends to update
> +	 * these MSRs on all the CPUs in the domain.
> +	 */

> +	on_each_cpu_mask(&d->cpu_mask, mon_event_config_write, &mon_info, 1);

... but here you IPI all the CPUs in the target domain to update them.

This means you unnecessarily IPI the CPUs in the target domain if they already had this
value, but the write syscall occurred on a domain that differs. This isn't what you
intended, but its benign.
More of a problem is: Won't this get skipped if the write syscall occurs on a domain that
happens to have the target configuration already?

Because you need the same value to be written on every CPU ... what happens to CPUs that
are offline when the configuration is changed? Do they keep their previous value, or does
it get reset?


I think this is best solved with a percpu variable for the current value of the MSR. You
can then read it for CPUs in a remote domain, and only issue IPIs to 'sync' the value if
needed. You can then re-use the sync call in resctrl_online_cpu() to set the MSR to
whatever value it should currently be.


> +
> +	/*
> +	 * When an Event Configuration is changed, the bandwidth counters
> +	 * for all RMIDs and Events will be cleared by the hardware. The
> +	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
> +	 * every RMID on the next read to any event for every RMID.
> +	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
> +	 * cleared while it is tracked by the hardware. Clear the
> +	 * mbm_local and mbm_total counts for all the RMIDs.
> +	 */
> +	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
> +	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
> +
> +write_exit:
> +	return ret;
> +}


> +static int mon_config_parse(struct rdt_resource *r, char *tok, u32 evtid)
> +{
> +	char *dom_str = NULL, *id_str;
> +	unsigned long dom_id, val;
> +	struct rdt_domain *d;
> +	int ret = 0;
> +
> +next:
> +	if (!tok || tok[0] == '\0')
> +		return 0;
> +
> +	/* Start processing the strings for each domain */
> +	dom_str = strim(strsep(&tok, ";"));
> +	id_str = strsep(&dom_str, "=");
> +
> +	if (!dom_str || kstrtoul(id_str, 10, &dom_id)) {
> +		rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
> +		return -EINVAL;
> +	}
> +
> +	if (!dom_str || kstrtoul(dom_str, 16, &val)) {
> +		rdt_last_cmd_puts("Missing '=' or non-numeric event configuration value\n");
> +		return -EINVAL;
> +	}

This is parsing the same format strings as parse_line(). Is there any chance that code
could be re-used instead of duplicated? This way anything that is added to the format (or
bugs found!) only need supporting in once place.



> +	list_for_each_entry(d, &r->domains, list) {
> +		if (d->id == dom_id) {
> +			ret = mbm_config_write(r, d, evtid, val);
> +			if (ret)
> +				return -EINVAL;
> +			goto next;
> +		}
> +	}
> +
> +	return -EINVAL;
> +}


Thanks,

James
  
James Morse Dec. 7, 2022, 5:24 p.m. UTC | #11
Bah! Sorry, I thought I was looking at v9!
(but the same comments apply)


Thanks,

James
  
Moger, Babu Dec. 8, 2022, 12:02 a.m. UTC | #12
[AMD Official Use Only - General]

Hi James,

> -----Original Message-----
> From: James Morse <james.morse@arm.com>
> Sent: Wednesday, December 7, 2022 11:21 AM
> To: Moger, Babu <Babu.Moger@amd.com>
> Cc: fenghua.yu@intel.com; dave.hansen@linux.intel.com; x86@kernel.org;
> hpa@zytor.com; paulmck@kernel.org; akpm@linux-foundation.org;
> quic_neeraju@quicinc.com; rdunlap@infradead.org;
> damien.lemoal@opensource.wdc.com; songmuchun@bytedance.com;
> peterz@infradead.org; jpoimboe@kernel.org; pbonzini@redhat.com;
> chang.seok.bae@intel.com; pawan.kumar.gupta@linux.intel.com;
> jmattson@google.com; daniel.sneddon@linux.intel.com; Das1, Sandipan
> <Sandipan.Das@amd.com>; tony.luck@intel.com; linux-doc@vger.kernel.org;
> linux-kernel@vger.kernel.org; bagasdotme@gmail.com; eranian@google.com;
> corbet@lwn.net; tglx@linutronix.de; mingo@redhat.com; bp@alien8.de;
> reinette.chatre@intel.com
> Subject: Re: [PATCH v8 10/13] x86/resctrl: Add sysfs interface to write
> mbm_total_bytes_config
> 
> Hi Babu,
> 
> (Nit: all the 'sysfs' in the subjects should really be 'resctrl', but as they already
> have 'x86/resctrl', could you just remove the sysfs?
> This patch would be "x86/resctrl: Add interface to write
> mbm_total_bytes_config")

Sure. Will change it.
> 
> On 04/11/2022 20:01, Babu Moger wrote:
> > The current event configuration for mbm_total_bytes can be changed by
> > the user by writing to the file
> > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config.
> >
> > The event configuration settings are domain specific and will affect
> > all the CPUs in the domain.
> >
> > Following are the types of events supported:
> >
> > ====
> ===========================================================
> > Bits   Description
> > ====
> ===========================================================
> > 6      Dirty Victims from the QOS domain to all types of memory
> > 5      Reads to slow memory in the non-local NUMA domain
> > 4      Reads to slow memory in the local NUMA domain
> > 3      Non-temporal writes to non-local NUMA domain
> > 2      Non-temporal writes to local NUMA domain
> > 1      Reads to memory in the non-local NUMA domain
> > 0      Reads to memory in the local NUMA domain
> > ====
> ===========================================================
> >
> > For example:
> > To change the mbm_total_bytes to count only reads on domain 0, the
> > bits 0, 1, 4 and 5 needs to be set, which is 110011b (in hex 0x33).
> > Run the command.
> > 	$echo  0=0x33 > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
> >
> > To change the mbm_total_bytes to count all the slow memory reads on
> > domain 1, the bits 4 and 5 needs to be set which is 110000b (in hex 0x30).
> > Run the command.
> > 	$echo  1=0x30 > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
> 
> > diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > index 18f9588a41cf..0cdccb69386e 100644
> > --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > @@ -1505,6 +1505,133 @@ static int mbm_local_bytes_config_show(struct
> kernfs_open_file *of,
> >  	return 0;
> >  }
> >
> > +static void mon_event_config_write(void *info) {
> > +	struct mon_config_info *mon_info = info;
> > +	u32 index;
> > +
> > +	index = mon_event_config_index_get(mon_info->evtid);
> > +	if (index >= MAX_CONFIG_EVENTS) {
> > +		pr_warn_once("Invalid event id %d\n", mon_info->evtid);
> > +		return;
> > +	}
> > +	wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
> }
> > +
> > +static int mbm_config_write(struct rdt_resource *r, struct rdt_domain *d,
> > +			    u32 evtid, u32 val)
> > +{
> > +	struct mon_config_info mon_info = {0};
> > +	int ret = 0;
> > +
> > +	rdt_last_cmd_clear();
> > +
> > +	/* mon_config cannot be more than the supported set of events */
> > +	if (val > MAX_EVT_CONFIG_BITS) {
> > +		rdt_last_cmd_puts("Invalid event configuration\n");
> > +		return -EINVAL;
> > +	}
> > +
> > +	/*
> > +	 * Read the current config value first. If both are same then
> > +	 * we don't need to write it again.
> > +	 */
> > +	mon_info.evtid = evtid;
> 
> > +	mondata_config_read(d, &mon_info);
> 
> This reads the MSR on this CPU, which gets the result for this domain...

[1] No. This read happens at the target domain. 

static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info)
{
        smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1);
}

> 
> 
> > +	if (mon_info.mon_config == val)
> > +		goto write_exit;
> > +
> > +	mon_info.mon_config = val;
> > +
> > +	/*
> > +	 * Update MSR_IA32_EVT_CFG_BASE MSRs on all the CPUs in the
> > +	 * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
> > +	 * are scoped at the domain level. Writing any of these MSRs
> > +	 * on one CPU is supposed to be observed by all CPUs in the
> > +	 * domain. However, the hardware team recommends to update
> > +	 * these MSRs on all the CPUs in the domain.
> > +	 */
> 
> > +	on_each_cpu_mask(&d->cpu_mask, mon_event_config_write,
> &mon_info,
> > +1);
> 
> ... but here you IPI all the CPUs in the target domain to update them.

[2] There have been some changes in this area recently. The requirement of writing the value on all the CPUs in the domain is not required anymore. I am working on verifying this right now.  If everything works, then I can do 
smp_call_function_any(&d->cpu_mask, mon_event_config_write,  &mon_info, 1);

I will confirm this soon.

> 
> This means you unnecessarily IPI the CPUs in the target domain if they already
> had this value, but the write syscall occurred on a domain that differs. This isn't
> what you intended, but its benign.
> More of a problem is: Won't this get skipped if the write syscall occurs on a
> domain that happens to have the target configuration already?

Do you still think this is a problem after my comment [1] above?  Or Am I missing something?

> 
> Because you need the same value to be written on every CPU ... what happens
> to CPUs that are offline when the configuration is changed? Do they keep their
> previous value, or does it get reset?

The contents of this MSR register are held outside of all the cores.  If the value changes while a cpu is offline, and it reads it once it comes online, it will get the new value.
> 
> 
> I think this is best solved with a percpu variable for the current value of the
> MSR. You can then read it for CPUs in a remote domain, and only issue IPIs to
> 'sync' the value if needed. You can then re-use the sync call in
> resctrl_online_cpu() to set the MSR to whatever value it should currently be.

This may not be required with my comment 1 and 2 above.

> 
> 
> > +
> > +	/*
> > +	 * When an Event Configuration is changed, the bandwidth counters
> > +	 * for all RMIDs and Events will be cleared by the hardware. The
> > +	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
> > +	 * every RMID on the next read to any event for every RMID.
> > +	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
> > +	 * cleared while it is tracked by the hardware. Clear the
> > +	 * mbm_local and mbm_total counts for all the RMIDs.
> > +	 */
> > +	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
> > +	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
> > +
> > +write_exit:
> > +	return ret;
> > +}
> 
> 
> > +static int mon_config_parse(struct rdt_resource *r, char *tok, u32
> > +evtid) {
> > +	char *dom_str = NULL, *id_str;
> > +	unsigned long dom_id, val;
> > +	struct rdt_domain *d;
> > +	int ret = 0;
> > +
> > +next:
> > +	if (!tok || tok[0] == '\0')
> > +		return 0;
> > +
> > +	/* Start processing the strings for each domain */
> > +	dom_str = strim(strsep(&tok, ";"));
> > +	id_str = strsep(&dom_str, "=");
> > +
> > +	if (!dom_str || kstrtoul(id_str, 10, &dom_id)) {
> > +		rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (!dom_str || kstrtoul(dom_str, 16, &val)) {
> > +		rdt_last_cmd_puts("Missing '=' or non-numeric event
> configuration value\n");
> > +		return -EINVAL;
> > +	}
> 
> This is parsing the same format strings as parse_line(). Is there any chance that
> code could be re-used instead of duplicated? This way anything that is added to
> the format (or bugs found!) only need supporting in once place.

I have checked on reusing the parse_line. The parse_line is specifically written for schemata.  We can't reuse parse_line without changing it completely.

Thanks
Babu
> 
> 
> 
> > +	list_for_each_entry(d, &r->domains, list) {
> > +		if (d->id == dom_id) {
> > +			ret = mbm_config_write(r, d, evtid, val);
> > +			if (ret)
> > +				return -EINVAL;
> > +			goto next;
> > +		}
> > +	}
> > +
> > +	return -EINVAL;
> > +}
> 
> 
> Thanks,
> 
> James
  
James Morse Dec. 13, 2022, 5:55 p.m. UTC | #13
Hi Babu,

On 08/12/2022 00:02, Moger, Babu wrote:
> [AMD Official Use Only - General]
>> -----Original Message-----
>> From: James Morse <james.morse@arm.com>
>> Sent: Wednesday, December 7, 2022 11:21 AM
>> To: Moger, Babu <Babu.Moger@amd.com>
>> Cc: fenghua.yu@intel.com; dave.hansen@linux.intel.com; x86@kernel.org;
>> hpa@zytor.com; paulmck@kernel.org; akpm@linux-foundation.org;
>> quic_neeraju@quicinc.com; rdunlap@infradead.org;
>> damien.lemoal@opensource.wdc.com; songmuchun@bytedance.com;
>> peterz@infradead.org; jpoimboe@kernel.org; pbonzini@redhat.com;
>> chang.seok.bae@intel.com; pawan.kumar.gupta@linux.intel.com;
>> jmattson@google.com; daniel.sneddon@linux.intel.com; Das1, Sandipan
>> <Sandipan.Das@amd.com>; tony.luck@intel.com; linux-doc@vger.kernel.org;
>> linux-kernel@vger.kernel.org; bagasdotme@gmail.com; eranian@google.com;
>> corbet@lwn.net; tglx@linutronix.de; mingo@redhat.com; bp@alien8.de;
>> reinette.chatre@intel.com
>> Subject: Re: [PATCH v8 10/13] x86/resctrl: Add sysfs interface to write
>> mbm_total_bytes_config

>> On 04/11/2022 20:01, Babu Moger wrote:
>>> The current event configuration for mbm_total_bytes can be changed by
>>> the user by writing to the file
>>> /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config.
>>>
>>> The event configuration settings are domain specific and will affect
>>> all the CPUs in the domain.
>>>
>>> Following are the types of events supported:
>>>
>>> ====
>> ===========================================================
>>> Bits   Description
>>> ====
>> ===========================================================
>>> 6      Dirty Victims from the QOS domain to all types of memory
>>> 5      Reads to slow memory in the non-local NUMA domain
>>> 4      Reads to slow memory in the local NUMA domain
>>> 3      Non-temporal writes to non-local NUMA domain
>>> 2      Non-temporal writes to local NUMA domain
>>> 1      Reads to memory in the non-local NUMA domain
>>> 0      Reads to memory in the local NUMA domain
>>> ====
>> ===========================================================
>>>
>>> For example:
>>> To change the mbm_total_bytes to count only reads on domain 0, the
>>> bits 0, 1, 4 and 5 needs to be set, which is 110011b (in hex 0x33).
>>> Run the command.
>>> 	$echo  0=0x33 > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
>>>
>>> To change the mbm_total_bytes to count all the slow memory reads on
>>> domain 1, the bits 4 and 5 needs to be set which is 110000b (in hex 0x30).
>>> Run the command.
>>> 	$echo  1=0x30 > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
>>
>>> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>>> b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>>> index 18f9588a41cf..0cdccb69386e 100644
>>> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>>> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>>> @@ -1505,6 +1505,133 @@ static int mbm_local_bytes_config_show(struct
>> kernfs_open_file *of,
>>>  	return 0;
>>>  }
>>>
>>> +static void mon_event_config_write(void *info) {
>>> +	struct mon_config_info *mon_info = info;
>>> +	u32 index;
>>> +
>>> +	index = mon_event_config_index_get(mon_info->evtid);
>>> +	if (index >= MAX_CONFIG_EVENTS) {
>>> +		pr_warn_once("Invalid event id %d\n", mon_info->evtid);
>>> +		return;
>>> +	}
>>> +	wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
>> }
>>> +
>>> +static int mbm_config_write(struct rdt_resource *r, struct rdt_domain *d,
>>> +			    u32 evtid, u32 val)
>>> +{
>>> +	struct mon_config_info mon_info = {0};
>>> +	int ret = 0;
>>> +
>>> +	rdt_last_cmd_clear();
>>> +
>>> +	/* mon_config cannot be more than the supported set of events */
>>> +	if (val > MAX_EVT_CONFIG_BITS) {
>>> +		rdt_last_cmd_puts("Invalid event configuration\n");
>>> +		return -EINVAL;
>>> +	}
>>> +
>>> +	/*
>>> +	 * Read the current config value first. If both are same then
>>> +	 * we don't need to write it again.
>>> +	 */
>>> +	mon_info.evtid = evtid;
>>
>>> +	mondata_config_read(d, &mon_info);
>>
>> This reads the MSR on this CPU, which gets the result for this domain...
> 
> [1] No. This read happens at the target domain. 

Oops ... looks like I muddled that with mon_event_config_read().


> static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info)
> {
>         smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1);
> }

>>> +	if (mon_info.mon_config == val)
>>> +		goto write_exit;
>>> +
>>> +	mon_info.mon_config = val;
>>> +
>>> +	/*
>>> +	 * Update MSR_IA32_EVT_CFG_BASE MSRs on all the CPUs in the
>>> +	 * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
>>> +	 * are scoped at the domain level. Writing any of these MSRs
>>> +	 * on one CPU is supposed to be observed by all CPUs in the
>>> +	 * domain. However, the hardware team recommends to update
>>> +	 * these MSRs on all the CPUs in the domain.
>>> +	 */
>>
>>> +	on_each_cpu_mask(&d->cpu_mask, mon_event_config_write,
>> &mon_info,
>>> +1);
>>
>> ... but here you IPI all the CPUs in the target domain to update them.

> [2] There have been some changes in this area recently. The requirement of writing the
> value on all the CPUs in the domain is not required anymore. I am working on verifying
> this right now.  If everything works, then I can do 
> smp_call_function_any(&d->cpu_mask, mon_event_config_write,  &mon_info, 1);
> 
> I will confirm this soon.

Okay, that makes my next question more confusing then ....


>> This means you unnecessarily IPI the CPUs in the target domain if they already
>> had this value, but the write syscall occurred on a domain that differs. This isn't
>> what you intended, but its benign.
>> More of a problem is: Won't this get skipped if the write syscall occurs on a
>> domain that happens to have the target configuration already?

> Do you still think this is a problem after my comment [1] above?  Or Am I missing something?

I'd muddled two similarly named functions. Sorry for the noise!

I think what you're left with is the question "What is the monitor config for CPUs that
were offline when it was last changed?". If its preserved by the CPU, then its some
unknown value, and needs to be made the same as the value user-space/the-domain currently
expects.

If there is only one config value for the domain (as your comment above suggests), then
nothing needs doing here.


>> Because you need the same value to be written on every CPU ... what happens
>> to CPUs that are offline when the configuration is changed? Do they keep their
>> previous value, or does it get reset?
> 
> The contents of this MSR register are held outside of all the cores.  If the value changes
> while a cpu is offline, and it reads it once it comes online, it will get the new value.

This fits with your new description of the value only needing to be written from one CPU
in the domain.


>> I think this is best solved with a percpu variable for the current value of the
>> MSR. You can then read it for CPUs in a remote domain, and only issue IPIs to
>> 'sync' the value if needed. You can then re-use the sync call in
>> resctrl_online_cpu() to set the MSR to whatever value it should currently be.
> 
> This may not be required with my comment 1 and 2 above.
> 
>>
>>
>>> +
>>> +	/*
>>> +	 * When an Event Configuration is changed, the bandwidth counters
>>> +	 * for all RMIDs and Events will be cleared by the hardware. The
>>> +	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
>>> +	 * every RMID on the next read to any event for every RMID.
>>> +	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
>>> +	 * cleared while it is tracked by the hardware. Clear the
>>> +	 * mbm_local and mbm_total counts for all the RMIDs.
>>> +	 */
>>> +	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
>>> +	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
>>> +
>>> +write_exit:
>>> +	return ret;
>>> +}
>>
>>
>>> +static int mon_config_parse(struct rdt_resource *r, char *tok, u32
>>> +evtid) {
>>> +	char *dom_str = NULL, *id_str;
>>> +	unsigned long dom_id, val;
>>> +	struct rdt_domain *d;
>>> +	int ret = 0;
>>> +
>>> +next:
>>> +	if (!tok || tok[0] == '\0')
>>> +		return 0;
>>> +
>>> +	/* Start processing the strings for each domain */
>>> +	dom_str = strim(strsep(&tok, ";"));
>>> +	id_str = strsep(&dom_str, "=");
>>> +
>>> +	if (!dom_str || kstrtoul(id_str, 10, &dom_id)) {
>>> +		rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
>>> +		return -EINVAL;
>>> +	}
>>> +
>>> +	if (!dom_str || kstrtoul(dom_str, 16, &val)) {
>>> +		rdt_last_cmd_puts("Missing '=' or non-numeric event
>> configuration value\n");
>>> +		return -EINVAL;
>>> +	}
>>
>> This is parsing the same format strings as parse_line(). Is there any chance that
>> code could be re-used instead of duplicated? This way anything that is added to
>> the format (or bugs found!) only need supporting in once place.
> 
> I have checked on reusing the parse_line. The parse_line is specifically written for
> schemata.  We can't reuse parse_line without changing it completely.

I agree its a little more complicated than it looked at first. I might have a go at it
later...


Thanks,

James
  
Moger, Babu Dec. 13, 2022, 11:46 p.m. UTC | #14
[AMD Official Use Only - General]

Hi James,

> -----Original Message-----
> From: James Morse <james.morse@arm.com>
> Sent: Tuesday, December 13, 2022 11:55 AM
> To: Moger, Babu <Babu.Moger@amd.com>
> Cc: fenghua.yu@intel.com; dave.hansen@linux.intel.com; x86@kernel.org;
> hpa@zytor.com; paulmck@kernel.org; akpm@linux-foundation.org;
> quic_neeraju@quicinc.com; rdunlap@infradead.org;
> damien.lemoal@opensource.wdc.com; songmuchun@bytedance.com;
> peterz@infradead.org; jpoimboe@kernel.org; pbonzini@redhat.com;
> chang.seok.bae@intel.com; pawan.kumar.gupta@linux.intel.com;
> jmattson@google.com; daniel.sneddon@linux.intel.com; Das1, Sandipan
> <Sandipan.Das@amd.com>; tony.luck@intel.com; linux-doc@vger.kernel.org;
> linux-kernel@vger.kernel.org; bagasdotme@gmail.com; eranian@google.com;
> corbet@lwn.net; tglx@linutronix.de; mingo@redhat.com; bp@alien8.de;
> reinette.chatre@intel.com
> Subject: Re: [PATCH v8 10/13] x86/resctrl: Add sysfs interface to write
> mbm_total_bytes_config
> 
> Hi Babu,
> 
> On 08/12/2022 00:02, Moger, Babu wrote:
> > [AMD Official Use Only - General]
> >> -----Original Message-----
> >> From: James Morse <james.morse@arm.com>
> >> Sent: Wednesday, December 7, 2022 11:21 AM
> >> To: Moger, Babu <Babu.Moger@amd.com>
> >> Cc: fenghua.yu@intel.com; dave.hansen@linux.intel.com;
> >> x86@kernel.org; hpa@zytor.com; paulmck@kernel.org;
> >> akpm@linux-foundation.org; quic_neeraju@quicinc.com;
> >> rdunlap@infradead.org; damien.lemoal@opensource.wdc.com;
> >> songmuchun@bytedance.com; peterz@infradead.org;
> jpoimboe@kernel.org;
> >> pbonzini@redhat.com; chang.seok.bae@intel.com;
> >> pawan.kumar.gupta@linux.intel.com;
> >> jmattson@google.com; daniel.sneddon@linux.intel.com; Das1, Sandipan
> >> <Sandipan.Das@amd.com>; tony.luck@intel.com;
> >> linux-doc@vger.kernel.org; linux-kernel@vger.kernel.org;
> >> bagasdotme@gmail.com; eranian@google.com; corbet@lwn.net;
> >> tglx@linutronix.de; mingo@redhat.com; bp@alien8.de;
> >> reinette.chatre@intel.com
> >> Subject: Re: [PATCH v8 10/13] x86/resctrl: Add sysfs interface to
> >> write mbm_total_bytes_config
> 
> >> On 04/11/2022 20:01, Babu Moger wrote:
> >>> The current event configuration for mbm_total_bytes can be changed
> >>> by the user by writing to the file
> >>> /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config.
> >>>
> >>> The event configuration settings are domain specific and will affect
> >>> all the CPUs in the domain.
> >>>
> >>> Following are the types of events supported:
> >>>
> >>> ====
> >> ===========================================================
> >>> Bits   Description
> >>> ====
> >> ===========================================================
> >>> 6      Dirty Victims from the QOS domain to all types of memory
> >>> 5      Reads to slow memory in the non-local NUMA domain
> >>> 4      Reads to slow memory in the local NUMA domain
> >>> 3      Non-temporal writes to non-local NUMA domain
> >>> 2      Non-temporal writes to local NUMA domain
> >>> 1      Reads to memory in the non-local NUMA domain
> >>> 0      Reads to memory in the local NUMA domain
> >>> ====
> >> ===========================================================
> >>>
> >>> For example:
> >>> To change the mbm_total_bytes to count only reads on domain 0, the
> >>> bits 0, 1, 4 and 5 needs to be set, which is 110011b (in hex 0x33).
> >>> Run the command.
> >>> 	$echo  0=0x33 > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
> >>>
> >>> To change the mbm_total_bytes to count all the slow memory reads on
> >>> domain 1, the bits 4 and 5 needs to be set which is 110000b (in hex 0x30).
> >>> Run the command.
> >>> 	$echo  1=0x30 > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
> >>
> >>> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> >>> b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> >>> index 18f9588a41cf..0cdccb69386e 100644
> >>> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> >>> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> >>> @@ -1505,6 +1505,133 @@ static int
> >>> mbm_local_bytes_config_show(struct
> >> kernfs_open_file *of,
> >>>  	return 0;
> >>>  }
> >>>
> >>> +static void mon_event_config_write(void *info) {
> >>> +	struct mon_config_info *mon_info = info;
> >>> +	u32 index;
> >>> +
> >>> +	index = mon_event_config_index_get(mon_info->evtid);
> >>> +	if (index >= MAX_CONFIG_EVENTS) {
> >>> +		pr_warn_once("Invalid event id %d\n", mon_info->evtid);
> >>> +		return;
> >>> +	}
> >>> +	wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
> >> }
> >>> +
> >>> +static int mbm_config_write(struct rdt_resource *r, struct rdt_domain *d,
> >>> +			    u32 evtid, u32 val)
> >>> +{
> >>> +	struct mon_config_info mon_info = {0};
> >>> +	int ret = 0;
> >>> +
> >>> +	rdt_last_cmd_clear();
> >>> +
> >>> +	/* mon_config cannot be more than the supported set of events */
> >>> +	if (val > MAX_EVT_CONFIG_BITS) {
> >>> +		rdt_last_cmd_puts("Invalid event configuration\n");
> >>> +		return -EINVAL;
> >>> +	}
> >>> +
> >>> +	/*
> >>> +	 * Read the current config value first. If both are same then
> >>> +	 * we don't need to write it again.
> >>> +	 */
> >>> +	mon_info.evtid = evtid;
> >>
> >>> +	mondata_config_read(d, &mon_info);
> >>
> >> This reads the MSR on this CPU, which gets the result for this domain...
> >
> > [1] No. This read happens at the target domain.
> 
> Oops ... looks like I muddled that with mon_event_config_read().
> 
> 
> > static void mondata_config_read(struct rdt_domain *d, struct
> > mon_config_info *mon_info) {
> >         smp_call_function_any(&d->cpu_mask, mon_event_config_read,
> > mon_info, 1); }
> 
> >>> +	if (mon_info.mon_config == val)
> >>> +		goto write_exit;
> >>> +
> >>> +	mon_info.mon_config = val;
> >>> +
> >>> +	/*
> >>> +	 * Update MSR_IA32_EVT_CFG_BASE MSRs on all the CPUs in the
> >>> +	 * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
> >>> +	 * are scoped at the domain level. Writing any of these MSRs
> >>> +	 * on one CPU is supposed to be observed by all CPUs in the
> >>> +	 * domain. However, the hardware team recommends to update
> >>> +	 * these MSRs on all the CPUs in the domain.
> >>> +	 */
> >>
> >>> +	on_each_cpu_mask(&d->cpu_mask, mon_event_config_write,
> >> &mon_info,
> >>> +1);
> >>
> >> ... but here you IPI all the CPUs in the target domain to update them.
> 
> > [2] There have been some changes in this area recently. The
> > requirement of writing the value on all the CPUs in the domain is not
> > required anymore. I am working on verifying this right now.  If
> > everything works, then I can do smp_call_function_any(&d->cpu_mask,
> > mon_event_config_write,  &mon_info, 1);
> >
> > I will confirm this soon.
> 
> Okay, that makes my next question more confusing then ....
> 
> 
> >> This means you unnecessarily IPI the CPUs in the target domain if
> >> they already had this value, but the write syscall occurred on a
> >> domain that differs. This isn't what you intended, but its benign.
> >> More of a problem is: Won't this get skipped if the write syscall
> >> occurs on a domain that happens to have the target configuration already?
> 
> > Do you still think this is a problem after my comment [1] above?  Or Am I
> missing something?
> 
> I'd muddled two similarly named functions. Sorry for the noise!

No worries. It made me look closely. 
> 
> I think what you're left with is the question "What is the monitor config for
> CPUs that were offline when it was last changed?". If its preserved by the CPU,
> then its some unknown value, and needs to be made the same as the value
> user-space/the-domain currently expects.
> 
> If there is only one config value for the domain (as your comment above
> suggests), then nothing needs doing here.

Ok. Thanks

> 
> 
> >> Because you need the same value to be written on every CPU ... what
> >> happens to CPUs that are offline when the configuration is changed?
> >> Do they keep their previous value, or does it get reset?
> >
> > The contents of this MSR register are held outside of all the cores.
> > If the value changes while a cpu is offline, and it reads it once it comes
> online, it will get the new value.
> 
> This fits with your new description of the value only needing to be written from
> one CPU in the domain.

Yes. Will change the comment about one CPU. 
Still waiting for the comment for the whole series from Reinette before I re-spin the next version.
> 
> 
> >> I think this is best solved with a percpu variable for the current
> >> value of the MSR. You can then read it for CPUs in a remote domain,
> >> and only issue IPIs to 'sync' the value if needed. You can then
> >> re-use the sync call in
> >> resctrl_online_cpu() to set the MSR to whatever value it should currently be.
> >
> > This may not be required with my comment 1 and 2 above.
> >
> >>
> >>
> >>> +
> >>> +	/*
> >>> +	 * When an Event Configuration is changed, the bandwidth counters
> >>> +	 * for all RMIDs and Events will be cleared by the hardware. The
> >>> +	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
> >>> +	 * every RMID on the next read to any event for every RMID.
> >>> +	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
> >>> +	 * cleared while it is tracked by the hardware. Clear the
> >>> +	 * mbm_local and mbm_total counts for all the RMIDs.
> >>> +	 */
> >>> +	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
> >>> +	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
> >>> +
> >>> +write_exit:
> >>> +	return ret;
> >>> +}
> >>
> >>
> >>> +static int mon_config_parse(struct rdt_resource *r, char *tok, u32
> >>> +evtid) {
> >>> +	char *dom_str = NULL, *id_str;
> >>> +	unsigned long dom_id, val;
> >>> +	struct rdt_domain *d;
> >>> +	int ret = 0;
> >>> +
> >>> +next:
> >>> +	if (!tok || tok[0] == '\0')
> >>> +		return 0;
> >>> +
> >>> +	/* Start processing the strings for each domain */
> >>> +	dom_str = strim(strsep(&tok, ";"));
> >>> +	id_str = strsep(&dom_str, "=");
> >>> +
> >>> +	if (!dom_str || kstrtoul(id_str, 10, &dom_id)) {
> >>> +		rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
> >>> +		return -EINVAL;
> >>> +	}
> >>> +
> >>> +	if (!dom_str || kstrtoul(dom_str, 16, &val)) {
> >>> +		rdt_last_cmd_puts("Missing '=' or non-numeric event
> >> configuration value\n");
> >>> +		return -EINVAL;
> >>> +	}
> >>
> >> This is parsing the same format strings as parse_line(). Is there any
> >> chance that code could be re-used instead of duplicated? This way
> >> anything that is added to the format (or bugs found!) only need supporting in
> once place.
> >
> > I have checked on reusing the parse_line. The parse_line is
> > specifically written for schemata.  We can't reuse parse_line without
> changing it completely.
> 
> I agree its a little more complicated than it looked at first. I might have a go at
> it later...

Ok Thanks
Babu
  

Patch

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 18f9588a41cf..0cdccb69386e 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1505,6 +1505,133 @@  static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
 	return 0;
 }
 
+static void mon_event_config_write(void *info)
+{
+	struct mon_config_info *mon_info = info;
+	u32 index;
+
+	index = mon_event_config_index_get(mon_info->evtid);
+	if (index >= MAX_CONFIG_EVENTS) {
+		pr_warn_once("Invalid event id %d\n", mon_info->evtid);
+		return;
+	}
+	wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
+}
+
+static int mbm_config_write(struct rdt_resource *r, struct rdt_domain *d,
+			    u32 evtid, u32 val)
+{
+	struct mon_config_info mon_info = {0};
+	int ret = 0;
+
+	rdt_last_cmd_clear();
+
+	/* mon_config cannot be more than the supported set of events */
+	if (val > MAX_EVT_CONFIG_BITS) {
+		rdt_last_cmd_puts("Invalid event configuration\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Read the current config value first. If both are same then
+	 * we don't need to write it again.
+	 */
+	mon_info.evtid = evtid;
+	mondata_config_read(d, &mon_info);
+	if (mon_info.mon_config == val)
+		goto write_exit;
+
+	mon_info.mon_config = val;
+
+	/*
+	 * Update MSR_IA32_EVT_CFG_BASE MSRs on all the CPUs in the
+	 * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
+	 * are scoped at the domain level. Writing any of these MSRs
+	 * on one CPU is supposed to be observed by all CPUs in the
+	 * domain. However, the hardware team recommends to update
+	 * these MSRs on all the CPUs in the domain.
+	 */
+	on_each_cpu_mask(&d->cpu_mask, mon_event_config_write, &mon_info, 1);
+
+	/*
+	 * When an Event Configuration is changed, the bandwidth counters
+	 * for all RMIDs and Events will be cleared by the hardware. The
+	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
+	 * every RMID on the next read to any event for every RMID.
+	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
+	 * cleared while it is tracked by the hardware. Clear the
+	 * mbm_local and mbm_total counts for all the RMIDs.
+	 */
+	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
+	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
+
+write_exit:
+	return ret;
+}
+
+static int mon_config_parse(struct rdt_resource *r, char *tok, u32 evtid)
+{
+	char *dom_str = NULL, *id_str;
+	unsigned long dom_id, val;
+	struct rdt_domain *d;
+	int ret = 0;
+
+next:
+	if (!tok || tok[0] == '\0')
+		return 0;
+
+	/* Start processing the strings for each domain */
+	dom_str = strim(strsep(&tok, ";"));
+	id_str = strsep(&dom_str, "=");
+
+	if (!dom_str || kstrtoul(id_str, 10, &dom_id)) {
+		rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
+		return -EINVAL;
+	}
+
+	if (!dom_str || kstrtoul(dom_str, 16, &val)) {
+		rdt_last_cmd_puts("Missing '=' or non-numeric event configuration value\n");
+		return -EINVAL;
+	}
+
+	list_for_each_entry(d, &r->domains, list) {
+		if (d->id == dom_id) {
+			ret = mbm_config_write(r, d, evtid, val);
+			if (ret)
+				return -EINVAL;
+			goto next;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
+					    char *buf, size_t nbytes,
+					    loff_t off)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+	int ret;
+
+	/* Valid input requires a trailing newline */
+	if (nbytes == 0 || buf[nbytes - 1] != '\n')
+		return -EINVAL;
+
+	cpus_read_lock();
+	mutex_lock(&rdtgroup_mutex);
+
+	rdt_last_cmd_clear();
+
+	buf[nbytes - 1] = '\0';
+
+	ret = mon_config_parse(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
+
+	mutex_unlock(&rdtgroup_mutex);
+	cpus_read_unlock();
+
+	return ret ?: nbytes;
+}
+
 /* rdtgroup information files for one cache resource. */
 static struct rftype res_common_files[] = {
 	{
@@ -1605,9 +1732,10 @@  static struct rftype res_common_files[] = {
 	},
 	{
 		.name		= "mbm_total_bytes_config",
-		.mode		= 0444,
+		.mode		= 0644,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= mbm_total_bytes_config_show,
+		.write		= mbm_total_bytes_config_write,
 	},
 	{
 		.name		= "mbm_local_bytes_config",