[v9,08/13] x86/resctrl: Add sysfs interface to read mbm_total_bytes_config
Commit Message
The current event configuration can be viewed by the user by reading
the configuration file /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config.
The event configuration settings are domain specific and will affect all
the CPUs in the domain.
Following are the types of events supported:
==== ===========================================================
Bits Description
==== ===========================================================
6 Dirty Victims from the QOS domain to all types of memory
5 Reads to slow memory in the non-local NUMA domain
4 Reads to slow memory in the local NUMA domain
3 Non-temporal writes to non-local NUMA domain
2 Non-temporal writes to local NUMA domain
1 Reads to memory in the non-local NUMA domain
0 Reads to memory in the local NUMA domain
==== ===========================================================
By default, the mbm_total_bytes_config is set to 0x7f to count all the
event types.
For example:
$cat /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
0=0x7f;1=0x7f;2=0x7f;3=0x7f
In this case, the event mbm_total_bytes is currently configured
with 0x7f on domains 0 to 3.
Signed-off-by: Babu Moger <babu.moger@amd.com>
---
arch/x86/include/asm/msr-index.h | 1
arch/x86/kernel/cpu/resctrl/internal.h | 24 ++++++++
arch/x86/kernel/cpu/resctrl/monitor.c | 4 +
arch/x86/kernel/cpu/resctrl/rdtgroup.c | 99 ++++++++++++++++++++++++++++++++
4 files changed, 127 insertions(+), 1 deletion(-)
Comments
Hi Babu,
I would like to second James's suggestion to replace sysfs with resctrl
or just remove it. I am concerned that you mentioned in recent message
that you only plan changes to patch 10 while James highlighted that this
needs to be addressed in entire series. Could you please ensure that
you check all the patches?
On 12/1/2022 7:36 AM, Babu Moger wrote:
> The current event configuration can be viewed by the user by reading
What "current" means is not clear and the term could just be removed.
> the configuration file /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config.
> The event configuration settings are domain specific and will affect all
> the CPUs in the domain.
>
> Following are the types of events supported:
> ==== ===========================================================
> Bits Description
> ==== ===========================================================
> 6 Dirty Victims from the QOS domain to all types of memory
> 5 Reads to slow memory in the non-local NUMA domain
> 4 Reads to slow memory in the local NUMA domain
> 3 Non-temporal writes to non-local NUMA domain
> 2 Non-temporal writes to local NUMA domain
> 1 Reads to memory in the non-local NUMA domain
> 0 Reads to memory in the local NUMA domain
> ==== ===========================================================
>
> By default, the mbm_total_bytes_config is set to 0x7f to count all the
> event types.
>
> For example:
> $cat /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
> 0=0x7f;1=0x7f;2=0x7f;3=0x7f
>
> In this case, the event mbm_total_bytes is currently configured
> with 0x7f on domains 0 to 3.
"currently" can be removed since it already starts with "In this case".
> Signed-off-by: Babu Moger <babu.moger@amd.com>
> ---
> arch/x86/include/asm/msr-index.h | 1
> arch/x86/kernel/cpu/resctrl/internal.h | 24 ++++++++
> arch/x86/kernel/cpu/resctrl/monitor.c | 4 +
> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 99 ++++++++++++++++++++++++++++++++
> 4 files changed, 127 insertions(+), 1 deletion(-)
>
...
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index 8342feb54a7f..e93b1c206116 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -1423,6 +1423,90 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
> return ret;
> }
>
> +struct mon_config_info {
> + u32 evtid;
> + u32 mon_config;
> +};
> +
> +#define INVALID_CONFIG_INDEX UINT_MAX
> +
> +/**
> + * mon_event_config_index_get - get the index for the configurable event
Could you say "get the hardware index" to help clarify what the index is
for?
> + * @evtid: event id.
> + *
> + * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID
> + * 1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID
> + * INVALID_CONFIG_INDEX for invalid evtid
> + */
> +static inline unsigned int mon_event_config_index_get(u32 evtid)
> +{
> + switch (evtid) {
> + case QOS_L3_MBM_TOTAL_EVENT_ID:
> + return 0;
> + case QOS_L3_MBM_LOCAL_EVENT_ID:
> + return 1;
> + default:
> + /* WARN */
> + return INVALID_CONFIG_INDEX;
> + }
> +}
I see that you copied my sample code here. My intention was that the
/* WARN */ comment be replaced with an actual warning. As a comment
it does not add value. Since the caller now prints a subtler warning it
could just be:
/* Should never reach here */
return INVALID_CONFIG_INDEX;
> +
> +static void mon_event_config_read(void *info)
> +{
> + struct mon_config_info *mon_info = info;
> + u32 h, index;
index can be "unsigned int" as returned by mon_event_config_index_get()
> +
> + index = mon_event_config_index_get(mon_info->evtid);
> + if (index == INVALID_CONFIG_INDEX) {
> + pr_warn_once("Invalid event id %d\n", mon_info->evtid);
> + return;
> + }
> + rdmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, h);
> +
> + /* Report only the valid event configuration bits */
> + mon_info->mon_config &= MAX_EVT_CONFIG_BITS;
> +}
> +
> +static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info)
> +{
> + smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1);
> +}
> +
> +static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
> +{
> + struct mon_config_info mon_info = {0};
> + struct rdt_domain *dom;
> + bool sep = false;
> +
> + mutex_lock(&rdtgroup_mutex);
> +
> + list_for_each_entry(dom, &r->domains, list) {
> + if (sep)
> + seq_puts(s, ";");
> +
> + mon_info.evtid = evtid;
> + mondata_config_read(dom, &mon_info);
> +
For robustness, please reset mon_config before calling mondata_config_read(). Since
mon_event_config_read() may (yes this is very unlikely) exit early then mon_config
will contain the data from the previous domain.
> + seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config);
> + sep = true;
> + }
> + seq_puts(s, "\n");
> +
> + mutex_unlock(&rdtgroup_mutex);
> +
> + return 0;
> +}
> +
...
Reinette
[AMD Official Use Only - General]
Hi Reinette,
> -----Original Message-----
> From: Reinette Chatre <reinette.chatre@intel.com>
> Sent: Thursday, December 15, 2022 11:41 AM
> To: Moger, Babu <Babu.Moger@amd.com>; corbet@lwn.net;
> tglx@linutronix.de; mingo@redhat.com; bp@alien8.de
> Cc: fenghua.yu@intel.com; dave.hansen@linux.intel.com; x86@kernel.org;
> hpa@zytor.com; paulmck@kernel.org; akpm@linux-foundation.org;
> quic_neeraju@quicinc.com; rdunlap@infradead.org;
> damien.lemoal@opensource.wdc.com; songmuchun@bytedance.com;
> peterz@infradead.org; jpoimboe@kernel.org; pbonzini@redhat.com;
> chang.seok.bae@intel.com; pawan.kumar.gupta@linux.intel.com;
> jmattson@google.com; daniel.sneddon@linux.intel.com; Das1, Sandipan
> <Sandipan.Das@amd.com>; tony.luck@intel.com; james.morse@arm.com;
> linux-doc@vger.kernel.org; linux-kernel@vger.kernel.org;
> bagasdotme@gmail.com; eranian@google.com; christophe.leroy@csgroup.eu;
> jarkko@kernel.org; adrian.hunter@intel.com; quic_jiles@quicinc.com;
> peternewman@google.com
> Subject: Re: [PATCH v9 08/13] x86/resctrl: Add sysfs interface to read
> mbm_total_bytes_config
>
> Hi Babu,
>
> I would like to second James's suggestion to replace sysfs with resctrl or just
> remove it. I am concerned that you mentioned in recent message that you only
> plan changes to patch 10 while James highlighted that this needs to be
> addressed in entire series. Could you please ensure that you check all the
> patches?
Ok. Sure. Will remove it.
x86/resctrl: Add interface to write mbm_total_bytes_config
Will check other patches for similar changes.
>
> On 12/1/2022 7:36 AM, Babu Moger wrote:
> > The current event configuration can be viewed by the user by reading
>
> What "current" means is not clear and the term could just be removed.
Will remove it.
>
> > the configuration file /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config.
> > The event configuration settings are domain specific and will affect
> > all the CPUs in the domain.
> >
> > Following are the types of events supported:
> > ====
> ===========================================================
> > Bits Description
> > ====
> ===========================================================
> > 6 Dirty Victims from the QOS domain to all types of memory
> > 5 Reads to slow memory in the non-local NUMA domain
> > 4 Reads to slow memory in the local NUMA domain
> > 3 Non-temporal writes to non-local NUMA domain
> > 2 Non-temporal writes to local NUMA domain
> > 1 Reads to memory in the non-local NUMA domain
> > 0 Reads to memory in the local NUMA domain
> > ====
> ===========================================================
> >
> > By default, the mbm_total_bytes_config is set to 0x7f to count all the
> > event types.
> >
> > For example:
> > $cat /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
> > 0=0x7f;1=0x7f;2=0x7f;3=0x7f
> >
> > In this case, the event mbm_total_bytes is currently configured
> > with 0x7f on domains 0 to 3.
>
> "currently" can be removed since it already starts with "In this case".
Sure.
>
>
> > Signed-off-by: Babu Moger <babu.moger@amd.com>
> > ---
> > arch/x86/include/asm/msr-index.h | 1
> > arch/x86/kernel/cpu/resctrl/internal.h | 24 ++++++++
> > arch/x86/kernel/cpu/resctrl/monitor.c | 4 +
> > arch/x86/kernel/cpu/resctrl/rdtgroup.c | 99
> ++++++++++++++++++++++++++++++++
> > 4 files changed, 127 insertions(+), 1 deletion(-)
> >
>
> ...
>
> > diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > index 8342feb54a7f..e93b1c206116 100644
> > --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > @@ -1423,6 +1423,90 @@ static int rdtgroup_size_show(struct
> kernfs_open_file *of,
> > return ret;
> > }
> >
> > +struct mon_config_info {
> > + u32 evtid;
> > + u32 mon_config;
> > +};
> > +
> > +#define INVALID_CONFIG_INDEX UINT_MAX
> > +
> > +/**
> > + * mon_event_config_index_get - get the index for the configurable
> > +event
>
> Could you say "get the hardware index" to help clarify what the index is for?
Sure.
>
> > + * @evtid: event id.
> > + *
> > + * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID
> > + * 1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID
> > + * INVALID_CONFIG_INDEX for invalid evtid
> > + */
> > +static inline unsigned int mon_event_config_index_get(u32 evtid) {
> > + switch (evtid) {
> > + case QOS_L3_MBM_TOTAL_EVENT_ID:
> > + return 0;
> > + case QOS_L3_MBM_LOCAL_EVENT_ID:
> > + return 1;
> > + default:
> > + /* WARN */
> > + return INVALID_CONFIG_INDEX;
> > + }
> > +}
>
> I see that you copied my sample code here. My intention was that the
> /* WARN */ comment be replaced with an actual warning. As a comment it
> does not add value. Since the caller now prints a subtler warning it could just
> be:
>
> /* Should never reach here */
> return INVALID_CONFIG_INDEX;
Sure.
>
> > +
> > +static void mon_event_config_read(void *info) {
> > + struct mon_config_info *mon_info = info;
> > + u32 h, index;
>
> index can be "unsigned int" as returned by mon_event_config_index_get()
Ok. Sure
>
> > +
> > + index = mon_event_config_index_get(mon_info->evtid);
> > + if (index == INVALID_CONFIG_INDEX) {
> > + pr_warn_once("Invalid event id %d\n", mon_info->evtid);
> > + return;
> > + }
> > + rdmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, h);
> > +
> > + /* Report only the valid event configuration bits */
> > + mon_info->mon_config &= MAX_EVT_CONFIG_BITS; }
> > +
> > +static void mondata_config_read(struct rdt_domain *d, struct
> > +mon_config_info *mon_info) {
> > + smp_call_function_any(&d->cpu_mask, mon_event_config_read,
> mon_info,
> > +1); }
> > +
> > +static int mbm_config_show(struct seq_file *s, struct rdt_resource
> > +*r, u32 evtid) {
> > + struct mon_config_info mon_info = {0};
> > + struct rdt_domain *dom;
> > + bool sep = false;
> > +
> > + mutex_lock(&rdtgroup_mutex);
> > +
> > + list_for_each_entry(dom, &r->domains, list) {
> > + if (sep)
> > + seq_puts(s, ";");
> > +
> > + mon_info.evtid = evtid;
> > + mondata_config_read(dom, &mon_info);
> > +
>
> For robustness, please reset mon_config before calling mondata_config_read().
> Since
> mon_event_config_read() may (yes this is very unlikely) exit early then
> mon_config will contain the data from the previous domain.
Sure. I can call memset to reset.
Thanks
Babu
>
> > + seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config);
> > + sep = true;
> > + }
> > + seq_puts(s, "\n");
> > +
> > + mutex_unlock(&rdtgroup_mutex);
> > +
> > + return 0;
> > +}
> > +
>
> ...
>
> Reinette
@@ -1062,6 +1062,7 @@
/* - AMD: */
#define MSR_IA32_MBA_BW_BASE 0xc0000200
#define MSR_IA32_SMBA_BW_BASE 0xc0000280
+#define MSR_IA32_EVT_CFG_BASE 0xc0000400
/* MSR_IA32_VMX_MISC bits */
#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14)
@@ -30,6 +30,29 @@
*/
#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
+/* Reads to Local DRAM Memory */
+#define READS_TO_LOCAL_MEM BIT(0)
+
+/* Reads to Remote DRAM Memory */
+#define READS_TO_REMOTE_MEM BIT(1)
+
+/* Non-Temporal Writes to Local Memory */
+#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2)
+
+/* Non-Temporal Writes to Remote Memory */
+#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3)
+
+/* Reads to Local Memory the system identifies as "Slow Memory" */
+#define READS_TO_LOCAL_S_MEM BIT(4)
+
+/* Reads to Remote Memory the system identifies as "Slow Memory" */
+#define READS_TO_REMOTE_S_MEM BIT(5)
+
+/* Dirty Victims to All Types of Memory */
+#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6)
+
+/* Max event bits supported */
+#define MAX_EVT_CONFIG_BITS GENMASK(6, 0)
struct rdt_fs_context {
struct kernfs_fs_context kfc;
@@ -531,5 +554,6 @@ bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
void __check_limbo(struct rdt_domain *d, bool force_free);
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
void __init thread_throttle_mode_init(void);
+void __init mbm_config_rftype_init(const char *config);
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
@@ -784,8 +784,10 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
return ret;
if (rdt_cpu_has(X86_FEATURE_BMEC)) {
- if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
mbm_total_event.configurable = true;
+ mbm_config_rftype_init("mbm_total_bytes_config");
+ }
if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
mbm_local_event.configurable = true;
}
@@ -1423,6 +1423,90 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
return ret;
}
+struct mon_config_info {
+ u32 evtid;
+ u32 mon_config;
+};
+
+#define INVALID_CONFIG_INDEX UINT_MAX
+
+/**
+ * mon_event_config_index_get - get the index for the configurable event
+ * @evtid: event id.
+ *
+ * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID
+ * 1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID
+ * INVALID_CONFIG_INDEX for invalid evtid
+ */
+static inline unsigned int mon_event_config_index_get(u32 evtid)
+{
+ switch (evtid) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return 0;
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return 1;
+ default:
+ /* WARN */
+ return INVALID_CONFIG_INDEX;
+ }
+}
+
+static void mon_event_config_read(void *info)
+{
+ struct mon_config_info *mon_info = info;
+ u32 h, index;
+
+ index = mon_event_config_index_get(mon_info->evtid);
+ if (index == INVALID_CONFIG_INDEX) {
+ pr_warn_once("Invalid event id %d\n", mon_info->evtid);
+ return;
+ }
+ rdmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, h);
+
+ /* Report only the valid event configuration bits */
+ mon_info->mon_config &= MAX_EVT_CONFIG_BITS;
+}
+
+static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info)
+{
+ smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1);
+}
+
+static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
+{
+ struct mon_config_info mon_info = {0};
+ struct rdt_domain *dom;
+ bool sep = false;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ list_for_each_entry(dom, &r->domains, list) {
+ if (sep)
+ seq_puts(s, ";");
+
+ mon_info.evtid = evtid;
+ mondata_config_read(dom, &mon_info);
+
+ seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config);
+ sep = true;
+ }
+ seq_puts(s, "\n");
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
+
+ return 0;
+}
+
/* rdtgroup information files for one cache resource. */
static struct rftype res_common_files[] = {
{
@@ -1521,6 +1605,12 @@ static struct rftype res_common_files[] = {
.seq_show = max_threshold_occ_show,
.fflags = RF_MON_INFO | RFTYPE_RES_CACHE,
},
+ {
+ .name = "mbm_total_bytes_config",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = mbm_total_bytes_config_show,
+ },
{
.name = "cpus",
.mode = 0644,
@@ -1627,6 +1717,15 @@ void __init thread_throttle_mode_init(void)
rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB;
}
+void __init mbm_config_rftype_init(const char *config)
+{
+ struct rftype *rft;
+
+ rft = rdtgroup_get_rftype_by_name(config);
+ if (rft)
+ rft->fflags = RF_MON_INFO | RFTYPE_RES_CACHE;
+}
+
/**
* rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
* @r: The resource group with which the file is associated.