[V2,3/5] perf mem: Clean up perf_mem_events__name()

Message ID 20231207192338.400336-4-kan.liang@linux.intel.com
State New
Headers
Series Clean up perf mem |

Commit Message

Liang, Kan Dec. 7, 2023, 7:23 p.m. UTC
  From: Kan Liang <kan.liang@linux.intel.com>

Introduce a generic perf_mem_events__name(). Remove the ARCH-specific
one.

The mem_load events may have a different format. Add ldlat and aux_event
in the struct perf_mem_event to indicate the format and the extra aux
event.

Add perf_mem_events_intel_aux[] to support the extra mem_load_aux event.

Rename perf_mem_events__name to perf_pmu__mem_events_name.

Tested-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 tools/perf/arch/arm64/util/mem-events.c   | 26 ++-------
 tools/perf/arch/powerpc/util/mem-events.c | 13 ++---
 tools/perf/arch/powerpc/util/mem-events.h |  7 +++
 tools/perf/arch/powerpc/util/pmu.c        | 11 ++++
 tools/perf/arch/x86/util/mem-events.c     | 70 +++++------------------
 tools/perf/arch/x86/util/mem-events.h     |  1 +
 tools/perf/arch/x86/util/pmu.c            |  8 ++-
 tools/perf/util/mem-events.c              | 56 ++++++++++++------
 tools/perf/util/mem-events.h              |  3 +-
 9 files changed, 89 insertions(+), 106 deletions(-)
 create mode 100644 tools/perf/arch/powerpc/util/mem-events.h
 create mode 100644 tools/perf/arch/powerpc/util/pmu.c
  

Comments

Ian Rogers Dec. 8, 2023, 12:01 a.m. UTC | #1
On Thu, Dec 7, 2023 at 11:24 AM <kan.liang@linux.intel.com> wrote:
>
> From: Kan Liang <kan.liang@linux.intel.com>
>
> Introduce a generic perf_mem_events__name(). Remove the ARCH-specific
> one.
>
> The mem_load events may have a different format. Add ldlat and aux_event
> in the struct perf_mem_event to indicate the format and the extra aux
> event.
>
> Add perf_mem_events_intel_aux[] to support the extra mem_load_aux event.
>
> Rename perf_mem_events__name to perf_pmu__mem_events_name.
>
> Tested-by: Ravi Bangoria <ravi.bangoria@amd.com>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>

Reviewed-by: Ian Rogers <irogers@google.com>

Thanks,
Ian

> ---
>  tools/perf/arch/arm64/util/mem-events.c   | 26 ++-------
>  tools/perf/arch/powerpc/util/mem-events.c | 13 ++---
>  tools/perf/arch/powerpc/util/mem-events.h |  7 +++
>  tools/perf/arch/powerpc/util/pmu.c        | 11 ++++
>  tools/perf/arch/x86/util/mem-events.c     | 70 +++++------------------
>  tools/perf/arch/x86/util/mem-events.h     |  1 +
>  tools/perf/arch/x86/util/pmu.c            |  8 ++-
>  tools/perf/util/mem-events.c              | 56 ++++++++++++------
>  tools/perf/util/mem-events.h              |  3 +-
>  9 files changed, 89 insertions(+), 106 deletions(-)
>  create mode 100644 tools/perf/arch/powerpc/util/mem-events.h
>  create mode 100644 tools/perf/arch/powerpc/util/pmu.c
>
> diff --git a/tools/perf/arch/arm64/util/mem-events.c b/tools/perf/arch/arm64/util/mem-events.c
> index 2602e8688727..eb2ef84f0fc8 100644
> --- a/tools/perf/arch/arm64/util/mem-events.c
> +++ b/tools/perf/arch/arm64/util/mem-events.c
> @@ -2,28 +2,10 @@
>  #include "map_symbol.h"
>  #include "mem-events.h"
>
> -#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
> +#define E(t, n, s, l, a) { .tag = t, .name = n, .sysfs_name = s, .ldlat = l, .aux_event = a }
>
>  struct perf_mem_event perf_mem_events_arm[PERF_MEM_EVENTS__MAX] = {
> -       E("spe-load",   "arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",       "arm_spe_0"),
> -       E("spe-store",  "arm_spe_0/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",                      "arm_spe_0"),
> -       E("spe-ldst",   "arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",       "arm_spe_0"),
> +       E("spe-load",   "%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",      "arm_spe_0",    true,   0),
> +       E("spe-store",  "%s/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",                     "arm_spe_0",    false,  0),
> +       E("spe-ldst",   "%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",      "arm_spe_0",    true,   0),
>  };
> -
> -static char mem_ev_name[100];
> -
> -const char *perf_mem_events__name(int i, const char *pmu_name __maybe_unused)
> -{
> -       struct perf_mem_event *e = &perf_mem_events_arm[i];
> -
> -       if (i >= PERF_MEM_EVENTS__MAX)
> -               return NULL;
> -
> -       if (i == PERF_MEM_EVENTS__LOAD || i == PERF_MEM_EVENTS__LOAD_STORE)
> -               scnprintf(mem_ev_name, sizeof(mem_ev_name),
> -                         e->name, perf_mem_events__loads_ldlat);
> -       else /* PERF_MEM_EVENTS__STORE */
> -               scnprintf(mem_ev_name, sizeof(mem_ev_name), e->name);
> -
> -       return mem_ev_name;
> -}
> diff --git a/tools/perf/arch/powerpc/util/mem-events.c b/tools/perf/arch/powerpc/util/mem-events.c
> index 78b986e5268d..b7883e38950f 100644
> --- a/tools/perf/arch/powerpc/util/mem-events.c
> +++ b/tools/perf/arch/powerpc/util/mem-events.c
> @@ -2,11 +2,10 @@
>  #include "map_symbol.h"
>  #include "mem-events.h"
>
> -/* PowerPC does not support 'ldlat' parameter. */
> -const char *perf_mem_events__name(int i, const char *pmu_name __maybe_unused)
> -{
> -       if (i == PERF_MEM_EVENTS__LOAD)
> -               return "cpu/mem-loads/";
> +#define E(t, n, s, l, a) { .tag = t, .name = n, .sysfs_name = s, .ldlat = l, .aux_event = a }
>
> -       return "cpu/mem-stores/";
> -}
> +struct perf_mem_event perf_mem_events_power[PERF_MEM_EVENTS__MAX] = {
> +       E("ldlat-loads",        "%s/mem-loads/",        "cpu/events/mem-loads",         false,  0),
> +       E("ldlat-stores",       "%s/mem-stores/",       "cpu/events/mem-stores",        false,  0),
> +       E(NULL,                 NULL,                   NULL,                           false,  0),
> +};
> diff --git a/tools/perf/arch/powerpc/util/mem-events.h b/tools/perf/arch/powerpc/util/mem-events.h
> new file mode 100644
> index 000000000000..6acc3d1b6873
> --- /dev/null
> +++ b/tools/perf/arch/powerpc/util/mem-events.h
> @@ -0,0 +1,7 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _POWER_MEM_EVENTS_H
> +#define _POWER_MEM_EVENTS_H
> +
> +extern struct perf_mem_event perf_mem_events_power[PERF_MEM_EVENTS__MAX];
> +
> +#endif /* _POWER_MEM_EVENTS_H */
> diff --git a/tools/perf/arch/powerpc/util/pmu.c b/tools/perf/arch/powerpc/util/pmu.c
> new file mode 100644
> index 000000000000..168173f88ddb
> --- /dev/null
> +++ b/tools/perf/arch/powerpc/util/pmu.c
> @@ -0,0 +1,11 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <string.h>
> +
> +#include "../../../util/pmu.h"
> +
> +void perf_pmu__arch_init(struct perf_pmu *pmu)
> +{
> +       if (pmu->is_core)
> +               pmu->mem_events = perf_mem_events_power;
> +}
> diff --git a/tools/perf/arch/x86/util/mem-events.c b/tools/perf/arch/x86/util/mem-events.c
> index 5fb41d50118d..f0e66a0151a0 100644
> --- a/tools/perf/arch/x86/util/mem-events.c
> +++ b/tools/perf/arch/x86/util/mem-events.c
> @@ -7,25 +7,26 @@
>  #include "linux/string.h"
>  #include "env.h"
>
> -static char mem_loads_name[100];
> -static bool mem_loads_name__init;
> -static char mem_stores_name[100];
> -
>  #define MEM_LOADS_AUX          0x8203
> -#define MEM_LOADS_AUX_NAME     "{%s/mem-loads-aux/,%s/mem-loads,ldlat=%u/}:P"
>
> -#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
> +#define E(t, n, s, l, a) { .tag = t, .name = n, .sysfs_name = s, .ldlat = l, .aux_event = a }
>
>  struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX] = {
> -       E("ldlat-loads",        "%s/mem-loads,ldlat=%u/P",      "%s/events/mem-loads"),
> -       E("ldlat-stores",       "%s/mem-stores/P",              "%s/events/mem-stores"),
> -       E(NULL,                 NULL,                           NULL),
> +       E("ldlat-loads",        "%s/mem-loads,ldlat=%u/P",      "%s/events/mem-loads",  true,   0),
> +       E("ldlat-stores",       "%s/mem-stores/P",              "%s/events/mem-stores", false,  0),
> +       E(NULL,                 NULL,                           NULL,                   false,  0),
> +};
> +
> +struct perf_mem_event perf_mem_events_intel_aux[PERF_MEM_EVENTS__MAX] = {
> +       E("ldlat-loads",        "{%s/mem-loads-aux/,%s/mem-loads,ldlat=%u/}:P", "%s/events/mem-loads",  true,   MEM_LOADS_AUX),
> +       E("ldlat-stores",       "%s/mem-stores/P",              "%s/events/mem-stores", false,  0),
> +       E(NULL,                 NULL,                           NULL,                   false,  0),
>  };
>
>  struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX] = {
> -       E(NULL,         NULL,           NULL),
> -       E(NULL,         NULL,           NULL),
> -       E("mem-ldst",   "ibs_op//",     "ibs_op"),
> +       E(NULL,         NULL,           NULL,           false,  0),
> +       E(NULL,         NULL,           NULL,           false,  0),
> +       E("mem-ldst",   "%s//",         "ibs_op",       false,  0),
>  };
>
>  bool is_mem_loads_aux_event(struct evsel *leader)
> @@ -40,48 +41,3 @@ bool is_mem_loads_aux_event(struct evsel *leader)
>
>         return leader->core.attr.config == MEM_LOADS_AUX;
>  }
> -
> -const char *perf_mem_events__name(int i, const char *pmu_name)
> -{
> -       struct perf_mem_event *e;
> -
> -       if (x86__is_amd_cpu())
> -               e = &perf_mem_events_amd[i];
> -       else
> -               e = &perf_mem_events_intel[i];
> -
> -       if (!e)
> -               return NULL;
> -
> -       if (i == PERF_MEM_EVENTS__LOAD) {
> -               if (mem_loads_name__init && !pmu_name)
> -                       return mem_loads_name;
> -
> -               if (!pmu_name) {
> -                       mem_loads_name__init = true;
> -                       pmu_name = "cpu";
> -               }
> -
> -               if (perf_pmus__have_event(pmu_name, "mem-loads-aux")) {
> -                       scnprintf(mem_loads_name, sizeof(mem_loads_name),
> -                                 MEM_LOADS_AUX_NAME, pmu_name, pmu_name,
> -                                 perf_mem_events__loads_ldlat);
> -               } else {
> -                       scnprintf(mem_loads_name, sizeof(mem_loads_name),
> -                                 e->name, pmu_name,
> -                                 perf_mem_events__loads_ldlat);
> -               }
> -               return mem_loads_name;
> -       }
> -
> -       if (i == PERF_MEM_EVENTS__STORE) {
> -               if (!pmu_name)
> -                       pmu_name = "cpu";
> -
> -               scnprintf(mem_stores_name, sizeof(mem_stores_name),
> -                         e->name, pmu_name);
> -               return mem_stores_name;
> -       }
> -
> -       return e->name;
> -}
> diff --git a/tools/perf/arch/x86/util/mem-events.h b/tools/perf/arch/x86/util/mem-events.h
> index 3959e427f482..f55c8d3b7d59 100644
> --- a/tools/perf/arch/x86/util/mem-events.h
> +++ b/tools/perf/arch/x86/util/mem-events.h
> @@ -3,6 +3,7 @@
>  #define _X86_MEM_EVENTS_H
>
>  extern struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX];
> +extern struct perf_mem_event perf_mem_events_intel_aux[PERF_MEM_EVENTS__MAX];
>
>  extern struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX];
>
> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
> index cd22e80e5657..0f49ff13cfe2 100644
> --- a/tools/perf/arch/x86/util/pmu.c
> +++ b/tools/perf/arch/x86/util/pmu.c
> @@ -35,8 +35,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
>         if (x86__is_amd_cpu()) {
>                 if (!strcmp(pmu->name, "ibs_op"))
>                         pmu->mem_events = perf_mem_events_amd;
> -       } else if (pmu->is_core)
> -               pmu->mem_events = perf_mem_events_intel;
> +       } else if (pmu->is_core) {
> +               if (perf_pmu__have_event(pmu, "mem-loads-aux"))
> +                       pmu->mem_events = perf_mem_events_intel_aux;
> +               else
> +                       pmu->mem_events = perf_mem_events_intel;
> +       }
>  }
>
>  int perf_pmus__num_mem_pmus(void)
> diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
> index 27a33dc44964..c9a40b64e538 100644
> --- a/tools/perf/util/mem-events.c
> +++ b/tools/perf/util/mem-events.c
> @@ -17,17 +17,17 @@
>
>  unsigned int perf_mem_events__loads_ldlat = 30;
>
> -#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
> +#define E(t, n, s, l, a) { .tag = t, .name = n, .sysfs_name = s, .ldlat = l, .aux_event = a }
>
>  struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
> -       E("ldlat-loads",        "cpu/mem-loads,ldlat=%u/P",     "cpu/events/mem-loads"),
> -       E("ldlat-stores",       "cpu/mem-stores/P",             "cpu/events/mem-stores"),
> -       E(NULL,                 NULL,                           NULL),
> +       E("ldlat-loads",        "%s/mem-loads,ldlat=%u/P",      "cpu/events/mem-loads",         true,   0),
> +       E("ldlat-stores",       "%s/mem-stores/P",              "cpu/events/mem-stores",        false,  0),
> +       E(NULL,                 NULL,                           NULL,                           false,  0),
>  };
>  #undef E
>
>  static char mem_loads_name[100];
> -static bool mem_loads_name__init;
> +static char mem_stores_name[100];
>
>  struct perf_mem_event *perf_pmu__mem_events_ptr(struct perf_pmu *pmu, int i)
>  {
> @@ -62,23 +62,45 @@ struct perf_pmu *perf_mem_events_find_pmu(void)
>         return perf_pmus__scan_mem(NULL);
>  }
>
> -const char * __weak perf_mem_events__name(int i, const char *pmu_name  __maybe_unused)
> +static const char *perf_pmu__mem_events_name(int i, struct perf_pmu *pmu)
>  {
> -       struct perf_mem_event *e = &perf_mem_events[i];
> +       struct perf_mem_event *e = &pmu->mem_events[i];
>
>         if (!e)
>                 return NULL;
>
> -       if (i == PERF_MEM_EVENTS__LOAD) {
> -               if (!mem_loads_name__init) {
> -                       mem_loads_name__init = true;
> -                       scnprintf(mem_loads_name, sizeof(mem_loads_name),
> -                                 e->name, perf_mem_events__loads_ldlat);
> +       if (i == PERF_MEM_EVENTS__LOAD || i == PERF_MEM_EVENTS__LOAD_STORE) {
> +               if (e->ldlat) {
> +                       if (!e->aux_event) {
> +                               /* ARM and Most of Intel */
> +                               scnprintf(mem_loads_name, sizeof(mem_loads_name),
> +                                         e->name, pmu->name,
> +                                         perf_mem_events__loads_ldlat);
> +                       } else {
> +                               /* Intel with mem-loads-aux event */
> +                               scnprintf(mem_loads_name, sizeof(mem_loads_name),
> +                                         e->name, pmu->name, pmu->name,
> +                                         perf_mem_events__loads_ldlat);
> +                       }
> +               } else {
> +                       if (!e->aux_event) {
> +                               /* AMD and POWER */
> +                               scnprintf(mem_loads_name, sizeof(mem_loads_name),
> +                                         e->name, pmu->name);
> +                       } else
> +                               return NULL;
>                 }
> +
>                 return mem_loads_name;
>         }
>
> -       return e->name;
> +       if (i == PERF_MEM_EVENTS__STORE) {
> +               scnprintf(mem_stores_name, sizeof(mem_stores_name),
> +                         e->name, pmu->name);
> +               return mem_stores_name;
> +       }
> +
> +       return NULL;
>  }
>
>  __weak bool is_mem_loads_aux_event(struct evsel *leader __maybe_unused)
> @@ -175,7 +197,7 @@ void perf_pmu__mem_events_list(struct perf_pmu *pmu)
>                         e->tag ? 13 : 0,
>                         e->tag ? : "",
>                         e->tag && verbose > 0 ? 25 : 0,
> -                       e->tag && verbose > 0 ? perf_mem_events__name(j, NULL) : "",
> +                       e->tag && verbose > 0 ? perf_pmu__mem_events_name(j, pmu) : "",
>                         e->supported ? ": available\n" : "");
>         }
>  }
> @@ -198,15 +220,15 @@ int perf_mem_events__record_args(const char **rec_argv, int *argv_nr,
>
>                         if (!e->supported) {
>                                 pr_err("failed: event '%s' not supported\n",
> -                                       perf_mem_events__name(j, pmu->name));
> +                                       perf_pmu__mem_events_name(j, pmu));
>                                 return -1;
>                         }
>
>                         if (perf_pmus__num_mem_pmus() == 1) {
>                                 rec_argv[i++] = "-e";
> -                               rec_argv[i++] = perf_mem_events__name(j, NULL);
> +                               rec_argv[i++] = perf_pmu__mem_events_name(j, pmu);
>                         } else {
> -                               const char *s = perf_mem_events__name(j, pmu->name);
> +                               const char *s = perf_pmu__mem_events_name(j, pmu);
>
>                                 if (!perf_mem_event__supported(mnt, pmu, e))
>                                         continue;
> diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
> index 0ad301a2e424..79d342768d12 100644
> --- a/tools/perf/util/mem-events.h
> +++ b/tools/perf/util/mem-events.h
> @@ -14,6 +14,8 @@
>  struct perf_mem_event {
>         bool            record;
>         bool            supported;
> +       bool            ldlat;
> +       u32             aux_event;
>         const char      *tag;
>         const char      *name;
>         const char      *sysfs_name;
> @@ -39,7 +41,6 @@ extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX];
>  int perf_pmu__mem_events_parse(struct perf_pmu *pmu, const char *str);
>  int perf_pmu__mem_events_init(struct perf_pmu *pmu);
>
> -const char *perf_mem_events__name(int i, const char *pmu_name);
>  struct perf_mem_event *perf_pmu__mem_events_ptr(struct perf_pmu *pmu, int i);
>  struct perf_pmu *perf_mem_events_find_pmu(void);
>  bool is_mem_loads_aux_event(struct evsel *leader);
> --
> 2.35.1
>
  
Leo Yan Dec. 9, 2023, 5:48 a.m. UTC | #2
On Thu, Dec 07, 2023 at 11:23:36AM -0800, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
> 
> Introduce a generic perf_mem_events__name(). Remove the ARCH-specific
> one.

Now memory event naming is arch dependent, this is because every arch
has different memory PMU names, and it appends specific configurations
(e.g. aarch64 appends 'ts_enable=1,...,min_latency=%u', and x86_64
appends 'ldlat=%u', etc).  This is not friendly for extension, e.g. it's
impossible for users to specify any extra attributes for memory events.

This patch tries to consolidate in a central place for generating memory
event names, its approach is to add more flags to meet special usage
cases, which means the code gets more complex (and more difficult for
later's maintenance).

I think we need to distinguish the event naming into two levels: in
util/mem-events.c, we can consider it as a common layer, and we maintain
common options in it, e.g. 'latency', 'all-user', 'all-kernel',
'timestamp', 'physical_address', etc.  In every arch's mem-events.c
file, it converts the common options to arch specific configurations.

In the end, this can avoid to add more and more flags into the
structure perf_mem_event.

Anyway, I also have a question for this patch itself, please see below
inline comment.

[...]

> diff --git a/tools/perf/arch/arm64/util/mem-events.c b/tools/perf/arch/arm64/util/mem-events.c
> index 2602e8688727..eb2ef84f0fc8 100644
> --- a/tools/perf/arch/arm64/util/mem-events.c
> +++ b/tools/perf/arch/arm64/util/mem-events.c
> @@ -2,28 +2,10 @@
>  #include "map_symbol.h"
>  #include "mem-events.h"
>  
> -#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
> +#define E(t, n, s, l, a) { .tag = t, .name = n, .sysfs_name = s, .ldlat = l, .aux_event = a }
>  
>  struct perf_mem_event perf_mem_events_arm[PERF_MEM_EVENTS__MAX] = {
> -	E("spe-load",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",	"arm_spe_0"),
> -	E("spe-store",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",			"arm_spe_0"),
> -	E("spe-ldst",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",	"arm_spe_0"),
> +	E("spe-load",	"%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",	"arm_spe_0",	true,	0),
> +	E("spe-store",	"%s/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",			"arm_spe_0",	false,	0),
> +	E("spe-ldst",	"%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",	"arm_spe_0",	true,	0),

Arm SPE is AUX event, should set '1' to the field '.aux_event'.

I am a bit suspect if we really need the field '.aux_event', the
'.aux_event' field is only used for generating event string.

So in the below function perf_pmu__mem_events_name(), I prefer to call
an arch specific function, e.g. arch_memory_event_name(event_id, cfg),
the parameter 'event_id' passes memory event ID for load, store, and
load-store, and the parameter 'cfg' which is a pointer to the common
memory options (as mentioned above for latency, all-user or all-kernel
mode, timestamp tracing, etc).

In the end, the common layer just passes the common options to low
level arch's layer and get a event string for recording.

Thanks,
Leo
  
Liang, Kan Dec. 11, 2023, 6:39 p.m. UTC | #3
On 2023-12-09 12:48 a.m., Leo Yan wrote:
> On Thu, Dec 07, 2023 at 11:23:36AM -0800, kan.liang@linux.intel.com wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> Introduce a generic perf_mem_events__name(). Remove the ARCH-specific
>> one.
> 
> Now memory event naming is arch dependent, this is because every arch
> has different memory PMU names, and it appends specific configurations
> (e.g. aarch64 appends 'ts_enable=1,...,min_latency=%u', and x86_64
> appends 'ldlat=%u', etc).  This is not friendly for extension, e.g. it's
> impossible for users to specify any extra attributes for memory events.
> 
> This patch tries to consolidate in a central place for generating memory
> event names, its approach is to add more flags to meet special usage
> cases, which means the code gets more complex (and more difficult for
> later's maintenance).
> 
> I think we need to distinguish the event naming into two levels: in
> util/mem-events.c, we can consider it as a common layer, and we maintain
> common options in it, e.g. 'latency', 'all-user', 'all-kernel',
> 'timestamp', 'physical_address', etc.  In every arch's mem-events.c
> file, it converts the common options to arch specific configurations.
> 
> In the end, this can avoid to add more and more flags into the
> structure perf_mem_event.

The purpose of this cleanup patch set is to remove the unnecessary
__weak functions, and try to make the code more generic.
The two flags has already covered all the current usage.
For now, it's good enough.

I agree that if there are more flags, it should not be a perfect
solution. But we don't have the third flag right now. Could we clean up
it later e.g., when introducing the third flag?

I just don't want to make the patch bigger and bigger. Also, I don't
think we usually implement something just for future possibilities.


> 
> Anyway, I also have a question for this patch itself, please see below
> inline comment.
> 
> [...]
> 
>> diff --git a/tools/perf/arch/arm64/util/mem-events.c b/tools/perf/arch/arm64/util/mem-events.c
>> index 2602e8688727..eb2ef84f0fc8 100644
>> --- a/tools/perf/arch/arm64/util/mem-events.c
>> +++ b/tools/perf/arch/arm64/util/mem-events.c
>> @@ -2,28 +2,10 @@
>>  #include "map_symbol.h"
>>  #include "mem-events.h"
>>  
>> -#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
>> +#define E(t, n, s, l, a) { .tag = t, .name = n, .sysfs_name = s, .ldlat = l, .aux_event = a }
>>  
>>  struct perf_mem_event perf_mem_events_arm[PERF_MEM_EVENTS__MAX] = {
>> -	E("spe-load",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",	"arm_spe_0"),
>> -	E("spe-store",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",			"arm_spe_0"),
>> -	E("spe-ldst",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",	"arm_spe_0"),
>> +	E("spe-load",	"%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",	"arm_spe_0",	true,	0),
>> +	E("spe-store",	"%s/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",			"arm_spe_0",	false,	0),
>> +	E("spe-ldst",	"%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",	"arm_spe_0",	true,	0),
> 
> Arm SPE is AUX event, should set '1' to the field '.aux_event'.

It actually means whether an extra event is required with a mem_loads
event.  I guess for ARM the answer is no.

> 
> I am a bit suspect if we really need the field '.aux_event', the
> '.aux_event' field is only used for generating event string.

No, it stores the event encoding for the extra event.
ARM doesn't need it, so it's 0.

> 
> So in the below function perf_pmu__mem_events_name(), I prefer to call
> an arch specific function, e.g. arch_memory_event_name(event_id, cfg),
> the parameter 'event_id' passes memory event ID for load, store, and
> load-store, and the parameter 'cfg' which is a pointer to the common
> memory options (as mentioned above for latency, all-user or all-kernel
> mode, timestamp tracing, etc).

If I understand correctly, I think we try to avoid the __weak function
for the perf tool. If there will be more parameters later, a mask may be
used for the parameters. But, again, I think it should be an improvement
for later.

Thanks,
Kan
> 
> In the end, the common layer just passes the common options to low
> level arch's layer and get a event string for recording.
> 
> Thanks,
> Leo
  
Leo Yan Dec. 13, 2023, 1:33 p.m. UTC | #4
On Mon, Dec 11, 2023 at 01:39:36PM -0500, Liang, Kan wrote:
> On 2023-12-09 12:48 a.m., Leo Yan wrote:
> > On Thu, Dec 07, 2023 at 11:23:36AM -0800, kan.liang@linux.intel.com wrote:
> >> From: Kan Liang <kan.liang@linux.intel.com>
> >>
> >> Introduce a generic perf_mem_events__name(). Remove the ARCH-specific
> >> one.
> > 
> > Now memory event naming is arch dependent, this is because every arch
> > has different memory PMU names, and it appends specific configurations
> > (e.g. aarch64 appends 'ts_enable=1,...,min_latency=%u', and x86_64
> > appends 'ldlat=%u', etc).  This is not friendly for extension, e.g. it's
> > impossible for users to specify any extra attributes for memory events.
> > 
> > This patch tries to consolidate in a central place for generating memory
> > event names, its approach is to add more flags to meet special usage
> > cases, which means the code gets more complex (and more difficult for
> > later's maintenance).
> > 
> > I think we need to distinguish the event naming into two levels: in
> > util/mem-events.c, we can consider it as a common layer, and we maintain
> > common options in it, e.g. 'latency', 'all-user', 'all-kernel',
> > 'timestamp', 'physical_address', etc.  In every arch's mem-events.c
> > file, it converts the common options to arch specific configurations.
> > 
> > In the end, this can avoid to add more and more flags into the
> > structure perf_mem_event.
> 
> The purpose of this cleanup patch set is to remove the unnecessary
> __weak functions, and try to make the code more generic.

I understand weak functions are not very good practice.  The point is
weak functions are used for some archs have implemented a feature but
other archs have not.

I can think a potential case to use a central place to maintain the
code for all archs - when we want to support cross analysis.  But this
patch series is for supporting cross analysis, to be honest, I still
don't see benefit for this series, though I know you might try to
support hybrid modes.

> The two flags has already covered all the current usage.
> For now, it's good enough.
> 
> I agree that if there are more flags, it should not be a perfect
> solution. But we don't have the third flag right now. Could we clean up
> it later e.g., when introducing the third flag?
> 
> I just don't want to make the patch bigger and bigger. Also, I don't
> think we usually implement something just for future possibilities.

Fine for me, but please address two main concerns in next spin:

- Fix building failure in patch 01;
- Fix the concerned regression on Arm64/AMD archs in patch 04.  I will
  give a bit more explanation in another reply.


[...]

> >> -#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
> >> +#define E(t, n, s, l, a) { .tag = t, .name = n, .sysfs_name = s, .ldlat = l, .aux_event = a }
> >>  
> >>  struct perf_mem_event perf_mem_events_arm[PERF_MEM_EVENTS__MAX] = {
> >> -	E("spe-load",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",	"arm_spe_0"),
> >> -	E("spe-store",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",			"arm_spe_0"),
> >> -	E("spe-ldst",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",	"arm_spe_0"),
> >> +	E("spe-load",	"%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",	"arm_spe_0",	true,	0),
> >> +	E("spe-store",	"%s/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",			"arm_spe_0",	false,	0),
> >> +	E("spe-ldst",	"%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",	"arm_spe_0",	true,	0),
> > 
> > Arm SPE is AUX event, should set '1' to the field '.aux_event'.
> 
> It actually means whether an extra event is required with a mem_loads
> event.  I guess for ARM the answer is no.

Thanks for confirmation.

> > I am a bit suspect if we really need the field '.aux_event', the
> > '.aux_event' field is only used for generating event string.
> 
> No, it stores the event encoding for the extra event.
> ARM doesn't need it, so it's 0.

I searched a bit and confirmed '.aux_event' is only used in
util/mem-events.c and for 'perf record'.

I failed to connect the code with "it stores the event encoding for the
extra event".  Could you elaborate a bit for this?

Thanks,
Leo
  
Liang, Kan Dec. 13, 2023, 4:17 p.m. UTC | #5
On 2023-12-13 8:33 a.m., Leo Yan wrote:
>>> I am a bit suspect if we really need the field '.aux_event', the
>>> '.aux_event' field is only used for generating event string.
>> No, it stores the event encoding for the extra event.
>> ARM doesn't need it, so it's 0.
> I searched a bit and confirmed '.aux_event' is only used in
> util/mem-events.c and for 'perf record'.
> 
> I failed to connect the code with "it stores the event encoding for the
> extra event".  Could you elaborate a bit for this?

The details of the reason of introducing the mem_load aux event can be
found here.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=61b985e3e775a3a75fda04ce7ef1b1aefc4758bc

A mem_load_aux event is a new requirement for SPR. For the other Intel
platforms, a single mem_load event is good enough to collect the data
source information. But for SPR, we have to group both the mem_load
event and the mem_load_aux event when collecting the data source
information. In the group, the mem_load_aux event must be the leader
event. But for the sample read case, only the sampling of the mem_load
make sense. So the is_mem_loads_aux_event() is introduced to switch the
sampling event to the mem_load event. Here is the perf tool patch.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2a57d40832dc8366bc517bcbbfdb1d7fb583735b

The .aux_event is to store the event encoding of the mem_load_aux event.
If it's the leader of a sampling read group, we should use the second
event as a sampling event.

Thanks,
Kan
  
Ian Rogers Dec. 13, 2023, 5:33 p.m. UTC | #6
On Wed, Dec 13, 2023 at 5:33 AM Leo Yan <leo.yan@linaro.org> wrote:
>
> On Mon, Dec 11, 2023 at 01:39:36PM -0500, Liang, Kan wrote:
> > On 2023-12-09 12:48 a.m., Leo Yan wrote:
> > > On Thu, Dec 07, 2023 at 11:23:36AM -0800, kan.liang@linux.intel.com wrote:
> > >> From: Kan Liang <kan.liang@linux.intel.com>
> > >>
> > >> Introduce a generic perf_mem_events__name(). Remove the ARCH-specific
> > >> one.
> > >
> > > Now memory event naming is arch dependent, this is because every arch
> > > has different memory PMU names, and it appends specific configurations
> > > (e.g. aarch64 appends 'ts_enable=1,...,min_latency=%u', and x86_64
> > > appends 'ldlat=%u', etc).  This is not friendly for extension, e.g. it's
> > > impossible for users to specify any extra attributes for memory events.
> > >
> > > This patch tries to consolidate in a central place for generating memory
> > > event names, its approach is to add more flags to meet special usage
> > > cases, which means the code gets more complex (and more difficult for
> > > later's maintenance).
> > >
> > > I think we need to distinguish the event naming into two levels: in
> > > util/mem-events.c, we can consider it as a common layer, and we maintain
> > > common options in it, e.g. 'latency', 'all-user', 'all-kernel',
> > > 'timestamp', 'physical_address', etc.  In every arch's mem-events.c
> > > file, it converts the common options to arch specific configurations.
> > >
> > > In the end, this can avoid to add more and more flags into the
> > > structure perf_mem_event.
> >
> > The purpose of this cleanup patch set is to remove the unnecessary
> > __weak functions, and try to make the code more generic.
>
> I understand weak functions are not very good practice.  The point is
> weak functions are used for some archs have implemented a feature but
> other archs have not.
>
> I can think a potential case to use a central place to maintain the
> code for all archs - when we want to support cross analysis.  But this
> patch series is for supporting cross analysis, to be honest, I still
> don't see benefit for this series, though I know you might try to
> support hybrid modes.

So thanks to Kan for doing this series and trying to clean the code
up. My complaint about the code is that it was overly hard wired.
Heck, we have event strings to parse that hard code PMU and event
names. In fixing hybrid my complaint was that we were adding yet more
complexity and as a lot of this was resting on printf format strings
it was hard to check that these were being used correctly. The
direction of this patch series I agree with.

Imo we should avoid weak definitions. Weak definitions are outside of
the C specification and have introduced bugs into the code base -
specifically a weak const array was having its size propagated into
code but then the linker later replaced that weak array. An
architecture #ifdef is better than a weak definition as the behavior
is defined and things blow up early rather than suffering from subtle
corruptions.

The Linux kernel device driver is abstracting what the hardware is
doing and since the more recent changes the PMU abstraction in the
perf tool is trying to match that. If we need to interface with PMUs
differently on each architecture then something is wrong. It happens
that things are wrong and we need to work with that. For example, on
intel there are topdown events that introduce ordering issues. We have
default initialization functions for different PMUs. The perf_pmu
struct is created in perf_pmu__lookup and that always calls an
perf_pmu__arch_init where the name of the PMU is already set. In the
weak perf_pmu__arch_init we tweak the perf_pmu struct so that it will
behave correctly elsewhere in the code. These changes are paralleling
that. That said, the Intel perf_pmu__arch_init does strcmps against
"intel_pt" and "intel_bts", does it really need to be arch specific
given it is already PMU specific? A situation we see in testing is
people trying to run user space ISA emulators like qemu, say ARM on
x86, should the PMU set up for intel_pt and intel_bts be broken in
this environment? I think that as the names are very specific I'd
prefer if the code were outside of the tools/perf/arch directory.
There are cases with PMU names like "cpu" where we're probably going
to need ifdefs or runtime is_intel() calls.

Anyway, my point is that I think we should be moving away from arch
specific stuff, as this patch series tries, and that way we have the
best chance of changes benefitting users regardless of hardware. It
may be that to make all of this work properly we need to modify PMUs,
but that all seems good, such as adding the extended type support for
legacy events on ARM PMUs so that legacy events can work without a
fixed CPU. We haven't got core PMUs working properly, see the recent
perf top hybrid problem. There are obviously issues with uncore as,
for example, most memory controllers are replicated PMUs but some
kernel perf drivers select a memory controller via a config value. We
either need to change the drivers for some kind of uniformity or do
some kind of abstracting in the perf tool. I think we'll probably need
to do it in the tool and when doing that we really shouldn't be doing
it in an arch specific or weak function way.

Thanks,
Ian

> > The two flags has already covered all the current usage.
> > For now, it's good enough.
> >
> > I agree that if there are more flags, it should not be a perfect
> > solution. But we don't have the third flag right now. Could we clean up
> > it later e.g., when introducing the third flag?
> >
> > I just don't want to make the patch bigger and bigger. Also, I don't
> > think we usually implement something just for future possibilities.
>
> Fine for me, but please address two main concerns in next spin:
>
> - Fix building failure in patch 01;
> - Fix the concerned regression on Arm64/AMD archs in patch 04.  I will
>   give a bit more explanation in another reply.
>
>
> [...]
>
> > >> -#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
> > >> +#define E(t, n, s, l, a) { .tag = t, .name = n, .sysfs_name = s, .ldlat = l, .aux_event = a }
> > >>
> > >>  struct perf_mem_event perf_mem_events_arm[PERF_MEM_EVENTS__MAX] = {
> > >> -  E("spe-load",   "arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",       "arm_spe_0"),
> > >> -  E("spe-store",  "arm_spe_0/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",                      "arm_spe_0"),
> > >> -  E("spe-ldst",   "arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",       "arm_spe_0"),
> > >> +  E("spe-load",   "%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",      "arm_spe_0",    true,   0),
> > >> +  E("spe-store",  "%s/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",                     "arm_spe_0",    false,  0),
> > >> +  E("spe-ldst",   "%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",      "arm_spe_0",    true,   0),
> > >
> > > Arm SPE is AUX event, should set '1' to the field '.aux_event'.
> >
> > It actually means whether an extra event is required with a mem_loads
> > event.  I guess for ARM the answer is no.
>
> Thanks for confirmation.
>
> > > I am a bit suspect if we really need the field '.aux_event', the
> > > '.aux_event' field is only used for generating event string.
> >
> > No, it stores the event encoding for the extra event.
> > ARM doesn't need it, so it's 0.
>
> I searched a bit and confirmed '.aux_event' is only used in
> util/mem-events.c and for 'perf record'.
>
> I failed to connect the code with "it stores the event encoding for the
> extra event".  Could you elaborate a bit for this?
>
> Thanks,
> Leo
  
Leo Yan Dec. 18, 2023, 3:21 a.m. UTC | #7
Hi Ian,

On Wed, Dec 13, 2023 at 09:33:24AM -0800, Ian Rogers wrote:

Sorry for late response due to I took a leave at end of last week.

[...]

> > > The purpose of this cleanup patch set is to remove the unnecessary
> > > __weak functions, and try to make the code more generic.
> >
> > I understand weak functions are not very good practice.  The point is
> > weak functions are used for some archs have implemented a feature but
> > other archs have not.
> >
> > I can think a potential case to use a central place to maintain the
> > code for all archs - when we want to support cross analysis.  But this
> > patch series is for supporting cross analysis, to be honest, I still
> > don't see benefit for this series, though I know you might try to
> > support hybrid modes.
> 
> So thanks to Kan for doing this series and trying to clean the code
> up. My complaint about the code is that it was overly hard wired.
> Heck, we have event strings to parse that hard code PMU and event
> names. In fixing hybrid my complaint was that we were adding yet more
> complexity and as a lot of this was resting on printf format strings
> it was hard to check that these were being used correctly. The
> direction of this patch series I agree with.

I agreed as well ;)

> Imo we should avoid weak definitions. Weak definitions are outside of
> the C specification and have introduced bugs into the code base -
> specifically a weak const array was having its size propagated into
> code but then the linker later replaced that weak array. An
> architecture #ifdef is better than a weak definition as the behavior
> is defined and things blow up early rather than suffering from subtle
> corruptions.

Thanks a lot for sharing.

> The Linux kernel device driver is abstracting what the hardware is
> doing and since the more recent changes the PMU abstraction in the
> perf tool is trying to match that. If we need to interface with PMUs
> differently on each architecture then something is wrong. It happens
> that things are wrong and we need to work with that. For example, on
> intel there are topdown events that introduce ordering issues. We have
> default initialization functions for different PMUs. The perf_pmu
> struct is created in perf_pmu__lookup and that always calls an
> perf_pmu__arch_init where the name of the PMU is already set. In the
> weak perf_pmu__arch_init we tweak the perf_pmu struct so that it will
> behave correctly elsewhere in the code. These changes are paralleling
> that. That said, the Intel perf_pmu__arch_init does strcmps against
> "intel_pt" and "intel_bts", does it really need to be arch specific
> given it is already PMU specific?

To be clear, I don't object refactoring, I am just wandering if have
better approach.

Your above question is a good point. I admit the implementation in
arch's perf_pmu__arch_init() is not a good practice, IIUC, you are
seeking an general approach for dynamically probing PMU (and the
associated events).

What I can think about is using the static linked PMU descriptor +
init callback function, which is widely used in Linux kernel for
machine's initialization (refer to [1]).

Likewise, we can define a descriptor for every PMU and link the
descriptor into a data section, e.g.:

  static const struct perf_pmu __intel_pt_pmu
  __section(".pmu.info.init") = {
        .name = "intel_pt",
        .auxtrace = true,
        .selectable = true,
        .perf_event_attr_init_default = intel_pt_pmu_default_config,
        .mem_events = NULL,
        ...
  }

As a result, perf_pmu__lookup() just iterates the descriptor array
stored in the data section ".pmu.info.init".  To support more complex
case, we even can add a callback pointer in the structure perf_pmu to
invoke PMU specific initialization.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm/include/asm/mach/arch.h#n78

> A situation we see in testing is
> people trying to run user space ISA emulators like qemu, say ARM on
> x86, should the PMU set up for intel_pt and intel_bts be broken in
> this environment?

This is a good case, also is a complex case.

> I think that as the names are very specific I'd
> prefer if the code were outside of the tools/perf/arch directory.

I am not sure if understand your meaning.

Anyway, let me extend a bit with this patch series. For instance,
perf_pmu__mem_events_name() function as a central place generates memory
event naming for different archs (and even with different attributes).
This leads to architecture specific things are maintained in perf core
layer.

Rather than adding a data pointer '.mem_events' in to struct perf_pmu,
I'd like to add a new callback (say .mem_event_init()) into struct
perf_pmu, this function can return back the memory event string.

In the end, we can de-couple the perf core layer with arch specific
codes - and if we really want to support cross runtime case (e.g. Arm
binary on x86 machine), we can connect with linked descriptor as
mentioned above.

> There are cases with PMU names like "cpu" where we're probably going
> to need ifdefs or runtime is_intel() calls.
> 
> Anyway, my point is that I think we should be moving away from arch
> specific stuff, as this patch series tries, and that way we have the
> best chance of changes benefitting users regardless of hardware.

To be clear, I agree it's great if we can build in all archs into single
perf binary for support cross runtime.

On the other hand, I don't think it's a good idea to totally remove arch
folders.

> It may be that to make all of this work properly we need to modify PMUs,
> but that all seems good, such as adding the extended type support for
> legacy events on ARM PMUs so that legacy events can work without a
> fixed CPU. We haven't got core PMUs working properly, see the recent
> perf top hybrid problem. There are obviously issues with uncore as,
> for example, most memory controllers are replicated PMUs but some
> kernel perf drivers select a memory controller via a config value. We
> either need to change the drivers for some kind of uniformity or do
> some kind of abstracting in the perf tool. I think we'll probably need
> to do it in the tool and when doing that we really shouldn't be doing
> it in an arch specific or weak function way.

Thanks for bringing up.  Now I understand a bit your concerns.

Leo
  

Patch

diff --git a/tools/perf/arch/arm64/util/mem-events.c b/tools/perf/arch/arm64/util/mem-events.c
index 2602e8688727..eb2ef84f0fc8 100644
--- a/tools/perf/arch/arm64/util/mem-events.c
+++ b/tools/perf/arch/arm64/util/mem-events.c
@@ -2,28 +2,10 @@ 
 #include "map_symbol.h"
 #include "mem-events.h"
 
-#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
+#define E(t, n, s, l, a) { .tag = t, .name = n, .sysfs_name = s, .ldlat = l, .aux_event = a }
 
 struct perf_mem_event perf_mem_events_arm[PERF_MEM_EVENTS__MAX] = {
-	E("spe-load",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",	"arm_spe_0"),
-	E("spe-store",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",			"arm_spe_0"),
-	E("spe-ldst",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",	"arm_spe_0"),
+	E("spe-load",	"%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",	"arm_spe_0",	true,	0),
+	E("spe-store",	"%s/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",			"arm_spe_0",	false,	0),
+	E("spe-ldst",	"%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",	"arm_spe_0",	true,	0),
 };
-
-static char mem_ev_name[100];
-
-const char *perf_mem_events__name(int i, const char *pmu_name __maybe_unused)
-{
-	struct perf_mem_event *e = &perf_mem_events_arm[i];
-
-	if (i >= PERF_MEM_EVENTS__MAX)
-		return NULL;
-
-	if (i == PERF_MEM_EVENTS__LOAD || i == PERF_MEM_EVENTS__LOAD_STORE)
-		scnprintf(mem_ev_name, sizeof(mem_ev_name),
-			  e->name, perf_mem_events__loads_ldlat);
-	else /* PERF_MEM_EVENTS__STORE */
-		scnprintf(mem_ev_name, sizeof(mem_ev_name), e->name);
-
-	return mem_ev_name;
-}
diff --git a/tools/perf/arch/powerpc/util/mem-events.c b/tools/perf/arch/powerpc/util/mem-events.c
index 78b986e5268d..b7883e38950f 100644
--- a/tools/perf/arch/powerpc/util/mem-events.c
+++ b/tools/perf/arch/powerpc/util/mem-events.c
@@ -2,11 +2,10 @@ 
 #include "map_symbol.h"
 #include "mem-events.h"
 
-/* PowerPC does not support 'ldlat' parameter. */
-const char *perf_mem_events__name(int i, const char *pmu_name __maybe_unused)
-{
-	if (i == PERF_MEM_EVENTS__LOAD)
-		return "cpu/mem-loads/";
+#define E(t, n, s, l, a) { .tag = t, .name = n, .sysfs_name = s, .ldlat = l, .aux_event = a }
 
-	return "cpu/mem-stores/";
-}
+struct perf_mem_event perf_mem_events_power[PERF_MEM_EVENTS__MAX] = {
+	E("ldlat-loads",	"%s/mem-loads/",	"cpu/events/mem-loads",		false,	0),
+	E("ldlat-stores",	"%s/mem-stores/",	"cpu/events/mem-stores",	false,	0),
+	E(NULL,			NULL,			NULL,				false,	0),
+};
diff --git a/tools/perf/arch/powerpc/util/mem-events.h b/tools/perf/arch/powerpc/util/mem-events.h
new file mode 100644
index 000000000000..6acc3d1b6873
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/mem-events.h
@@ -0,0 +1,7 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _POWER_MEM_EVENTS_H
+#define _POWER_MEM_EVENTS_H
+
+extern struct perf_mem_event perf_mem_events_power[PERF_MEM_EVENTS__MAX];
+
+#endif /* _POWER_MEM_EVENTS_H */
diff --git a/tools/perf/arch/powerpc/util/pmu.c b/tools/perf/arch/powerpc/util/pmu.c
new file mode 100644
index 000000000000..168173f88ddb
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/pmu.c
@@ -0,0 +1,11 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+#include <string.h>
+
+#include "../../../util/pmu.h"
+
+void perf_pmu__arch_init(struct perf_pmu *pmu)
+{
+	if (pmu->is_core)
+		pmu->mem_events = perf_mem_events_power;
+}
diff --git a/tools/perf/arch/x86/util/mem-events.c b/tools/perf/arch/x86/util/mem-events.c
index 5fb41d50118d..f0e66a0151a0 100644
--- a/tools/perf/arch/x86/util/mem-events.c
+++ b/tools/perf/arch/x86/util/mem-events.c
@@ -7,25 +7,26 @@ 
 #include "linux/string.h"
 #include "env.h"
 
-static char mem_loads_name[100];
-static bool mem_loads_name__init;
-static char mem_stores_name[100];
-
 #define MEM_LOADS_AUX		0x8203
-#define MEM_LOADS_AUX_NAME     "{%s/mem-loads-aux/,%s/mem-loads,ldlat=%u/}:P"
 
-#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
+#define E(t, n, s, l, a) { .tag = t, .name = n, .sysfs_name = s, .ldlat = l, .aux_event = a }
 
 struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX] = {
-	E("ldlat-loads",	"%s/mem-loads,ldlat=%u/P",	"%s/events/mem-loads"),
-	E("ldlat-stores",	"%s/mem-stores/P",		"%s/events/mem-stores"),
-	E(NULL,			NULL,				NULL),
+	E("ldlat-loads",	"%s/mem-loads,ldlat=%u/P",	"%s/events/mem-loads",	true,	0),
+	E("ldlat-stores",	"%s/mem-stores/P",		"%s/events/mem-stores",	false,	0),
+	E(NULL,			NULL,				NULL,			false,	0),
+};
+
+struct perf_mem_event perf_mem_events_intel_aux[PERF_MEM_EVENTS__MAX] = {
+	E("ldlat-loads",	"{%s/mem-loads-aux/,%s/mem-loads,ldlat=%u/}:P",	"%s/events/mem-loads",	true,	MEM_LOADS_AUX),
+	E("ldlat-stores",	"%s/mem-stores/P",		"%s/events/mem-stores",	false,	0),
+	E(NULL,			NULL,				NULL,			false,	0),
 };
 
 struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX] = {
-	E(NULL,		NULL,		NULL),
-	E(NULL,		NULL,		NULL),
-	E("mem-ldst",	"ibs_op//",	"ibs_op"),
+	E(NULL,		NULL,		NULL,		false,	0),
+	E(NULL,		NULL,		NULL,		false,	0),
+	E("mem-ldst",	"%s//",		"ibs_op",	false,	0),
 };
 
 bool is_mem_loads_aux_event(struct evsel *leader)
@@ -40,48 +41,3 @@  bool is_mem_loads_aux_event(struct evsel *leader)
 
 	return leader->core.attr.config == MEM_LOADS_AUX;
 }
-
-const char *perf_mem_events__name(int i, const char *pmu_name)
-{
-	struct perf_mem_event *e;
-
-	if (x86__is_amd_cpu())
-		e = &perf_mem_events_amd[i];
-	else
-		e = &perf_mem_events_intel[i];
-
-	if (!e)
-		return NULL;
-
-	if (i == PERF_MEM_EVENTS__LOAD) {
-		if (mem_loads_name__init && !pmu_name)
-			return mem_loads_name;
-
-		if (!pmu_name) {
-			mem_loads_name__init = true;
-			pmu_name = "cpu";
-		}
-
-		if (perf_pmus__have_event(pmu_name, "mem-loads-aux")) {
-			scnprintf(mem_loads_name, sizeof(mem_loads_name),
-				  MEM_LOADS_AUX_NAME, pmu_name, pmu_name,
-				  perf_mem_events__loads_ldlat);
-		} else {
-			scnprintf(mem_loads_name, sizeof(mem_loads_name),
-				  e->name, pmu_name,
-				  perf_mem_events__loads_ldlat);
-		}
-		return mem_loads_name;
-	}
-
-	if (i == PERF_MEM_EVENTS__STORE) {
-		if (!pmu_name)
-			pmu_name = "cpu";
-
-		scnprintf(mem_stores_name, sizeof(mem_stores_name),
-			  e->name, pmu_name);
-		return mem_stores_name;
-	}
-
-	return e->name;
-}
diff --git a/tools/perf/arch/x86/util/mem-events.h b/tools/perf/arch/x86/util/mem-events.h
index 3959e427f482..f55c8d3b7d59 100644
--- a/tools/perf/arch/x86/util/mem-events.h
+++ b/tools/perf/arch/x86/util/mem-events.h
@@ -3,6 +3,7 @@ 
 #define _X86_MEM_EVENTS_H
 
 extern struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX];
+extern struct perf_mem_event perf_mem_events_intel_aux[PERF_MEM_EVENTS__MAX];
 
 extern struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX];
 
diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
index cd22e80e5657..0f49ff13cfe2 100644
--- a/tools/perf/arch/x86/util/pmu.c
+++ b/tools/perf/arch/x86/util/pmu.c
@@ -35,8 +35,12 @@  void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
 	if (x86__is_amd_cpu()) {
 		if (!strcmp(pmu->name, "ibs_op"))
 			pmu->mem_events = perf_mem_events_amd;
-	} else if (pmu->is_core)
-		pmu->mem_events = perf_mem_events_intel;
+	} else if (pmu->is_core) {
+		if (perf_pmu__have_event(pmu, "mem-loads-aux"))
+			pmu->mem_events = perf_mem_events_intel_aux;
+		else
+			pmu->mem_events = perf_mem_events_intel;
+	}
 }
 
 int perf_pmus__num_mem_pmus(void)
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index 27a33dc44964..c9a40b64e538 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -17,17 +17,17 @@ 
 
 unsigned int perf_mem_events__loads_ldlat = 30;
 
-#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
+#define E(t, n, s, l, a) { .tag = t, .name = n, .sysfs_name = s, .ldlat = l, .aux_event = a }
 
 struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
-	E("ldlat-loads",	"cpu/mem-loads,ldlat=%u/P",	"cpu/events/mem-loads"),
-	E("ldlat-stores",	"cpu/mem-stores/P",		"cpu/events/mem-stores"),
-	E(NULL,			NULL,				NULL),
+	E("ldlat-loads",	"%s/mem-loads,ldlat=%u/P",	"cpu/events/mem-loads",		true,	0),
+	E("ldlat-stores",	"%s/mem-stores/P",		"cpu/events/mem-stores",	false,	0),
+	E(NULL,			NULL,				NULL,				false,	0),
 };
 #undef E
 
 static char mem_loads_name[100];
-static bool mem_loads_name__init;
+static char mem_stores_name[100];
 
 struct perf_mem_event *perf_pmu__mem_events_ptr(struct perf_pmu *pmu, int i)
 {
@@ -62,23 +62,45 @@  struct perf_pmu *perf_mem_events_find_pmu(void)
 	return perf_pmus__scan_mem(NULL);
 }
 
-const char * __weak perf_mem_events__name(int i, const char *pmu_name  __maybe_unused)
+static const char *perf_pmu__mem_events_name(int i, struct perf_pmu *pmu)
 {
-	struct perf_mem_event *e = &perf_mem_events[i];
+	struct perf_mem_event *e = &pmu->mem_events[i];
 
 	if (!e)
 		return NULL;
 
-	if (i == PERF_MEM_EVENTS__LOAD) {
-		if (!mem_loads_name__init) {
-			mem_loads_name__init = true;
-			scnprintf(mem_loads_name, sizeof(mem_loads_name),
-				  e->name, perf_mem_events__loads_ldlat);
+	if (i == PERF_MEM_EVENTS__LOAD || i == PERF_MEM_EVENTS__LOAD_STORE) {
+		if (e->ldlat) {
+			if (!e->aux_event) {
+				/* ARM and Most of Intel */
+				scnprintf(mem_loads_name, sizeof(mem_loads_name),
+					  e->name, pmu->name,
+					  perf_mem_events__loads_ldlat);
+			} else {
+				/* Intel with mem-loads-aux event */
+				scnprintf(mem_loads_name, sizeof(mem_loads_name),
+					  e->name, pmu->name, pmu->name,
+					  perf_mem_events__loads_ldlat);
+			}
+		} else {
+			if (!e->aux_event) {
+				/* AMD and POWER */
+				scnprintf(mem_loads_name, sizeof(mem_loads_name),
+					  e->name, pmu->name);
+			} else
+				return NULL;
 		}
+
 		return mem_loads_name;
 	}
 
-	return e->name;
+	if (i == PERF_MEM_EVENTS__STORE) {
+		scnprintf(mem_stores_name, sizeof(mem_stores_name),
+			  e->name, pmu->name);
+		return mem_stores_name;
+	}
+
+	return NULL;
 }
 
 __weak bool is_mem_loads_aux_event(struct evsel *leader __maybe_unused)
@@ -175,7 +197,7 @@  void perf_pmu__mem_events_list(struct perf_pmu *pmu)
 			e->tag ? 13 : 0,
 			e->tag ? : "",
 			e->tag && verbose > 0 ? 25 : 0,
-			e->tag && verbose > 0 ? perf_mem_events__name(j, NULL) : "",
+			e->tag && verbose > 0 ? perf_pmu__mem_events_name(j, pmu) : "",
 			e->supported ? ": available\n" : "");
 	}
 }
@@ -198,15 +220,15 @@  int perf_mem_events__record_args(const char **rec_argv, int *argv_nr,
 
 			if (!e->supported) {
 				pr_err("failed: event '%s' not supported\n",
-					perf_mem_events__name(j, pmu->name));
+					perf_pmu__mem_events_name(j, pmu));
 				return -1;
 			}
 
 			if (perf_pmus__num_mem_pmus() == 1) {
 				rec_argv[i++] = "-e";
-				rec_argv[i++] = perf_mem_events__name(j, NULL);
+				rec_argv[i++] = perf_pmu__mem_events_name(j, pmu);
 			} else {
-				const char *s = perf_mem_events__name(j, pmu->name);
+				const char *s = perf_pmu__mem_events_name(j, pmu);
 
 				if (!perf_mem_event__supported(mnt, pmu, e))
 					continue;
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index 0ad301a2e424..79d342768d12 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -14,6 +14,8 @@ 
 struct perf_mem_event {
 	bool		record;
 	bool		supported;
+	bool		ldlat;
+	u32		aux_event;
 	const char	*tag;
 	const char	*name;
 	const char	*sysfs_name;
@@ -39,7 +41,6 @@  extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX];
 int perf_pmu__mem_events_parse(struct perf_pmu *pmu, const char *str);
 int perf_pmu__mem_events_init(struct perf_pmu *pmu);
 
-const char *perf_mem_events__name(int i, const char *pmu_name);
 struct perf_mem_event *perf_pmu__mem_events_ptr(struct perf_pmu *pmu, int i);
 struct perf_pmu *perf_mem_events_find_pmu(void);
 bool is_mem_loads_aux_event(struct evsel *leader);