[6/8] perf stat,metrics: New metricgroup output for the default mode

Message ID 20230607162700.3234712-7-kan.liang@linux.intel.com
State New
Headers
Series New metricgroup output in perf stat default mode |

Commit Message

Liang, Kan June 7, 2023, 4:26 p.m. UTC
  From: Kan Liang <kan.liang@linux.intel.com>

In the default mode, the current output of the metricgroup include both
events and metrics, which is not necessary and just makes the output
hard to read. Since different ARCHs (even different generations in the
same ARCH) may use different events. The output also vary on different
platforms.

For a metricgroup, only outputting the value of each metric is good
enough.

Current perf may append different metric groups to the same leader
event, or append the metrics from the same metricgroup to different
events. That could bring confusion when perf only prints the
metricgroup output mode. For example, print the same metricgroup name
several times.
Reorganize metricgroup for the default mode and make sure that
a metricgroup can only be appended to one event.
Sort the metricgroup for the default mode by the name of the
metricgroup.

Add a new field default_metricgroup in evsel to indicate an event of
the default metricgroup. For those events, printout() should print
the metricgroup name rather than events.

Add print_metricgroup_header() to print out the metricgroup name in
different output formats.

On SPR
Before:

 ./perf_old stat sleep 1

 Performance counter stats for 'sleep 1':

              0.54 msec task-clock:u                     #    0.001 CPUs utilized
                 0      context-switches:u               #    0.000 /sec
                 0      cpu-migrations:u                 #    0.000 /sec
                68      page-faults:u                    #  125.445 K/sec
           540,970      cycles:u                         #    0.998 GHz
           556,325      instructions:u                   #    1.03  insn per cycle
           123,602      branches:u                       #  228.018 M/sec
             6,889      branch-misses:u                  #    5.57% of all branches
         3,245,820      TOPDOWN.SLOTS:u                  #     18.4 %  tma_backend_bound
                                                  #     17.2 %  tma_retiring
                                                  #     23.1 %  tma_bad_speculation
                                                  #     41.4 %  tma_frontend_bound
           564,859      topdown-retiring:u
         1,370,999      topdown-fe-bound:u
           603,271      topdown-be-bound:u
           744,874      topdown-bad-spec:u
            12,661      INT_MISC.UOP_DROPPING:u          #   23.357 M/sec

       1.001798215 seconds time elapsed

       0.000193000 seconds user
       0.001700000 seconds sys

After:

$ ./perf stat sleep 1

 Performance counter stats for 'sleep 1':

              0.51 msec task-clock:u                     #    0.001 CPUs utilized
                 0      context-switches:u               #    0.000 /sec
                 0      cpu-migrations:u                 #    0.000 /sec
                68      page-faults:u                    #  132.683 K/sec
           545,228      cycles:u                         #    1.064 GHz
           555,509      instructions:u                   #    1.02  insn per cycle
           123,574      branches:u                       #  241.120 M/sec
             6,957      branch-misses:u                  #    5.63% of all branches
                        TopdownL1                 #     17.5 %  tma_backend_bound
                                                  #     22.6 %  tma_bad_speculation
                                                  #     42.7 %  tma_frontend_bound
                                                  #     17.1 %  tma_retiring
                        TopdownL2                 #     21.8 %  tma_branch_mispredicts
                                                  #     11.5 %  tma_core_bound
                                                  #     13.4 %  tma_fetch_bandwidth
                                                  #     29.3 %  tma_fetch_latency
                                                  #      2.7 %  tma_heavy_operations
                                                  #     14.5 %  tma_light_operations
                                                  #      0.8 %  tma_machine_clears
                                                  #      6.1 %  tma_memory_bound

       1.001712086 seconds time elapsed

       0.000151000 seconds user
       0.001618000 seconds sys


Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 tools/perf/builtin-stat.c      |   1 +
 tools/perf/util/evsel.h        |   1 +
 tools/perf/util/metricgroup.c  | 106 ++++++++++++++++++++++++++++++++-
 tools/perf/util/metricgroup.h  |   1 +
 tools/perf/util/stat-display.c |  69 ++++++++++++++++++++-
 5 files changed, 172 insertions(+), 6 deletions(-)
  

Comments

Ian Rogers June 13, 2023, 8:16 p.m. UTC | #1
On Wed, Jun 7, 2023 at 9:27 AM <kan.liang@linux.intel.com> wrote:
>
> From: Kan Liang <kan.liang@linux.intel.com>
>
> In the default mode, the current output of the metricgroup include both
> events and metrics, which is not necessary and just makes the output
> hard to read. Since different ARCHs (even different generations in the
> same ARCH) may use different events. The output also vary on different
> platforms.
>
> For a metricgroup, only outputting the value of each metric is good
> enough.
>
> Current perf may append different metric groups to the same leader
> event, or append the metrics from the same metricgroup to different
> events. That could bring confusion when perf only prints the
> metricgroup output mode. For example, print the same metricgroup name
> several times.
> Reorganize metricgroup for the default mode and make sure that
> a metricgroup can only be appended to one event.
> Sort the metricgroup for the default mode by the name of the
> metricgroup.
>
> Add a new field default_metricgroup in evsel to indicate an event of
> the default metricgroup. For those events, printout() should print
> the metricgroup name rather than events.
>
> Add print_metricgroup_header() to print out the metricgroup name in
> different output formats.
>
> On SPR
> Before:
>
>  ./perf_old stat sleep 1
>
>  Performance counter stats for 'sleep 1':
>
>               0.54 msec task-clock:u                     #    0.001 CPUs utilized
>                  0      context-switches:u               #    0.000 /sec
>                  0      cpu-migrations:u                 #    0.000 /sec
>                 68      page-faults:u                    #  125.445 K/sec
>            540,970      cycles:u                         #    0.998 GHz
>            556,325      instructions:u                   #    1.03  insn per cycle
>            123,602      branches:u                       #  228.018 M/sec
>              6,889      branch-misses:u                  #    5.57% of all branches
>          3,245,820      TOPDOWN.SLOTS:u                  #     18.4 %  tma_backend_bound
>                                                   #     17.2 %  tma_retiring
>                                                   #     23.1 %  tma_bad_speculation
>                                                   #     41.4 %  tma_frontend_bound
>            564,859      topdown-retiring:u
>          1,370,999      topdown-fe-bound:u
>            603,271      topdown-be-bound:u
>            744,874      topdown-bad-spec:u
>             12,661      INT_MISC.UOP_DROPPING:u          #   23.357 M/sec
>
>        1.001798215 seconds time elapsed
>
>        0.000193000 seconds user
>        0.001700000 seconds sys
>
> After:
>
> $ ./perf stat sleep 1
>
>  Performance counter stats for 'sleep 1':
>
>               0.51 msec task-clock:u                     #    0.001 CPUs utilized
>                  0      context-switches:u               #    0.000 /sec
>                  0      cpu-migrations:u                 #    0.000 /sec
>                 68      page-faults:u                    #  132.683 K/sec
>            545,228      cycles:u                         #    1.064 GHz
>            555,509      instructions:u                   #    1.02  insn per cycle
>            123,574      branches:u                       #  241.120 M/sec
>              6,957      branch-misses:u                  #    5.63% of all branches
>                         TopdownL1                 #     17.5 %  tma_backend_bound
>                                                   #     22.6 %  tma_bad_speculation
>                                                   #     42.7 %  tma_frontend_bound
>                                                   #     17.1 %  tma_retiring
>                         TopdownL2                 #     21.8 %  tma_branch_mispredicts
>                                                   #     11.5 %  tma_core_bound
>                                                   #     13.4 %  tma_fetch_bandwidth
>                                                   #     29.3 %  tma_fetch_latency
>                                                   #      2.7 %  tma_heavy_operations
>                                                   #     14.5 %  tma_light_operations
>                                                   #      0.8 %  tma_machine_clears
>                                                   #      6.1 %  tma_memory_bound
>
>        1.001712086 seconds time elapsed
>
>        0.000151000 seconds user
>        0.001618000 seconds sys
>
>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> ---
>  tools/perf/builtin-stat.c      |   1 +
>  tools/perf/util/evsel.h        |   1 +
>  tools/perf/util/metricgroup.c  | 106 ++++++++++++++++++++++++++++++++-
>  tools/perf/util/metricgroup.h  |   1 +
>  tools/perf/util/stat-display.c |  69 ++++++++++++++++++++-
>  5 files changed, 172 insertions(+), 6 deletions(-)
>
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 2269b3e90e9b..b274cc264d56 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -2172,6 +2172,7 @@ static int add_default_attributes(void)
>
>                         evlist__for_each_entry(metric_evlist, metric_evsel) {
>                                 metric_evsel->skippable = true;
> +                               metric_evsel->default_metricgroup = true;
>                         }
>                         evlist__splice_list_tail(evsel_list, &metric_evlist->core.entries);
>                         evlist__delete(metric_evlist);
> diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> index 36a32e4ca168..61b1385108f4 100644
> --- a/tools/perf/util/evsel.h
> +++ b/tools/perf/util/evsel.h
> @@ -130,6 +130,7 @@ struct evsel {
>         bool                    reset_group;
>         bool                    errored;
>         bool                    needs_auxtrace_mmap;
> +       bool                    default_metricgroup;

A comment would be useful here, something like:

If running perf stat, is this evsel a member of a Default metric group metric.

>         struct hashmap          *per_pkg_mask;
>         int                     err;
>         struct {
> diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
> index efafa02db5e5..22181ce4f27f 100644
> --- a/tools/perf/util/metricgroup.c
> +++ b/tools/perf/util/metricgroup.c
> @@ -79,6 +79,7 @@ static struct rb_node *metric_event_new(struct rblist *rblist __maybe_unused,
>                 return NULL;
>         memcpy(me, entry, sizeof(struct metric_event));
>         me->evsel = ((struct metric_event *)entry)->evsel;
> +       me->default_metricgroup_name = NULL;
>         INIT_LIST_HEAD(&me->head);
>         return &me->nd;
>  }
> @@ -1133,14 +1134,19 @@ static int metricgroup__add_metric_sys_event_iter(const struct pmu_metric *pm,
>  /**
>   * metric_list_cmp - list_sort comparator that sorts metrics with more events to
>   *                   the front. tool events are excluded from the count.
> + *                   For the default metrics, sort them by metricgroup name.
>   */
> -static int metric_list_cmp(void *priv __maybe_unused, const struct list_head *l,
> +static int metric_list_cmp(void *priv, const struct list_head *l,
>                            const struct list_head *r)
>  {
>         const struct metric *left = container_of(l, struct metric, nd);
>         const struct metric *right = container_of(r, struct metric, nd);
>         struct expr_id_data *data;
>         int i, left_count, right_count;
> +       bool is_default = *(bool *)priv;
> +
> +       if (is_default && left->default_metricgroup_name && right->default_metricgroup_name)
> +               return strcmp(left->default_metricgroup_name, right->default_metricgroup_name);

This breaks the comment above. The events are now sorted prioritizing
default metric group names. This potentially will have an effect of
reducing sharing of events between groups, it will also break
assumptions within that code that there are always the same number of
fewer events in a metric as you process the list. To remedy this I
think you need to re-sort the metrics after the event sharing has had
a chance to share events between groups.


>
>         left_count = hashmap__size(left->pctx->ids);
>         perf_tool_event__for_each_event(i) {
> @@ -1497,6 +1503,91 @@ static int parse_ids(bool metric_no_merge, struct perf_pmu *fake_pmu,
>         return ret;
>  }
>
> +static struct metric_event *
> +metricgroup__lookup_default_metricgroup(struct rblist *metric_events,
> +                                       struct evsel *evsel,
> +                                       struct metric *m)
> +{
> +       struct metric_event *me;
> +       char *name;
> +       int err;
> +
> +       me = metricgroup__lookup(metric_events, evsel, true);
> +       if (!me->default_metricgroup_name) {
> +               if (m->pmu && strcmp(m->pmu, "cpu"))
> +                       err = asprintf(&name, "%s (%s)", m->default_metricgroup_name, m->pmu);
> +               else
> +                       err = asprintf(&name, "%s", m->default_metricgroup_name);
> +               if (err < 0)
> +                       return NULL;
> +               me->default_metricgroup_name = name;
> +       }
> +       if (!strncmp(m->default_metricgroup_name,
> +                    me->default_metricgroup_name,
> +                    strlen(m->default_metricgroup_name)))
> +               return me;
> +
> +       return NULL;
> +}

A function comment would be useful as the name is confusing, why
lookup? Doesn't it create the value? Leak sanitizer isn't happy here:

```
==1545918==ERROR: LeakSanitizer: detected memory leaks

Direct leak of 10 byte(s) in 1 object(s) allocated from:
    #0 0x7f2755a7077b in __interceptor_strdup
../../../../src/libsanitizer/asan/asan_interceptors.cpp:439
    #1 0x564986a8df31 in asprintf util/util.c:566
    #2 0x5649869b5901 in metricgroup__lookup_default_metricgroup
util/metricgroup.c:1520
    #3 0x5649869b5e57 in metricgroup__lookup_create util/metricgroup.c:1579
    #4 0x5649869b6ddc in parse_groups util/metricgroup.c:1698
    #5 0x5649869b7714 in metricgroup__parse_groups util/metricgroup.c:1771
    #6 0x5649867da9d5 in add_default_attributes tools/perf/builtin-stat.c:2164
    #7 0x5649867ddbfb in cmd_stat tools/perf/builtin-stat.c:2707
    #8 0x5649868fa5a2 in run_builtin tools/perf/perf.c:323
    #9 0x5649868fab13 in handle_internal_command tools/perf/perf.c:377
    #10 0x5649868faedb in run_argv tools/perf/perf.c:421
    #11 0x5649868fb443 in main tools/perf/perf.c:537
    #12 0x7f2754846189 in __libc_start_call_main
../sysdeps/nptl/libc_start_call_main.h:58
```

> +static struct metric_event *
> +metricgroup__lookup_create(struct rblist *metric_events,
> +                          struct evsel **evsel,
> +                          struct list_head *metric_list,
> +                          struct metric *m,
> +                          bool is_default)
> +{
> +       struct metric_event *me;
> +       struct metric *cur;
> +       struct evsel *ev;
> +       size_t i;
> +
> +       if (!is_default)
> +               return metricgroup__lookup(metric_events, evsel[0], true);
> +
> +       /*
> +        * If the metric group has been attached to a previous
> +        * event/metric, use that metric event.
> +        */
> +       list_for_each_entry(cur, metric_list, nd) {
> +               if (cur == m)
> +                       break;
> +               if (cur->pmu && strcmp(m->pmu, cur->pmu))
> +                       continue;
> +               if (strncmp(m->default_metricgroup_name,
> +                           cur->default_metricgroup_name,
> +                           strlen(m->default_metricgroup_name)))
> +                       continue;
> +               if (!cur->evlist)
> +                       continue;
> +               evlist__for_each_entry(cur->evlist, ev) {
> +                       me = metricgroup__lookup(metric_events, ev, false);
> +                       if (!strncmp(m->default_metricgroup_name,
> +                                    me->default_metricgroup_name,
> +                                    strlen(m->default_metricgroup_name)))
> +                               return me;
> +               }
> +       }
> +
> +       /*
> +        * Different metric groups may append to the same leader event.
> +        * For example, TopdownL1 and TopdownL2 are appended to the
> +        * TOPDOWN.SLOTS event.
> +        * Split it and append the new metric group to the next available
> +        * event.
> +        */
> +       me = metricgroup__lookup_default_metricgroup(metric_events, evsel[0], m);
> +       if (me)
> +               return me;
> +
> +       for (i = 1; i < hashmap__size(m->pctx->ids); i++) {
> +               me = metricgroup__lookup_default_metricgroup(metric_events, evsel[i], m);
> +               if (me)
> +                       return me;
> +       }
> +       return NULL;
> +}
> +

I have a hard time understanding this function, does it just go away
if you do the two sorts that I proposed above? Should this be
metric_event__lookup_create? A function comment saying what the code
is trying to achieve would be useful.

This appears to be trying to correct output issues by changing how
metrics are associated with events, shouldn't output issues be
resolved by fixing the output code? If not, why don't we apply this
logic to TopdownL1, why just Default?

>  static int parse_groups(struct evlist *perf_evlist,
>                         const char *pmu, const char *str,
>                         bool metric_no_group,
> @@ -1512,6 +1603,7 @@ static int parse_groups(struct evlist *perf_evlist,
>         LIST_HEAD(metric_list);
>         struct metric *m;
>         bool tool_events[PERF_TOOL_MAX] = {false};
> +       bool is_default = !strcmp(str, "Default");
>         int ret;
>
>         if (metric_events_list->nr_entries == 0)
> @@ -1523,7 +1615,7 @@ static int parse_groups(struct evlist *perf_evlist,
>                 goto out;
>
>         /* Sort metrics from largest to smallest. */
> -       list_sort(NULL, &metric_list, metric_list_cmp);
> +       list_sort((void *)&is_default, &metric_list, metric_list_cmp);
>
>         if (!metric_no_merge) {
>                 struct expr_parse_ctx *combined = NULL;
> @@ -1603,7 +1695,15 @@ static int parse_groups(struct evlist *perf_evlist,
>                         goto out;
>                 }
>
> -               me = metricgroup__lookup(metric_events_list, metric_events[0], true);
> +               me = metricgroup__lookup_create(metric_events_list,
> +                                               metric_events,
> +                                               &metric_list, m,
> +                                               is_default);
> +               if (!me) {
> +                       pr_err("Cannot create metric group for default!\n");
> +                       ret = -EINVAL;
> +                       goto out;
> +               }
>
>                 expr = malloc(sizeof(struct metric_expr));
>                 if (!expr) {
> diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h
> index bf18274c15df..e3609b853213 100644
> --- a/tools/perf/util/metricgroup.h
> +++ b/tools/perf/util/metricgroup.h
> @@ -22,6 +22,7 @@ struct cgroup;
>  struct metric_event {
>         struct rb_node nd;
>         struct evsel *evsel;
> +       char *default_metricgroup_name;
>         struct list_head head; /* list of metric_expr */
>  };
>
> diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
> index a2bbdc25d979..efe5fd04c033 100644
> --- a/tools/perf/util/stat-display.c
> +++ b/tools/perf/util/stat-display.c
> @@ -21,10 +21,12 @@
>  #include "iostat.h"
>  #include "pmu.h"
>  #include "pmus.h"
> +#include "metricgroup.h"

This is bringing metric code from stat-shadow, which is kind of the
whole reason there is a separation and stat-shadow exists. Should the
logic exist in stat-shadow instead?

>
>  #define CNTR_NOT_SUPPORTED     "<not supported>"
>  #define CNTR_NOT_COUNTED       "<not counted>"
>
> +#define MGROUP_LEN   50
>  #define METRIC_LEN   38
>  #define EVNAME_LEN   32
>  #define COUNTS_LEN   18
> @@ -707,6 +709,55 @@ static bool evlist__has_hybrid(struct evlist *evlist)
>         return false;
>  }
>
> +static void print_metricgroup_header_json(struct perf_stat_config *config,
> +                                         struct outstate *os __maybe_unused,
> +                                         const char *metricgroup_name)
> +{
> +       fprintf(config->output, "\"metricgroup\" : \"%s\"}", metricgroup_name);
> +       new_line_json(config, (void *)os);
> +}
> +

Should the output part of this patch be separate from the
evsel/evlist/meitric modifications?

Thanks,
Ian

> +static void print_metricgroup_header_csv(struct perf_stat_config *config,
> +                                        struct outstate *os,
> +                                        const char *metricgroup_name)
> +{
> +       int i;
> +
> +       for (i = 0; i < os->nfields; i++)
> +               fputs(config->csv_sep, os->fh);
> +       fprintf(config->output, "%s", metricgroup_name);
> +       new_line_csv(config, (void *)os);
> +}
> +
> +static void print_metricgroup_header_std(struct perf_stat_config *config,
> +                                        struct outstate *os __maybe_unused,
> +                                        const char *metricgroup_name)
> +{
> +       int n = fprintf(config->output, " %*s", EVNAME_LEN, metricgroup_name);
> +
> +       fprintf(config->output, "%*s", MGROUP_LEN - n - 1, "");
> +}
> +
> +static void print_metricgroup_header(struct perf_stat_config *config,
> +                                    struct outstate *os,
> +                                    struct evsel *counter,
> +                                    double noise, u64 run, u64 ena,
> +                                    const char *metricgroup_name)
> +{
> +       aggr_printout(config, os->evsel, os->id, os->aggr_nr);
> +
> +       print_noise(config, counter, noise, /*before_metric=*/true);
> +       print_running(config, run, ena, /*before_metric=*/true);
> +
> +       if (config->json_output) {
> +               print_metricgroup_header_json(config, os, metricgroup_name);
> +       } else if (config->csv_output) {
> +               print_metricgroup_header_csv(config, os, metricgroup_name);
> +       } else
> +               print_metricgroup_header_std(config, os, metricgroup_name);
> +
> +}
> +
>  static void printout(struct perf_stat_config *config, struct outstate *os,
>                      double uval, u64 run, u64 ena, double noise, int aggr_idx)
>  {
> @@ -751,10 +802,17 @@ static void printout(struct perf_stat_config *config, struct outstate *os,
>         out.force_header = false;
>
>         if (!config->metric_only) {
> -               abs_printout(config, os->id, os->aggr_nr, counter, uval, ok);
> +               if (counter->default_metricgroup) {
> +                       struct metric_event *me;
>
> -               print_noise(config, counter, noise, /*before_metric=*/true);
> -               print_running(config, run, ena, /*before_metric=*/true);
> +                       me = metricgroup__lookup(&config->metric_events, counter, false);
> +                       print_metricgroup_header(config, os, counter, noise, run, ena,
> +                                                me->default_metricgroup_name);
> +               } else {
> +                       abs_printout(config, os->id, os->aggr_nr, counter, uval, ok);
> +                       print_noise(config, counter, noise, /*before_metric=*/true);
> +                       print_running(config, run, ena, /*before_metric=*/true);
> +               }
>         }
>
>         if (ok) {
> @@ -883,6 +941,11 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
>         if (counter->merged_stat)
>                 return;
>
> +       /* Only print the metric group for the default mode */
> +       if (counter->default_metricgroup &&
> +           !metricgroup__lookup(&config->metric_events, counter, false))
> +               return;
> +
>         uniquify_counter(config, counter);
>
>         val = aggr->counts.val;
> --
> 2.35.1
>
  
Liang, Kan June 13, 2023, 8:50 p.m. UTC | #2
On 2023-06-13 4:16 p.m., Ian Rogers wrote:
> On Wed, Jun 7, 2023 at 9:27 AM <kan.liang@linux.intel.com> wrote:
>>
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> In the default mode, the current output of the metricgroup include both
>> events and metrics, which is not necessary and just makes the output
>> hard to read. Since different ARCHs (even different generations in the
>> same ARCH) may use different events. The output also vary on different
>> platforms.
>>
>> For a metricgroup, only outputting the value of each metric is good
>> enough.
>>
>> Current perf may append different metric groups to the same leader
>> event, or append the metrics from the same metricgroup to different
>> events. That could bring confusion when perf only prints the
>> metricgroup output mode. For example, print the same metricgroup name
>> several times.
>> Reorganize metricgroup for the default mode and make sure that
>> a metricgroup can only be appended to one event.
>> Sort the metricgroup for the default mode by the name of the
>> metricgroup.
>>
>> Add a new field default_metricgroup in evsel to indicate an event of
>> the default metricgroup. For those events, printout() should print
>> the metricgroup name rather than events.
>>
>> Add print_metricgroup_header() to print out the metricgroup name in
>> different output formats.
>>
>> On SPR
>> Before:
>>
>>  ./perf_old stat sleep 1
>>
>>  Performance counter stats for 'sleep 1':
>>
>>               0.54 msec task-clock:u                     #    0.001 CPUs utilized
>>                  0      context-switches:u               #    0.000 /sec
>>                  0      cpu-migrations:u                 #    0.000 /sec
>>                 68      page-faults:u                    #  125.445 K/sec
>>            540,970      cycles:u                         #    0.998 GHz
>>            556,325      instructions:u                   #    1.03  insn per cycle
>>            123,602      branches:u                       #  228.018 M/sec
>>              6,889      branch-misses:u                  #    5.57% of all branches
>>          3,245,820      TOPDOWN.SLOTS:u                  #     18.4 %  tma_backend_bound
>>                                                   #     17.2 %  tma_retiring
>>                                                   #     23.1 %  tma_bad_speculation
>>                                                   #     41.4 %  tma_frontend_bound
>>            564,859      topdown-retiring:u
>>          1,370,999      topdown-fe-bound:u
>>            603,271      topdown-be-bound:u
>>            744,874      topdown-bad-spec:u
>>             12,661      INT_MISC.UOP_DROPPING:u          #   23.357 M/sec
>>
>>        1.001798215 seconds time elapsed
>>
>>        0.000193000 seconds user
>>        0.001700000 seconds sys
>>
>> After:
>>
>> $ ./perf stat sleep 1
>>
>>  Performance counter stats for 'sleep 1':
>>
>>               0.51 msec task-clock:u                     #    0.001 CPUs utilized
>>                  0      context-switches:u               #    0.000 /sec
>>                  0      cpu-migrations:u                 #    0.000 /sec
>>                 68      page-faults:u                    #  132.683 K/sec
>>            545,228      cycles:u                         #    1.064 GHz
>>            555,509      instructions:u                   #    1.02  insn per cycle
>>            123,574      branches:u                       #  241.120 M/sec
>>              6,957      branch-misses:u                  #    5.63% of all branches
>>                         TopdownL1                 #     17.5 %  tma_backend_bound
>>                                                   #     22.6 %  tma_bad_speculation
>>                                                   #     42.7 %  tma_frontend_bound
>>                                                   #     17.1 %  tma_retiring
>>                         TopdownL2                 #     21.8 %  tma_branch_mispredicts
>>                                                   #     11.5 %  tma_core_bound
>>                                                   #     13.4 %  tma_fetch_bandwidth
>>                                                   #     29.3 %  tma_fetch_latency
>>                                                   #      2.7 %  tma_heavy_operations
>>                                                   #     14.5 %  tma_light_operations
>>                                                   #      0.8 %  tma_machine_clears
>>                                                   #      6.1 %  tma_memory_bound
>>
>>        1.001712086 seconds time elapsed
>>
>>        0.000151000 seconds user
>>        0.001618000 seconds sys
>>
>>
>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
>> ---
>>  tools/perf/builtin-stat.c      |   1 +
>>  tools/perf/util/evsel.h        |   1 +
>>  tools/perf/util/metricgroup.c  | 106 ++++++++++++++++++++++++++++++++-
>>  tools/perf/util/metricgroup.h  |   1 +
>>  tools/perf/util/stat-display.c |  69 ++++++++++++++++++++-
>>  5 files changed, 172 insertions(+), 6 deletions(-)
>>
>> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
>> index 2269b3e90e9b..b274cc264d56 100644
>> --- a/tools/perf/builtin-stat.c
>> +++ b/tools/perf/builtin-stat.c
>> @@ -2172,6 +2172,7 @@ static int add_default_attributes(void)
>>
>>                         evlist__for_each_entry(metric_evlist, metric_evsel) {
>>                                 metric_evsel->skippable = true;
>> +                               metric_evsel->default_metricgroup = true;
>>                         }
>>                         evlist__splice_list_tail(evsel_list, &metric_evlist->core.entries);
>>                         evlist__delete(metric_evlist);
>> diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
>> index 36a32e4ca168..61b1385108f4 100644
>> --- a/tools/perf/util/evsel.h
>> +++ b/tools/perf/util/evsel.h
>> @@ -130,6 +130,7 @@ struct evsel {
>>         bool                    reset_group;
>>         bool                    errored;
>>         bool                    needs_auxtrace_mmap;
>> +       bool                    default_metricgroup;
> 
> A comment would be useful here, something like:
> 
> If running perf stat, is this evsel a member of a Default metric group metric.

Yes, it's the member of the 'default' metric group.
I will add a comment.

> 
>>         struct hashmap          *per_pkg_mask;
>>         int                     err;
>>         struct {
>> diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
>> index efafa02db5e5..22181ce4f27f 100644
>> --- a/tools/perf/util/metricgroup.c
>> +++ b/tools/perf/util/metricgroup.c
>> @@ -79,6 +79,7 @@ static struct rb_node *metric_event_new(struct rblist *rblist __maybe_unused,
>>                 return NULL;
>>         memcpy(me, entry, sizeof(struct metric_event));
>>         me->evsel = ((struct metric_event *)entry)->evsel;
>> +       me->default_metricgroup_name = NULL;
>>         INIT_LIST_HEAD(&me->head);
>>         return &me->nd;
>>  }
>> @@ -1133,14 +1134,19 @@ static int metricgroup__add_metric_sys_event_iter(const struct pmu_metric *pm,
>>  /**
>>   * metric_list_cmp - list_sort comparator that sorts metrics with more events to
>>   *                   the front. tool events are excluded from the count.
>> + *                   For the default metrics, sort them by metricgroup name.
>>   */
>> -static int metric_list_cmp(void *priv __maybe_unused, const struct list_head *l,
>> +static int metric_list_cmp(void *priv, const struct list_head *l,
>>                            const struct list_head *r)
>>  {
>>         const struct metric *left = container_of(l, struct metric, nd);
>>         const struct metric *right = container_of(r, struct metric, nd);
>>         struct expr_id_data *data;
>>         int i, left_count, right_count;
>> +       bool is_default = *(bool *)priv;
>> +
>> +       if (is_default && left->default_metricgroup_name && right->default_metricgroup_name)
>> +               return strcmp(left->default_metricgroup_name, right->default_metricgroup_name);
> 
> This breaks the comment above. The events are now sorted prioritizing
> default metric group names. This potentially will have an effect of
> reducing sharing of events between groups, it will also break
> assumptions within that code that there are always the same number of
> fewer events in a metric as you process the list. To remedy this I
> think you need to re-sort the metrics after the event sharing has had
> a chance to share events between groups.
> 
> 
>>
>>         left_count = hashmap__size(left->pctx->ids);
>>         perf_tool_event__for_each_event(i) {
>> @@ -1497,6 +1503,91 @@ static int parse_ids(bool metric_no_merge, struct perf_pmu *fake_pmu,
>>         return ret;
>>  }
>>
>> +static struct metric_event *
>> +metricgroup__lookup_default_metricgroup(struct rblist *metric_events,
>> +                                       struct evsel *evsel,
>> +                                       struct metric *m)
>> +{
>> +       struct metric_event *me;
>> +       char *name;
>> +       int err;
>> +
>> +       me = metricgroup__lookup(metric_events, evsel, true);
>> +       if (!me->default_metricgroup_name) {
>> +               if (m->pmu && strcmp(m->pmu, "cpu"))
>> +                       err = asprintf(&name, "%s (%s)", m->default_metricgroup_name, m->pmu);
>> +               else
>> +                       err = asprintf(&name, "%s", m->default_metricgroup_name);
>> +               if (err < 0)
>> +                       return NULL;
>> +               me->default_metricgroup_name = name;
>> +       }
>> +       if (!strncmp(m->default_metricgroup_name,
>> +                    me->default_metricgroup_name,
>> +                    strlen(m->default_metricgroup_name)))
>> +               return me;
>> +
>> +       return NULL;
>> +}
> 
> A function comment would be useful as the name is confusing, why
> lookup? Doesn't it create the value? Leak sanitizer isn't happy here:
> 
> ```
> ==1545918==ERROR: LeakSanitizer: detected memory leaks
> 
> Direct leak of 10 byte(s) in 1 object(s) allocated from:
>     #0 0x7f2755a7077b in __interceptor_strdup
> ../../../../src/libsanitizer/asan/asan_interceptors.cpp:439
>     #1 0x564986a8df31 in asprintf util/util.c:566
>     #2 0x5649869b5901 in metricgroup__lookup_default_metricgroup
> util/metricgroup.c:1520
>     #3 0x5649869b5e57 in metricgroup__lookup_create util/metricgroup.c:1579
>     #4 0x5649869b6ddc in parse_groups util/metricgroup.c:1698
>     #5 0x5649869b7714 in metricgroup__parse_groups util/metricgroup.c:1771
>     #6 0x5649867da9d5 in add_default_attributes tools/perf/builtin-stat.c:2164
>     #7 0x5649867ddbfb in cmd_stat tools/perf/builtin-stat.c:2707
>     #8 0x5649868fa5a2 in run_builtin tools/perf/perf.c:323
>     #9 0x5649868fab13 in handle_internal_command tools/perf/perf.c:377
>     #10 0x5649868faedb in run_argv tools/perf/perf.c:421
>     #11 0x5649868fb443 in main tools/perf/perf.c:537
>     #12 0x7f2754846189 in __libc_start_call_main
> ../sysdeps/nptl/libc_start_call_main.h:58
> ```
> 
>> +static struct metric_event *
>> +metricgroup__lookup_create(struct rblist *metric_events,
>> +                          struct evsel **evsel,
>> +                          struct list_head *metric_list,
>> +                          struct metric *m,
>> +                          bool is_default)
>> +{
>> +       struct metric_event *me;
>> +       struct metric *cur;
>> +       struct evsel *ev;
>> +       size_t i;
>> +
>> +       if (!is_default)
>> +               return metricgroup__lookup(metric_events, evsel[0], true);
>> +
>> +       /*
>> +        * If the metric group has been attached to a previous
>> +        * event/metric, use that metric event.
>> +        */
>> +       list_for_each_entry(cur, metric_list, nd) {
>> +               if (cur == m)
>> +                       break;
>> +               if (cur->pmu && strcmp(m->pmu, cur->pmu))
>> +                       continue;
>> +               if (strncmp(m->default_metricgroup_name,
>> +                           cur->default_metricgroup_name,
>> +                           strlen(m->default_metricgroup_name)))
>> +                       continue;
>> +               if (!cur->evlist)
>> +                       continue;
>> +               evlist__for_each_entry(cur->evlist, ev) {
>> +                       me = metricgroup__lookup(metric_events, ev, false);
>> +                       if (!strncmp(m->default_metricgroup_name,
>> +                                    me->default_metricgroup_name,
>> +                                    strlen(m->default_metricgroup_name)))
>> +                               return me;
>> +               }
>> +       }
>> +
>> +       /*
>> +        * Different metric groups may append to the same leader event.
>> +        * For example, TopdownL1 and TopdownL2 are appended to the
>> +        * TOPDOWN.SLOTS event.
>> +        * Split it and append the new metric group to the next available
>> +        * event.
>> +        */
>> +       me = metricgroup__lookup_default_metricgroup(metric_events, evsel[0], m);
>> +       if (me)
>> +               return me;
>> +
>> +       for (i = 1; i < hashmap__size(m->pctx->ids); i++) {
>> +               me = metricgroup__lookup_default_metricgroup(metric_events, evsel[i], m);
>> +               if (me)
>> +                       return me;
>> +       }
>> +       return NULL;
>> +}
>> +
> 
> I have a hard time understanding this function, does it just go away
> if you do the two sorts that I proposed above? Should this be
> metric_event__lookup_create? A function comment saying what the code
> is trying to achieve would be useful.
> 
> This appears to be trying to correct output issues by changing how
> metrics are associated with events, shouldn't output issues be
> resolved by fixing the output code? If not, why don't we apply this
> logic to TopdownL1, why just Default?

Yes, the above codes try to re-organize the metric and append the metric
from the same metricgroup into the same event. So it can be easily print
out later.

With the second sort, I think it should not be problem to address it in
the output code. Let me do some experiments.

> 
>>  static int parse_groups(struct evlist *perf_evlist,
>>                         const char *pmu, const char *str,
>>                         bool metric_no_group,
>> @@ -1512,6 +1603,7 @@ static int parse_groups(struct evlist *perf_evlist,
>>         LIST_HEAD(metric_list);
>>         struct metric *m;
>>         bool tool_events[PERF_TOOL_MAX] = {false};
>> +       bool is_default = !strcmp(str, "Default");
>>         int ret;
>>
>>         if (metric_events_list->nr_entries == 0)
>> @@ -1523,7 +1615,7 @@ static int parse_groups(struct evlist *perf_evlist,
>>                 goto out;
>>
>>         /* Sort metrics from largest to smallest. */
>> -       list_sort(NULL, &metric_list, metric_list_cmp);
>> +       list_sort((void *)&is_default, &metric_list, metric_list_cmp);
>>
>>         if (!metric_no_merge) {
>>                 struct expr_parse_ctx *combined = NULL;
>> @@ -1603,7 +1695,15 @@ static int parse_groups(struct evlist *perf_evlist,
>>                         goto out;
>>                 }
>>
>> -               me = metricgroup__lookup(metric_events_list, metric_events[0], true);
>> +               me = metricgroup__lookup_create(metric_events_list,
>> +                                               metric_events,
>> +                                               &metric_list, m,
>> +                                               is_default);
>> +               if (!me) {
>> +                       pr_err("Cannot create metric group for default!\n");
>> +                       ret = -EINVAL;
>> +                       goto out;
>> +               }
>>
>>                 expr = malloc(sizeof(struct metric_expr));
>>                 if (!expr) {
>> diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h
>> index bf18274c15df..e3609b853213 100644
>> --- a/tools/perf/util/metricgroup.h
>> +++ b/tools/perf/util/metricgroup.h
>> @@ -22,6 +22,7 @@ struct cgroup;
>>  struct metric_event {
>>         struct rb_node nd;
>>         struct evsel *evsel;
>> +       char *default_metricgroup_name;
>>         struct list_head head; /* list of metric_expr */
>>  };
>>
>> diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
>> index a2bbdc25d979..efe5fd04c033 100644
>> --- a/tools/perf/util/stat-display.c
>> +++ b/tools/perf/util/stat-display.c
>> @@ -21,10 +21,12 @@
>>  #include "iostat.h"
>>  #include "pmu.h"
>>  #include "pmus.h"
>> +#include "metricgroup.h"
> 
> This is bringing metric code from stat-shadow, which is kind of the
> whole reason there is a separation and stat-shadow exists. Should the
> logic exist in stat-shadow instead?
> 
>>
>>  #define CNTR_NOT_SUPPORTED     "<not supported>"
>>  #define CNTR_NOT_COUNTED       "<not counted>"
>>
>> +#define MGROUP_LEN   50
>>  #define METRIC_LEN   38
>>  #define EVNAME_LEN   32
>>  #define COUNTS_LEN   18
>> @@ -707,6 +709,55 @@ static bool evlist__has_hybrid(struct evlist *evlist)
>>         return false;
>>  }
>>
>> +static void print_metricgroup_header_json(struct perf_stat_config *config,
>> +                                         struct outstate *os __maybe_unused,
>> +                                         const char *metricgroup_name)
>> +{
>> +       fprintf(config->output, "\"metricgroup\" : \"%s\"}", metricgroup_name);
>> +       new_line_json(config, (void *)os);
>> +}
>> +
> 
> Should the output part of this patch be separate from the
> evsel/evlist/meitric modifications?
> 

Sure, I will split the patch.

Thanks,
Kan

> Thanks,
> Ian
> 
>> +static void print_metricgroup_header_csv(struct perf_stat_config *config,
>> +                                        struct outstate *os,
>> +                                        const char *metricgroup_name)
>> +{
>> +       int i;
>> +
>> +       for (i = 0; i < os->nfields; i++)
>> +               fputs(config->csv_sep, os->fh);
>> +       fprintf(config->output, "%s", metricgroup_name);
>> +       new_line_csv(config, (void *)os);
>> +}
>> +
>> +static void print_metricgroup_header_std(struct perf_stat_config *config,
>> +                                        struct outstate *os __maybe_unused,
>> +                                        const char *metricgroup_name)
>> +{
>> +       int n = fprintf(config->output, " %*s", EVNAME_LEN, metricgroup_name);
>> +
>> +       fprintf(config->output, "%*s", MGROUP_LEN - n - 1, "");
>> +}
>> +
>> +static void print_metricgroup_header(struct perf_stat_config *config,
>> +                                    struct outstate *os,
>> +                                    struct evsel *counter,
>> +                                    double noise, u64 run, u64 ena,
>> +                                    const char *metricgroup_name)
>> +{
>> +       aggr_printout(config, os->evsel, os->id, os->aggr_nr);
>> +
>> +       print_noise(config, counter, noise, /*before_metric=*/true);
>> +       print_running(config, run, ena, /*before_metric=*/true);
>> +
>> +       if (config->json_output) {
>> +               print_metricgroup_header_json(config, os, metricgroup_name);
>> +       } else if (config->csv_output) {
>> +               print_metricgroup_header_csv(config, os, metricgroup_name);
>> +       } else
>> +               print_metricgroup_header_std(config, os, metricgroup_name);
>> +
>> +}
>> +
>>  static void printout(struct perf_stat_config *config, struct outstate *os,
>>                      double uval, u64 run, u64 ena, double noise, int aggr_idx)
>>  {
>> @@ -751,10 +802,17 @@ static void printout(struct perf_stat_config *config, struct outstate *os,
>>         out.force_header = false;
>>
>>         if (!config->metric_only) {
>> -               abs_printout(config, os->id, os->aggr_nr, counter, uval, ok);
>> +               if (counter->default_metricgroup) {
>> +                       struct metric_event *me;
>>
>> -               print_noise(config, counter, noise, /*before_metric=*/true);
>> -               print_running(config, run, ena, /*before_metric=*/true);
>> +                       me = metricgroup__lookup(&config->metric_events, counter, false);
>> +                       print_metricgroup_header(config, os, counter, noise, run, ena,
>> +                                                me->default_metricgroup_name);
>> +               } else {
>> +                       abs_printout(config, os->id, os->aggr_nr, counter, uval, ok);
>> +                       print_noise(config, counter, noise, /*before_metric=*/true);
>> +                       print_running(config, run, ena, /*before_metric=*/true);
>> +               }
>>         }
>>
>>         if (ok) {
>> @@ -883,6 +941,11 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
>>         if (counter->merged_stat)
>>                 return;
>>
>> +       /* Only print the metric group for the default mode */
>> +       if (counter->default_metricgroup &&
>> +           !metricgroup__lookup(&config->metric_events, counter, false))
>> +               return;
>> +
>>         uniquify_counter(config, counter);
>>
>>         val = aggr->counts.val;
>> --
>> 2.35.1
>>
  

Patch

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2269b3e90e9b..b274cc264d56 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -2172,6 +2172,7 @@  static int add_default_attributes(void)
 
 			evlist__for_each_entry(metric_evlist, metric_evsel) {
 				metric_evsel->skippable = true;
+				metric_evsel->default_metricgroup = true;
 			}
 			evlist__splice_list_tail(evsel_list, &metric_evlist->core.entries);
 			evlist__delete(metric_evlist);
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 36a32e4ca168..61b1385108f4 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -130,6 +130,7 @@  struct evsel {
 	bool			reset_group;
 	bool			errored;
 	bool			needs_auxtrace_mmap;
+	bool			default_metricgroup;
 	struct hashmap		*per_pkg_mask;
 	int			err;
 	struct {
diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index efafa02db5e5..22181ce4f27f 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -79,6 +79,7 @@  static struct rb_node *metric_event_new(struct rblist *rblist __maybe_unused,
 		return NULL;
 	memcpy(me, entry, sizeof(struct metric_event));
 	me->evsel = ((struct metric_event *)entry)->evsel;
+	me->default_metricgroup_name = NULL;
 	INIT_LIST_HEAD(&me->head);
 	return &me->nd;
 }
@@ -1133,14 +1134,19 @@  static int metricgroup__add_metric_sys_event_iter(const struct pmu_metric *pm,
 /**
  * metric_list_cmp - list_sort comparator that sorts metrics with more events to
  *                   the front. tool events are excluded from the count.
+ *                   For the default metrics, sort them by metricgroup name.
  */
-static int metric_list_cmp(void *priv __maybe_unused, const struct list_head *l,
+static int metric_list_cmp(void *priv, const struct list_head *l,
 			   const struct list_head *r)
 {
 	const struct metric *left = container_of(l, struct metric, nd);
 	const struct metric *right = container_of(r, struct metric, nd);
 	struct expr_id_data *data;
 	int i, left_count, right_count;
+	bool is_default = *(bool *)priv;
+
+	if (is_default && left->default_metricgroup_name && right->default_metricgroup_name)
+		return strcmp(left->default_metricgroup_name, right->default_metricgroup_name);
 
 	left_count = hashmap__size(left->pctx->ids);
 	perf_tool_event__for_each_event(i) {
@@ -1497,6 +1503,91 @@  static int parse_ids(bool metric_no_merge, struct perf_pmu *fake_pmu,
 	return ret;
 }
 
+static struct metric_event *
+metricgroup__lookup_default_metricgroup(struct rblist *metric_events,
+					struct evsel *evsel,
+					struct metric *m)
+{
+	struct metric_event *me;
+	char *name;
+	int err;
+
+	me = metricgroup__lookup(metric_events, evsel, true);
+	if (!me->default_metricgroup_name) {
+		if (m->pmu && strcmp(m->pmu, "cpu"))
+			err = asprintf(&name, "%s (%s)", m->default_metricgroup_name, m->pmu);
+		else
+			err = asprintf(&name, "%s", m->default_metricgroup_name);
+		if (err < 0)
+			return NULL;
+		me->default_metricgroup_name = name;
+	}
+	if (!strncmp(m->default_metricgroup_name,
+		     me->default_metricgroup_name,
+		     strlen(m->default_metricgroup_name)))
+		return me;
+
+	return NULL;
+}
+
+static struct metric_event *
+metricgroup__lookup_create(struct rblist *metric_events,
+			   struct evsel **evsel,
+			   struct list_head *metric_list,
+			   struct metric *m,
+			   bool is_default)
+{
+	struct metric_event *me;
+	struct metric *cur;
+	struct evsel *ev;
+	size_t i;
+
+	if (!is_default)
+		return metricgroup__lookup(metric_events, evsel[0], true);
+
+	/*
+	 * If the metric group has been attached to a previous
+	 * event/metric, use that metric event.
+	 */
+	list_for_each_entry(cur, metric_list, nd) {
+		if (cur == m)
+			break;
+		if (cur->pmu && strcmp(m->pmu, cur->pmu))
+			continue;
+		if (strncmp(m->default_metricgroup_name,
+			    cur->default_metricgroup_name,
+			    strlen(m->default_metricgroup_name)))
+			continue;
+		if (!cur->evlist)
+			continue;
+		evlist__for_each_entry(cur->evlist, ev) {
+			me = metricgroup__lookup(metric_events, ev, false);
+			if (!strncmp(m->default_metricgroup_name,
+				     me->default_metricgroup_name,
+				     strlen(m->default_metricgroup_name)))
+				return me;
+		}
+	}
+
+	/*
+	 * Different metric groups may append to the same leader event.
+	 * For example, TopdownL1 and TopdownL2 are appended to the
+	 * TOPDOWN.SLOTS event.
+	 * Split it and append the new metric group to the next available
+	 * event.
+	 */
+	me = metricgroup__lookup_default_metricgroup(metric_events, evsel[0], m);
+	if (me)
+		return me;
+
+	for (i = 1; i < hashmap__size(m->pctx->ids); i++) {
+		me = metricgroup__lookup_default_metricgroup(metric_events, evsel[i], m);
+		if (me)
+			return me;
+	}
+	return NULL;
+}
+
 static int parse_groups(struct evlist *perf_evlist,
 			const char *pmu, const char *str,
 			bool metric_no_group,
@@ -1512,6 +1603,7 @@  static int parse_groups(struct evlist *perf_evlist,
 	LIST_HEAD(metric_list);
 	struct metric *m;
 	bool tool_events[PERF_TOOL_MAX] = {false};
+	bool is_default = !strcmp(str, "Default");
 	int ret;
 
 	if (metric_events_list->nr_entries == 0)
@@ -1523,7 +1615,7 @@  static int parse_groups(struct evlist *perf_evlist,
 		goto out;
 
 	/* Sort metrics from largest to smallest. */
-	list_sort(NULL, &metric_list, metric_list_cmp);
+	list_sort((void *)&is_default, &metric_list, metric_list_cmp);
 
 	if (!metric_no_merge) {
 		struct expr_parse_ctx *combined = NULL;
@@ -1603,7 +1695,15 @@  static int parse_groups(struct evlist *perf_evlist,
 			goto out;
 		}
 
-		me = metricgroup__lookup(metric_events_list, metric_events[0], true);
+		me = metricgroup__lookup_create(metric_events_list,
+						metric_events,
+						&metric_list, m,
+						is_default);
+		if (!me) {
+			pr_err("Cannot create metric group for default!\n");
+			ret = -EINVAL;
+			goto out;
+		}
 
 		expr = malloc(sizeof(struct metric_expr));
 		if (!expr) {
diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h
index bf18274c15df..e3609b853213 100644
--- a/tools/perf/util/metricgroup.h
+++ b/tools/perf/util/metricgroup.h
@@ -22,6 +22,7 @@  struct cgroup;
 struct metric_event {
 	struct rb_node nd;
 	struct evsel *evsel;
+	char *default_metricgroup_name;
 	struct list_head head; /* list of metric_expr */
 };
 
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index a2bbdc25d979..efe5fd04c033 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -21,10 +21,12 @@ 
 #include "iostat.h"
 #include "pmu.h"
 #include "pmus.h"
+#include "metricgroup.h"
 
 #define CNTR_NOT_SUPPORTED	"<not supported>"
 #define CNTR_NOT_COUNTED	"<not counted>"
 
+#define MGROUP_LEN   50
 #define METRIC_LEN   38
 #define EVNAME_LEN   32
 #define COUNTS_LEN   18
@@ -707,6 +709,55 @@  static bool evlist__has_hybrid(struct evlist *evlist)
 	return false;
 }
 
+static void print_metricgroup_header_json(struct perf_stat_config *config,
+					  struct outstate *os __maybe_unused,
+					  const char *metricgroup_name)
+{
+	fprintf(config->output, "\"metricgroup\" : \"%s\"}", metricgroup_name);
+	new_line_json(config, (void *)os);
+}
+
+static void print_metricgroup_header_csv(struct perf_stat_config *config,
+					 struct outstate *os,
+					 const char *metricgroup_name)
+{
+	int i;
+
+	for (i = 0; i < os->nfields; i++)
+		fputs(config->csv_sep, os->fh);
+	fprintf(config->output, "%s", metricgroup_name);
+	new_line_csv(config, (void *)os);
+}
+
+static void print_metricgroup_header_std(struct perf_stat_config *config,
+					 struct outstate *os __maybe_unused,
+					 const char *metricgroup_name)
+{
+	int n = fprintf(config->output, " %*s", EVNAME_LEN, metricgroup_name);
+
+	fprintf(config->output, "%*s", MGROUP_LEN - n - 1, "");
+}
+
+static void print_metricgroup_header(struct perf_stat_config *config,
+				     struct outstate *os,
+				     struct evsel *counter,
+				     double noise, u64 run, u64 ena,
+				     const char *metricgroup_name)
+{
+	aggr_printout(config, os->evsel, os->id, os->aggr_nr);
+
+	print_noise(config, counter, noise, /*before_metric=*/true);
+	print_running(config, run, ena, /*before_metric=*/true);
+
+	if (config->json_output) {
+		print_metricgroup_header_json(config, os, metricgroup_name);
+	} else if (config->csv_output) {
+		print_metricgroup_header_csv(config, os, metricgroup_name);
+	} else
+		print_metricgroup_header_std(config, os, metricgroup_name);
+
+}
+
 static void printout(struct perf_stat_config *config, struct outstate *os,
 		     double uval, u64 run, u64 ena, double noise, int aggr_idx)
 {
@@ -751,10 +802,17 @@  static void printout(struct perf_stat_config *config, struct outstate *os,
 	out.force_header = false;
 
 	if (!config->metric_only) {
-		abs_printout(config, os->id, os->aggr_nr, counter, uval, ok);
+		if (counter->default_metricgroup) {
+			struct metric_event *me;
 
-		print_noise(config, counter, noise, /*before_metric=*/true);
-		print_running(config, run, ena, /*before_metric=*/true);
+			me = metricgroup__lookup(&config->metric_events, counter, false);
+			print_metricgroup_header(config, os, counter, noise, run, ena,
+						 me->default_metricgroup_name);
+		} else {
+			abs_printout(config, os->id, os->aggr_nr, counter, uval, ok);
+			print_noise(config, counter, noise, /*before_metric=*/true);
+			print_running(config, run, ena, /*before_metric=*/true);
+		}
 	}
 
 	if (ok) {
@@ -883,6 +941,11 @@  static void print_counter_aggrdata(struct perf_stat_config *config,
 	if (counter->merged_stat)
 		return;
 
+	/* Only print the metric group for the default mode */
+	if (counter->default_metricgroup &&
+	    !metricgroup__lookup(&config->metric_events, counter, false))
+		return;
+
 	uniquify_counter(config, counter);
 
 	val = aggr->counts.val;