[03/10] perf record: Add BPF event filter support

Message ID 20230314234237.3008956-4-namhyung@kernel.org
State New
Headers
Series perf record: Implement BPF sample filter (v5) |

Commit Message

Namhyung Kim March 14, 2023, 11:42 p.m. UTC
  Use --filter option to set BPF filter for generic events other than the
tracepoints or Intel PT.  The BPF program will check the sample data and
filter according to the expression.

For example, the below is the typical perf record for frequency mode.
The sample period started from 1 and increased gradually.

$ sudo ./perf record -e cycles true
$ sudo ./perf script
       perf-exec 2272336 546683.916875:          1 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
       perf-exec 2272336 546683.916892:          1 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
       perf-exec 2272336 546683.916899:          3 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
       perf-exec 2272336 546683.916905:         17 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
       perf-exec 2272336 546683.916911:        100 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
       perf-exec 2272336 546683.916917:        589 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
       perf-exec 2272336 546683.916924:       3470 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
       perf-exec 2272336 546683.916930:      20465 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
            true 2272336 546683.916940:     119873 cycles:  ffffffff8283afdd perf_iterate_ctx+0x2d ([kernel.kallsyms])
            true 2272336 546683.917003:     461349 cycles:  ffffffff82892517 vma_interval_tree_insert+0x37 ([kernel.kallsyms])
            true 2272336 546683.917237:     635778 cycles:  ffffffff82a11400 security_mmap_file+0x20 ([kernel.kallsyms])

When you add a BPF filter to get samples having periods greater than 1000,
the output would look like below:

$ sudo ./perf record -e cycles --filter 'period > 1000' true
$ sudo ./perf script
       perf-exec 2273949 546850.708501:       5029 cycles:  ffffffff826f9e25 finish_wait+0x5 ([kernel.kallsyms])
       perf-exec 2273949 546850.708508:      32409 cycles:  ffffffff826f9e25 finish_wait+0x5 ([kernel.kallsyms])
       perf-exec 2273949 546850.708526:     143369 cycles:  ffffffff82b4cdbf xas_start+0x5f ([kernel.kallsyms])
       perf-exec 2273949 546850.708600:     372650 cycles:  ffffffff8286b8f7 __pagevec_lru_add+0x117 ([kernel.kallsyms])
       perf-exec 2273949 546850.708791:     482953 cycles:  ffffffff829190de __mod_memcg_lruvec_state+0x4e ([kernel.kallsyms])
            true 2273949 546850.709036:     501985 cycles:  ffffffff828add7c tlb_gather_mmu+0x4c ([kernel.kallsyms])
            true 2273949 546850.709292:     503065 cycles:      7f2446d97c03 _dl_map_object_deps+0x973 (/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2)

Acked-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Documentation/perf-record.txt | 15 +++++++++++---
 tools/perf/util/bpf_counter.c            |  3 +--
 tools/perf/util/evlist.c                 | 25 +++++++++++++++++-------
 tools/perf/util/evsel.c                  |  2 ++
 tools/perf/util/parse-events.c           |  8 +++-----
 5 files changed, 36 insertions(+), 17 deletions(-)
  

Comments

Arnaldo Carvalho de Melo March 15, 2023, 1:47 p.m. UTC | #1
Em Tue, Mar 14, 2023 at 04:42:30PM -0700, Namhyung Kim escreveu:
> Use --filter option to set BPF filter for generic events other than the
> tracepoints or Intel PT.  The BPF program will check the sample data and
> filter according to the expression.
> 
> For example, the below is the typical perf record for frequency mode.
> The sample period started from 1 and increased gradually.
> 
> $ sudo ./perf record -e cycles true
> $ sudo ./perf script
>        perf-exec 2272336 546683.916875:          1 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>        perf-exec 2272336 546683.916892:          1 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>        perf-exec 2272336 546683.916899:          3 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>        perf-exec 2272336 546683.916905:         17 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>        perf-exec 2272336 546683.916911:        100 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>        perf-exec 2272336 546683.916917:        589 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>        perf-exec 2272336 546683.916924:       3470 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>        perf-exec 2272336 546683.916930:      20465 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>             true 2272336 546683.916940:     119873 cycles:  ffffffff8283afdd perf_iterate_ctx+0x2d ([kernel.kallsyms])
>             true 2272336 546683.917003:     461349 cycles:  ffffffff82892517 vma_interval_tree_insert+0x37 ([kernel.kallsyms])
>             true 2272336 546683.917237:     635778 cycles:  ffffffff82a11400 security_mmap_file+0x20 ([kernel.kallsyms])
> 
> When you add a BPF filter to get samples having periods greater than 1000,
> the output would look like below:

Had to add:

diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index be336f1b2b689602..153a13cdca9df1ea 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -19,6 +19,7 @@
 #include "mmap.h"
 #include "stat.h"
 #include "metricgroup.h"
+#include "util/bpf-filter.h"
 #include "util/env.h"
 #include "util/pmu.h"
 #include <internal/lib.h>
@@ -135,6 +136,18 @@ int bpf_counter__disable(struct evsel *evsel __maybe_unused)
 	return 0;
 }
 
+// not to drag util/bpf-filter.c
+
+int perf_bpf_filter__prepare(struct evsel *evsel __maybe_unused)
+{
+	return 0;
+}
+
+int perf_bpf_filter__destroy(struct evsel *evsel __maybe_unused)
+{
+	return 0;
+}
+
 /*
  * Support debug printing even though util/debug.c is not linked.  That means
  * implementing 'verbose' and 'eprintf'.


Please run 'perf test' before submitting patches,

- Arnaldo
 
> $ sudo ./perf record -e cycles --filter 'period > 1000' true
> $ sudo ./perf script
>        perf-exec 2273949 546850.708501:       5029 cycles:  ffffffff826f9e25 finish_wait+0x5 ([kernel.kallsyms])
>        perf-exec 2273949 546850.708508:      32409 cycles:  ffffffff826f9e25 finish_wait+0x5 ([kernel.kallsyms])
>        perf-exec 2273949 546850.708526:     143369 cycles:  ffffffff82b4cdbf xas_start+0x5f ([kernel.kallsyms])
>        perf-exec 2273949 546850.708600:     372650 cycles:  ffffffff8286b8f7 __pagevec_lru_add+0x117 ([kernel.kallsyms])
>        perf-exec 2273949 546850.708791:     482953 cycles:  ffffffff829190de __mod_memcg_lruvec_state+0x4e ([kernel.kallsyms])
>             true 2273949 546850.709036:     501985 cycles:  ffffffff828add7c tlb_gather_mmu+0x4c ([kernel.kallsyms])
>             true 2273949 546850.709292:     503065 cycles:      7f2446d97c03 _dl_map_object_deps+0x973 (/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2)
> 
> Acked-by: Jiri Olsa <jolsa@kernel.org>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  tools/perf/Documentation/perf-record.txt | 15 +++++++++++---
>  tools/perf/util/bpf_counter.c            |  3 +--
>  tools/perf/util/evlist.c                 | 25 +++++++++++++++++-------
>  tools/perf/util/evsel.c                  |  2 ++
>  tools/perf/util/parse-events.c           |  8 +++-----
>  5 files changed, 36 insertions(+), 17 deletions(-)
> 
> diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
> index ff815c2f67e8..122f71726eaa 100644
> --- a/tools/perf/Documentation/perf-record.txt
> +++ b/tools/perf/Documentation/perf-record.txt
> @@ -119,9 +119,12 @@ OPTIONS
>  	  "perf report" to view group events together.
>  
>  --filter=<filter>::
> -        Event filter. This option should follow an event selector (-e) which
> -	selects either tracepoint event(s) or a hardware trace PMU
> -	(e.g. Intel PT or CoreSight).
> +	Event filter.  This option should follow an event selector (-e).
> +	If the event is a tracepoint, the filter string will be parsed by
> +	the kernel.  If the event is a hardware trace PMU (e.g. Intel PT
> +	or CoreSight), it'll be processed as an address filter.  Otherwise
> +	it means a general filter using BPF which can be applied for any
> +	kind of event.
>  
>  	- tracepoint filters
>  
> @@ -176,6 +179,12 @@ OPTIONS
>  
>  	Multiple filters can be separated with space or comma.
>  
> +	- bpf filters
> +
> +	A BPF filter can access the sample data and make a decision based on the
> +	data.  Users need to set an appropriate sample type to use the BPF
> +	filter.
> +
>  --exclude-perf::
>  	Don't record events issued by perf itself. This option should follow
>  	an event selector (-e) which selects tracepoint event(s). It adds a
> diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
> index aa78a15a6f0a..1b77436e067e 100644
> --- a/tools/perf/util/bpf_counter.c
> +++ b/tools/perf/util/bpf_counter.c
> @@ -763,8 +763,7 @@ extern struct bpf_counter_ops bperf_cgrp_ops;
>  
>  static inline bool bpf_counter_skip(struct evsel *evsel)
>  {
> -	return list_empty(&evsel->bpf_counter_list) &&
> -		evsel->follower_skel == NULL;
> +	return evsel->bpf_counter_ops == NULL;
>  }
>  
>  int bpf_counter__install_pe(struct evsel *evsel, int cpu_map_idx, int fd)
> diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
> index b74e12239aec..cc491a037836 100644
> --- a/tools/perf/util/evlist.c
> +++ b/tools/perf/util/evlist.c
> @@ -31,6 +31,7 @@
>  #include "util/evlist-hybrid.h"
>  #include "util/pmu.h"
>  #include "util/sample.h"
> +#include "util/bpf-filter.h"
>  #include <signal.h>
>  #include <unistd.h>
>  #include <sched.h>
> @@ -1086,17 +1087,27 @@ int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel)
>  	int err = 0;
>  
>  	evlist__for_each_entry(evlist, evsel) {
> -		if (evsel->filter == NULL)
> -			continue;
> -
>  		/*
>  		 * filters only work for tracepoint event, which doesn't have cpu limit.
>  		 * So evlist and evsel should always be same.
>  		 */
> -		err = perf_evsel__apply_filter(&evsel->core, evsel->filter);
> -		if (err) {
> -			*err_evsel = evsel;
> -			break;
> +		if (evsel->filter) {
> +			err = perf_evsel__apply_filter(&evsel->core, evsel->filter);
> +			if (err) {
> +				*err_evsel = evsel;
> +				break;
> +			}
> +		}
> +
> +		/*
> +		 * non-tracepoint events can have BPF filters.
> +		 */
> +		if (!list_empty(&evsel->bpf_filters)) {
> +			err = perf_bpf_filter__prepare(evsel);
> +			if (err) {
> +				*err_evsel = evsel;
> +				break;
> +			}
>  		}
>  	}
>  
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index a83d8cd5eb51..dc3faf005c3b 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -50,6 +50,7 @@
>  #include "off_cpu.h"
>  #include "../perf-sys.h"
>  #include "util/parse-branch-options.h"
> +#include "util/bpf-filter.h"
>  #include <internal/xyarray.h>
>  #include <internal/lib.h>
>  #include <internal/threadmap.h>
> @@ -1517,6 +1518,7 @@ void evsel__exit(struct evsel *evsel)
>  	assert(list_empty(&evsel->core.node));
>  	assert(evsel->evlist == NULL);
>  	bpf_counter__destroy(evsel);
> +	perf_bpf_filter__destroy(evsel);
>  	evsel__free_counts(evsel);
>  	perf_evsel__free_fd(&evsel->core);
>  	perf_evsel__free_id(&evsel->core);
> diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
> index 3b2e5bb3e852..6c5cf5244486 100644
> --- a/tools/perf/util/parse-events.c
> +++ b/tools/perf/util/parse-events.c
> @@ -28,6 +28,7 @@
>  #include "perf.h"
>  #include "util/parse-events-hybrid.h"
>  #include "util/pmu-hybrid.h"
> +#include "util/bpf-filter.h"
>  #include "tracepoint.h"
>  #include "thread_map.h"
>  
> @@ -2542,11 +2543,8 @@ static int set_filter(struct evsel *evsel, const void *arg)
>  		perf_pmu__scan_file(pmu, "nr_addr_filters",
>  				    "%d", &nr_addr_filters);
>  
> -	if (!nr_addr_filters) {
> -		fprintf(stderr,
> -			"This CPU does not support address filtering\n");
> -		return -1;
> -	}
> +	if (!nr_addr_filters)
> +		return perf_bpf_filter__parse(&evsel->bpf_filters, str);
>  
>  	if (evsel__append_addr_filter(evsel, str) < 0) {
>  		fprintf(stderr,
> -- 
> 2.40.0.rc1.284.g88254d51c5-goog
>
  
Namhyung Kim March 15, 2023, 4:41 p.m. UTC | #2
Hi Arnaldo,

On Wed, Mar 15, 2023 at 6:47 AM Arnaldo Carvalho de Melo
<acme@kernel.org> wrote:
>
> Em Tue, Mar 14, 2023 at 04:42:30PM -0700, Namhyung Kim escreveu:
> > Use --filter option to set BPF filter for generic events other than the
> > tracepoints or Intel PT.  The BPF program will check the sample data and
> > filter according to the expression.
> >
> > For example, the below is the typical perf record for frequency mode.
> > The sample period started from 1 and increased gradually.
> >
> > $ sudo ./perf record -e cycles true
> > $ sudo ./perf script
> >        perf-exec 2272336 546683.916875:          1 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
> >        perf-exec 2272336 546683.916892:          1 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
> >        perf-exec 2272336 546683.916899:          3 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
> >        perf-exec 2272336 546683.916905:         17 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
> >        perf-exec 2272336 546683.916911:        100 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
> >        perf-exec 2272336 546683.916917:        589 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
> >        perf-exec 2272336 546683.916924:       3470 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
> >        perf-exec 2272336 546683.916930:      20465 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
> >             true 2272336 546683.916940:     119873 cycles:  ffffffff8283afdd perf_iterate_ctx+0x2d ([kernel.kallsyms])
> >             true 2272336 546683.917003:     461349 cycles:  ffffffff82892517 vma_interval_tree_insert+0x37 ([kernel.kallsyms])
> >             true 2272336 546683.917237:     635778 cycles:  ffffffff82a11400 security_mmap_file+0x20 ([kernel.kallsyms])
> >
> > When you add a BPF filter to get samples having periods greater than 1000,
> > the output would look like below:
>
> Had to add:
>
> diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
> index be336f1b2b689602..153a13cdca9df1ea 100644
> --- a/tools/perf/util/python.c
> +++ b/tools/perf/util/python.c
> @@ -19,6 +19,7 @@
>  #include "mmap.h"
>  #include "stat.h"
>  #include "metricgroup.h"
> +#include "util/bpf-filter.h"
>  #include "util/env.h"
>  #include "util/pmu.h"
>  #include <internal/lib.h>
> @@ -135,6 +136,18 @@ int bpf_counter__disable(struct evsel *evsel __maybe_unused)
>         return 0;
>  }
>
> +// not to drag util/bpf-filter.c
> +
> +int perf_bpf_filter__prepare(struct evsel *evsel __maybe_unused)
> +{
> +       return 0;
> +}
> +
> +int perf_bpf_filter__destroy(struct evsel *evsel __maybe_unused)
> +{
> +       return 0;
> +}
> +
>  /*
>   * Support debug printing even though util/debug.c is not linked.  That means
>   * implementing 'verbose' and 'eprintf'.
>
>
> Please run 'perf test' before submitting patches,

Ugh, sorry.  I think I ran it at some point but missed the python test :-p

Anyway, I'm afraid you need to enclose with #ifndef HAVE_BPF_SKEL.

Thanks,
Namhyung


>
> - Arnaldo
>
> > $ sudo ./perf record -e cycles --filter 'period > 1000' true
> > $ sudo ./perf script
> >        perf-exec 2273949 546850.708501:       5029 cycles:  ffffffff826f9e25 finish_wait+0x5 ([kernel.kallsyms])
> >        perf-exec 2273949 546850.708508:      32409 cycles:  ffffffff826f9e25 finish_wait+0x5 ([kernel.kallsyms])
> >        perf-exec 2273949 546850.708526:     143369 cycles:  ffffffff82b4cdbf xas_start+0x5f ([kernel.kallsyms])
> >        perf-exec 2273949 546850.708600:     372650 cycles:  ffffffff8286b8f7 __pagevec_lru_add+0x117 ([kernel.kallsyms])
> >        perf-exec 2273949 546850.708791:     482953 cycles:  ffffffff829190de __mod_memcg_lruvec_state+0x4e ([kernel.kallsyms])
> >             true 2273949 546850.709036:     501985 cycles:  ffffffff828add7c tlb_gather_mmu+0x4c ([kernel.kallsyms])
> >             true 2273949 546850.709292:     503065 cycles:      7f2446d97c03 _dl_map_object_deps+0x973 (/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2)
> >
> > Acked-by: Jiri Olsa <jolsa@kernel.org>
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> >  tools/perf/Documentation/perf-record.txt | 15 +++++++++++---
> >  tools/perf/util/bpf_counter.c            |  3 +--
> >  tools/perf/util/evlist.c                 | 25 +++++++++++++++++-------
> >  tools/perf/util/evsel.c                  |  2 ++
> >  tools/perf/util/parse-events.c           |  8 +++-----
> >  5 files changed, 36 insertions(+), 17 deletions(-)
> >
> > diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
> > index ff815c2f67e8..122f71726eaa 100644
> > --- a/tools/perf/Documentation/perf-record.txt
> > +++ b/tools/perf/Documentation/perf-record.txt
> > @@ -119,9 +119,12 @@ OPTIONS
> >         "perf report" to view group events together.
> >
> >  --filter=<filter>::
> > -        Event filter. This option should follow an event selector (-e) which
> > -     selects either tracepoint event(s) or a hardware trace PMU
> > -     (e.g. Intel PT or CoreSight).
> > +     Event filter.  This option should follow an event selector (-e).
> > +     If the event is a tracepoint, the filter string will be parsed by
> > +     the kernel.  If the event is a hardware trace PMU (e.g. Intel PT
> > +     or CoreSight), it'll be processed as an address filter.  Otherwise
> > +     it means a general filter using BPF which can be applied for any
> > +     kind of event.
> >
> >       - tracepoint filters
> >
> > @@ -176,6 +179,12 @@ OPTIONS
> >
> >       Multiple filters can be separated with space or comma.
> >
> > +     - bpf filters
> > +
> > +     A BPF filter can access the sample data and make a decision based on the
> > +     data.  Users need to set an appropriate sample type to use the BPF
> > +     filter.
> > +
> >  --exclude-perf::
> >       Don't record events issued by perf itself. This option should follow
> >       an event selector (-e) which selects tracepoint event(s). It adds a
> > diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
> > index aa78a15a6f0a..1b77436e067e 100644
> > --- a/tools/perf/util/bpf_counter.c
> > +++ b/tools/perf/util/bpf_counter.c
> > @@ -763,8 +763,7 @@ extern struct bpf_counter_ops bperf_cgrp_ops;
> >
> >  static inline bool bpf_counter_skip(struct evsel *evsel)
> >  {
> > -     return list_empty(&evsel->bpf_counter_list) &&
> > -             evsel->follower_skel == NULL;
> > +     return evsel->bpf_counter_ops == NULL;
> >  }
> >
> >  int bpf_counter__install_pe(struct evsel *evsel, int cpu_map_idx, int fd)
> > diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
> > index b74e12239aec..cc491a037836 100644
> > --- a/tools/perf/util/evlist.c
> > +++ b/tools/perf/util/evlist.c
> > @@ -31,6 +31,7 @@
> >  #include "util/evlist-hybrid.h"
> >  #include "util/pmu.h"
> >  #include "util/sample.h"
> > +#include "util/bpf-filter.h"
> >  #include <signal.h>
> >  #include <unistd.h>
> >  #include <sched.h>
> > @@ -1086,17 +1087,27 @@ int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel)
> >       int err = 0;
> >
> >       evlist__for_each_entry(evlist, evsel) {
> > -             if (evsel->filter == NULL)
> > -                     continue;
> > -
> >               /*
> >                * filters only work for tracepoint event, which doesn't have cpu limit.
> >                * So evlist and evsel should always be same.
> >                */
> > -             err = perf_evsel__apply_filter(&evsel->core, evsel->filter);
> > -             if (err) {
> > -                     *err_evsel = evsel;
> > -                     break;
> > +             if (evsel->filter) {
> > +                     err = perf_evsel__apply_filter(&evsel->core, evsel->filter);
> > +                     if (err) {
> > +                             *err_evsel = evsel;
> > +                             break;
> > +                     }
> > +             }
> > +
> > +             /*
> > +              * non-tracepoint events can have BPF filters.
> > +              */
> > +             if (!list_empty(&evsel->bpf_filters)) {
> > +                     err = perf_bpf_filter__prepare(evsel);
> > +                     if (err) {
> > +                             *err_evsel = evsel;
> > +                             break;
> > +                     }
> >               }
> >       }
> >
> > diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> > index a83d8cd5eb51..dc3faf005c3b 100644
> > --- a/tools/perf/util/evsel.c
> > +++ b/tools/perf/util/evsel.c
> > @@ -50,6 +50,7 @@
> >  #include "off_cpu.h"
> >  #include "../perf-sys.h"
> >  #include "util/parse-branch-options.h"
> > +#include "util/bpf-filter.h"
> >  #include <internal/xyarray.h>
> >  #include <internal/lib.h>
> >  #include <internal/threadmap.h>
> > @@ -1517,6 +1518,7 @@ void evsel__exit(struct evsel *evsel)
> >       assert(list_empty(&evsel->core.node));
> >       assert(evsel->evlist == NULL);
> >       bpf_counter__destroy(evsel);
> > +     perf_bpf_filter__destroy(evsel);
> >       evsel__free_counts(evsel);
> >       perf_evsel__free_fd(&evsel->core);
> >       perf_evsel__free_id(&evsel->core);
> > diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
> > index 3b2e5bb3e852..6c5cf5244486 100644
> > --- a/tools/perf/util/parse-events.c
> > +++ b/tools/perf/util/parse-events.c
> > @@ -28,6 +28,7 @@
> >  #include "perf.h"
> >  #include "util/parse-events-hybrid.h"
> >  #include "util/pmu-hybrid.h"
> > +#include "util/bpf-filter.h"
> >  #include "tracepoint.h"
> >  #include "thread_map.h"
> >
> > @@ -2542,11 +2543,8 @@ static int set_filter(struct evsel *evsel, const void *arg)
> >               perf_pmu__scan_file(pmu, "nr_addr_filters",
> >                                   "%d", &nr_addr_filters);
> >
> > -     if (!nr_addr_filters) {
> > -             fprintf(stderr,
> > -                     "This CPU does not support address filtering\n");
> > -             return -1;
> > -     }
> > +     if (!nr_addr_filters)
> > +             return perf_bpf_filter__parse(&evsel->bpf_filters, str);
> >
> >       if (evsel__append_addr_filter(evsel, str) < 0) {
> >               fprintf(stderr,
> > --
> > 2.40.0.rc1.284.g88254d51c5-goog
> >
>
> --
>
> - Arnaldo
  
Arnaldo Carvalho de Melo March 15, 2023, 4:51 p.m. UTC | #3
On March 15, 2023 1:41:29 PM GMT-03:00, Namhyung Kim <namhyung@kernel.org> wrote:
>Hi Arnaldo,
>
>On Wed, Mar 15, 2023 at 6:47 AM Arnaldo Carvalho de Melo
><acme@kernel.org> wrote:
>>
>> Em Tue, Mar 14, 2023 at 04:42:30PM -0700, Namhyung Kim escreveu:
>> > Use --filter option to set BPF filter for generic events other than the
>> > tracepoints or Intel PT.  The BPF program will check the sample data and
>> > filter according to the expression.
>> >
>> > For example, the below is the typical perf record for frequency mode.
>> > The sample period started from 1 and increased gradually.
>> >
>> > $ sudo ./perf record -e cycles true
>> > $ sudo ./perf script
>> >        perf-exec 2272336 546683.916875:          1 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>> >        perf-exec 2272336 546683.916892:          1 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>> >        perf-exec 2272336 546683.916899:          3 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>> >        perf-exec 2272336 546683.916905:         17 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>> >        perf-exec 2272336 546683.916911:        100 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>> >        perf-exec 2272336 546683.916917:        589 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>> >        perf-exec 2272336 546683.916924:       3470 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>> >        perf-exec 2272336 546683.916930:      20465 cycles:  ffffffff828499b8 perf_event_exec+0x298 ([kernel.kallsyms])
>> >             true 2272336 546683.916940:     119873 cycles:  ffffffff8283afdd perf_iterate_ctx+0x2d ([kernel.kallsyms])
>> >             true 2272336 546683.917003:     461349 cycles:  ffffffff82892517 vma_interval_tree_insert+0x37 ([kernel.kallsyms])
>> >             true 2272336 546683.917237:     635778 cycles:  ffffffff82a11400 security_mmap_file+0x20 ([kernel.kallsyms])
>> >
>> > When you add a BPF filter to get samples having periods greater than 1000,
>> > the output would look like below:
>>
>> Had to add:
>>
>> diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
>> index be336f1b2b689602..153a13cdca9df1ea 100644
>> --- a/tools/perf/util/python.c
>> +++ b/tools/perf/util/python.c
>> @@ -19,6 +19,7 @@
>>  #include "mmap.h"
>>  #include "stat.h"
>>  #include "metricgroup.h"
>> +#include "util/bpf-filter.h"
>>  #include "util/env.h"
>>  #include "util/pmu.h"
>>  #include <internal/lib.h>
>> @@ -135,6 +136,18 @@ int bpf_counter__disable(struct evsel *evsel __maybe_unused)
>>         return 0;
>>  }
>>
>> +// not to drag util/bpf-filter.c
>> +
>> +int perf_bpf_filter__prepare(struct evsel *evsel __maybe_unused)
>> +{
>> +       return 0;
>> +}
>> +
>> +int perf_bpf_filter__destroy(struct evsel *evsel __maybe_unused)
>> +{
>> +       return 0;
>> +}
>> +
>>  /*
>>   * Support debug printing even though util/debug.c is not linked.  That means
>>   * implementing 'verbose' and 'eprintf'.
>>
>>
>> Please run 'perf test' before submitting patches,
>
>Ugh, sorry.  I think I ran it at some point but missed the python test :-p
>
>Anyway, I'm afraid you need to enclose with #ifndef HAVE_BPF_SKEL.

Right, I noticed that

>
>Thanks,
>Namhyung
>
>
>>
>> - Arnaldo
>>
>> > $ sudo ./perf record -e cycles --filter 'period > 1000' true
>> > $ sudo ./perf script
>> >        perf-exec 2273949 546850.708501:       5029 cycles:  ffffffff826f9e25 finish_wait+0x5 ([kernel.kallsyms])
>> >        perf-exec 2273949 546850.708508:      32409 cycles:  ffffffff826f9e25 finish_wait+0x5 ([kernel.kallsyms])
>> >        perf-exec 2273949 546850.708526:     143369 cycles:  ffffffff82b4cdbf xas_start+0x5f ([kernel.kallsyms])
>> >        perf-exec 2273949 546850.708600:     372650 cycles:  ffffffff8286b8f7 __pagevec_lru_add+0x117 ([kernel.kallsyms])
>> >        perf-exec 2273949 546850.708791:     482953 cycles:  ffffffff829190de __mod_memcg_lruvec_state+0x4e ([kernel.kallsyms])
>> >             true 2273949 546850.709036:     501985 cycles:  ffffffff828add7c tlb_gather_mmu+0x4c ([kernel.kallsyms])
>> >             true 2273949 546850.709292:     503065 cycles:      7f2446d97c03 _dl_map_object_deps+0x973 (/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2)
>> >
>> > Acked-by: Jiri Olsa <jolsa@kernel.org>
>> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
>> > ---
>> >  tools/perf/Documentation/perf-record.txt | 15 +++++++++++---
>> >  tools/perf/util/bpf_counter.c            |  3 +--
>> >  tools/perf/util/evlist.c                 | 25 +++++++++++++++++-------
>> >  tools/perf/util/evsel.c                  |  2 ++
>> >  tools/perf/util/parse-events.c           |  8 +++-----
>> >  5 files changed, 36 insertions(+), 17 deletions(-)
>> >
>> > diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
>> > index ff815c2f67e8..122f71726eaa 100644
>> > --- a/tools/perf/Documentation/perf-record.txt
>> > +++ b/tools/perf/Documentation/perf-record.txt
>> > @@ -119,9 +119,12 @@ OPTIONS
>> >         "perf report" to view group events together.
>> >
>> >  --filter=<filter>::
>> > -        Event filter. This option should follow an event selector (-e) which
>> > -     selects either tracepoint event(s) or a hardware trace PMU
>> > -     (e.g. Intel PT or CoreSight).
>> > +     Event filter.  This option should follow an event selector (-e).
>> > +     If the event is a tracepoint, the filter string will be parsed by
>> > +     the kernel.  If the event is a hardware trace PMU (e.g. Intel PT
>> > +     or CoreSight), it'll be processed as an address filter.  Otherwise
>> > +     it means a general filter using BPF which can be applied for any
>> > +     kind of event.
>> >
>> >       - tracepoint filters
>> >
>> > @@ -176,6 +179,12 @@ OPTIONS
>> >
>> >       Multiple filters can be separated with space or comma.
>> >
>> > +     - bpf filters
>> > +
>> > +     A BPF filter can access the sample data and make a decision based on the
>> > +     data.  Users need to set an appropriate sample type to use the BPF
>> > +     filter.
>> > +
>> >  --exclude-perf::
>> >       Don't record events issued by perf itself. This option should follow
>> >       an event selector (-e) which selects tracepoint event(s). It adds a
>> > diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
>> > index aa78a15a6f0a..1b77436e067e 100644
>> > --- a/tools/perf/util/bpf_counter.c
>> > +++ b/tools/perf/util/bpf_counter.c
>> > @@ -763,8 +763,7 @@ extern struct bpf_counter_ops bperf_cgrp_ops;
>> >
>> >  static inline bool bpf_counter_skip(struct evsel *evsel)
>> >  {
>> > -     return list_empty(&evsel->bpf_counter_list) &&
>> > -             evsel->follower_skel == NULL;
>> > +     return evsel->bpf_counter_ops == NULL;
>> >  }
>> >
>> >  int bpf_counter__install_pe(struct evsel *evsel, int cpu_map_idx, int fd)
>> > diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
>> > index b74e12239aec..cc491a037836 100644
>> > --- a/tools/perf/util/evlist.c
>> > +++ b/tools/perf/util/evlist.c
>> > @@ -31,6 +31,7 @@
>> >  #include "util/evlist-hybrid.h"
>> >  #include "util/pmu.h"
>> >  #include "util/sample.h"
>> > +#include "util/bpf-filter.h"
>> >  #include <signal.h>
>> >  #include <unistd.h>
>> >  #include <sched.h>
>> > @@ -1086,17 +1087,27 @@ int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel)
>> >       int err = 0;
>> >
>> >       evlist__for_each_entry(evlist, evsel) {
>> > -             if (evsel->filter == NULL)
>> > -                     continue;
>> > -
>> >               /*
>> >                * filters only work for tracepoint event, which doesn't have cpu limit.
>> >                * So evlist and evsel should always be same.
>> >                */
>> > -             err = perf_evsel__apply_filter(&evsel->core, evsel->filter);
>> > -             if (err) {
>> > -                     *err_evsel = evsel;
>> > -                     break;
>> > +             if (evsel->filter) {
>> > +                     err = perf_evsel__apply_filter(&evsel->core, evsel->filter);
>> > +                     if (err) {
>> > +                             *err_evsel = evsel;
>> > +                             break;
>> > +                     }
>> > +             }
>> > +
>> > +             /*
>> > +              * non-tracepoint events can have BPF filters.
>> > +              */
>> > +             if (!list_empty(&evsel->bpf_filters)) {
>> > +                     err = perf_bpf_filter__prepare(evsel);
>> > +                     if (err) {
>> > +                             *err_evsel = evsel;
>> > +                             break;
>> > +                     }
>> >               }
>> >       }
>> >
>> > diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
>> > index a83d8cd5eb51..dc3faf005c3b 100644
>> > --- a/tools/perf/util/evsel.c
>> > +++ b/tools/perf/util/evsel.c
>> > @@ -50,6 +50,7 @@
>> >  #include "off_cpu.h"
>> >  #include "../perf-sys.h"
>> >  #include "util/parse-branch-options.h"
>> > +#include "util/bpf-filter.h"
>> >  #include <internal/xyarray.h>
>> >  #include <internal/lib.h>
>> >  #include <internal/threadmap.h>
>> > @@ -1517,6 +1518,7 @@ void evsel__exit(struct evsel *evsel)
>> >       assert(list_empty(&evsel->core.node));
>> >       assert(evsel->evlist == NULL);
>> >       bpf_counter__destroy(evsel);
>> > +     perf_bpf_filter__destroy(evsel);
>> >       evsel__free_counts(evsel);
>> >       perf_evsel__free_fd(&evsel->core);
>> >       perf_evsel__free_id(&evsel->core);
>> > diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
>> > index 3b2e5bb3e852..6c5cf5244486 100644
>> > --- a/tools/perf/util/parse-events.c
>> > +++ b/tools/perf/util/parse-events.c
>> > @@ -28,6 +28,7 @@
>> >  #include "perf.h"
>> >  #include "util/parse-events-hybrid.h"
>> >  #include "util/pmu-hybrid.h"
>> > +#include "util/bpf-filter.h"
>> >  #include "tracepoint.h"
>> >  #include "thread_map.h"
>> >
>> > @@ -2542,11 +2543,8 @@ static int set_filter(struct evsel *evsel, const void *arg)
>> >               perf_pmu__scan_file(pmu, "nr_addr_filters",
>> >                                   "%d", &nr_addr_filters);
>> >
>> > -     if (!nr_addr_filters) {
>> > -             fprintf(stderr,
>> > -                     "This CPU does not support address filtering\n");
>> > -             return -1;
>> > -     }
>> > +     if (!nr_addr_filters)
>> > +             return perf_bpf_filter__parse(&evsel->bpf_filters, str);
>> >
>> >       if (evsel__append_addr_filter(evsel, str) < 0) {
>> >               fprintf(stderr,
>> > --
>> > 2.40.0.rc1.284.g88254d51c5-goog
>> >
>>
>> --
>>
>> - Arnaldo
  

Patch

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index ff815c2f67e8..122f71726eaa 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -119,9 +119,12 @@  OPTIONS
 	  "perf report" to view group events together.
 
 --filter=<filter>::
-        Event filter. This option should follow an event selector (-e) which
-	selects either tracepoint event(s) or a hardware trace PMU
-	(e.g. Intel PT or CoreSight).
+	Event filter.  This option should follow an event selector (-e).
+	If the event is a tracepoint, the filter string will be parsed by
+	the kernel.  If the event is a hardware trace PMU (e.g. Intel PT
+	or CoreSight), it'll be processed as an address filter.  Otherwise
+	it means a general filter using BPF which can be applied for any
+	kind of event.
 
 	- tracepoint filters
 
@@ -176,6 +179,12 @@  OPTIONS
 
 	Multiple filters can be separated with space or comma.
 
+	- bpf filters
+
+	A BPF filter can access the sample data and make a decision based on the
+	data.  Users need to set an appropriate sample type to use the BPF
+	filter.
+
 --exclude-perf::
 	Don't record events issued by perf itself. This option should follow
 	an event selector (-e) which selects tracepoint event(s). It adds a
diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index aa78a15a6f0a..1b77436e067e 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -763,8 +763,7 @@  extern struct bpf_counter_ops bperf_cgrp_ops;
 
 static inline bool bpf_counter_skip(struct evsel *evsel)
 {
-	return list_empty(&evsel->bpf_counter_list) &&
-		evsel->follower_skel == NULL;
+	return evsel->bpf_counter_ops == NULL;
 }
 
 int bpf_counter__install_pe(struct evsel *evsel, int cpu_map_idx, int fd)
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index b74e12239aec..cc491a037836 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -31,6 +31,7 @@ 
 #include "util/evlist-hybrid.h"
 #include "util/pmu.h"
 #include "util/sample.h"
+#include "util/bpf-filter.h"
 #include <signal.h>
 #include <unistd.h>
 #include <sched.h>
@@ -1086,17 +1087,27 @@  int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel)
 	int err = 0;
 
 	evlist__for_each_entry(evlist, evsel) {
-		if (evsel->filter == NULL)
-			continue;
-
 		/*
 		 * filters only work for tracepoint event, which doesn't have cpu limit.
 		 * So evlist and evsel should always be same.
 		 */
-		err = perf_evsel__apply_filter(&evsel->core, evsel->filter);
-		if (err) {
-			*err_evsel = evsel;
-			break;
+		if (evsel->filter) {
+			err = perf_evsel__apply_filter(&evsel->core, evsel->filter);
+			if (err) {
+				*err_evsel = evsel;
+				break;
+			}
+		}
+
+		/*
+		 * non-tracepoint events can have BPF filters.
+		 */
+		if (!list_empty(&evsel->bpf_filters)) {
+			err = perf_bpf_filter__prepare(evsel);
+			if (err) {
+				*err_evsel = evsel;
+				break;
+			}
 		}
 	}
 
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index a83d8cd5eb51..dc3faf005c3b 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -50,6 +50,7 @@ 
 #include "off_cpu.h"
 #include "../perf-sys.h"
 #include "util/parse-branch-options.h"
+#include "util/bpf-filter.h"
 #include <internal/xyarray.h>
 #include <internal/lib.h>
 #include <internal/threadmap.h>
@@ -1517,6 +1518,7 @@  void evsel__exit(struct evsel *evsel)
 	assert(list_empty(&evsel->core.node));
 	assert(evsel->evlist == NULL);
 	bpf_counter__destroy(evsel);
+	perf_bpf_filter__destroy(evsel);
 	evsel__free_counts(evsel);
 	perf_evsel__free_fd(&evsel->core);
 	perf_evsel__free_id(&evsel->core);
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 3b2e5bb3e852..6c5cf5244486 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -28,6 +28,7 @@ 
 #include "perf.h"
 #include "util/parse-events-hybrid.h"
 #include "util/pmu-hybrid.h"
+#include "util/bpf-filter.h"
 #include "tracepoint.h"
 #include "thread_map.h"
 
@@ -2542,11 +2543,8 @@  static int set_filter(struct evsel *evsel, const void *arg)
 		perf_pmu__scan_file(pmu, "nr_addr_filters",
 				    "%d", &nr_addr_filters);
 
-	if (!nr_addr_filters) {
-		fprintf(stderr,
-			"This CPU does not support address filtering\n");
-		return -1;
-	}
+	if (!nr_addr_filters)
+		return perf_bpf_filter__parse(&evsel->bpf_filters, str);
 
 	if (evsel__append_addr_filter(evsel, str) < 0) {
 		fprintf(stderr,