[V4,01/11] perf/core: Add aux_pause, aux_resume, aux_start_paused
Commit Message
Hardware traces, such as instruction traces, can produce a vast amount of
trace data, so being able to reduce tracing to more specific circumstances
can be useful.
The ability to pause or resume tracing when another event happens, can do
that.
Add ability for an event to "pause" or "resume" AUX area tracing.
Add aux_pause bit to perf_event_attr to indicate that, if the event
happens, the associated AUX area tracing should be paused. Ditto
aux_resume. Do not allow aux_pause and aux_resume to be set together.
Add aux_start_paused bit to perf_event_attr to indicate to an AUX area
event that it should start in a "paused" state.
Add aux_paused to struct perf_event for AUX area events to keep track of
the "paused" state. aux_paused is initialized to aux_start_paused.
Add PERF_EF_PAUSE and PERF_EF_RESUME modes for ->stop() and ->start()
callbacks. Call as needed, during __perf_event_output(). Add
aux_in_pause_resume to struct perf_buffer to prevent races with the NMI
handler. Pause/resume in NMI context will miss out if it coincides with
another pause/resume.
To use aux_pause or aux_resume, an event must be in a group with the AUX
area event as the group leader.
Example (requires Intel PT and tools patches also):
$ perf record --kcore -e intel_pt/aux-action=start-paused/k,syscalls:sys_enter_newuname/aux-action=resume/,syscalls:sys_exit_newuname/aux-action=pause/ uname
Linux
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.043 MB perf.data ]
$ perf script --call-trace
uname 30805 [000] 24001.058782799: name: 0x7ffc9c1865b0
uname 30805 [000] 24001.058784424: psb offs: 0
uname 30805 [000] 24001.058784424: cbr: 39 freq: 3904 MHz (139%)
uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) debug_smp_processor_id
uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) __x64_sys_newuname
uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) down_read
uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) __cond_resched
uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_add
uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) in_lock_functions
uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_sub
uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) up_read
uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_add
uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) in_lock_functions
uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) preempt_count_sub
uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) _copy_to_user
uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) syscall_exit_to_user_mode
uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) syscall_exit_work
uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) perf_syscall_exit
uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) debug_smp_processor_id
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_trace_buf_alloc
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_swevent_get_recursion_context
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) debug_smp_processor_id
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) debug_smp_processor_id
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_tp_event
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_trace_buf_update
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) tracing_gen_ctx_irq_test
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_swevent_event
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __perf_event_account_interrupt
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __this_cpu_preempt_check
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_event_output_forward
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_event_aux_pause
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) ring_buffer_get
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __rcu_read_lock
uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __rcu_read_unlock
uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) pt_event_stop
uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) debug_smp_processor_id
uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) debug_smp_processor_id
uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) native_write_msr
uname 30805 [000] 24001.058785463: ([kernel.kallsyms]) native_write_msr
uname 30805 [000] 24001.058785639: 0x0
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
Changes in V4:
Rename aux_output_cfg -> aux_action
Reorder aux_action bits from:
aux_pause, aux_resume, aux_start_paused
to:
aux_start_paused, aux_pause, aux_resume
Fix aux_action bits __u64 -> __u32
include/linux/perf_event.h | 15 +++++++
include/uapi/linux/perf_event.h | 11 ++++-
kernel/events/core.c | 72 +++++++++++++++++++++++++++++++--
kernel/events/internal.h | 1 +
4 files changed, 95 insertions(+), 4 deletions(-)
Comments
On Thu, Jan 11, 2024 at 12:19 AM Adrian Hunter <adrian.hunter@intelcom> wrote:
>
> Hardware traces, such as instruction traces, can produce a vast amount of
> trace data, so being able to reduce tracing to more specific circumstances
> can be useful.
>
> The ability to pause or resume tracing when another event happens, can do
> that.
>
> Add ability for an event to "pause" or "resume" AUX area tracing.
>
> Add aux_pause bit to perf_event_attr to indicate that, if the event
> happens, the associated AUX area tracing should be paused. Ditto
> aux_resume. Do not allow aux_pause and aux_resume to be set together.
>
> Add aux_start_paused bit to perf_event_attr to indicate to an AUX area
> event that it should start in a "paused" state.
>
> Add aux_paused to struct perf_event for AUX area events to keep track of
> the "paused" state. aux_paused is initialized to aux_start_paused.
>
> Add PERF_EF_PAUSE and PERF_EF_RESUME modes for ->stop() and ->start()
> callbacks. Call as needed, during __perf_event_output(). Add
> aux_in_pause_resume to struct perf_buffer to prevent races with the NMI
> handler. Pause/resume in NMI context will miss out if it coincides with
> another pause/resume.
>
> To use aux_pause or aux_resume, an event must be in a group with the AUX
> area event as the group leader.
>
> Example (requires Intel PT and tools patches also):
>
> $ perf record --kcore -e intel_pt/aux-action=start-paused/k,syscalls:sys_enter_newuname/aux-action=resume/,syscalls:sys_exit_newuname/aux-action=pause/ uname
> Linux
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 0.043 MB perf.data ]
> $ perf script --call-trace
> uname 30805 [000] 24001.058782799: name: 0x7ffc9c1865b0
> uname 30805 [000] 24001.058784424: psb offs: 0
> uname 30805 [000] 24001.058784424: cbr: 39 freq: 3904 MHz (139%)
> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) debug_smp_processor_id
> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) __x64_sys_newuname
> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) down_read
> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) __cond_resched
> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_add
> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) in_lock_functions
> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_sub
> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) up_read
> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_add
> uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) in_lock_functions
> uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) preempt_count_sub
> uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) _copy_to_user
> uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) syscall_exit_to_user_mode
> uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) syscall_exit_work
> uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) perf_syscall_exit
> uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) debug_smp_processor_id
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_trace_buf_alloc
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_swevent_get_recursion_context
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) debug_smp_processor_id
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) debug_smp_processor_id
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_tp_event
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_trace_buf_update
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) tracing_gen_ctx_irq_test
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_swevent_event
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __perf_event_account_interrupt
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __this_cpu_preempt_check
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_event_output_forward
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_event_aux_pause
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) ring_buffer_get
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __rcu_read_lock
> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __rcu_read_unlock
> uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) pt_event_stop
> uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) debug_smp_processor_id
> uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) debug_smp_processor_id
> uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) native_write_msr
> uname 30805 [000] 24001.058785463: ([kernel.kallsyms]) native_write_msr
> uname 30805 [000] 24001.058785639: 0x0
Looks great! I think this is very similar to what Kees asked in
https://lore.kernel.org/linux-perf-users/202401091452.B73E21B6C@keescook/
I have a couple of basic questions:
* Can we do that for regular events too?
* What's the difference between start/stop and pause/resume?
(IOW can we do that just using start/stop callbacks?)
Actually I was thinking about dropping samples using a BPF filter
outside the target scope (e.g. a syscall) but it'd be nice if we can
have builtin support for that.
Thanks,
Namhyung
>
> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
> ---
>
>
> Changes in V4:
> Rename aux_output_cfg -> aux_action
> Reorder aux_action bits from:
> aux_pause, aux_resume, aux_start_paused
> to:
> aux_start_paused, aux_pause, aux_resume
> Fix aux_action bits __u64 -> __u32
>
>
> include/linux/perf_event.h | 15 +++++++
> include/uapi/linux/perf_event.h | 11 ++++-
> kernel/events/core.c | 72 +++++++++++++++++++++++++++++++--
> kernel/events/internal.h | 1 +
> 4 files changed, 95 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 5547ba68e6e4..342879168269 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -291,6 +291,7 @@ struct perf_event_pmu_context;
> #define PERF_PMU_CAP_NO_EXCLUDE 0x0040
> #define PERF_PMU_CAP_AUX_OUTPUT 0x0080
> #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
> +#define PERF_PMU_CAP_AUX_PAUSE 0x0200
>
> struct perf_output_handle;
>
> @@ -363,6 +364,8 @@ struct pmu {
> #define PERF_EF_START 0x01 /* start the counter when adding */
> #define PERF_EF_RELOAD 0x02 /* reload the counter when starting */
> #define PERF_EF_UPDATE 0x04 /* update the counter when stopping */
> +#define PERF_EF_PAUSE 0x08 /* AUX area event, pause tracing */
> +#define PERF_EF_RESUME 0x10 /* AUX area event, resume tracing */
>
> /*
> * Adds/Removes a counter to/from the PMU, can be done inside a
> @@ -402,6 +405,15 @@ struct pmu {
> *
> * ->start() with PERF_EF_RELOAD will reprogram the counter
> * value, must be preceded by a ->stop() with PERF_EF_UPDATE.
> + *
> + * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not
> + * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with
> + * PERF_EF_RESUME.
> + *
> + * ->start() with PERF_EF_RESUME will start as simply as possible but
> + * only if the counter is not otherwise stopped. Will not overlap
> + * another ->start() with PERF_EF_RESUME nor ->stop() with
> + * PERF_EF_PAUSE.
> */
> void (*start) (struct perf_event *event, int flags);
> void (*stop) (struct perf_event *event, int flags);
> @@ -798,6 +810,9 @@ struct perf_event {
> /* for aux_output events */
> struct perf_event *aux_event;
>
> + /* for AUX area events */
> + unsigned int aux_paused;
> +
> void (*destroy)(struct perf_event *);
> struct rcu_head rcu_head;
>
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 39c6a250dd1b..5f6b3b494184 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -507,7 +507,16 @@ struct perf_event_attr {
> __u16 sample_max_stack;
> __u16 __reserved_2;
> __u32 aux_sample_size;
> - __u32 __reserved_3;
> +
> + union {
> + __u32 aux_action;
> + struct {
> + __u32 aux_start_paused : 1, /* start AUX area tracing paused */
> + aux_pause : 1, /* on overflow, pause AUX area tracing */
> + aux_resume : 1, /* on overflow, resume AUX area tracing */
> + __reserved_3 : 29;
> + };
> + };
>
> /*
> * User provided data if sigtrap=1, passed back to user via
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 9efd0d7775e7..dc9ec2443ac9 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2097,7 +2097,8 @@ static void perf_put_aux_event(struct perf_event *event)
>
> static bool perf_need_aux_event(struct perf_event *event)
> {
> - return !!event->attr.aux_output || !!event->attr.aux_sample_size;
> + return event->attr.aux_output || event->attr.aux_sample_size ||
> + event->attr.aux_pause || event->attr.aux_resume;
> }
>
> static int perf_get_aux_event(struct perf_event *event,
> @@ -2122,6 +2123,10 @@ static int perf_get_aux_event(struct perf_event *event,
> !perf_aux_output_match(event, group_leader))
> return 0;
>
> + if ((event->attr.aux_pause || event->attr.aux_resume) &&
> + !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
> + return 0;
> +
> if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
> return 0;
>
> @@ -7846,6 +7851,47 @@ void perf_prepare_header(struct perf_event_header *header,
> WARN_ON_ONCE(header->size & 7);
> }
>
> +static void __perf_event_aux_pause(struct perf_event *event, bool pause)
> +{
> + if (pause) {
> + if (!READ_ONCE(event->aux_paused)) {
> + WRITE_ONCE(event->aux_paused, 1);
> + event->pmu->stop(event, PERF_EF_PAUSE);
> + }
> + } else {
> + if (READ_ONCE(event->aux_paused)) {
> + WRITE_ONCE(event->aux_paused, 0);
> + event->pmu->start(event, PERF_EF_RESUME);
> + }
> + }
> +}
> +
> +static void perf_event_aux_pause(struct perf_event *event, bool pause)
> +{
> + struct perf_buffer *rb;
> + unsigned long flags;
> +
> + if (WARN_ON_ONCE(!event))
> + return;
> +
> + rb = ring_buffer_get(event);
> + if (!rb)
> + return;
> +
> + local_irq_save(flags);
> + /* Guard against NMI, NMI loses here */
> + if (READ_ONCE(rb->aux_in_pause_resume))
> + goto out_restore;
> + WRITE_ONCE(rb->aux_in_pause_resume, 1);
> + barrier();
> + __perf_event_aux_pause(event, pause);
> + barrier();
> + WRITE_ONCE(rb->aux_in_pause_resume, 0);
> +out_restore:
> + local_irq_restore(flags);
> + ring_buffer_put(rb);
> +}
> +
> static __always_inline int
> __perf_event_output(struct perf_event *event,
> struct perf_sample_data *data,
> @@ -7859,6 +7905,9 @@ __perf_event_output(struct perf_event *event,
> struct perf_event_header header;
> int err;
>
> + if (event->attr.aux_pause)
> + perf_event_aux_pause(event->aux_event, true);
> +
> /* protect the callchain buffers */
> rcu_read_lock();
>
> @@ -7875,6 +7924,10 @@ __perf_event_output(struct perf_event *event,
>
> exit:
> rcu_read_unlock();
> +
> + if (event->attr.aux_resume)
> + perf_event_aux_pause(event->aux_event, false);
> +
> return err;
> }
>
> @@ -12014,10 +12067,23 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
> }
>
> if (event->attr.aux_output &&
> - !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
> + (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
> + event->attr.aux_pause || event->attr.aux_resume)) {
> + err = -EOPNOTSUPP;
> + goto err_pmu;
> + }
> +
> + if (event->attr.aux_pause && event->attr.aux_resume) {
> + err = -EINVAL;
> + goto err_pmu;
> + }
> +
> + if (event->attr.aux_start_paused &&
> + !(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) {
> err = -EOPNOTSUPP;
> goto err_pmu;
> }
> + event->aux_paused = event->attr.aux_start_paused;
>
> if (cgroup_fd != -1) {
> err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
> @@ -12814,7 +12880,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
> * Grouping is not supported for kernel events, neither is 'AUX',
> * make sure the caller's intentions are adjusted.
> */
> - if (attr->aux_output)
> + if (attr->aux_output || attr->aux_action)
> return ERR_PTR(-EINVAL);
>
> event = perf_event_alloc(attr, cpu, task, NULL, NULL,
> diff --git a/kernel/events/internal.h b/kernel/events/internal.h
> index 5150d5f84c03..3320f78117dc 100644
> --- a/kernel/events/internal.h
> +++ b/kernel/events/internal.h
> @@ -51,6 +51,7 @@ struct perf_buffer {
> void (*free_aux)(void *);
> refcount_t aux_refcount;
> int aux_in_sampling;
> + int aux_in_pause_resume;
> void **aux_pages;
> void *aux_priv;
>
> --
> 2.34.1
>
On 19/01/24 23:40, Namhyung Kim wrote:
> On Thu, Jan 11, 2024 at 12:19 AM Adrian Hunter <adrian.hunter@intel.com> wrote:
>>
>> Hardware traces, such as instruction traces, can produce a vast amount of
>> trace data, so being able to reduce tracing to more specific circumstances
>> can be useful.
>>
>> The ability to pause or resume tracing when another event happens, can do
>> that.
>>
>> Add ability for an event to "pause" or "resume" AUX area tracing.
>>
>> Add aux_pause bit to perf_event_attr to indicate that, if the event
>> happens, the associated AUX area tracing should be paused. Ditto
>> aux_resume. Do not allow aux_pause and aux_resume to be set together.
>>
>> Add aux_start_paused bit to perf_event_attr to indicate to an AUX area
>> event that it should start in a "paused" state.
>>
>> Add aux_paused to struct perf_event for AUX area events to keep track of
>> the "paused" state. aux_paused is initialized to aux_start_paused.
>>
>> Add PERF_EF_PAUSE and PERF_EF_RESUME modes for ->stop() and ->start()
>> callbacks. Call as needed, during __perf_event_output(). Add
>> aux_in_pause_resume to struct perf_buffer to prevent races with the NMI
>> handler. Pause/resume in NMI context will miss out if it coincides with
>> another pause/resume.
>>
>> To use aux_pause or aux_resume, an event must be in a group with the AUX
>> area event as the group leader.
>>
>> Example (requires Intel PT and tools patches also):
>>
>> $ perf record --kcore -e intel_pt/aux-action=start-paused/k,syscalls:sys_enter_newuname/aux-action=resume/,syscalls:sys_exit_newuname/aux-action=pause/ uname
>> Linux
>> [ perf record: Woken up 1 times to write data ]
>> [ perf record: Captured and wrote 0.043 MB perf.data ]
>> $ perf script --call-trace
>> uname 30805 [000] 24001.058782799: name: 0x7ffc9c1865b0
>> uname 30805 [000] 24001.058784424: psb offs: 0
>> uname 30805 [000] 24001.058784424: cbr: 39 freq: 3904 MHz (139%)
>> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) debug_smp_processor_id
>> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) __x64_sys_newuname
>> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) down_read
>> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) __cond_resched
>> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_add
>> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) in_lock_functions
>> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_sub
>> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) up_read
>> uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_add
>> uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) in_lock_functions
>> uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) preempt_count_sub
>> uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) _copy_to_user
>> uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) syscall_exit_to_user_mode
>> uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) syscall_exit_work
>> uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) perf_syscall_exit
>> uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) debug_smp_processor_id
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_trace_buf_alloc
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_swevent_get_recursion_context
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) debug_smp_processor_id
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) debug_smp_processor_id
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_tp_event
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_trace_buf_update
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) tracing_gen_ctx_irq_test
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_swevent_event
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __perf_event_account_interrupt
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __this_cpu_preempt_check
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_event_output_forward
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_event_aux_pause
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) ring_buffer_get
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __rcu_read_lock
>> uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __rcu_read_unlock
>> uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) pt_event_stop
>> uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) debug_smp_processor_id
>> uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) debug_smp_processor_id
>> uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) native_write_msr
>> uname 30805 [000] 24001.058785463: ([kernel.kallsyms]) native_write_msr
>> uname 30805 [000] 24001.058785639: 0x0
>
> Looks great! I think this is very similar to what Kees asked in
>
> https://lore.kernel.org/linux-perf-users/202401091452.B73E21B6C@keescook/
Sometimes a precisely-defined workload is needed, just so that
running it repeatedly does not produce results that vary too much
to tell whether one software version is better than another.
>
> I have a couple of basic questions:
> * Can we do that for regular events too?
That would be much more complicated. The current implementation
can only pause / resume 1 event, the group leader, and it has to
be supported by the PMU callbacks.
> * What's the difference between start/stop and pause/resume?
> (IOW can we do that just using start/stop callbacks?)
It is using start / stop callbacks, albeit with a different mode
parameter. However pause / resume is not allowed unless the event
has been started and not stopped, so it is a different state.
>
> Actually I was thinking about dropping samples using a BPF filter
> outside the target scope (e.g. a syscall) but it'd be nice if we can
> have builtin support for that.
In general, I would have thought that capturing samples does not
produce so much data that it cannot be filtered in post-processing.
Looking at the email thread from above, that seems to be what
Arnaldo has proposed.
AUX area tracing is different in this regard. Intel PT can produce
more trace data than can be written out in time, so data will be
lost for large traces. Also post-processing takes a long time, so
less data captured helps a lot there also.
@@ -291,6 +291,7 @@ struct perf_event_pmu_context;
#define PERF_PMU_CAP_NO_EXCLUDE 0x0040
#define PERF_PMU_CAP_AUX_OUTPUT 0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
+#define PERF_PMU_CAP_AUX_PAUSE 0x0200
struct perf_output_handle;
@@ -363,6 +364,8 @@ struct pmu {
#define PERF_EF_START 0x01 /* start the counter when adding */
#define PERF_EF_RELOAD 0x02 /* reload the counter when starting */
#define PERF_EF_UPDATE 0x04 /* update the counter when stopping */
+#define PERF_EF_PAUSE 0x08 /* AUX area event, pause tracing */
+#define PERF_EF_RESUME 0x10 /* AUX area event, resume tracing */
/*
* Adds/Removes a counter to/from the PMU, can be done inside a
@@ -402,6 +405,15 @@ struct pmu {
*
* ->start() with PERF_EF_RELOAD will reprogram the counter
* value, must be preceded by a ->stop() with PERF_EF_UPDATE.
+ *
+ * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not
+ * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with
+ * PERF_EF_RESUME.
+ *
+ * ->start() with PERF_EF_RESUME will start as simply as possible but
+ * only if the counter is not otherwise stopped. Will not overlap
+ * another ->start() with PERF_EF_RESUME nor ->stop() with
+ * PERF_EF_PAUSE.
*/
void (*start) (struct perf_event *event, int flags);
void (*stop) (struct perf_event *event, int flags);
@@ -798,6 +810,9 @@ struct perf_event {
/* for aux_output events */
struct perf_event *aux_event;
+ /* for AUX area events */
+ unsigned int aux_paused;
+
void (*destroy)(struct perf_event *);
struct rcu_head rcu_head;
@@ -507,7 +507,16 @@ struct perf_event_attr {
__u16 sample_max_stack;
__u16 __reserved_2;
__u32 aux_sample_size;
- __u32 __reserved_3;
+
+ union {
+ __u32 aux_action;
+ struct {
+ __u32 aux_start_paused : 1, /* start AUX area tracing paused */
+ aux_pause : 1, /* on overflow, pause AUX area tracing */
+ aux_resume : 1, /* on overflow, resume AUX area tracing */
+ __reserved_3 : 29;
+ };
+ };
/*
* User provided data if sigtrap=1, passed back to user via
@@ -2097,7 +2097,8 @@ static void perf_put_aux_event(struct perf_event *event)
static bool perf_need_aux_event(struct perf_event *event)
{
- return !!event->attr.aux_output || !!event->attr.aux_sample_size;
+ return event->attr.aux_output || event->attr.aux_sample_size ||
+ event->attr.aux_pause || event->attr.aux_resume;
}
static int perf_get_aux_event(struct perf_event *event,
@@ -2122,6 +2123,10 @@ static int perf_get_aux_event(struct perf_event *event,
!perf_aux_output_match(event, group_leader))
return 0;
+ if ((event->attr.aux_pause || event->attr.aux_resume) &&
+ !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
+ return 0;
+
if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
return 0;
@@ -7846,6 +7851,47 @@ void perf_prepare_header(struct perf_event_header *header,
WARN_ON_ONCE(header->size & 7);
}
+static void __perf_event_aux_pause(struct perf_event *event, bool pause)
+{
+ if (pause) {
+ if (!READ_ONCE(event->aux_paused)) {
+ WRITE_ONCE(event->aux_paused, 1);
+ event->pmu->stop(event, PERF_EF_PAUSE);
+ }
+ } else {
+ if (READ_ONCE(event->aux_paused)) {
+ WRITE_ONCE(event->aux_paused, 0);
+ event->pmu->start(event, PERF_EF_RESUME);
+ }
+ }
+}
+
+static void perf_event_aux_pause(struct perf_event *event, bool pause)
+{
+ struct perf_buffer *rb;
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(!event))
+ return;
+
+ rb = ring_buffer_get(event);
+ if (!rb)
+ return;
+
+ local_irq_save(flags);
+ /* Guard against NMI, NMI loses here */
+ if (READ_ONCE(rb->aux_in_pause_resume))
+ goto out_restore;
+ WRITE_ONCE(rb->aux_in_pause_resume, 1);
+ barrier();
+ __perf_event_aux_pause(event, pause);
+ barrier();
+ WRITE_ONCE(rb->aux_in_pause_resume, 0);
+out_restore:
+ local_irq_restore(flags);
+ ring_buffer_put(rb);
+}
+
static __always_inline int
__perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
@@ -7859,6 +7905,9 @@ __perf_event_output(struct perf_event *event,
struct perf_event_header header;
int err;
+ if (event->attr.aux_pause)
+ perf_event_aux_pause(event->aux_event, true);
+
/* protect the callchain buffers */
rcu_read_lock();
@@ -7875,6 +7924,10 @@ __perf_event_output(struct perf_event *event,
exit:
rcu_read_unlock();
+
+ if (event->attr.aux_resume)
+ perf_event_aux_pause(event->aux_event, false);
+
return err;
}
@@ -12014,10 +12067,23 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
if (event->attr.aux_output &&
- !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
+ (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
+ event->attr.aux_pause || event->attr.aux_resume)) {
+ err = -EOPNOTSUPP;
+ goto err_pmu;
+ }
+
+ if (event->attr.aux_pause && event->attr.aux_resume) {
+ err = -EINVAL;
+ goto err_pmu;
+ }
+
+ if (event->attr.aux_start_paused &&
+ !(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) {
err = -EOPNOTSUPP;
goto err_pmu;
}
+ event->aux_paused = event->attr.aux_start_paused;
if (cgroup_fd != -1) {
err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
@@ -12814,7 +12880,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
* Grouping is not supported for kernel events, neither is 'AUX',
* make sure the caller's intentions are adjusted.
*/
- if (attr->aux_output)
+ if (attr->aux_output || attr->aux_action)
return ERR_PTR(-EINVAL);
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
@@ -51,6 +51,7 @@ struct perf_buffer {
void (*free_aux)(void *);
refcount_t aux_refcount;
int aux_in_sampling;
+ int aux_in_pause_resume;
void **aux_pages;
void *aux_priv;