[RFC,04/10] perf: Introduce deferred user callchains

Message ID d5def69b0c88bcbe2a85d0e1fd6cfca62b472ed4.1699487758.git.jpoimboe@kernel.org
State New
Headers
Series perf: user space sframe unwinding |

Commit Message

Josh Poimboeuf Nov. 9, 2023, 12:41 a.m. UTC
  Instead of attempting to unwind user space from the NMI handler, defer
it to run in task context by sending a self-IPI and then scheduling the
unwind to run in the IRQ's exit task work before returning to user space.

This allows the user stack page to be paged in if needed, avoids
duplicate unwinds for kernel-bound workloads, and prepares for SFrame
unwinding (so .sframe sections can be paged in on demand).

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 arch/Kconfig                    |  3 ++
 include/linux/perf_event.h      | 22 ++++++--
 include/uapi/linux/perf_event.h |  1 +
 kernel/bpf/stackmap.c           |  5 +-
 kernel/events/callchain.c       |  7 ++-
 kernel/events/core.c            | 90 ++++++++++++++++++++++++++++++---
 6 files changed, 115 insertions(+), 13 deletions(-)
  

Comments

Namhyung Kim Nov. 11, 2023, 6:57 a.m. UTC | #1
On Wed, Nov 8, 2023 at 4:43 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> Instead of attempting to unwind user space from the NMI handler, defer
> it to run in task context by sending a self-IPI and then scheduling the
> unwind to run in the IRQ's exit task work before returning to user space.
>
> This allows the user stack page to be paged in if needed, avoids
> duplicate unwinds for kernel-bound workloads, and prepares for SFrame
> unwinding (so .sframe sections can be paged in on demand).
>
> Suggested-by: Steven Rostedt <rostedt@goodmis.org>
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> ---
>  arch/Kconfig                    |  3 ++
>  include/linux/perf_event.h      | 22 ++++++--
>  include/uapi/linux/perf_event.h |  1 +
>  kernel/bpf/stackmap.c           |  5 +-
>  kernel/events/callchain.c       |  7 ++-
>  kernel/events/core.c            | 90 ++++++++++++++++++++++++++++++---
>  6 files changed, 115 insertions(+), 13 deletions(-)
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index f4b210ab0612..690c82212224 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -425,6 +425,9 @@ config HAVE_HARDLOCKUP_DETECTOR_ARCH
>           It uses the same command line parameters, and sysctl interface,
>           as the generic hardlockup detectors.
>
> +config HAVE_PERF_CALLCHAIN_DEFERRED
> +       bool
> +
>  config HAVE_PERF_REGS
>         bool
>         help
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 2d8fa253b9df..2f232111dff2 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -786,6 +786,7 @@ struct perf_event {
>         struct irq_work                 pending_irq;
>         struct callback_head            pending_task;
>         unsigned int                    pending_work;
> +       unsigned int                    pending_unwind;
>
>         atomic_t                        event_limit;
>
> @@ -1113,7 +1114,10 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
>  extern u64 perf_event_read_value(struct perf_event *event,
>                                  u64 *enabled, u64 *running);
>
> -extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);
> +extern void perf_callchain(struct perf_sample_data *data,
> +                          struct perf_event *event, struct pt_regs *regs);
> +extern void perf_callchain_deferred(struct perf_sample_data *data,
> +                                   struct perf_event *event, struct pt_regs *regs);
>
>  static inline bool branch_sample_no_flags(const struct perf_event *event)
>  {
> @@ -1189,6 +1193,7 @@ struct perf_sample_data {
>         u64                             data_page_size;
>         u64                             code_page_size;
>         u64                             aux_size;
> +       bool                            deferred;
>  } ____cacheline_aligned;
>
>  /* default value for data source */
> @@ -1206,6 +1211,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
>         data->sample_flags = PERF_SAMPLE_PERIOD;
>         data->period = period;
>         data->dyn_size = 0;
> +       data->deferred = false;
>
>         if (addr) {
>                 data->addr = addr;
> @@ -1219,7 +1225,11 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data,
>  {
>         int size = 1;
>
> -       data->callchain = perf_callchain(event, regs);
> +       if (data->deferred)
> +               perf_callchain_deferred(data, event, regs);
> +       else
> +               perf_callchain(data, event, regs);
> +
>         size += data->callchain->nr;
>
>         data->dyn_size += size * sizeof(u64);
> @@ -1534,12 +1544,18 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p
>  extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
>  extern struct perf_callchain_entry *
>  get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
> -                  u32 max_stack, bool add_mark);
> +                  u32 max_stack, bool add_mark, bool defer_user);
>  extern int get_callchain_buffers(int max_stack);
>  extern void put_callchain_buffers(void);
>  extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
>  extern void put_callchain_entry(int rctx);
>
> +#ifdef CONFIG_HAVE_PERF_CALLCHAIN_DEFERRED
> +extern void perf_callchain_user_deferred(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
> +#else
> +static inline void perf_callchain_user_deferred(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) {}
> +#endif
> +
>  extern int sysctl_perf_event_max_stack;
>  extern int sysctl_perf_event_max_contexts_per_stack;
>
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 39c6a250dd1b..9a1127af4cda 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -1237,6 +1237,7 @@ enum perf_callchain_context {
>         PERF_CONTEXT_HV                 = (__u64)-32,
>         PERF_CONTEXT_KERNEL             = (__u64)-128,
>         PERF_CONTEXT_USER               = (__u64)-512,
> +       PERF_CONTEXT_USER_DEFERRED      = (__u64)-640,
>
>         PERF_CONTEXT_GUEST              = (__u64)-2048,
>         PERF_CONTEXT_GUEST_KERNEL       = (__u64)-2176,
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index e4827ca5378d..fcdd26715b12 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -294,8 +294,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
>         if (max_depth > sysctl_perf_event_max_stack)
>                 max_depth = sysctl_perf_event_max_stack;
>
> -       trace = get_perf_callchain(regs, kernel, user, max_depth, false);
> -
> +       trace = get_perf_callchain(regs, kernel, user, max_depth, false, false);
>         if (unlikely(!trace))
>                 /* couldn't fetch the stack trace */
>                 return -EFAULT;
> @@ -420,7 +419,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
>                 trace = get_callchain_entry_for_task(task, max_depth);
>         else
>                 trace = get_perf_callchain(regs, kernel, user, max_depth,
> -                                          false);
> +                                          false, false);

Hmm.. BPF is not supported.  Any plans?


>         if (unlikely(!trace))
>                 goto err_fault;
>
> diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> index 2bee8b6fda0e..16571c8d6771 100644
> --- a/kernel/events/callchain.c
> +++ b/kernel/events/callchain.c
> @@ -178,7 +178,7 @@ put_callchain_entry(int rctx)
>
>  struct perf_callchain_entry *
>  get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
> -                  u32 max_stack, bool add_mark)
> +                  u32 max_stack, bool add_mark, bool defer_user)
>  {
>         struct perf_callchain_entry *entry;
>         struct perf_callchain_entry_ctx ctx;
> @@ -207,6 +207,11 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
>                         regs = task_pt_regs(current);
>                 }
>
> +               if (defer_user) {
> +                       perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
> +                       goto exit_put;

Hmm.. ok.  Then now perf tools need to stitch the call stacks
in two (or more?) samples.


> +               }
> +
>                 if (add_mark)
>                         perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 5e41a3b70bcd..290e06b0071c 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6751,6 +6751,12 @@ static void perf_pending_irq(struct irq_work *entry)
>         struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
>         int rctx;
>
> +       if (!is_software_event(event)) {
> +               if (event->pending_unwind)
> +                       task_work_add(current, &event->pending_task, TWA_RESUME);

I'm not familiar with this code.  Is it possible to the current task
is preempted before returning to user space (and run the
perf_pending_task_unwind) ?


> +               return;
> +       }
> +
>         /*
>          * If we 'fail' here, that's OK, it means recursion is already disabled
>          * and we won't recurse 'further'.
> @@ -6772,11 +6778,57 @@ static void perf_pending_irq(struct irq_work *entry)
>                 perf_swevent_put_recursion_context(rctx);
>  }
>
> +static void perf_pending_task_unwind(struct perf_event *event)
> +{
> +       struct pt_regs *regs = task_pt_regs(current);
> +       struct perf_output_handle handle;
> +       struct perf_event_header header;
> +       struct perf_sample_data data;
> +       struct perf_callchain_entry *callchain;
> +
> +       callchain = kmalloc(sizeof(struct perf_callchain_entry) +
> +                           (sizeof(__u64) * event->attr.sample_max_stack) +
> +                           (sizeof(__u64) * 1) /* one context */,
> +                           GFP_KERNEL);

Any chance it can reuse get_perf_callchain() instead of
allocating the callchains every time?


> +       if (!callchain)
> +               return;
> +
> +       callchain->nr = 0;
> +       data.callchain = callchain;
> +
> +       perf_sample_data_init(&data, 0, event->hw.last_period);

It would double count the period for a sample.
I guess we want to set the period to 0.

> +
> +       data.deferred = true;
> +
> +       perf_prepare_sample(&data, event, regs);

I don't think this would work properly.  Not to mention it duplicates
sample data unnecessarily, some data needs special handling to
avoid inefficient (full) data copy like for (user) stack, regs and aux.

Anyway I'm not sure it can support these additional samples for
deferred callchains without breaking the existing perf tools.
Anyway it doesn't know PERF_CONTEXT_USER_DEFERRED at least.
I think this should be controlled by a new feature bit in the
perf_event_attr.

Then we might add a breaking change to have a special
sample record for the deferred callchains and sample ID only.

> +
> +       perf_prepare_header(&header, &data, event, regs);
> +
> +       if (perf_output_begin(&handle, &data, event, header.size))
> +               goto done;
> +
> +       perf_output_sample(&handle, &header, &data, event);
> +
> +       perf_output_end(&handle);
> +
> +done:
> +       kfree(callchain);
> +}
> +
> +
>  static void perf_pending_task(struct callback_head *head)
>  {
>         struct perf_event *event = container_of(head, struct perf_event, pending_task);
>         int rctx;
>
> +       if (!is_software_event(event)) {
> +               if (event->pending_unwind) {
> +                       perf_pending_task_unwind(event);
> +                       event->pending_unwind = 0;
> +               }
> +               return;
> +       }
> +
>         /*
>          * If we 'fail' here, that's OK, it means recursion is already disabled
>          * and we won't recurse 'further'.
> @@ -7587,22 +7639,48 @@ static u64 perf_get_page_size(unsigned long addr)
>
>  static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
>
> -struct perf_callchain_entry *
> -perf_callchain(struct perf_event *event, struct pt_regs *regs)
> +void perf_callchain(struct perf_sample_data *data, struct perf_event *event,
> +                   struct pt_regs *regs)
>  {
>         bool kernel = !event->attr.exclude_callchain_kernel;
>         bool user   = !event->attr.exclude_callchain_user;
>         const u32 max_stack = event->attr.sample_max_stack;
> -       struct perf_callchain_entry *callchain;
> +       bool defer_user = IS_ENABLED(CONFIG_HAVE_PERF_CALLCHAIN_DEFERRED);

Is it always enabled depending on the kernel config?
It could be controlled by event->attr.something..

Thanks,
Namhyung

>
>         /* Disallow cross-task user callchains. */
>         user &= !event->ctx->task || event->ctx->task == current;
>
>         if (!kernel && !user)
> -               return &__empty_callchain;
> +               goto empty;
>
> -       callchain = get_perf_callchain(regs, kernel, user, max_stack, true);
> -       return callchain ?: &__empty_callchain;
> +       data->callchain = get_perf_callchain(regs, kernel, user, max_stack, true, defer_user);
> +       if (!data->callchain)
> +               goto empty;
> +
> +       if (user && defer_user && !event->pending_unwind) {
> +               event->pending_unwind = 1;
> +               irq_work_queue(&event->pending_irq);
> +       }
> +
> +       return;
> +
> +empty:
> +       data->callchain = &__empty_callchain;
> +}
> +
> +void perf_callchain_deferred(struct perf_sample_data *data,
> +                            struct perf_event *event, struct pt_regs *regs)
> +{
> +       struct perf_callchain_entry_ctx ctx;
> +
> +       ctx.entry               = data->callchain;
> +       ctx.max_stack           = event->attr.sample_max_stack;
> +       ctx.nr                  = 0;
> +       ctx.contexts            = 0;
> +       ctx.contexts_maxed      = false;
> +
> +       perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
> +       perf_callchain_user_deferred(&ctx, regs);
>  }
>
>  static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
> --
> 2.41.0
>
>
  
Josh Poimboeuf Nov. 11, 2023, 6:49 p.m. UTC | #2
On Fri, Nov 10, 2023 at 10:57:58PM -0800, Namhyung Kim wrote:
> On Wed, Nov 8, 2023 at 4:43 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> > +++ b/kernel/bpf/stackmap.c
> > @@ -294,8 +294,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
> >         if (max_depth > sysctl_perf_event_max_stack)
> >                 max_depth = sysctl_perf_event_max_stack;
> >
> > -       trace = get_perf_callchain(regs, kernel, user, max_depth, false);
> > -
> > +       trace = get_perf_callchain(regs, kernel, user, max_depth, false, false);
> >         if (unlikely(!trace))
> >                 /* couldn't fetch the stack trace */
> >                 return -EFAULT;
> > @@ -420,7 +419,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
> >                 trace = get_callchain_entry_for_task(task, max_depth);
> >         else
> >                 trace = get_perf_callchain(regs, kernel, user, max_depth,
> > -                                          false);
> > +                                          false, false);
> 
> Hmm.. BPF is not supported.  Any plans?

I'm not sure whether this feature makes sense for BPF.  If the BPF
program runs in atomic context then it would have to defer the unwind
until right before exiting to user space.  But then the program is no
longer running.

> > +++ b/kernel/events/callchain.c
> > @@ -178,7 +178,7 @@ put_callchain_entry(int rctx)
> >
> >  struct perf_callchain_entry *
> >  get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
> > -                  u32 max_stack, bool add_mark)
> > +                  u32 max_stack, bool add_mark, bool defer_user)
> >  {
> >         struct perf_callchain_entry *entry;
> >         struct perf_callchain_entry_ctx ctx;
> > @@ -207,6 +207,11 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
> >                         regs = task_pt_regs(current);
> >                 }
> >
> > +               if (defer_user) {
> > +                       perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
> > +                       goto exit_put;
> 
> Hmm.. ok.  Then now perf tools need to stitch the call stacks
> in two (or more?) samples.

Yes.  And it could be more than two samples if there were multiple NMIs
before returning to user space.  In that case there would be a single
user sample which needs to be stitched to each of the kernel samples.

> > +               }
> > +
> >                 if (add_mark)
> >                         perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
> >
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 5e41a3b70bcd..290e06b0071c 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -6751,6 +6751,12 @@ static void perf_pending_irq(struct irq_work *entry)
> >         struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
> >         int rctx;
> >
> > +       if (!is_software_event(event)) {
> > +               if (event->pending_unwind)
> > +                       task_work_add(current, &event->pending_task, TWA_RESUME);
> 
> I'm not familiar with this code.  Is it possible to the current task
> is preempted before returning to user space (and run the
> perf_pending_task_unwind) ?

Yes, the task work (perf_pending_task_unwind) is preemptible.

> > +static void perf_pending_task_unwind(struct perf_event *event)
> > +{
> > +       struct pt_regs *regs = task_pt_regs(current);
> > +       struct perf_output_handle handle;
> > +       struct perf_event_header header;
> > +       struct perf_sample_data data;
> > +       struct perf_callchain_entry *callchain;
> > +
> > +       callchain = kmalloc(sizeof(struct perf_callchain_entry) +
> > +                           (sizeof(__u64) * event->attr.sample_max_stack) +
> > +                           (sizeof(__u64) * 1) /* one context */,
> > +                           GFP_KERNEL);
> 
> Any chance it can reuse get_perf_callchain() instead of
> allocating the callchains every time?

I don't think so, because if it gets preempted, the new task might also
need to do an unwind.  But there's only one task-context callchain per
CPU.

The allocation is less than ideal.  I could just allocate it on the
stack, and keep the number of entries bounded to 128 entries or so.

> > +       if (!callchain)
> > +               return;
> > +
> > +       callchain->nr = 0;
> > +       data.callchain = callchain;
> > +
> > +       perf_sample_data_init(&data, 0, event->hw.last_period);
> 
> It would double count the period for a sample.
> I guess we want to set the period to 0.

Ok.

> > +
> > +       data.deferred = true;
> > +
> > +       perf_prepare_sample(&data, event, regs);
> 
> I don't think this would work properly.  Not to mention it duplicates
> sample data unnecessarily, some data needs special handling to
> avoid inefficient (full) data copy like for (user) stack, regs and aux.

Yeah.  I tried sending only the sample ID and callchain, but perf tool
didn't appreciate that ;-) So for the RFC I gave up and did this.

> Anyway I'm not sure it can support these additional samples for
> deferred callchains without breaking the existing perf tools.
> Anyway it doesn't know PERF_CONTEXT_USER_DEFERRED at least.
> I think this should be controlled by a new feature bit in the
> perf_event_attr.
> 
> Then we might add a breaking change to have a special
> sample record for the deferred callchains and sample ID only.

Sounds like a good idea.  I'll need to study the code to figure out how
to do that on the perf tool side.  Or would you care to write a patch?

> > -struct perf_callchain_entry *
> > -perf_callchain(struct perf_event *event, struct pt_regs *regs)
> > +void perf_callchain(struct perf_sample_data *data, struct perf_event *event,
> > +                   struct pt_regs *regs)
> >  {
> >         bool kernel = !event->attr.exclude_callchain_kernel;
> >         bool user   = !event->attr.exclude_callchain_user;
> >         const u32 max_stack = event->attr.sample_max_stack;
> > -       struct perf_callchain_entry *callchain;
> > +       bool defer_user = IS_ENABLED(CONFIG_HAVE_PERF_CALLCHAIN_DEFERRED);
> 
> Is it always enabled depending on the kernel config?
> It could be controlled by event->attr.something..

Possibly, depending on what y'all think.  Also, fp vs sframe could be an
attr (though sframe implies deferred).

Thanks for the review!
  
Josh Poimboeuf Nov. 11, 2023, 6:54 p.m. UTC | #3
On Sat, Nov 11, 2023 at 10:49:10AM -0800, Josh Poimboeuf wrote:
> On Fri, Nov 10, 2023 at 10:57:58PM -0800, Namhyung Kim wrote:
> > > +static void perf_pending_task_unwind(struct perf_event *event)
> > > +{
> > > +       struct pt_regs *regs = task_pt_regs(current);
> > > +       struct perf_output_handle handle;
> > > +       struct perf_event_header header;
> > > +       struct perf_sample_data data;
> > > +       struct perf_callchain_entry *callchain;
> > > +
> > > +       callchain = kmalloc(sizeof(struct perf_callchain_entry) +
> > > +                           (sizeof(__u64) * event->attr.sample_max_stack) +
> > > +                           (sizeof(__u64) * 1) /* one context */,
> > > +                           GFP_KERNEL);
> > 
> > Any chance it can reuse get_perf_callchain() instead of
> > allocating the callchains every time?
> 
> I don't think so, because if it gets preempted, the new task might also
> need to do an unwind.  But there's only one task-context callchain per
> CPU.

BTW it's not just preemption, this code can also block when the unwinder
tries to copy from user space.  So disabling preemption isn't an option.
  
Namhyung Kim Nov. 13, 2023, 4:56 p.m. UTC | #4
On Sat, Nov 11, 2023 at 10:49 AM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> On Fri, Nov 10, 2023 at 10:57:58PM -0800, Namhyung Kim wrote:
> > On Wed, Nov 8, 2023 at 4:43 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> > > +++ b/kernel/bpf/stackmap.c
> > > @@ -294,8 +294,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
> > >         if (max_depth > sysctl_perf_event_max_stack)
> > >                 max_depth = sysctl_perf_event_max_stack;
> > >
> > > -       trace = get_perf_callchain(regs, kernel, user, max_depth, false);
> > > -
> > > +       trace = get_perf_callchain(regs, kernel, user, max_depth, false, false);
> > >         if (unlikely(!trace))
> > >                 /* couldn't fetch the stack trace */
> > >                 return -EFAULT;
> > > @@ -420,7 +419,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
> > >                 trace = get_callchain_entry_for_task(task, max_depth);
> > >         else
> > >                 trace = get_perf_callchain(regs, kernel, user, max_depth,
> > > -                                          false);
> > > +                                          false, false);
> >
> > Hmm.. BPF is not supported.  Any plans?
>
> I'm not sure whether this feature makes sense for BPF.  If the BPF
> program runs in atomic context then it would have to defer the unwind
> until right before exiting to user space.  But then the program is no
> longer running.

Ok, then BPF gets no user call stacks even with sframes.

>
> > > +++ b/kernel/events/callchain.c
> > > @@ -178,7 +178,7 @@ put_callchain_entry(int rctx)
> > >
> > >  struct perf_callchain_entry *
> > >  get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
> > > -                  u32 max_stack, bool add_mark)
> > > +                  u32 max_stack, bool add_mark, bool defer_user)
> > >  {
> > >         struct perf_callchain_entry *entry;
> > >         struct perf_callchain_entry_ctx ctx;
> > > @@ -207,6 +207,11 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
> > >                         regs = task_pt_regs(current);
> > >                 }
> > >
> > > +               if (defer_user) {
> > > +                       perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
> > > +                       goto exit_put;
> >
> > Hmm.. ok.  Then now perf tools need to stitch the call stacks
> > in two (or more?) samples.
>
> Yes.  And it could be more than two samples if there were multiple NMIs
> before returning to user space.  In that case there would be a single
> user sample which needs to be stitched to each of the kernel samples.

Ok, but ...

>
> > > +               }
> > > +
> > >                 if (add_mark)
> > >                         perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
> > >
> > > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > > index 5e41a3b70bcd..290e06b0071c 100644
> > > --- a/kernel/events/core.c
> > > +++ b/kernel/events/core.c
> > > @@ -6751,6 +6751,12 @@ static void perf_pending_irq(struct irq_work *entry)
> > >         struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
> > >         int rctx;
> > >
> > > +       if (!is_software_event(event)) {
> > > +               if (event->pending_unwind)
> > > +                       task_work_add(current, &event->pending_task, TWA_RESUME);
> >
> > I'm not familiar with this code.  Is it possible to the current task
> > is preempted before returning to user space (and run the
> > perf_pending_task_unwind) ?
>
> Yes, the task work (perf_pending_task_unwind) is preemptible.

If the task is preempted, the call stack would be from another
task (if it also has the pending call stacks) and we need to
check which user call stack matches which kernel call stack.
So there's no guarantee we can just use adjacent samples.

>
> > > +static void perf_pending_task_unwind(struct perf_event *event)
> > > +{
> > > +       struct pt_regs *regs = task_pt_regs(current);
> > > +       struct perf_output_handle handle;
> > > +       struct perf_event_header header;
> > > +       struct perf_sample_data data;
> > > +       struct perf_callchain_entry *callchain;
> > > +
> > > +       callchain = kmalloc(sizeof(struct perf_callchain_entry) +
> > > +                           (sizeof(__u64) * event->attr.sample_max_stack) +
> > > +                           (sizeof(__u64) * 1) /* one context */,
> > > +                           GFP_KERNEL);
> >
> > Any chance it can reuse get_perf_callchain() instead of
> > allocating the callchains every time?
>
> I don't think so, because if it gets preempted, the new task might also
> need to do an unwind.  But there's only one task-context callchain per
> CPU.
>
> The allocation is less than ideal.  I could just allocate it on the
> stack, and keep the number of entries bounded to 128 entries or so.

Yeah, probably that's better.

>
> > > +       if (!callchain)
> > > +               return;
> > > +
> > > +       callchain->nr = 0;
> > > +       data.callchain = callchain;
> > > +
> > > +       perf_sample_data_init(&data, 0, event->hw.last_period);
> >
> > It would double count the period for a sample.
> > I guess we want to set the period to 0.
>
> Ok.
>
> > > +
> > > +       data.deferred = true;
> > > +
> > > +       perf_prepare_sample(&data, event, regs);
> >
> > I don't think this would work properly.  Not to mention it duplicates
> > sample data unnecessarily, some data needs special handling to
> > avoid inefficient (full) data copy like for (user) stack, regs and aux.
>
> Yeah.  I tried sending only the sample ID and callchain, but perf tool
> didn't appreciate that ;-) So for the RFC I gave up and did this.

Right, it should have some compatible sample ID header fields.
It's dynamic and depends on the attr but at least it should have a
PID to match callchains.

>
> > Anyway I'm not sure it can support these additional samples for
> > deferred callchains without breaking the existing perf tools.
> > Anyway it doesn't know PERF_CONTEXT_USER_DEFERRED at least.
> > I think this should be controlled by a new feature bit in the
> > perf_event_attr.
> >
> > Then we might add a breaking change to have a special
> > sample record for the deferred callchains and sample ID only.
>
> Sounds like a good idea.  I'll need to study the code to figure out how
> to do that on the perf tool side.  Or would you care to write a patch?

Sure, I'd be happy to write one.

>
> > > -struct perf_callchain_entry *
> > > -perf_callchain(struct perf_event *event, struct pt_regs *regs)
> > > +void perf_callchain(struct perf_sample_data *data, struct perf_event *event,
> > > +                   struct pt_regs *regs)
> > >  {
> > >         bool kernel = !event->attr.exclude_callchain_kernel;
> > >         bool user   = !event->attr.exclude_callchain_user;
> > >         const u32 max_stack = event->attr.sample_max_stack;
> > > -       struct perf_callchain_entry *callchain;
> > > +       bool defer_user = IS_ENABLED(CONFIG_HAVE_PERF_CALLCHAIN_DEFERRED);
> >
> > Is it always enabled depending on the kernel config?
> > It could be controlled by event->attr.something..
>
> Possibly, depending on what y'all think.  Also, fp vs sframe could be an
> attr (though sframe implies deferred).
>
> Thanks for the review!

Thanks for your work,
Namhyung
  
Peter Zijlstra Nov. 13, 2023, 5:21 p.m. UTC | #5
On Mon, Nov 13, 2023 at 08:56:39AM -0800, Namhyung Kim wrote:

> Ok, then BPF gets no user call stacks even with sframes.

Well, you could obviously run another BPF program at return to user and
stitch there.

> Ok, but ...

> If the task is preempted, the call stack would be from another
> task (if it also has the pending call stacks) and we need to
> check which user call stack matches which kernel call stack.
> So there's no guarantee we can just use adjacent samples.

So upon requesting a user backtrace for the task it could request a
token (from eg a global atomic u64 in the most crude case) and place
this in the task_struct. The backtrace will emit this as forward
reference for the user trace.

Once the task_work generates the user stacktrace, it will again dump
this token along with the user unwind, after which it will reset the
token for this this task (to 0).
  
Namhyung Kim Nov. 13, 2023, 5:48 p.m. UTC | #6
On Mon, Nov 13, 2023 at 9:21 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Nov 13, 2023 at 08:56:39AM -0800, Namhyung Kim wrote:
>
> > Ok, then BPF gets no user call stacks even with sframes.
>
> Well, you could obviously run another BPF program at return to user and
> stitch there.

Right, maybe some more work needed for bpf_get_stackid()
but it seems possible.

>
> > Ok, but ...
>
> > If the task is preempted, the call stack would be from another
> > task (if it also has the pending call stacks) and we need to
> > check which user call stack matches which kernel call stack.
> > So there's no guarantee we can just use adjacent samples.
>
> So upon requesting a user backtrace for the task it could request a
> token (from eg a global atomic u64 in the most crude case) and place
> this in the task_struct. The backtrace will emit this as forward
> reference for the user trace.
>
> Once the task_work generates the user stacktrace, it will again dump
> this token along with the user unwind, after which it will reset the
> token for this this task (to 0).

Yeah, I thought something like this first, but then I thought
"can we just use PID for this?"

Thanks,
Namhyung
  
Peter Zijlstra Nov. 13, 2023, 6:49 p.m. UTC | #7
On Mon, Nov 13, 2023 at 09:48:32AM -0800, Namhyung Kim wrote:

> Yeah, I thought something like this first, but then I thought
> "can we just use PID for this?"

TID, and assuming things are otherwise time ordered, yes.
  
Namhyung Kim Nov. 13, 2023, 7:16 p.m. UTC | #8
On Mon, Nov 13, 2023 at 10:50 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Nov 13, 2023 at 09:48:32AM -0800, Namhyung Kim wrote:
>
> > Yeah, I thought something like this first, but then I thought
> > "can we just use PID for this?"
>
> TID, and assuming things are otherwise time ordered, yes.

Right, I meant that, not TGID.

At least, the perf tools handle events in time ordered.

Thanks,
Namhyung
  
Namhyung Kim Nov. 15, 2023, 4:13 p.m. UTC | #9
On Mon, Nov 13, 2023 at 08:56:39AM -0800, Namhyung Kim wrote:
> On Sat, Nov 11, 2023 at 10:49 AM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> >
> > On Fri, Nov 10, 2023 at 10:57:58PM -0800, Namhyung Kim wrote:
> > > Anyway I'm not sure it can support these additional samples for
> > > deferred callchains without breaking the existing perf tools.
> > > Anyway it doesn't know PERF_CONTEXT_USER_DEFERRED at least.
> > > I think this should be controlled by a new feature bit in the
> > > perf_event_attr.
> > >
> > > Then we might add a breaking change to have a special
> > > sample record for the deferred callchains and sample ID only.
> >
> > Sounds like a good idea.  I'll need to study the code to figure out how
> > to do that on the perf tool side.  Or would you care to write a patch?
> 
> Sure, I'd be happy to write one.

I think we can start with something like the below.
The sample id (attr.sample_type) should have
IDENTIFIER | TID | TIME to enable defer_callchain
in order to match sample and callchain records.

Thanks,
Namhyung


---8<---
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 39c6a250dd1b..a3765ff59798 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -456,7 +456,8 @@ struct perf_event_attr {
 				inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
 				remove_on_exec :  1, /* event is removed from task on exec */
 				sigtrap        :  1, /* send synchronous SIGTRAP on event */
-				__reserved_1   : 26;
+				defer_callchain:  1, /* generate DEFERRED_CALLCHAINS records for userspace */
+				__reserved_1   : 25;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -1207,6 +1208,20 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_AUX_OUTPUT_HW_ID		= 21,
 
+	/*
+	 * Deferred user stack callchains (for SFrame).  Previous samples would
+	 * have kernel callchains only and they need to be stitched with this
+	 * to make full callchains.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				nr;
+	 *	u64				ips[nr];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_DEFERRED_CALLCHAINS		= 22,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
  
Peter Zijlstra Nov. 20, 2023, 2:03 p.m. UTC | #10
On Wed, Nov 15, 2023 at 08:13:31AM -0800, Namhyung Kim wrote:

> ---8<---
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 39c6a250dd1b..a3765ff59798 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -456,7 +456,8 @@ struct perf_event_attr {
>  				inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
>  				remove_on_exec :  1, /* event is removed from task on exec */
>  				sigtrap        :  1, /* send synchronous SIGTRAP on event */
> -				__reserved_1   : 26;
> +				defer_callchain:  1, /* generate DEFERRED_CALLCHAINS records for userspace */
> +				__reserved_1   : 25;
>  
>  	union {
>  		__u32		wakeup_events;	  /* wakeup every n events */
> @@ -1207,6 +1208,20 @@ enum perf_event_type {
>  	 */
>  	PERF_RECORD_AUX_OUTPUT_HW_ID		= 21,
>  
> +	/*
> +	 * Deferred user stack callchains (for SFrame).  Previous samples would

Possibly also useful for ShadowStack based unwinders. And by virtue of
it possibly saving work when multiple consecutive samples hit
the same kernel section, for everything.

> +	 * have kernel callchains only and they need to be stitched with this
> +	 * to make full callchains.
> +	 *
> +	 * struct {
> +	 *	struct perf_event_header	header;
> +	 *	u64				nr;
> +	 *	u64				ips[nr];
> +	 *	struct sample_id		sample_id;
> +	 * };
> +	 */
> +	PERF_RECORD_DEFERRED_CALLCHAINS		= 22,
> +
>  	PERF_RECORD_MAX,			/* non-ABI */
>  };
>  

Anyway, yeah, that should do I suppose.
  

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index f4b210ab0612..690c82212224 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -425,6 +425,9 @@  config HAVE_HARDLOCKUP_DETECTOR_ARCH
 	  It uses the same command line parameters, and sysctl interface,
 	  as the generic hardlockup detectors.
 
+config HAVE_PERF_CALLCHAIN_DEFERRED
+	bool
+
 config HAVE_PERF_REGS
 	bool
 	help
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2d8fa253b9df..2f232111dff2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -786,6 +786,7 @@  struct perf_event {
 	struct irq_work			pending_irq;
 	struct callback_head		pending_task;
 	unsigned int			pending_work;
+	unsigned int			pending_unwind;
 
 	atomic_t			event_limit;
 
@@ -1113,7 +1114,10 @@  int perf_event_read_local(struct perf_event *event, u64 *value,
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
-extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);
+extern void perf_callchain(struct perf_sample_data *data,
+			   struct perf_event *event, struct pt_regs *regs);
+extern void perf_callchain_deferred(struct perf_sample_data *data,
+				    struct perf_event *event, struct pt_regs *regs);
 
 static inline bool branch_sample_no_flags(const struct perf_event *event)
 {
@@ -1189,6 +1193,7 @@  struct perf_sample_data {
 	u64				data_page_size;
 	u64				code_page_size;
 	u64				aux_size;
+	bool				deferred;
 } ____cacheline_aligned;
 
 /* default value for data source */
@@ -1206,6 +1211,7 @@  static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->sample_flags = PERF_SAMPLE_PERIOD;
 	data->period = period;
 	data->dyn_size = 0;
+	data->deferred = false;
 
 	if (addr) {
 		data->addr = addr;
@@ -1219,7 +1225,11 @@  static inline void perf_sample_save_callchain(struct perf_sample_data *data,
 {
 	int size = 1;
 
-	data->callchain = perf_callchain(event, regs);
+	if (data->deferred)
+		perf_callchain_deferred(data, event, regs);
+	else
+		perf_callchain(data, event, regs);
+
 	size += data->callchain->nr;
 
 	data->dyn_size += size * sizeof(u64);
@@ -1534,12 +1544,18 @@  extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p
 extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
-		   u32 max_stack, bool add_mark);
+		   u32 max_stack, bool add_mark, bool defer_user);
 extern int get_callchain_buffers(int max_stack);
 extern void put_callchain_buffers(void);
 extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
 extern void put_callchain_entry(int rctx);
 
+#ifdef CONFIG_HAVE_PERF_CALLCHAIN_DEFERRED
+extern void perf_callchain_user_deferred(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
+#else
+static inline void perf_callchain_user_deferred(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) {}
+#endif
+
 extern int sysctl_perf_event_max_stack;
 extern int sysctl_perf_event_max_contexts_per_stack;
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 39c6a250dd1b..9a1127af4cda 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1237,6 +1237,7 @@  enum perf_callchain_context {
 	PERF_CONTEXT_HV			= (__u64)-32,
 	PERF_CONTEXT_KERNEL		= (__u64)-128,
 	PERF_CONTEXT_USER		= (__u64)-512,
+	PERF_CONTEXT_USER_DEFERRED	= (__u64)-640,
 
 	PERF_CONTEXT_GUEST		= (__u64)-2048,
 	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index e4827ca5378d..fcdd26715b12 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -294,8 +294,7 @@  BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 	if (max_depth > sysctl_perf_event_max_stack)
 		max_depth = sysctl_perf_event_max_stack;
 
-	trace = get_perf_callchain(regs, kernel, user, max_depth, false);
-
+	trace = get_perf_callchain(regs, kernel, user, max_depth, false, false);
 	if (unlikely(!trace))
 		/* couldn't fetch the stack trace */
 		return -EFAULT;
@@ -420,7 +419,7 @@  static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 		trace = get_callchain_entry_for_task(task, max_depth);
 	else
 		trace = get_perf_callchain(regs, kernel, user, max_depth,
-					   false);
+					   false, false);
 	if (unlikely(!trace))
 		goto err_fault;
 
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 2bee8b6fda0e..16571c8d6771 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -178,7 +178,7 @@  put_callchain_entry(int rctx)
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
-		   u32 max_stack, bool add_mark)
+		   u32 max_stack, bool add_mark, bool defer_user)
 {
 	struct perf_callchain_entry *entry;
 	struct perf_callchain_entry_ctx ctx;
@@ -207,6 +207,11 @@  get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
 			regs = task_pt_regs(current);
 		}
 
+		if (defer_user) {
+			perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
+			goto exit_put;
+		}
+
 		if (add_mark)
 			perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5e41a3b70bcd..290e06b0071c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6751,6 +6751,12 @@  static void perf_pending_irq(struct irq_work *entry)
 	struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
 	int rctx;
 
+	if (!is_software_event(event)) {
+		if (event->pending_unwind)
+			task_work_add(current, &event->pending_task, TWA_RESUME);
+		return;
+	}
+
 	/*
 	 * If we 'fail' here, that's OK, it means recursion is already disabled
 	 * and we won't recurse 'further'.
@@ -6772,11 +6778,57 @@  static void perf_pending_irq(struct irq_work *entry)
 		perf_swevent_put_recursion_context(rctx);
 }
 
+static void perf_pending_task_unwind(struct perf_event *event)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+	struct perf_sample_data data;
+	struct perf_callchain_entry *callchain;
+
+	callchain = kmalloc(sizeof(struct perf_callchain_entry) +
+			    (sizeof(__u64) * event->attr.sample_max_stack) +
+			    (sizeof(__u64) * 1) /* one context */,
+			    GFP_KERNEL);
+	if (!callchain)
+		return;
+
+	callchain->nr = 0;
+	data.callchain = callchain;
+
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+
+	data.deferred = true;
+
+	perf_prepare_sample(&data, event, regs);
+
+	perf_prepare_header(&header, &data, event, regs);
+
+	if (perf_output_begin(&handle, &data, event, header.size))
+		goto done;
+
+	perf_output_sample(&handle, &header, &data, event);
+
+	perf_output_end(&handle);
+
+done:
+	kfree(callchain);
+}
+
+
 static void perf_pending_task(struct callback_head *head)
 {
 	struct perf_event *event = container_of(head, struct perf_event, pending_task);
 	int rctx;
 
+	if (!is_software_event(event)) {
+		if (event->pending_unwind) {
+			perf_pending_task_unwind(event);
+			event->pending_unwind = 0;
+		}
+		return;
+	}
+
 	/*
 	 * If we 'fail' here, that's OK, it means recursion is already disabled
 	 * and we won't recurse 'further'.
@@ -7587,22 +7639,48 @@  static u64 perf_get_page_size(unsigned long addr)
 
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 
-struct perf_callchain_entry *
-perf_callchain(struct perf_event *event, struct pt_regs *regs)
+void perf_callchain(struct perf_sample_data *data, struct perf_event *event,
+		    struct pt_regs *regs)
 {
 	bool kernel = !event->attr.exclude_callchain_kernel;
 	bool user   = !event->attr.exclude_callchain_user;
 	const u32 max_stack = event->attr.sample_max_stack;
-	struct perf_callchain_entry *callchain;
+	bool defer_user = IS_ENABLED(CONFIG_HAVE_PERF_CALLCHAIN_DEFERRED);
 
 	/* Disallow cross-task user callchains. */
 	user &= !event->ctx->task || event->ctx->task == current;
 
 	if (!kernel && !user)
-		return &__empty_callchain;
+		goto empty;
 
-	callchain = get_perf_callchain(regs, kernel, user, max_stack, true);
-	return callchain ?: &__empty_callchain;
+	data->callchain = get_perf_callchain(regs, kernel, user, max_stack, true, defer_user);
+	if (!data->callchain)
+		goto empty;
+
+	if (user && defer_user && !event->pending_unwind) {
+		event->pending_unwind = 1;
+		irq_work_queue(&event->pending_irq);
+	}
+
+	return;
+
+empty:
+	data->callchain = &__empty_callchain;
+}
+
+void perf_callchain_deferred(struct perf_sample_data *data,
+			     struct perf_event *event, struct pt_regs *regs)
+{
+	struct perf_callchain_entry_ctx ctx;
+
+	ctx.entry		= data->callchain;
+	ctx.max_stack		= event->attr.sample_max_stack;
+	ctx.nr			= 0;
+	ctx.contexts		= 0;
+	ctx.contexts_maxed	= false;
+
+	perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+	perf_callchain_user_deferred(&ctx, regs);
 }
 
 static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)