diff mbox series

[v2] perf lock contention: Account contending locks too

Message ID	20240228053335.312776-1-namhyung@kernel.org
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel+bounces-84540-ouuuleilei=gmail.com@vger.kernel.org designates 2604:1380:45d1:ec00::1 as permitted sender) client-ip=2604:1380:45d1:ec00::1; From: Namhyung Kim <namhyung@kernel.org> To: Arnaldo Carvalho de Melo <acme@kernel.org>, Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org>, Adrian Hunter <adrian.hunter@intel.com>, Peter Zijlstra <peterz@infradead.org>, Ingo Molnar <mingo@kernel.org>, LKML <linux-kernel@vger.kernel.org>, linux-perf-users@vger.kernel.org, Song Liu <song@kernel.org>, bpf@vger.kernel.org Subject: [PATCH v2] perf lock contention: Account contending locks too Date: Tue, 27 Feb 2024 21:33:35 -0800 Message-ID: <20240228053335.312776-1-namhyung@kernel.org> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-getmail-retrieved-from-mailbox: INBOX
Series	[v2] perf lock contention: Account contending locks too \| [v2] perf lock contention: Account contending locks too

Commit Message

Namhyung Kim Feb. 28, 2024, 5:33 a.m. UTC

  Currently it accounts the contention using delta between timestamps in
lock:contention_begin and lock:contention_end tracepoints.  But it means
the lock should see the both events during the monitoring period.

Actually there are 4 cases that happen with the monitoring:

                monitoring period
            /                       \
            |                       |
 1:  B------+-----------------------+--------E
 2:    B----+-------------E         |
 3:         |           B-----------+----E
 4:         |     B-------------E   |
            |                       |
            t0                      t1

where B and E mean contention BEGIN and END, respectively.  So it only
accounts the case 4 for now.  It seems there's no way to handle the case
1.  The case 2 might be handled if it saved the timestamp (t0), but it
lacks the information from the B notably the flags which shows the lock
types.  Also it could be a nested lock which it currently ignores.  So
I think we should ignore the case 2.

However we can handle the case 3 if we save the timestamp (t1) at the
end of the period.  And then it can iterate the map entries in the
userspace and update the lock stat accordinly.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
v2: add a comment on mark_end_timestamp  (Ian)

 tools/perf/util/bpf_lock_contention.c         | 120 ++++++++++++++++++
 .../perf/util/bpf_skel/lock_contention.bpf.c  |  16 ++-
 tools/perf/util/bpf_skel/lock_data.h          |   7 +
 3 files changed, 136 insertions(+), 7 deletions(-)

Comments

Ian Rogers Feb. 28, 2024, 6:32 a.m. UTC | #1

On Tue, Feb 27, 2024 at 9:33 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Currently it accounts the contention using delta between timestamps in
> lock:contention_begin and lock:contention_end tracepoints.  But it means
> the lock should see the both events during the monitoring period.
>
> Actually there are 4 cases that happen with the monitoring:
>
>                 monitoring period
>             /                       \
>             |                       |
>  1:  B------+-----------------------+--------E
>  2:    B----+-------------E         |
>  3:         |           B-----------+----E
>  4:         |     B-------------E   |
>             |                       |
>             t0                      t1
>
> where B and E mean contention BEGIN and END, respectively.  So it only
> accounts the case 4 for now.  It seems there's no way to handle the case
> 1.  The case 2 might be handled if it saved the timestamp (t0), but it
> lacks the information from the B notably the flags which shows the lock
> types.  Also it could be a nested lock which it currently ignores.  So
> I think we should ignore the case 2.
>
> However we can handle the case 3 if we save the timestamp (t1) at the
> end of the period.  And then it can iterate the map entries in the
> userspace and update the lock stat accordinly.
>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>

Reviewed-by: Ian Rogers <irogers@google.com>

Thanks,
Ian

> ---
> v2: add a comment on mark_end_timestamp  (Ian)
>
>  tools/perf/util/bpf_lock_contention.c         | 120 ++++++++++++++++++
>  .../perf/util/bpf_skel/lock_contention.bpf.c  |  16 ++-
>  tools/perf/util/bpf_skel/lock_data.h          |   7 +
>  3 files changed, 136 insertions(+), 7 deletions(-)
>
> diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> index 31ff19afc20c..9af76c6b2543 100644
> --- a/tools/perf/util/bpf_lock_contention.c
> +++ b/tools/perf/util/bpf_lock_contention.c
> @@ -179,6 +179,123 @@ int lock_contention_prepare(struct lock_contention *con)
>         return 0;
>  }
>
> +/*
> + * Run the BPF program directly using BPF_PROG_TEST_RUN to update the end
> + * timestamp in ktime so that it can calculate delta easily.
> + */
> +static void mark_end_timestamp(void)
> +{
> +       DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
> +               .flags = BPF_F_TEST_RUN_ON_CPU,
> +       );
> +       int prog_fd = bpf_program__fd(skel->progs.end_timestamp);
> +
> +       bpf_prog_test_run_opts(prog_fd, &opts);
> +}
> +
> +static void update_lock_stat(int map_fd, int pid, u64 end_ts,
> +                            enum lock_aggr_mode aggr_mode,
> +                            struct tstamp_data *ts_data)
> +{
> +       u64 delta;
> +       struct contention_key stat_key = {};
> +       struct contention_data stat_data;
> +
> +       if (ts_data->timestamp >= end_ts)
> +               return;
> +
> +       delta = end_ts - ts_data->timestamp;
> +
> +       switch (aggr_mode) {
> +       case LOCK_AGGR_CALLER:
> +               stat_key.stack_id = ts_data->stack_id;
> +               break;
> +       case LOCK_AGGR_TASK:
> +               stat_key.pid = pid;
> +               break;
> +       case LOCK_AGGR_ADDR:
> +               stat_key.lock_addr_or_cgroup = ts_data->lock;
> +               break;
> +       case LOCK_AGGR_CGROUP:
> +               /* TODO */
> +               return;
> +       default:
> +               return;
> +       }
> +
> +       if (bpf_map_lookup_elem(map_fd, &stat_key, &stat_data) < 0)
> +               return;
> +
> +       stat_data.total_time += delta;
> +       stat_data.count++;
> +
> +       if (delta > stat_data.max_time)
> +               stat_data.max_time = delta;
> +       if (delta < stat_data.min_time)
> +               stat_data.min_time = delta;
> +
> +       bpf_map_update_elem(map_fd, &stat_key, &stat_data, BPF_EXIST);
> +}
> +
> +/*
> + * Account entries in the tstamp map (which didn't see the corresponding
> + * lock:contention_end tracepoint) using end_ts.
> + */
> +static void account_end_timestamp(struct lock_contention *con)
> +{
> +       int ts_fd, stat_fd;
> +       int *prev_key, key;
> +       u64 end_ts = skel->bss->end_ts;
> +       int total_cpus;
> +       enum lock_aggr_mode aggr_mode = con->aggr_mode;
> +       struct tstamp_data ts_data, *cpu_data;
> +
> +       /* Iterate per-task tstamp map (key = TID) */
> +       ts_fd = bpf_map__fd(skel->maps.tstamp);
> +       stat_fd = bpf_map__fd(skel->maps.lock_stat);
> +
> +       prev_key = NULL;
> +       while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
> +               if (bpf_map_lookup_elem(ts_fd, &key, &ts_data) == 0) {
> +                       int pid = key;
> +
> +                       if (aggr_mode == LOCK_AGGR_TASK && con->owner)
> +                               pid = ts_data.flags;
> +
> +                       update_lock_stat(stat_fd, pid, end_ts, aggr_mode,
> +                                        &ts_data);
> +               }
> +
> +               prev_key = &key;
> +       }
> +
> +       /* Now it'll check per-cpu tstamp map which doesn't have TID. */
> +       if (aggr_mode == LOCK_AGGR_TASK || aggr_mode == LOCK_AGGR_CGROUP)
> +               return;
> +
> +       total_cpus = cpu__max_cpu().cpu;
> +       ts_fd = bpf_map__fd(skel->maps.tstamp_cpu);
> +
> +       cpu_data = calloc(total_cpus, sizeof(*cpu_data));
> +       if (cpu_data == NULL)
> +               return;
> +
> +       prev_key = NULL;
> +       while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
> +               if (bpf_map_lookup_elem(ts_fd, &key, cpu_data) < 0)
> +                       goto next;
> +
> +               for (int i = 0; i < total_cpus; i++) {
> +                       update_lock_stat(stat_fd, -1, end_ts, aggr_mode,
> +                                        &cpu_data[i]);
> +               }
> +
> +next:
> +               prev_key = &key;
> +       }
> +       free(cpu_data);
> +}
> +
>  int lock_contention_start(void)
>  {
>         skel->bss->enabled = 1;
> @@ -188,6 +305,7 @@ int lock_contention_start(void)
>  int lock_contention_stop(void)
>  {
>         skel->bss->enabled = 0;
> +       mark_end_timestamp();
>         return 0;
>  }
>
> @@ -301,6 +419,8 @@ int lock_contention_read(struct lock_contention *con)
>         if (stack_trace == NULL)
>                 return -1;
>
> +       account_end_timestamp(con);
> +
>         if (con->aggr_mode == LOCK_AGGR_TASK) {
>                 struct thread *idle = __machine__findnew_thread(machine,
>                                                                 /*pid=*/0,
> diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> index 95cd8414f6ef..fb54bd38e7d0 100644
> --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> @@ -19,13 +19,6 @@
>  #define LCB_F_PERCPU   (1U << 4)
>  #define LCB_F_MUTEX    (1U << 5)
>
> -struct tstamp_data {
> -       __u64 timestamp;
> -       __u64 lock;
> -       __u32 flags;
> -       __s32 stack_id;
> -};
> -
>  /* callstack storage  */
>  struct {
>         __uint(type, BPF_MAP_TYPE_STACK_TRACE);
> @@ -140,6 +133,8 @@ int perf_subsys_id = -1;
>  /* determine the key of lock stat */
>  int aggr_mode;
>
> +__u64 end_ts;
> +
>  /* error stat */
>  int task_fail;
>  int stack_fail;
> @@ -559,4 +554,11 @@ int BPF_PROG(collect_lock_syms)
>         return 0;
>  }
>
> +SEC("raw_tp/bpf_test_finish")
> +int BPF_PROG(end_timestamp)
> +{
> +       end_ts = bpf_ktime_get_ns();
> +       return 0;
> +}
> +
>  char LICENSE[] SEC("license") = "Dual BSD/GPL";
> diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
> index 08482daf61be..36af11faad03 100644
> --- a/tools/perf/util/bpf_skel/lock_data.h
> +++ b/tools/perf/util/bpf_skel/lock_data.h
> @@ -3,6 +3,13 @@
>  #ifndef UTIL_BPF_SKEL_LOCK_DATA_H
>  #define UTIL_BPF_SKEL_LOCK_DATA_H
>
> +struct tstamp_data {
> +       u64 timestamp;
> +       u64 lock;
> +       u32 flags;
> +       u32 stack_id;
> +};
> +
>  struct contention_key {
>         u32 stack_id;
>         u32 pid;
> --
> 2.44.0.rc1.240.g4c46232300-goog
>

Arnaldo Carvalho de Melo Feb. 28, 2024, 12:22 p.m. UTC | #2

On Tue, Feb 27, 2024 at 09:33:35PM -0800, Namhyung Kim wrote:
> Currently it accounts the contention using delta between timestamps in
> lock:contention_begin and lock:contention_end tracepoints.  But it means
> the lock should see the both events during the monitoring period.
> 
> Actually there are 4 cases that happen with the monitoring:
> 
>                 monitoring period
>             /                       \
>             |                       |
>  1:  B------+-----------------------+--------E
>  2:    B----+-------------E         |
>  3:         |           B-----------+----E
>  4:         |     B-------------E   |
>             |                       |
>             t0                      t1
> 
> where B and E mean contention BEGIN and END, respectively.  So it only
> accounts the case 4 for now.  It seems there's no way to handle the case
> 1.  The case 2 might be handled if it saved the timestamp (t0), but it
> lacks the information from the B notably the flags which shows the lock
> types.  Also it could be a nested lock which it currently ignores.  So
> I think we should ignore the case 2.

Perhaps have a separate output listing locks that were found to be with
at least tE - t0 time, with perhaps a backtrace at that END time?

With that we wouldn't miss that info, however incomplete it is and the
user would try running again, perhaps for a longer time, or start
monitoring before the observed workload starts, etc.

Anyway:

Reviwed-by: Arnaldo Carvalho de Melo <acme@redhat.com>

- Arnaldo
 
> However we can handle the case 3 if we save the timestamp (t1) at the
> end of the period.  And then it can iterate the map entries in the
> userspace and update the lock stat accordinly.
> 
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
> v2: add a comment on mark_end_timestamp  (Ian)
> 
>  tools/perf/util/bpf_lock_contention.c         | 120 ++++++++++++++++++
>  .../perf/util/bpf_skel/lock_contention.bpf.c  |  16 ++-
>  tools/perf/util/bpf_skel/lock_data.h          |   7 +
>  3 files changed, 136 insertions(+), 7 deletions(-)
> 
> diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> index 31ff19afc20c..9af76c6b2543 100644
> --- a/tools/perf/util/bpf_lock_contention.c
> +++ b/tools/perf/util/bpf_lock_contention.c
> @@ -179,6 +179,123 @@ int lock_contention_prepare(struct lock_contention *con)
>  	return 0;
>  }
>  
> +/*
> + * Run the BPF program directly using BPF_PROG_TEST_RUN to update the end
> + * timestamp in ktime so that it can calculate delta easily.
> + */
> +static void mark_end_timestamp(void)
> +{
> +	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
> +		.flags = BPF_F_TEST_RUN_ON_CPU,
> +	);
> +	int prog_fd = bpf_program__fd(skel->progs.end_timestamp);
> +
> +	bpf_prog_test_run_opts(prog_fd, &opts);
> +}
> +
> +static void update_lock_stat(int map_fd, int pid, u64 end_ts,
> +			     enum lock_aggr_mode aggr_mode,
> +			     struct tstamp_data *ts_data)
> +{
> +	u64 delta;
> +	struct contention_key stat_key = {};
> +	struct contention_data stat_data;
> +
> +	if (ts_data->timestamp >= end_ts)
> +		return;
> +
> +	delta = end_ts - ts_data->timestamp;
> +
> +	switch (aggr_mode) {
> +	case LOCK_AGGR_CALLER:
> +		stat_key.stack_id = ts_data->stack_id;
> +		break;
> +	case LOCK_AGGR_TASK:
> +		stat_key.pid = pid;
> +		break;
> +	case LOCK_AGGR_ADDR:
> +		stat_key.lock_addr_or_cgroup = ts_data->lock;
> +		break;
> +	case LOCK_AGGR_CGROUP:
> +		/* TODO */
> +		return;
> +	default:
> +		return;
> +	}
> +
> +	if (bpf_map_lookup_elem(map_fd, &stat_key, &stat_data) < 0)
> +		return;
> +
> +	stat_data.total_time += delta;
> +	stat_data.count++;
> +
> +	if (delta > stat_data.max_time)
> +		stat_data.max_time = delta;
> +	if (delta < stat_data.min_time)
> +		stat_data.min_time = delta;
> +
> +	bpf_map_update_elem(map_fd, &stat_key, &stat_data, BPF_EXIST);
> +}
> +
> +/*
> + * Account entries in the tstamp map (which didn't see the corresponding
> + * lock:contention_end tracepoint) using end_ts.
> + */
> +static void account_end_timestamp(struct lock_contention *con)
> +{
> +	int ts_fd, stat_fd;
> +	int *prev_key, key;
> +	u64 end_ts = skel->bss->end_ts;
> +	int total_cpus;
> +	enum lock_aggr_mode aggr_mode = con->aggr_mode;
> +	struct tstamp_data ts_data, *cpu_data;
> +
> +	/* Iterate per-task tstamp map (key = TID) */
> +	ts_fd = bpf_map__fd(skel->maps.tstamp);
> +	stat_fd = bpf_map__fd(skel->maps.lock_stat);
> +
> +	prev_key = NULL;
> +	while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
> +		if (bpf_map_lookup_elem(ts_fd, &key, &ts_data) == 0) {
> +			int pid = key;
> +
> +			if (aggr_mode == LOCK_AGGR_TASK && con->owner)
> +				pid = ts_data.flags;
> +
> +			update_lock_stat(stat_fd, pid, end_ts, aggr_mode,
> +					 &ts_data);
> +		}
> +
> +		prev_key = &key;
> +	}
> +
> +	/* Now it'll check per-cpu tstamp map which doesn't have TID. */
> +	if (aggr_mode == LOCK_AGGR_TASK || aggr_mode == LOCK_AGGR_CGROUP)
> +		return;
> +
> +	total_cpus = cpu__max_cpu().cpu;
> +	ts_fd = bpf_map__fd(skel->maps.tstamp_cpu);
> +
> +	cpu_data = calloc(total_cpus, sizeof(*cpu_data));
> +	if (cpu_data == NULL)
> +		return;
> +
> +	prev_key = NULL;
> +	while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
> +		if (bpf_map_lookup_elem(ts_fd, &key, cpu_data) < 0)
> +			goto next;
> +
> +		for (int i = 0; i < total_cpus; i++) {
> +			update_lock_stat(stat_fd, -1, end_ts, aggr_mode,
> +					 &cpu_data[i]);
> +		}
> +
> +next:
> +		prev_key = &key;
> +	}
> +	free(cpu_data);
> +}
> +
>  int lock_contention_start(void)
>  {
>  	skel->bss->enabled = 1;
> @@ -188,6 +305,7 @@ int lock_contention_start(void)
>  int lock_contention_stop(void)
>  {
>  	skel->bss->enabled = 0;
> +	mark_end_timestamp();
>  	return 0;
>  }
>  
> @@ -301,6 +419,8 @@ int lock_contention_read(struct lock_contention *con)
>  	if (stack_trace == NULL)
>  		return -1;
>  
> +	account_end_timestamp(con);
> +
>  	if (con->aggr_mode == LOCK_AGGR_TASK) {
>  		struct thread *idle = __machine__findnew_thread(machine,
>  								/*pid=*/0,
> diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> index 95cd8414f6ef..fb54bd38e7d0 100644
> --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> @@ -19,13 +19,6 @@
>  #define LCB_F_PERCPU	(1U << 4)
>  #define LCB_F_MUTEX	(1U << 5)
>  
> -struct tstamp_data {
> -	__u64 timestamp;
> -	__u64 lock;
> -	__u32 flags;
> -	__s32 stack_id;
> -};
> -
>  /* callstack storage  */
>  struct {
>  	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
> @@ -140,6 +133,8 @@ int perf_subsys_id = -1;
>  /* determine the key of lock stat */
>  int aggr_mode;
>  
> +__u64 end_ts;
> +
>  /* error stat */
>  int task_fail;
>  int stack_fail;
> @@ -559,4 +554,11 @@ int BPF_PROG(collect_lock_syms)
>  	return 0;
>  }
>  
> +SEC("raw_tp/bpf_test_finish")
> +int BPF_PROG(end_timestamp)
> +{
> +	end_ts = bpf_ktime_get_ns();
> +	return 0;
> +}
> +
>  char LICENSE[] SEC("license") = "Dual BSD/GPL";
> diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
> index 08482daf61be..36af11faad03 100644
> --- a/tools/perf/util/bpf_skel/lock_data.h
> +++ b/tools/perf/util/bpf_skel/lock_data.h
> @@ -3,6 +3,13 @@
>  #ifndef UTIL_BPF_SKEL_LOCK_DATA_H
>  #define UTIL_BPF_SKEL_LOCK_DATA_H
>  
> +struct tstamp_data {
> +	u64 timestamp;
> +	u64 lock;
> +	u32 flags;
> +	u32 stack_id;
> +};
> +
>  struct contention_key {
>  	u32 stack_id;
>  	u32 pid;
> -- 
> 2.44.0.rc1.240.g4c46232300-goog

Namhyung Kim Feb. 28, 2024, 8:01 p.m. UTC | #3

On Wed, Feb 28, 2024 at 4:22 AM Arnaldo Carvalho de Melo
<acme@kernel.org> wrote:
>
> On Tue, Feb 27, 2024 at 09:33:35PM -0800, Namhyung Kim wrote:
> > Currently it accounts the contention using delta between timestamps in
> > lock:contention_begin and lock:contention_end tracepoints.  But it means
> > the lock should see the both events during the monitoring period.
> >
> > Actually there are 4 cases that happen with the monitoring:
> >
> >                 monitoring period
> >             /                       \
> >             |                       |
> >  1:  B------+-----------------------+--------E
> >  2:    B----+-------------E         |
> >  3:         |           B-----------+----E
> >  4:         |     B-------------E   |
> >             |                       |
> >             t0                      t1
> >
> > where B and E mean contention BEGIN and END, respectively.  So it only
> > accounts the case 4 for now.  It seems there's no way to handle the case
> > 1.  The case 2 might be handled if it saved the timestamp (t0), but it
> > lacks the information from the B notably the flags which shows the lock
> > types.  Also it could be a nested lock which it currently ignores.  So
> > I think we should ignore the case 2.
>
> Perhaps have a separate output listing locks that were found to be with
> at least tE - t0 time, with perhaps a backtrace at that END time?

Do you mean long contentions in case 3?  I'm not sure what do
you mean by tE, but they started after t0 so cannot be greater
than or equal to the monitoring period.  Maybe we can try with
say, 90% of period but we can still miss something.

And collecting backtrace of other task would be racy as the it
may not contend anymore.

>
> With that we wouldn't miss that info, however incomplete it is and the
> user would try running again, perhaps for a longer time, or start
> monitoring before the observed workload starts, etc.

Yeah, it can be useful.  Let me think about it more.

>
> Anyway:
>
> Reviwed-by: Arnaldo Carvalho de Melo <acme@redhat.com>

Thanks for your review!
Namhyung

Arnaldo Carvalho de Melo Feb. 28, 2024, 8:16 p.m. UTC | #4

On Wed, Feb 28, 2024 at 12:01:55PM -0800, Namhyung Kim wrote:
> On Wed, Feb 28, 2024 at 4:22 AM Arnaldo Carvalho de Melo
> <acme@kernel.org> wrote:
> >
> > On Tue, Feb 27, 2024 at 09:33:35PM -0800, Namhyung Kim wrote:
> > > Currently it accounts the contention using delta between timestamps in
> > > lock:contention_begin and lock:contention_end tracepoints.  But it means
> > > the lock should see the both events during the monitoring period.
> > >
> > > Actually there are 4 cases that happen with the monitoring:
> > >
> > >                 monitoring period
> > >             /                       \
> > >             |                       |
> > >  1:  B------+-----------------------+--------E
> > >  2:    B----+-------------E         |
> > >  3:         |           B-----------+----E
> > >  4:         |     B-------------E   |
> > >             |                       |
> > >             t0                      t1
> > >
> > > where B and E mean contention BEGIN and END, respectively.  So it only
> > > accounts the case 4 for now.  It seems there's no way to handle the case
> > > 1.  The case 2 might be handled if it saved the timestamp (t0), but it
> > > lacks the information from the B notably the flags which shows the lock
> > > types.  Also it could be a nested lock which it currently ignores.  So
> > > I think we should ignore the case 2.
> >
> > Perhaps have a separate output listing locks that were found to be with
> > at least tE - t0 time, with perhaps a backtrace at that END time?
> 
> Do you mean long contentions in case 3?  I'm not sure what do
> you mean by tE, but they started after t0 so cannot be greater

case 2

                monitoring period
            /                       \
            |                       |
 2:    B----+-------------E         |
            |             |         |
            t0            tE        t1

We get a notification for event E, right? We don´t have one for B,
because it happened before we were monitoring.

> than or equal to the monitoring period.  Maybe we can try with
> say, 90% of period but we can still miss something.
> 
> And collecting backtrace of other task would be racy as the it
> may not contend anymore.
> 
> > With that we wouldn't miss that info, however incomplete it is and the
> > user would try running again, perhaps for a longer time, or start
> > monitoring before the observed workload starts, etc.
> 
> Yeah, it can be useful.  Let me think about it more.
> 
> >
> > Anyway:
> >
> > Reviwed-by: Arnaldo Carvalho de Melo <acme@redhat.com>
> 
> Thanks for your review!
> Namhyung
>

Namhyung Kim Feb. 28, 2024, 9:19 p.m. UTC | #5

On Wed, Feb 28, 2024 at 12:16 PM Arnaldo Carvalho de Melo
<acme@kernel.org> wrote:
>
> On Wed, Feb 28, 2024 at 12:01:55PM -0800, Namhyung Kim wrote:
> > On Wed, Feb 28, 2024 at 4:22 AM Arnaldo Carvalho de Melo
> > <acme@kernel.org> wrote:
> > >
> > > On Tue, Feb 27, 2024 at 09:33:35PM -0800, Namhyung Kim wrote:
> > > > Currently it accounts the contention using delta between timestamps in
> > > > lock:contention_begin and lock:contention_end tracepoints.  But it means
> > > > the lock should see the both events during the monitoring period.
> > > >
> > > > Actually there are 4 cases that happen with the monitoring:
> > > >
> > > >                 monitoring period
> > > >             /                       \
> > > >             |                       |
> > > >  1:  B------+-----------------------+--------E
> > > >  2:    B----+-------------E         |
> > > >  3:         |           B-----------+----E
> > > >  4:         |     B-------------E   |
> > > >             |                       |
> > > >             t0                      t1
> > > >
> > > > where B and E mean contention BEGIN and END, respectively.  So it only
> > > > accounts the case 4 for now.  It seems there's no way to handle the case
> > > > 1.  The case 2 might be handled if it saved the timestamp (t0), but it
> > > > lacks the information from the B notably the flags which shows the lock
> > > > types.  Also it could be a nested lock which it currently ignores.  So
> > > > I think we should ignore the case 2.
> > >
> > > Perhaps have a separate output listing locks that were found to be with
> > > at least tE - t0 time, with perhaps a backtrace at that END time?
> >
> > Do you mean long contentions in case 3?  I'm not sure what do
> > you mean by tE, but they started after t0 so cannot be greater
>
> case 2
>
>                 monitoring period
>             /                       \
>             |                       |
>  2:    B----+-------------E         |
>             |             |         |
>             t0            tE        t1
>
> We get a notification for event E, right? We don´t have one for B,
> because it happened before we were monitoring.

Ah, ok.  But there should be too many events in case 2 and
I don't think users want to see them all.  And they don't have
flags.  But maybe we can update the flag when it sees exactly
the same callstack later.

Thanks,
Namhyung

Arnaldo Carvalho de Melo Feb. 29, 2024, 5:23 p.m. UTC | #6

On Wed, Feb 28, 2024 at 01:19:12PM -0800, Namhyung Kim wrote:
> On Wed, Feb 28, 2024 at 12:16 PM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> > On Wed, Feb 28, 2024 at 12:01:55PM -0800, Namhyung Kim wrote:
> > > On Wed, Feb 28, 2024 at 4:22 AM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> > > > On Tue, Feb 27, 2024 at 09:33:35PM -0800, Namhyung Kim wrote:
> > > > > Currently it accounts the contention using delta between timestamps in
> > > > > lock:contention_begin and lock:contention_end tracepoints.  But it means
> > > > > the lock should see the both events during the monitoring period.

> > > > > Actually there are 4 cases that happen with the monitoring:

> > > > >                 monitoring period
> > > > >             /                       \
> > > > >             |                       |
> > > > >  1:  B------+-----------------------+--------E
> > > > >  2:    B----+-------------E         |
> > > > >  3:         |           B-----------+----E
> > > > >  4:         |     B-------------E   |
> > > > >             |                       |
> > > > >             t0                      t1

> > > > > where B and E mean contention BEGIN and END, respectively.  So it only
> > > > > accounts the case 4 for now.  It seems there's no way to handle the case
> > > > > 1.  The case 2 might be handled if it saved the timestamp (t0), but it
> > > > > lacks the information from the B notably the flags which shows the lock
> > > > > types.  Also it could be a nested lock which it currently ignores.  So
> > > > > I think we should ignore the case 2.

> > > > Perhaps have a separate output listing locks that were found to be with
> > > > at least tE - t0 time, with perhaps a backtrace at that END time?

> > > Do you mean long contentions in case 3?  I'm not sure what do
> > > you mean by tE, but they started after t0 so cannot be greater

> > case 2

> >                 monitoring period
> >             /                       \
> >             |                       |
> >  2:    B----+-------------E         |
> >             |             |         |
> >             t0            tE        t1
> >
> > We get a notification for event E, right? We don´t have one for B,
> > because it happened before we were monitoring.
> 
> Ah, ok.  But there should be too many events in case 2 and
> I don't think users want to see them all.  And they don't have

So maybe a summary, something like:

  N locks that were locked before this session started have been
  released, no further info besides this histogram of in-session
  durations:

    0-N units of time: ++
  N+1-M units of time: ++++
    ...

> flags.  But maybe we can update the flag when it sees exactly
> the same callstack later.

  The callstack, if going all the way to userspace may have the workload
targeted in the command line ( some pid, tid, CPU, etc) and thus would
point for things the user probably is interested than some other lock
that may affect it but indirectly.

- Arnaldo

Namhyung Kim Feb. 29, 2024, 9:53 p.m. UTC | #7

On Thu, Feb 29, 2024 at 9:23 AM Arnaldo Carvalho de Melo
<acme@kernel.org> wrote:
>
> On Wed, Feb 28, 2024 at 01:19:12PM -0800, Namhyung Kim wrote:
> > On Wed, Feb 28, 2024 at 12:16 PM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> > > On Wed, Feb 28, 2024 at 12:01:55PM -0800, Namhyung Kim wrote:
> > > > On Wed, Feb 28, 2024 at 4:22 AM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> > > > > On Tue, Feb 27, 2024 at 09:33:35PM -0800, Namhyung Kim wrote:
> > > > > > Currently it accounts the contention using delta between timestamps in
> > > > > > lock:contention_begin and lock:contention_end tracepoints.  But it means
> > > > > > the lock should see the both events during the monitoring period.
>
> > > > > > Actually there are 4 cases that happen with the monitoring:
>
> > > > > >                 monitoring period
> > > > > >             /                       \
> > > > > >             |                       |
> > > > > >  1:  B------+-----------------------+--------E
> > > > > >  2:    B----+-------------E         |
> > > > > >  3:         |           B-----------+----E
> > > > > >  4:         |     B-------------E   |
> > > > > >             |                       |
> > > > > >             t0                      t1
>
> > > > > > where B and E mean contention BEGIN and END, respectively.  So it only
> > > > > > accounts the case 4 for now.  It seems there's no way to handle the case
> > > > > > 1.  The case 2 might be handled if it saved the timestamp (t0), but it
> > > > > > lacks the information from the B notably the flags which shows the lock
> > > > > > types.  Also it could be a nested lock which it currently ignores.  So
> > > > > > I think we should ignore the case 2.
>
> > > > > Perhaps have a separate output listing locks that were found to be with
> > > > > at least tE - t0 time, with perhaps a backtrace at that END time?
>
> > > > Do you mean long contentions in case 3?  I'm not sure what do
> > > > you mean by tE, but they started after t0 so cannot be greater
>
> > > case 2
>
> > >                 monitoring period
> > >             /                       \
> > >             |                       |
> > >  2:    B----+-------------E         |
> > >             |             |         |
> > >             t0            tE        t1
> > >
> > > We get a notification for event E, right? We don´t have one for B,
> > > because it happened before we were monitoring.
> >
> > Ah, ok.  But there should be too many events in case 2 and
> > I don't think users want to see them all.  And they don't have
>
> So maybe a summary, something like:
>
>   N locks that were locked before this session started have been
>   released, no further info besides this histogram of in-session
>   durations:
>
>     0-N units of time: ++
>   N+1-M units of time: ++++
>     ...

Summary could work.  But I'm not sure about the histogram
since different locks would have different behavior - spinlock
vs. mutex/semaphore.  Maybe it's more meaningful when
you have filters or separate histograms for each lock.


>
> > flags.  But maybe we can update the flag when it sees exactly
> > the same callstack later.
>
>   The callstack, if going all the way to userspace may have the workload
> targeted in the command line ( some pid, tid, CPU, etc) and thus would
> point for things the user probably is interested than some other lock
> that may affect it but indirectly.

It doesn't collect user callstacks yet since it requires recording memory
address space information of every process - basically FORK and MMAP.
Maybe we can use callstacks with build-ID and offsets but it also requires
mapping from build-ID to binary somewhere.

Anyway, it's good to add more features to it.  Let me process this
patch first and think about more later. :)

Thanks,
Namhyung

Namhyung Kim March 1, 2024, 7:30 p.m. UTC | #8

On Tue, 27 Feb 2024 21:33:35 -0800, Namhyung Kim wrote:
> Currently it accounts the contention using delta between timestamps in
> lock:contention_begin and lock:contention_end tracepoints.  But it means
> the lock should see the both events during the monitoring period.
> 
> Actually there are 4 cases that happen with the monitoring:
> 
>                 monitoring period
>             /                       \
>             |                       |
>  1:  B------+-----------------------+--------E
>  2:    B----+-------------E         |
>  3:         |           B-----------+----E
>  4:         |     B-------------E   |
>             |                       |
>             t0                      t1
> 
> [...]

Applied to perf-tools-next, thanks!

Best regards,

diff mbox series

Patch

diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index 31ff19afc20c..9af76c6b2543 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -179,6 +179,123 @@  int lock_contention_prepare(struct lock_contention *con)
 	return 0;
 }
 
+/*
+ * Run the BPF program directly using BPF_PROG_TEST_RUN to update the end
+ * timestamp in ktime so that it can calculate delta easily.
+ */
+static void mark_end_timestamp(void)
+{
+	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
+		.flags = BPF_F_TEST_RUN_ON_CPU,
+	);
+	int prog_fd = bpf_program__fd(skel->progs.end_timestamp);
+
+	bpf_prog_test_run_opts(prog_fd, &opts);
+}
+
+static void update_lock_stat(int map_fd, int pid, u64 end_ts,
+			     enum lock_aggr_mode aggr_mode,
+			     struct tstamp_data *ts_data)
+{
+	u64 delta;
+	struct contention_key stat_key = {};
+	struct contention_data stat_data;
+
+	if (ts_data->timestamp >= end_ts)
+		return;
+
+	delta = end_ts - ts_data->timestamp;
+
+	switch (aggr_mode) {
+	case LOCK_AGGR_CALLER:
+		stat_key.stack_id = ts_data->stack_id;
+		break;
+	case LOCK_AGGR_TASK:
+		stat_key.pid = pid;
+		break;
+	case LOCK_AGGR_ADDR:
+		stat_key.lock_addr_or_cgroup = ts_data->lock;
+		break;
+	case LOCK_AGGR_CGROUP:
+		/* TODO */
+		return;
+	default:
+		return;
+	}
+
+	if (bpf_map_lookup_elem(map_fd, &stat_key, &stat_data) < 0)
+		return;
+
+	stat_data.total_time += delta;
+	stat_data.count++;
+
+	if (delta > stat_data.max_time)
+		stat_data.max_time = delta;
+	if (delta < stat_data.min_time)
+		stat_data.min_time = delta;
+
+	bpf_map_update_elem(map_fd, &stat_key, &stat_data, BPF_EXIST);
+}
+
+/*
+ * Account entries in the tstamp map (which didn't see the corresponding
+ * lock:contention_end tracepoint) using end_ts.
+ */
+static void account_end_timestamp(struct lock_contention *con)
+{
+	int ts_fd, stat_fd;
+	int *prev_key, key;
+	u64 end_ts = skel->bss->end_ts;
+	int total_cpus;
+	enum lock_aggr_mode aggr_mode = con->aggr_mode;
+	struct tstamp_data ts_data, *cpu_data;
+
+	/* Iterate per-task tstamp map (key = TID) */
+	ts_fd = bpf_map__fd(skel->maps.tstamp);
+	stat_fd = bpf_map__fd(skel->maps.lock_stat);
+
+	prev_key = NULL;
+	while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
+		if (bpf_map_lookup_elem(ts_fd, &key, &ts_data) == 0) {
+			int pid = key;
+
+			if (aggr_mode == LOCK_AGGR_TASK && con->owner)
+				pid = ts_data.flags;
+
+			update_lock_stat(stat_fd, pid, end_ts, aggr_mode,
+					 &ts_data);
+		}
+
+		prev_key = &key;
+	}
+
+	/* Now it'll check per-cpu tstamp map which doesn't have TID. */
+	if (aggr_mode == LOCK_AGGR_TASK || aggr_mode == LOCK_AGGR_CGROUP)
+		return;
+
+	total_cpus = cpu__max_cpu().cpu;
+	ts_fd = bpf_map__fd(skel->maps.tstamp_cpu);
+
+	cpu_data = calloc(total_cpus, sizeof(*cpu_data));
+	if (cpu_data == NULL)
+		return;
+
+	prev_key = NULL;
+	while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
+		if (bpf_map_lookup_elem(ts_fd, &key, cpu_data) < 0)
+			goto next;
+
+		for (int i = 0; i < total_cpus; i++) {
+			update_lock_stat(stat_fd, -1, end_ts, aggr_mode,
+					 &cpu_data[i]);
+		}
+
+next:
+		prev_key = &key;
+	}
+	free(cpu_data);
+}
+
 int lock_contention_start(void)
 {
 	skel->bss->enabled = 1;
@@ -188,6 +305,7 @@  int lock_contention_start(void)
 int lock_contention_stop(void)
 {
 	skel->bss->enabled = 0;
+	mark_end_timestamp();
 	return 0;
 }
 
@@ -301,6 +419,8 @@  int lock_contention_read(struct lock_contention *con)
 	if (stack_trace == NULL)
 		return -1;
 
+	account_end_timestamp(con);
+
 	if (con->aggr_mode == LOCK_AGGR_TASK) {
 		struct thread *idle = __machine__findnew_thread(machine,
 								/*pid=*/0,
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index 95cd8414f6ef..fb54bd38e7d0 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -19,13 +19,6 @@ 
 #define LCB_F_PERCPU	(1U << 4)
 #define LCB_F_MUTEX	(1U << 5)
 
-struct tstamp_data {
-	__u64 timestamp;
-	__u64 lock;
-	__u32 flags;
-	__s32 stack_id;
-};
-
 /* callstack storage  */
 struct {
 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
@@ -140,6 +133,8 @@  int perf_subsys_id = -1;
 /* determine the key of lock stat */
 int aggr_mode;
 
+__u64 end_ts;
+
 /* error stat */
 int task_fail;
 int stack_fail;
@@ -559,4 +554,11 @@  int BPF_PROG(collect_lock_syms)
 	return 0;
 }
 
+SEC("raw_tp/bpf_test_finish")
+int BPF_PROG(end_timestamp)
+{
+	end_ts = bpf_ktime_get_ns();
+	return 0;
+}
+
 char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
index 08482daf61be..36af11faad03 100644
--- a/tools/perf/util/bpf_skel/lock_data.h
+++ b/tools/perf/util/bpf_skel/lock_data.h
@@ -3,6 +3,13 @@ 
 #ifndef UTIL_BPF_SKEL_LOCK_DATA_H
 #define UTIL_BPF_SKEL_LOCK_DATA_H
 
+struct tstamp_data {
+	u64 timestamp;
+	u64 lock;
+	u32 flags;
+	u32 stack_id;
+};
+
 struct contention_key {
 	u32 stack_id;
 	u32 pid;