[v4,05/10] riscv: Prepare for user-space perf event mmap support

Message ID 20230703124647.215952-6-alexghiti@rivosinc.com
State New
Headers
Series riscv: Allow userspace to directly access perf counters |

Commit Message

Alexandre Ghiti July 3, 2023, 12:46 p.m. UTC
  Provide all the necessary bits in the generic riscv pmu driver to be
able to mmap perf events in userspace: the heavy lifting lies in the
driver backend, namely the legacy and sbi implementations.

Note that arch_perf_update_userpage is almost a copy of arm64 code.

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
---
 drivers/perf/riscv_pmu.c       | 105 +++++++++++++++++++++++++++++++++
 include/linux/perf/riscv_pmu.h |   4 ++
 2 files changed, 109 insertions(+)
  

Comments

Atish Patra July 14, 2023, 8:03 a.m. UTC | #1
On Mon, Jul 3, 2023 at 5:51 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
>
> Provide all the necessary bits in the generic riscv pmu driver to be
> able to mmap perf events in userspace: the heavy lifting lies in the
> driver backend, namely the legacy and sbi implementations.
>
> Note that arch_perf_update_userpage is almost a copy of arm64 code.
>
> Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
> Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> ---
>  drivers/perf/riscv_pmu.c       | 105 +++++++++++++++++++++++++++++++++
>  include/linux/perf/riscv_pmu.h |   4 ++
>  2 files changed, 109 insertions(+)
>
> diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c
> index ebca5eab9c9b..432ad2e80ce3 100644
> --- a/drivers/perf/riscv_pmu.c
> +++ b/drivers/perf/riscv_pmu.c
> @@ -14,9 +14,73 @@
>  #include <linux/perf/riscv_pmu.h>
>  #include <linux/printk.h>
>  #include <linux/smp.h>
> +#include <linux/sched_clock.h>
>
>  #include <asm/sbi.h>
>
> +static bool riscv_perf_user_access(struct perf_event *event)
> +{
> +       return ((event->attr.type == PERF_TYPE_HARDWARE) ||
> +               (event->attr.type == PERF_TYPE_HW_CACHE) ||
> +               (event->attr.type == PERF_TYPE_RAW)) &&
> +               !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
> +}
> +
> +void arch_perf_update_userpage(struct perf_event *event,
> +                              struct perf_event_mmap_page *userpg, u64 now)
> +{
> +       struct clock_read_data *rd;
> +       unsigned int seq;
> +       u64 ns;
> +
> +       userpg->cap_user_time = 0;
> +       userpg->cap_user_time_zero = 0;
> +       userpg->cap_user_time_short = 0;
> +       userpg->cap_user_rdpmc = riscv_perf_user_access(event);
> +
> +       userpg->pmc_width = 64;
> +
> +       do {
> +               rd = sched_clock_read_begin(&seq);
> +
> +               userpg->time_mult = rd->mult;
> +               userpg->time_shift = rd->shift;
> +               userpg->time_zero = rd->epoch_ns;
> +               userpg->time_cycles = rd->epoch_cyc;
> +               userpg->time_mask = rd->sched_clock_mask;
> +
> +               /*
> +                * Subtract the cycle base, such that software that
> +                * doesn't know about cap_user_time_short still 'works'
> +                * assuming no wraps.
> +                */
> +               ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift);
> +               userpg->time_zero -= ns;
> +
> +       } while (sched_clock_read_retry(seq));
> +
> +       userpg->time_offset = userpg->time_zero - now;
> +
> +       /*
> +        * time_shift is not expected to be greater than 31 due to
> +        * the original published conversion algorithm shifting a
> +        * 32-bit value (now specifies a 64-bit value) - refer
> +        * perf_event_mmap_page documentation in perf_event.h.
> +        */
> +       if (userpg->time_shift == 32) {
> +               userpg->time_shift = 31;
> +               userpg->time_mult >>= 1;
> +       }
> +
> +       /*
> +        * Internal timekeeping for enabled/running/stopped times
> +        * is always computed with the sched_clock.
> +        */
> +       userpg->cap_user_time = 1;
> +       userpg->cap_user_time_zero = 1;
> +       userpg->cap_user_time_short = 1;
> +}
> +
>  static unsigned long csr_read_num(int csr_num)
>  {
>  #define switchcase_csr_read(__csr_num, __val)          {\
> @@ -171,6 +235,8 @@ int riscv_pmu_event_set_period(struct perf_event *event)
>
>         local64_set(&hwc->prev_count, (u64)-left);
>
> +       perf_event_update_userpage(event);
> +
>         return overflow;
>  }
>
> @@ -267,6 +333,9 @@ static int riscv_pmu_event_init(struct perf_event *event)
>         hwc->idx = -1;
>         hwc->event_base = mapped_event;
>
> +       if (rvpmu->event_init)
> +               rvpmu->event_init(event);
> +
>         if (!is_sampling_event(event)) {
>                 /*
>                  * For non-sampling runs, limit the sample_period to half
> @@ -283,6 +352,39 @@ static int riscv_pmu_event_init(struct perf_event *event)
>         return 0;
>  }
>
> +static int riscv_pmu_event_idx(struct perf_event *event)
> +{
> +       struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
> +
> +       if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
> +               return 0;
> +
> +       if (rvpmu->csr_index)
> +               return rvpmu->csr_index(event) + 1;
> +
> +       return 0;
> +}
> +
> +static void riscv_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
> +{
> +       struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
> +
> +       if (rvpmu->event_mapped) {
> +               rvpmu->event_mapped(event, mm);
> +               perf_event_update_userpage(event);
> +       }
> +}
> +
> +static void riscv_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
> +{
> +       struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
> +
> +       if (rvpmu->event_unmapped) {
> +               rvpmu->event_unmapped(event, mm);
> +               perf_event_update_userpage(event);
> +       }
> +}
> +
>  struct riscv_pmu *riscv_pmu_alloc(void)
>  {
>         struct riscv_pmu *pmu;
> @@ -307,6 +409,9 @@ struct riscv_pmu *riscv_pmu_alloc(void)
>         }
>         pmu->pmu = (struct pmu) {
>                 .event_init     = riscv_pmu_event_init,
> +               .event_mapped   = riscv_pmu_event_mapped,
> +               .event_unmapped = riscv_pmu_event_unmapped,
> +               .event_idx      = riscv_pmu_event_idx,
>                 .add            = riscv_pmu_add,
>                 .del            = riscv_pmu_del,
>                 .start          = riscv_pmu_start,
> diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h
> index 5deeea0be7cb..43282e22ebe1 100644
> --- a/include/linux/perf/riscv_pmu.h
> +++ b/include/linux/perf/riscv_pmu.h
> @@ -55,6 +55,10 @@ struct riscv_pmu {
>         void            (*ctr_start)(struct perf_event *event, u64 init_val);
>         void            (*ctr_stop)(struct perf_event *event, unsigned long flag);
>         int             (*event_map)(struct perf_event *event, u64 *config);
> +       void            (*event_init)(struct perf_event *event);
> +       void            (*event_mapped)(struct perf_event *event, struct mm_struct *mm);
> +       void            (*event_unmapped)(struct perf_event *event, struct mm_struct *mm);
> +       uint8_t         (*csr_index)(struct perf_event *event);
>
>         struct cpu_hw_events    __percpu *hw_events;
>         struct hlist_node       node;
> --
> 2.39.2
>

Reviewed-by: Atish Patra <atishp@rivosinc.com>
  

Patch

diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c
index ebca5eab9c9b..432ad2e80ce3 100644
--- a/drivers/perf/riscv_pmu.c
+++ b/drivers/perf/riscv_pmu.c
@@ -14,9 +14,73 @@ 
 #include <linux/perf/riscv_pmu.h>
 #include <linux/printk.h>
 #include <linux/smp.h>
+#include <linux/sched_clock.h>
 
 #include <asm/sbi.h>
 
+static bool riscv_perf_user_access(struct perf_event *event)
+{
+	return ((event->attr.type == PERF_TYPE_HARDWARE) ||
+		(event->attr.type == PERF_TYPE_HW_CACHE) ||
+		(event->attr.type == PERF_TYPE_RAW)) &&
+		!!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
+}
+
+void arch_perf_update_userpage(struct perf_event *event,
+			       struct perf_event_mmap_page *userpg, u64 now)
+{
+	struct clock_read_data *rd;
+	unsigned int seq;
+	u64 ns;
+
+	userpg->cap_user_time = 0;
+	userpg->cap_user_time_zero = 0;
+	userpg->cap_user_time_short = 0;
+	userpg->cap_user_rdpmc = riscv_perf_user_access(event);
+
+	userpg->pmc_width = 64;
+
+	do {
+		rd = sched_clock_read_begin(&seq);
+
+		userpg->time_mult = rd->mult;
+		userpg->time_shift = rd->shift;
+		userpg->time_zero = rd->epoch_ns;
+		userpg->time_cycles = rd->epoch_cyc;
+		userpg->time_mask = rd->sched_clock_mask;
+
+		/*
+		 * Subtract the cycle base, such that software that
+		 * doesn't know about cap_user_time_short still 'works'
+		 * assuming no wraps.
+		 */
+		ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift);
+		userpg->time_zero -= ns;
+
+	} while (sched_clock_read_retry(seq));
+
+	userpg->time_offset = userpg->time_zero - now;
+
+	/*
+	 * time_shift is not expected to be greater than 31 due to
+	 * the original published conversion algorithm shifting a
+	 * 32-bit value (now specifies a 64-bit value) - refer
+	 * perf_event_mmap_page documentation in perf_event.h.
+	 */
+	if (userpg->time_shift == 32) {
+		userpg->time_shift = 31;
+		userpg->time_mult >>= 1;
+	}
+
+	/*
+	 * Internal timekeeping for enabled/running/stopped times
+	 * is always computed with the sched_clock.
+	 */
+	userpg->cap_user_time = 1;
+	userpg->cap_user_time_zero = 1;
+	userpg->cap_user_time_short = 1;
+}
+
 static unsigned long csr_read_num(int csr_num)
 {
 #define switchcase_csr_read(__csr_num, __val)		{\
@@ -171,6 +235,8 @@  int riscv_pmu_event_set_period(struct perf_event *event)
 
 	local64_set(&hwc->prev_count, (u64)-left);
 
+	perf_event_update_userpage(event);
+
 	return overflow;
 }
 
@@ -267,6 +333,9 @@  static int riscv_pmu_event_init(struct perf_event *event)
 	hwc->idx = -1;
 	hwc->event_base = mapped_event;
 
+	if (rvpmu->event_init)
+		rvpmu->event_init(event);
+
 	if (!is_sampling_event(event)) {
 		/*
 		 * For non-sampling runs, limit the sample_period to half
@@ -283,6 +352,39 @@  static int riscv_pmu_event_init(struct perf_event *event)
 	return 0;
 }
 
+static int riscv_pmu_event_idx(struct perf_event *event)
+{
+	struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
+
+	if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
+		return 0;
+
+	if (rvpmu->csr_index)
+		return rvpmu->csr_index(event) + 1;
+
+	return 0;
+}
+
+static void riscv_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
+{
+	struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
+
+	if (rvpmu->event_mapped) {
+		rvpmu->event_mapped(event, mm);
+		perf_event_update_userpage(event);
+	}
+}
+
+static void riscv_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
+{
+	struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
+
+	if (rvpmu->event_unmapped) {
+		rvpmu->event_unmapped(event, mm);
+		perf_event_update_userpage(event);
+	}
+}
+
 struct riscv_pmu *riscv_pmu_alloc(void)
 {
 	struct riscv_pmu *pmu;
@@ -307,6 +409,9 @@  struct riscv_pmu *riscv_pmu_alloc(void)
 	}
 	pmu->pmu = (struct pmu) {
 		.event_init	= riscv_pmu_event_init,
+		.event_mapped	= riscv_pmu_event_mapped,
+		.event_unmapped	= riscv_pmu_event_unmapped,
+		.event_idx	= riscv_pmu_event_idx,
 		.add		= riscv_pmu_add,
 		.del		= riscv_pmu_del,
 		.start		= riscv_pmu_start,
diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h
index 5deeea0be7cb..43282e22ebe1 100644
--- a/include/linux/perf/riscv_pmu.h
+++ b/include/linux/perf/riscv_pmu.h
@@ -55,6 +55,10 @@  struct riscv_pmu {
 	void		(*ctr_start)(struct perf_event *event, u64 init_val);
 	void		(*ctr_stop)(struct perf_event *event, unsigned long flag);
 	int		(*event_map)(struct perf_event *event, u64 *config);
+	void		(*event_init)(struct perf_event *event);
+	void		(*event_mapped)(struct perf_event *event, struct mm_struct *mm);
+	void		(*event_unmapped)(struct perf_event *event, struct mm_struct *mm);
+	uint8_t		(*csr_index)(struct perf_event *event);
 
 	struct cpu_hw_events	__percpu *hw_events;
 	struct hlist_node	node;