[v4,09/10] tools: lib: perf: Implement riscv mmap support

Message ID 20230727141428.962286-10-alexghiti@rivosinc.com
State New
Headers
Series riscv: Allow userspace to directly access perf counters |

Commit Message

Alexandre Ghiti July 27, 2023, 2:14 p.m. UTC
  riscv now supports mmaping hardware counters so add what's needed to
take advantage of that in libperf.

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Atish Patra <atishp@rivosinc.com>
---
 tools/lib/perf/mmap.c | 65 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
  

Comments

Ian Rogers July 28, 2023, 5:52 p.m. UTC | #1
On Thu, Jul 27, 2023 at 7:28 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
>
> riscv now supports mmaping hardware counters so add what's needed to
> take advantage of that in libperf.
>
> Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
> Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> Reviewed-by: Atish Patra <atishp@rivosinc.com>
> ---
>  tools/lib/perf/mmap.c | 65 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 65 insertions(+)
>
> diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c
> index 0d1634cedf44..378a163f0554 100644
> --- a/tools/lib/perf/mmap.c
> +++ b/tools/lib/perf/mmap.c
> @@ -392,6 +392,71 @@ static u64 read_perf_counter(unsigned int counter)
>
>  static u64 read_timestamp(void) { return read_sysreg(cntvct_el0); }
>
> +#elif __riscv_xlen == 64

This is something of an odd guard, perhaps:
#elif defined(__riscv) && __riscv_xlen == 64

That way it is more intention revealing that this is riscv code. Could
you add a comment relating to the __riscv_xlen ?

> +
> +/* TODO: implement rv32 support */
> +
> +#define CSR_CYCLE      0xc00
> +#define CSR_TIME       0xc01
> +
> +#define csr_read(csr)                                          \
> +({                                                             \
> +       register unsigned long __v;                             \
> +               __asm__ __volatile__ ("csrr %0, " #csr          \
> +                : "=r" (__v) :                                 \
> +                : "memory");                                   \

To avoid the macro pasting that could potentially go weird, could this be:

__asm__ __volatile__ ("csrr %0, %1",
  : "=r"(__v) /* outputs */
  : "i"(csr) /* inputs */
  : "memory" /* clobbers */)

Also, why is this clobbering memory? Worth adding a comment.

Thanks,
Ian

> +                __v;                                           \
> +})
> +
> +static unsigned long csr_read_num(int csr_num)
> +{
> +#define switchcase_csr_read(__csr_num, __val)           {\
> +       case __csr_num:                                 \
> +               __val = csr_read(__csr_num);            \
> +               break; }
> +#define switchcase_csr_read_2(__csr_num, __val)         {\
> +       switchcase_csr_read(__csr_num + 0, __val)        \
> +       switchcase_csr_read(__csr_num + 1, __val)}
> +#define switchcase_csr_read_4(__csr_num, __val)         {\
> +       switchcase_csr_read_2(__csr_num + 0, __val)      \
> +       switchcase_csr_read_2(__csr_num + 2, __val)}
> +#define switchcase_csr_read_8(__csr_num, __val)         {\
> +       switchcase_csr_read_4(__csr_num + 0, __val)      \
> +       switchcase_csr_read_4(__csr_num + 4, __val)}
> +#define switchcase_csr_read_16(__csr_num, __val)        {\
> +       switchcase_csr_read_8(__csr_num + 0, __val)      \
> +       switchcase_csr_read_8(__csr_num + 8, __val)}
> +#define switchcase_csr_read_32(__csr_num, __val)        {\
> +       switchcase_csr_read_16(__csr_num + 0, __val)     \
> +       switchcase_csr_read_16(__csr_num + 16, __val)}
> +
> +       unsigned long ret = 0;
> +
> +       switch (csr_num) {
> +       switchcase_csr_read_32(CSR_CYCLE, ret)
> +       default:
> +               break;
> +       }
> +
> +       return ret;
> +#undef switchcase_csr_read_32
> +#undef switchcase_csr_read_16
> +#undef switchcase_csr_read_8
> +#undef switchcase_csr_read_4
> +#undef switchcase_csr_read_2
> +#undef switchcase_csr_read
> +}
> +
> +static u64 read_perf_counter(unsigned int counter)
> +{
> +       return csr_read_num(CSR_CYCLE + counter);
> +}
> +
> +static u64 read_timestamp(void)
> +{
> +       return csr_read_num(CSR_TIME);
> +}
> +
>  #else
>  static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
>  static u64 read_timestamp(void) { return 0; }
> --
> 2.39.2
>
  
Alexandre Ghiti July 31, 2023, 10:15 a.m. UTC | #2
Hi Ian,

On Fri, Jul 28, 2023 at 7:53 PM Ian Rogers <irogers@google.com> wrote:
>
> On Thu, Jul 27, 2023 at 7:28 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> >
> > riscv now supports mmaping hardware counters so add what's needed to
> > take advantage of that in libperf.
> >
> > Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
> > Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> > Reviewed-by: Atish Patra <atishp@rivosinc.com>
> > ---
> >  tools/lib/perf/mmap.c | 65 +++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 65 insertions(+)
> >
> > diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c
> > index 0d1634cedf44..378a163f0554 100644
> > --- a/tools/lib/perf/mmap.c
> > +++ b/tools/lib/perf/mmap.c
> > @@ -392,6 +392,71 @@ static u64 read_perf_counter(unsigned int counter)
> >
> >  static u64 read_timestamp(void) { return read_sysreg(cntvct_el0); }
> >
> > +#elif __riscv_xlen == 64
>
> This is something of an odd guard, perhaps:
> #elif defined(__riscv) && __riscv_xlen == 64
>
> That way it is more intention revealing that this is riscv code. Could
> you add a comment relating to the __riscv_xlen ?

I guess Andrew answered that already.

>
> > +
> > +/* TODO: implement rv32 support */
> > +
> > +#define CSR_CYCLE      0xc00
> > +#define CSR_TIME       0xc01
> > +
> > +#define csr_read(csr)                                          \
> > +({                                                             \
> > +       register unsigned long __v;                             \
> > +               __asm__ __volatile__ ("csrr %0, " #csr          \
> > +                : "=r" (__v) :                                 \
> > +                : "memory");                                   \
>
> To avoid the macro pasting that could potentially go weird, could this be:
>
> __asm__ __volatile__ ("csrr %0, %1",
>   : "=r"(__v) /* outputs */
>   : "i"(csr) /* inputs */
>   : "memory" /* clobbers */)
>
> Also, why is this clobbering memory? Worth adding a comment.

No idea, I see that it is also done this way in
arch/riscv/include/asm/csr.h. @Atish Kumar Patra , @Palmer Dabbelt ?

Thanks for your comments!

Alex

>
> Thanks,
> Ian
>
> > +                __v;                                           \
> > +})
> > +
> > +static unsigned long csr_read_num(int csr_num)
> > +{
> > +#define switchcase_csr_read(__csr_num, __val)           {\
> > +       case __csr_num:                                 \
> > +               __val = csr_read(__csr_num);            \
> > +               break; }
> > +#define switchcase_csr_read_2(__csr_num, __val)         {\
> > +       switchcase_csr_read(__csr_num + 0, __val)        \
> > +       switchcase_csr_read(__csr_num + 1, __val)}
> > +#define switchcase_csr_read_4(__csr_num, __val)         {\
> > +       switchcase_csr_read_2(__csr_num + 0, __val)      \
> > +       switchcase_csr_read_2(__csr_num + 2, __val)}
> > +#define switchcase_csr_read_8(__csr_num, __val)         {\
> > +       switchcase_csr_read_4(__csr_num + 0, __val)      \
> > +       switchcase_csr_read_4(__csr_num + 4, __val)}
> > +#define switchcase_csr_read_16(__csr_num, __val)        {\
> > +       switchcase_csr_read_8(__csr_num + 0, __val)      \
> > +       switchcase_csr_read_8(__csr_num + 8, __val)}
> > +#define switchcase_csr_read_32(__csr_num, __val)        {\
> > +       switchcase_csr_read_16(__csr_num + 0, __val)     \
> > +       switchcase_csr_read_16(__csr_num + 16, __val)}
> > +
> > +       unsigned long ret = 0;
> > +
> > +       switch (csr_num) {
> > +       switchcase_csr_read_32(CSR_CYCLE, ret)
> > +       default:
> > +               break;
> > +       }
> > +
> > +       return ret;
> > +#undef switchcase_csr_read_32
> > +#undef switchcase_csr_read_16
> > +#undef switchcase_csr_read_8
> > +#undef switchcase_csr_read_4
> > +#undef switchcase_csr_read_2
> > +#undef switchcase_csr_read
> > +}
> > +
> > +static u64 read_perf_counter(unsigned int counter)
> > +{
> > +       return csr_read_num(CSR_CYCLE + counter);
> > +}
> > +
> > +static u64 read_timestamp(void)
> > +{
> > +       return csr_read_num(CSR_TIME);
> > +}
> > +
> >  #else
> >  static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
> >  static u64 read_timestamp(void) { return 0; }
> > --
> > 2.39.2
> >
  
Alexandre Ghiti July 31, 2023, 10:27 a.m. UTC | #3
On Mon, Jul 31, 2023 at 12:15 PM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
>
> Hi Ian,
>
> On Fri, Jul 28, 2023 at 7:53 PM Ian Rogers <irogers@google.com> wrote:
> >
> > On Thu, Jul 27, 2023 at 7:28 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> > >
> > > riscv now supports mmaping hardware counters so add what's needed to
> > > take advantage of that in libperf.
> > >
> > > Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
> > > Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> > > Reviewed-by: Atish Patra <atishp@rivosinc.com>
> > > ---
> > >  tools/lib/perf/mmap.c | 65 +++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 65 insertions(+)
> > >
> > > diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c
> > > index 0d1634cedf44..378a163f0554 100644
> > > --- a/tools/lib/perf/mmap.c
> > > +++ b/tools/lib/perf/mmap.c
> > > @@ -392,6 +392,71 @@ static u64 read_perf_counter(unsigned int counter)
> > >
> > >  static u64 read_timestamp(void) { return read_sysreg(cntvct_el0); }
> > >
> > > +#elif __riscv_xlen == 64
> >
> > This is something of an odd guard, perhaps:
> > #elif defined(__riscv) && __riscv_xlen == 64
> >
> > That way it is more intention revealing that this is riscv code. Could
> > you add a comment relating to the __riscv_xlen ?
>
> I guess Andrew answered that already.
>
> >
> > > +
> > > +/* TODO: implement rv32 support */
> > > +
> > > +#define CSR_CYCLE      0xc00
> > > +#define CSR_TIME       0xc01
> > > +
> > > +#define csr_read(csr)                                          \
> > > +({                                                             \
> > > +       register unsigned long __v;                             \
> > > +               __asm__ __volatile__ ("csrr %0, " #csr          \
> > > +                : "=r" (__v) :                                 \
> > > +                : "memory");                                   \
> >
> > To avoid the macro pasting that could potentially go weird, could this be:
> >
> > __asm__ __volatile__ ("csrr %0, %1",
> >   : "=r"(__v) /* outputs */
> >   : "i"(csr) /* inputs */
> >   : "memory" /* clobbers */)

Forgot to answer this one: it compiles, but I have to admit that I
don't understand the difference and if that's correct (all macros in
arch/riscv/include/asm/csr.h use # to do this) and what benefits it
brings. Can you elaborate more on things that could "go weird"?

Thanks again,

Alex

> >
> > Also, why is this clobbering memory? Worth adding a comment.
>
> No idea, I see that it is also done this way in
> arch/riscv/include/asm/csr.h. @Atish Kumar Patra , @Palmer Dabbelt ?
>
> Thanks for your comments!
>
> Alex
>
> >
> > Thanks,
> > Ian
> >
> > > +                __v;                                           \
> > > +})
> > > +
> > > +static unsigned long csr_read_num(int csr_num)
> > > +{
> > > +#define switchcase_csr_read(__csr_num, __val)           {\
> > > +       case __csr_num:                                 \
> > > +               __val = csr_read(__csr_num);            \
> > > +               break; }
> > > +#define switchcase_csr_read_2(__csr_num, __val)         {\
> > > +       switchcase_csr_read(__csr_num + 0, __val)        \
> > > +       switchcase_csr_read(__csr_num + 1, __val)}
> > > +#define switchcase_csr_read_4(__csr_num, __val)         {\
> > > +       switchcase_csr_read_2(__csr_num + 0, __val)      \
> > > +       switchcase_csr_read_2(__csr_num + 2, __val)}
> > > +#define switchcase_csr_read_8(__csr_num, __val)         {\
> > > +       switchcase_csr_read_4(__csr_num + 0, __val)      \
> > > +       switchcase_csr_read_4(__csr_num + 4, __val)}
> > > +#define switchcase_csr_read_16(__csr_num, __val)        {\
> > > +       switchcase_csr_read_8(__csr_num + 0, __val)      \
> > > +       switchcase_csr_read_8(__csr_num + 8, __val)}
> > > +#define switchcase_csr_read_32(__csr_num, __val)        {\
> > > +       switchcase_csr_read_16(__csr_num + 0, __val)     \
> > > +       switchcase_csr_read_16(__csr_num + 16, __val)}
> > > +
> > > +       unsigned long ret = 0;
> > > +
> > > +       switch (csr_num) {
> > > +       switchcase_csr_read_32(CSR_CYCLE, ret)
> > > +       default:
> > > +               break;
> > > +       }
> > > +
> > > +       return ret;
> > > +#undef switchcase_csr_read_32
> > > +#undef switchcase_csr_read_16
> > > +#undef switchcase_csr_read_8
> > > +#undef switchcase_csr_read_4
> > > +#undef switchcase_csr_read_2
> > > +#undef switchcase_csr_read
> > > +}
> > > +
> > > +static u64 read_perf_counter(unsigned int counter)
> > > +{
> > > +       return csr_read_num(CSR_CYCLE + counter);
> > > +}
> > > +
> > > +static u64 read_timestamp(void)
> > > +{
> > > +       return csr_read_num(CSR_TIME);
> > > +}
> > > +
> > >  #else
> > >  static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
> > >  static u64 read_timestamp(void) { return 0; }
> > > --
> > > 2.39.2
> > >
  
Ian Rogers July 31, 2023, 3:10 p.m. UTC | #4
On Mon, Jul 31, 2023 at 3:27 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
>
> On Mon, Jul 31, 2023 at 12:15 PM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> >
> > Hi Ian,
> >
> > On Fri, Jul 28, 2023 at 7:53 PM Ian Rogers <irogers@google.com> wrote:
> > >
> > > On Thu, Jul 27, 2023 at 7:28 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> > > >
> > > > riscv now supports mmaping hardware counters so add what's needed to
> > > > take advantage of that in libperf.
> > > >
> > > > Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
> > > > Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> > > > Reviewed-by: Atish Patra <atishp@rivosinc.com>
> > > > ---
> > > >  tools/lib/perf/mmap.c | 65 +++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 65 insertions(+)
> > > >
> > > > diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c
> > > > index 0d1634cedf44..378a163f0554 100644
> > > > --- a/tools/lib/perf/mmap.c
> > > > +++ b/tools/lib/perf/mmap.c
> > > > @@ -392,6 +392,71 @@ static u64 read_perf_counter(unsigned int counter)
> > > >
> > > >  static u64 read_timestamp(void) { return read_sysreg(cntvct_el0); }
> > > >
> > > > +#elif __riscv_xlen == 64
> > >
> > > This is something of an odd guard, perhaps:
> > > #elif defined(__riscv) && __riscv_xlen == 64
> > >
> > > That way it is more intention revealing that this is riscv code. Could
> > > you add a comment relating to the __riscv_xlen ?
> >
> > I guess Andrew answered that already.
> >

Not sure. I still think it looks weird:
...
#if defined(__i386__) || defined(__x86_64__)
...
#elif defined(__aarch64__)
...
#elif __riscv_xlen == 64
...
#else
static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
static u64 read_timestamp(void) { return 0; }
#endif

The first two are clearly #ifdef-ing architecture specific assembly
code, under what conditions I get RISC-V code  ¯\(ツ)/¯ At least worth
a comment like "csrr is only available when you have xlens of 64
because ..."

> > >
> > > > +
> > > > +/* TODO: implement rv32 support */
> > > > +
> > > > +#define CSR_CYCLE      0xc00
> > > > +#define CSR_TIME       0xc01
> > > > +
> > > > +#define csr_read(csr)                                          \
> > > > +({                                                             \
> > > > +       register unsigned long __v;                             \
> > > > +               __asm__ __volatile__ ("csrr %0, " #csr          \
> > > > +                : "=r" (__v) :                                 \
> > > > +                : "memory");                                   \
> > >
> > > To avoid the macro pasting that could potentially go weird, could this be:
> > >
> > > __asm__ __volatile__ ("csrr %0, %1",
> > >   : "=r"(__v) /* outputs */
> > >   : "i"(csr) /* inputs */
> > >   : "memory" /* clobbers */)
>
> Forgot to answer this one: it compiles, but I have to admit that I
> don't understand the difference and if that's correct (all macros in
> arch/riscv/include/asm/csr.h use # to do this) and what benefits it
> brings. Can you elaborate more on things that could "go weird"?

So rather than use an input constraint for the asm block you are using
the C preprocessor to paste in the csr argument. If csr is something
like "1" then it looks good and you'll get "csrr %0,1". If you pass
something like "1 << 31" then that will be pasted as "csrr %0, 1 <<
31" and that starts to get weird in the context of being in the
assembler where it is unlikely the C operators work. Using the input
constraint avoids this, causes the C compiler to check the type of the
argument and you'll probably get more intelligible error messages as a
consequence.

>
> Thanks again,
>
> Alex
>
> > >
> > > Also, why is this clobbering memory? Worth adding a comment.
> >
> > No idea, I see that it is also done this way in
> > arch/riscv/include/asm/csr.h. @Atish Kumar Patra , @Palmer Dabbelt ?

It would seem to make sense then not to have a memory constraint until
we know why we're doing it?

Thanks,
Ian

> >
> > Thanks for your comments!
> >
> > Alex
> >
> > >
> > > Thanks,
> > > Ian
> > >
> > > > +                __v;                                           \
> > > > +})
> > > > +
> > > > +static unsigned long csr_read_num(int csr_num)
> > > > +{
> > > > +#define switchcase_csr_read(__csr_num, __val)           {\
> > > > +       case __csr_num:                                 \
> > > > +               __val = csr_read(__csr_num);            \
> > > > +               break; }
> > > > +#define switchcase_csr_read_2(__csr_num, __val)         {\
> > > > +       switchcase_csr_read(__csr_num + 0, __val)        \
> > > > +       switchcase_csr_read(__csr_num + 1, __val)}
> > > > +#define switchcase_csr_read_4(__csr_num, __val)         {\
> > > > +       switchcase_csr_read_2(__csr_num + 0, __val)      \
> > > > +       switchcase_csr_read_2(__csr_num + 2, __val)}
> > > > +#define switchcase_csr_read_8(__csr_num, __val)         {\
> > > > +       switchcase_csr_read_4(__csr_num + 0, __val)      \
> > > > +       switchcase_csr_read_4(__csr_num + 4, __val)}
> > > > +#define switchcase_csr_read_16(__csr_num, __val)        {\
> > > > +       switchcase_csr_read_8(__csr_num + 0, __val)      \
> > > > +       switchcase_csr_read_8(__csr_num + 8, __val)}
> > > > +#define switchcase_csr_read_32(__csr_num, __val)        {\
> > > > +       switchcase_csr_read_16(__csr_num + 0, __val)     \
> > > > +       switchcase_csr_read_16(__csr_num + 16, __val)}
> > > > +
> > > > +       unsigned long ret = 0;
> > > > +
> > > > +       switch (csr_num) {
> > > > +       switchcase_csr_read_32(CSR_CYCLE, ret)
> > > > +       default:
> > > > +               break;
> > > > +       }
> > > > +
> > > > +       return ret;
> > > > +#undef switchcase_csr_read_32
> > > > +#undef switchcase_csr_read_16
> > > > +#undef switchcase_csr_read_8
> > > > +#undef switchcase_csr_read_4
> > > > +#undef switchcase_csr_read_2
> > > > +#undef switchcase_csr_read
> > > > +}
> > > > +
> > > > +static u64 read_perf_counter(unsigned int counter)
> > > > +{
> > > > +       return csr_read_num(CSR_CYCLE + counter);
> > > > +}
> > > > +
> > > > +static u64 read_timestamp(void)
> > > > +{
> > > > +       return csr_read_num(CSR_TIME);
> > > > +}
> > > > +
> > > >  #else
> > > >  static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
> > > >  static u64 read_timestamp(void) { return 0; }
> > > > --
> > > > 2.39.2
> > > >
  
Alexandre Ghiti July 31, 2023, 4:06 p.m. UTC | #5
On Mon, Jul 31, 2023 at 5:10 PM Ian Rogers <irogers@google.com> wrote:
>
> On Mon, Jul 31, 2023 at 3:27 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> >
> > On Mon, Jul 31, 2023 at 12:15 PM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> > >
> > > Hi Ian,
> > >
> > > On Fri, Jul 28, 2023 at 7:53 PM Ian Rogers <irogers@google.com> wrote:
> > > >
> > > > On Thu, Jul 27, 2023 at 7:28 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> > > > >
> > > > > riscv now supports mmaping hardware counters so add what's needed to
> > > > > take advantage of that in libperf.
> > > > >
> > > > > Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
> > > > > Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> > > > > Reviewed-by: Atish Patra <atishp@rivosinc.com>
> > > > > ---
> > > > >  tools/lib/perf/mmap.c | 65 +++++++++++++++++++++++++++++++++++++++++++
> > > > >  1 file changed, 65 insertions(+)
> > > > >
> > > > > diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c
> > > > > index 0d1634cedf44..378a163f0554 100644
> > > > > --- a/tools/lib/perf/mmap.c
> > > > > +++ b/tools/lib/perf/mmap.c
> > > > > @@ -392,6 +392,71 @@ static u64 read_perf_counter(unsigned int counter)
> > > > >
> > > > >  static u64 read_timestamp(void) { return read_sysreg(cntvct_el0); }
> > > > >
> > > > > +#elif __riscv_xlen == 64
> > > >
> > > > This is something of an odd guard, perhaps:
> > > > #elif defined(__riscv) && __riscv_xlen == 64
> > > >
> > > > That way it is more intention revealing that this is riscv code. Could
> > > > you add a comment relating to the __riscv_xlen ?
> > >
> > > I guess Andrew answered that already.
> > >
>
> Not sure. I still think it looks weird:
> ...
> #if defined(__i386__) || defined(__x86_64__)
> ...
> #elif defined(__aarch64__)
> ...
> #elif __riscv_xlen == 64
> ...
> #else
> static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
> static u64 read_timestamp(void) { return 0; }
> #endif
>
> The first two are clearly #ifdef-ing architecture specific assembly
> code, under what conditions I get RISC-V code  ¯\(ツ)/¯ At least worth
> a comment like "csrr is only available when you have xlens of 64
> because ..."

__riscv_xlen indicates riscv64, which is the only target of this
patchset. But if you prefer, I don't mind adding back the
defined(__riscv) if I re-spin a new version.

>
> > > >
> > > > > +
> > > > > +/* TODO: implement rv32 support */
> > > > > +
> > > > > +#define CSR_CYCLE      0xc00
> > > > > +#define CSR_TIME       0xc01
> > > > > +
> > > > > +#define csr_read(csr)                                          \
> > > > > +({                                                             \
> > > > > +       register unsigned long __v;                             \
> > > > > +               __asm__ __volatile__ ("csrr %0, " #csr          \
> > > > > +                : "=r" (__v) :                                 \
> > > > > +                : "memory");                                   \
> > > >
> > > > To avoid the macro pasting that could potentially go weird, could this be:
> > > >
> > > > __asm__ __volatile__ ("csrr %0, %1",
> > > >   : "=r"(__v) /* outputs */
> > > >   : "i"(csr) /* inputs */
> > > >   : "memory" /* clobbers */)
> >
> > Forgot to answer this one: it compiles, but I have to admit that I
> > don't understand the difference and if that's correct (all macros in
> > arch/riscv/include/asm/csr.h use # to do this) and what benefits it
> > brings. Can you elaborate more on things that could "go weird"?
>
> So rather than use an input constraint for the asm block you are using
> the C preprocessor to paste in the csr argument. If csr is something
> like "1" then it looks good and you'll get "csrr %0,1". If you pass
> something like "1 << 31" then that will be pasted as "csrr %0, 1 <<
> 31" and that starts to get weird in the context of being in the
> assembler where it is unlikely the C operators work. Using the input
> constraint avoids this, causes the C compiler to check the type of the
> argument and you'll probably get more intelligible error messages as a
> consequence.
>

Thanks. So if I'm not mistaken, in this exact context, given we only
use csr_read() through the csr_read_num() function, it seems ok right?

> >
> > Thanks again,
> >
> > Alex
> >
> > > >
> > > > Also, why is this clobbering memory? Worth adding a comment.
> > >
> > > No idea, I see that it is also done this way in
> > > arch/riscv/include/asm/csr.h. @Atish Kumar Patra , @Palmer Dabbelt ?
>
> It would seem to make sense then not to have a memory constraint until
> we know why we're doing it?
>

I have just had the answer internally (thanks to @Brendan Sweeney):
csr modifications can alter how the memory is accessed (satp which
changes the address space, sum which allows/disallows userspace
access...), so we need the memory barrier to make sure the compiler
does not reorder the memory accesses.

Thanks,

Alex

> Thanks,
> Ian
>
> > >
> > > Thanks for your comments!
> > >
> > > Alex
> > >
> > > >
> > > > Thanks,
> > > > Ian
> > > >
> > > > > +                __v;                                           \
> > > > > +})
> > > > > +
> > > > > +static unsigned long csr_read_num(int csr_num)
> > > > > +{
> > > > > +#define switchcase_csr_read(__csr_num, __val)           {\
> > > > > +       case __csr_num:                                 \
> > > > > +               __val = csr_read(__csr_num);            \
> > > > > +               break; }
> > > > > +#define switchcase_csr_read_2(__csr_num, __val)         {\
> > > > > +       switchcase_csr_read(__csr_num + 0, __val)        \
> > > > > +       switchcase_csr_read(__csr_num + 1, __val)}
> > > > > +#define switchcase_csr_read_4(__csr_num, __val)         {\
> > > > > +       switchcase_csr_read_2(__csr_num + 0, __val)      \
> > > > > +       switchcase_csr_read_2(__csr_num + 2, __val)}
> > > > > +#define switchcase_csr_read_8(__csr_num, __val)         {\
> > > > > +       switchcase_csr_read_4(__csr_num + 0, __val)      \
> > > > > +       switchcase_csr_read_4(__csr_num + 4, __val)}
> > > > > +#define switchcase_csr_read_16(__csr_num, __val)        {\
> > > > > +       switchcase_csr_read_8(__csr_num + 0, __val)      \
> > > > > +       switchcase_csr_read_8(__csr_num + 8, __val)}
> > > > > +#define switchcase_csr_read_32(__csr_num, __val)        {\
> > > > > +       switchcase_csr_read_16(__csr_num + 0, __val)     \
> > > > > +       switchcase_csr_read_16(__csr_num + 16, __val)}
> > > > > +
> > > > > +       unsigned long ret = 0;
> > > > > +
> > > > > +       switch (csr_num) {
> > > > > +       switchcase_csr_read_32(CSR_CYCLE, ret)
> > > > > +       default:
> > > > > +               break;
> > > > > +       }
> > > > > +
> > > > > +       return ret;
> > > > > +#undef switchcase_csr_read_32
> > > > > +#undef switchcase_csr_read_16
> > > > > +#undef switchcase_csr_read_8
> > > > > +#undef switchcase_csr_read_4
> > > > > +#undef switchcase_csr_read_2
> > > > > +#undef switchcase_csr_read
> > > > > +}
> > > > > +
> > > > > +static u64 read_perf_counter(unsigned int counter)
> > > > > +{
> > > > > +       return csr_read_num(CSR_CYCLE + counter);
> > > > > +}
> > > > > +
> > > > > +static u64 read_timestamp(void)
> > > > > +{
> > > > > +       return csr_read_num(CSR_TIME);
> > > > > +}
> > > > > +
> > > > >  #else
> > > > >  static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
> > > > >  static u64 read_timestamp(void) { return 0; }
> > > > > --
> > > > > 2.39.2
> > > > >
  
Ian Rogers July 31, 2023, 4:46 p.m. UTC | #6
On Mon, Jul 31, 2023 at 9:06 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
>
> On Mon, Jul 31, 2023 at 5:10 PM Ian Rogers <irogers@google.com> wrote:
> >
> > On Mon, Jul 31, 2023 at 3:27 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> > >
> > > On Mon, Jul 31, 2023 at 12:15 PM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> > > >
> > > > Hi Ian,
> > > >
> > > > On Fri, Jul 28, 2023 at 7:53 PM Ian Rogers <irogers@google.com> wrote:
> > > > >
> > > > > On Thu, Jul 27, 2023 at 7:28 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> > > > > >
> > > > > > riscv now supports mmaping hardware counters so add what's needed to
> > > > > > take advantage of that in libperf.
> > > > > >
> > > > > > Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
> > > > > > Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> > > > > > Reviewed-by: Atish Patra <atishp@rivosinc.com>
> > > > > > ---
> > > > > >  tools/lib/perf/mmap.c | 65 +++++++++++++++++++++++++++++++++++++++++++
> > > > > >  1 file changed, 65 insertions(+)
> > > > > >
> > > > > > diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c
> > > > > > index 0d1634cedf44..378a163f0554 100644
> > > > > > --- a/tools/lib/perf/mmap.c
> > > > > > +++ b/tools/lib/perf/mmap.c
> > > > > > @@ -392,6 +392,71 @@ static u64 read_perf_counter(unsigned int counter)
> > > > > >
> > > > > >  static u64 read_timestamp(void) { return read_sysreg(cntvct_el0); }
> > > > > >
> > > > > > +#elif __riscv_xlen == 64
> > > > >
> > > > > This is something of an odd guard, perhaps:
> > > > > #elif defined(__riscv) && __riscv_xlen == 64
> > > > >
> > > > > That way it is more intention revealing that this is riscv code. Could
> > > > > you add a comment relating to the __riscv_xlen ?
> > > >
> > > > I guess Andrew answered that already.
> > > >
> >
> > Not sure. I still think it looks weird:
> > ...
> > #if defined(__i386__) || defined(__x86_64__)
> > ...
> > #elif defined(__aarch64__)
> > ...
> > #elif __riscv_xlen == 64
> > ...
> > #else
> > static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
> > static u64 read_timestamp(void) { return 0; }
> > #endif
> >
> > The first two are clearly #ifdef-ing architecture specific assembly
> > code, under what conditions I get RISC-V code  ¯\(ツ)/¯ At least worth
> > a comment like "csrr is only available when you have xlens of 64
> > because ..."
>
> __riscv_xlen indicates riscv64, which is the only target of this
> patchset. But if you prefer, I don't mind adding back the
> defined(__riscv) if I re-spin a new version.

This kind of begs the question as to why there is no __riscv64 ifdef.
The issue with xlen is it isn't intention revealing so for regular
people trying to understand the code it would be nice to document it.

> >
> > > > >
> > > > > > +
> > > > > > +/* TODO: implement rv32 support */
> > > > > > +
> > > > > > +#define CSR_CYCLE      0xc00
> > > > > > +#define CSR_TIME       0xc01
> > > > > > +
> > > > > > +#define csr_read(csr)                                          \
> > > > > > +({                                                             \
> > > > > > +       register unsigned long __v;                             \
> > > > > > +               __asm__ __volatile__ ("csrr %0, " #csr          \
> > > > > > +                : "=r" (__v) :                                 \
> > > > > > +                : "memory");                                   \
> > > > >
> > > > > To avoid the macro pasting that could potentially go weird, could this be:
> > > > >
> > > > > __asm__ __volatile__ ("csrr %0, %1",
> > > > >   : "=r"(__v) /* outputs */
> > > > >   : "i"(csr) /* inputs */
> > > > >   : "memory" /* clobbers */)
> > >
> > > Forgot to answer this one: it compiles, but I have to admit that I
> > > don't understand the difference and if that's correct (all macros in
> > > arch/riscv/include/asm/csr.h use # to do this) and what benefits it
> > > brings. Can you elaborate more on things that could "go weird"?
> >
> > So rather than use an input constraint for the asm block you are using
> > the C preprocessor to paste in the csr argument. If csr is something
> > like "1" then it looks good and you'll get "csrr %0,1". If you pass
> > something like "1 << 31" then that will be pasted as "csrr %0, 1 <<
> > 31" and that starts to get weird in the context of being in the
> > assembler where it is unlikely the C operators work. Using the input
> > constraint avoids this, causes the C compiler to check the type of the
> > argument and you'll probably get more intelligible error messages as a
> > consequence.
> >
>
> Thanks. So if I'm not mistaken, in this exact context, given we only
> use csr_read() through the csr_read_num() function, it seems ok right?

So you've formed a cargo cult and the justification is not wanting to
stop a copy-paste chain from somewhere else. This code itself will be
copy-pasted and we go to some ends to encourage that by placing parts
of it in include/uapi/linux/perf_event.h. It seems better to catch
this issue early rather than propagate it.

> > >
> > > Thanks again,
> > >
> > > Alex
> > >
> > > > >
> > > > > Also, why is this clobbering memory? Worth adding a comment.
> > > >
> > > > No idea, I see that it is also done this way in
> > > > arch/riscv/include/asm/csr.h. @Atish Kumar Patra , @Palmer Dabbelt ?
> >
> > It would seem to make sense then not to have a memory constraint until
> > we know why we're doing it?
> >
>
> I have just had the answer internally (thanks to @Brendan Sweeney):
> csr modifications can alter how the memory is accessed (satp which
> changes the address space, sum which allows/disallows userspace
> access...), so we need the memory barrier to make sure the compiler
> does not reorder the memory accesses.

The conditions you mention shouldn't apply here though? Even if you
add a memory barrier for the compiler what is stopping the hardware
reordering loads and stores? If it absolutely has to be there then a
comment something like "There is a bug is riscv where the csrr
instruction can clobber memory breaking future reads and some how this
constraint fixes it by ... ".

Thanks,
Ian

> Thanks,
>
> Alex
>
> > Thanks,
> > Ian
> >
> > > >
> > > > Thanks for your comments!
> > > >
> > > > Alex
> > > >
> > > > >
> > > > > Thanks,
> > > > > Ian
> > > > >
> > > > > > +                __v;                                           \
> > > > > > +})
> > > > > > +
> > > > > > +static unsigned long csr_read_num(int csr_num)
> > > > > > +{
> > > > > > +#define switchcase_csr_read(__csr_num, __val)           {\
> > > > > > +       case __csr_num:                                 \
> > > > > > +               __val = csr_read(__csr_num);            \
> > > > > > +               break; }
> > > > > > +#define switchcase_csr_read_2(__csr_num, __val)         {\
> > > > > > +       switchcase_csr_read(__csr_num + 0, __val)        \
> > > > > > +       switchcase_csr_read(__csr_num + 1, __val)}
> > > > > > +#define switchcase_csr_read_4(__csr_num, __val)         {\
> > > > > > +       switchcase_csr_read_2(__csr_num + 0, __val)      \
> > > > > > +       switchcase_csr_read_2(__csr_num + 2, __val)}
> > > > > > +#define switchcase_csr_read_8(__csr_num, __val)         {\
> > > > > > +       switchcase_csr_read_4(__csr_num + 0, __val)      \
> > > > > > +       switchcase_csr_read_4(__csr_num + 4, __val)}
> > > > > > +#define switchcase_csr_read_16(__csr_num, __val)        {\
> > > > > > +       switchcase_csr_read_8(__csr_num + 0, __val)      \
> > > > > > +       switchcase_csr_read_8(__csr_num + 8, __val)}
> > > > > > +#define switchcase_csr_read_32(__csr_num, __val)        {\
> > > > > > +       switchcase_csr_read_16(__csr_num + 0, __val)     \
> > > > > > +       switchcase_csr_read_16(__csr_num + 16, __val)}
> > > > > > +
> > > > > > +       unsigned long ret = 0;
> > > > > > +
> > > > > > +       switch (csr_num) {
> > > > > > +       switchcase_csr_read_32(CSR_CYCLE, ret)
> > > > > > +       default:
> > > > > > +               break;
> > > > > > +       }
> > > > > > +
> > > > > > +       return ret;
> > > > > > +#undef switchcase_csr_read_32
> > > > > > +#undef switchcase_csr_read_16
> > > > > > +#undef switchcase_csr_read_8
> > > > > > +#undef switchcase_csr_read_4
> > > > > > +#undef switchcase_csr_read_2
> > > > > > +#undef switchcase_csr_read
> > > > > > +}
> > > > > > +
> > > > > > +static u64 read_perf_counter(unsigned int counter)
> > > > > > +{
> > > > > > +       return csr_read_num(CSR_CYCLE + counter);
> > > > > > +}
> > > > > > +
> > > > > > +static u64 read_timestamp(void)
> > > > > > +{
> > > > > > +       return csr_read_num(CSR_TIME);
> > > > > > +}
> > > > > > +
> > > > > >  #else
> > > > > >  static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
> > > > > >  static u64 read_timestamp(void) { return 0; }
> > > > > > --
> > > > > > 2.39.2
> > > > > >
  
Jessica Clarke July 31, 2023, 7:37 p.m. UTC | #7
On 31 Jul 2023, at 17:06, Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> 
> On Mon, Jul 31, 2023 at 5:10 PM Ian Rogers <irogers@google.com> wrote:
>> 
>> On Mon, Jul 31, 2023 at 3:27 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
>>> 
>>> On Mon, Jul 31, 2023 at 12:15 PM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
>>>> 
>>>> Hi Ian,
>>>> 
>>>> On Fri, Jul 28, 2023 at 7:53 PM Ian Rogers <irogers@google.com> wrote:
>>>>> 
>>>>> On Thu, Jul 27, 2023 at 7:28 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
>>>>>> 
>>>>>> riscv now supports mmaping hardware counters so add what's needed to
>>>>>> take advantage of that in libperf.
>>>>>> 
>>>>>> Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
>>>>>> Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
>>>>>> Reviewed-by: Atish Patra <atishp@rivosinc.com>
>>>>>> ---
>>>>>> tools/lib/perf/mmap.c | 65 +++++++++++++++++++++++++++++++++++++++++++
>>>>>> 1 file changed, 65 insertions(+)
>>>>>> 
>>>>>> diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c
>>>>>> index 0d1634cedf44..378a163f0554 100644
>>>>>> --- a/tools/lib/perf/mmap.c
>>>>>> +++ b/tools/lib/perf/mmap.c
>>>>>> @@ -392,6 +392,71 @@ static u64 read_perf_counter(unsigned int counter)
>>>>>> 
>>>>>> static u64 read_timestamp(void) { return read_sysreg(cntvct_el0); }
>>>>>> 
>>>>>> +#elif __riscv_xlen == 64
>>>>> 
>>>>> This is something of an odd guard, perhaps:
>>>>> #elif defined(__riscv) && __riscv_xlen == 64
>>>>> 
>>>>> That way it is more intention revealing that this is riscv code. Could
>>>>> you add a comment relating to the __riscv_xlen ?
>>>> 
>>>> I guess Andrew answered that already.
>>>> 
>> 
>> Not sure. I still think it looks weird:
>> ...
>> #if defined(__i386__) || defined(__x86_64__)
>> ...
>> #elif defined(__aarch64__)
>> ...
>> #elif __riscv_xlen == 64
>> ...
>> #else
>> static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
>> static u64 read_timestamp(void) { return 0; }
>> #endif
>> 
>> The first two are clearly #ifdef-ing architecture specific assembly
>> code, under what conditions I get RISC-V code  ¯\(ツ)/¯ At least worth
>> a comment like "csrr is only available when you have xlens of 64
>> because ..."
> 
> __riscv_xlen indicates riscv64, which is the only target of this
> patchset. But if you prefer, I don't mind adding back the
> defined(__riscv) if I re-spin a new version.

I mean, -Wundef is a thing that will scream at you if you evaluate a
macro that’s undefined and get an implicit 0, and parts of the linux
build seem to enable it. So I would strongly recommend against
(ab)using that feature of the preprocessor, especially when it aligns
with greater clarity.

Jess
  
Matthew Wilcox July 31, 2023, 7:47 p.m. UTC | #8
On Mon, Jul 31, 2023 at 09:46:07AM -0700, Ian Rogers wrote:
> On Mon, Jul 31, 2023 at 9:06 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> > I have just had the answer internally (thanks to @Brendan Sweeney):
> > csr modifications can alter how the memory is accessed (satp which
> > changes the address space, sum which allows/disallows userspace
> > access...), so we need the memory barrier to make sure the compiler
> > does not reorder the memory accesses.
> 
> The conditions you mention shouldn't apply here though? Even if you
> add a memory barrier for the compiler what is stopping the hardware
> reordering loads and stores? If it absolutely has to be there then a
> comment something like "There is a bug is riscv where the csrr
> instruction can clobber memory breaking future reads and some how this
> constraint fixes it by ... ".

If the hardware doesn't know that writing to a csr can change how memory
accesses are done and reorders memory accesses around that csr write,
you've got a really broken piece of hardware on your hands ...

I know nothing about risc-v, and maybe the definition says that you need
to put a memory barrier before and/or after it in the instruction stream,
but on all hardware I'm familiar with, writing to a CSR is an implicitly
serialising operation.
  
Jessica Clarke July 31, 2023, 9:07 p.m. UTC | #9
On 31 Jul 2023, at 20:47, Matthew Wilcox <willy@infradead.org> wrote:
> 
> On Mon, Jul 31, 2023 at 09:46:07AM -0700, Ian Rogers wrote:
>> On Mon, Jul 31, 2023 at 9:06 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
>>> I have just had the answer internally (thanks to @Brendan Sweeney):
>>> csr modifications can alter how the memory is accessed (satp which
>>> changes the address space, sum which allows/disallows userspace
>>> access...), so we need the memory barrier to make sure the compiler
>>> does not reorder the memory accesses.
>> 
>> The conditions you mention shouldn't apply here though? Even if you
>> add a memory barrier for the compiler what is stopping the hardware
>> reordering loads and stores? If it absolutely has to be there then a
>> comment something like "There is a bug is riscv where the csrr
>> instruction can clobber memory breaking future reads and some how this
>> constraint fixes it by ... ".
> 
> If the hardware doesn't know that writing to a csr can change how memory
> accesses are done and reorders memory accesses around that csr write,
> you've got a really broken piece of hardware on your hands ...
> 
> I know nothing about risc-v, and maybe the definition says that you need
> to put a memory barrier before and/or after it in the instruction stream,
> but on all hardware I'm familiar with, writing to a CSR is an implicitly
> serialising operation.

satp has some special rules due to the interaction with TLBs. Enabling
/ disabling translation by toggling between Bare and non-Bare modes
doesn’t require any kind of fence, nor does changing the ASID (if not
recycling), but any other changes to satp (changing between
Sv39/Sv48/Sv57 or changing the page table base address) do require
fences (not really sure why to be honest, maybe something about having
separate kernel and user page tables?..). §5.2.1 Supervisor
Memory-Management Fence Instruction of the privileged spec (the one I’m
looking at is version 20211203 but dated October 4, 2022) details this.

Jess
  
Alexandre Ghiti Aug. 1, 2023, 7:09 a.m. UTC | #10
On Mon, Jul 31, 2023 at 6:46 PM Ian Rogers <irogers@google.com> wrote:
>
> On Mon, Jul 31, 2023 at 9:06 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> >
> > On Mon, Jul 31, 2023 at 5:10 PM Ian Rogers <irogers@google.com> wrote:
> > >
> > > On Mon, Jul 31, 2023 at 3:27 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> > > >
> > > > On Mon, Jul 31, 2023 at 12:15 PM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> > > > >
> > > > > Hi Ian,
> > > > >
> > > > > On Fri, Jul 28, 2023 at 7:53 PM Ian Rogers <irogers@google.com> wrote:
> > > > > >
> > > > > > On Thu, Jul 27, 2023 at 7:28 AM Alexandre Ghiti <alexghiti@rivosinc.com> wrote:
> > > > > > >
> > > > > > > riscv now supports mmaping hardware counters so add what's needed to
> > > > > > > take advantage of that in libperf.
> > > > > > >
> > > > > > > Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
> > > > > > > Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> > > > > > > Reviewed-by: Atish Patra <atishp@rivosinc.com>
> > > > > > > ---
> > > > > > >  tools/lib/perf/mmap.c | 65 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > >  1 file changed, 65 insertions(+)
> > > > > > >
> > > > > > > diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c
> > > > > > > index 0d1634cedf44..378a163f0554 100644
> > > > > > > --- a/tools/lib/perf/mmap.c
> > > > > > > +++ b/tools/lib/perf/mmap.c
> > > > > > > @@ -392,6 +392,71 @@ static u64 read_perf_counter(unsigned int counter)
> > > > > > >
> > > > > > >  static u64 read_timestamp(void) { return read_sysreg(cntvct_el0); }
> > > > > > >
> > > > > > > +#elif __riscv_xlen == 64
> > > > > >
> > > > > > This is something of an odd guard, perhaps:
> > > > > > #elif defined(__riscv) && __riscv_xlen == 64
> > > > > >
> > > > > > That way it is more intention revealing that this is riscv code. Could
> > > > > > you add a comment relating to the __riscv_xlen ?
> > > > >
> > > > > I guess Andrew answered that already.
> > > > >
> > >
> > > Not sure. I still think it looks weird:
> > > ...
> > > #if defined(__i386__) || defined(__x86_64__)
> > > ...
> > > #elif defined(__aarch64__)
> > > ...
> > > #elif __riscv_xlen == 64
> > > ...
> > > #else
> > > static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
> > > static u64 read_timestamp(void) { return 0; }
> > > #endif
> > >
> > > The first two are clearly #ifdef-ing architecture specific assembly
> > > code, under what conditions I get RISC-V code  ¯\(ツ)/¯ At least worth
> > > a comment like "csrr is only available when you have xlens of 64
> > > because ..."
> >
> > __riscv_xlen indicates riscv64, which is the only target of this
> > patchset. But if you prefer, I don't mind adding back the
> > defined(__riscv) if I re-spin a new version.
>
> This kind of begs the question as to why there is no __riscv64 ifdef.
> The issue with xlen is it isn't intention revealing so for regular
> people trying to understand the code it would be nice to document it.

I understand, I'll add the defined(__riscv) and a comment to explain
the __riscv_xlen.

>
> > >
> > > > > >
> > > > > > > +
> > > > > > > +/* TODO: implement rv32 support */
> > > > > > > +
> > > > > > > +#define CSR_CYCLE      0xc00
> > > > > > > +#define CSR_TIME       0xc01
> > > > > > > +
> > > > > > > +#define csr_read(csr)                                          \
> > > > > > > +({                                                             \
> > > > > > > +       register unsigned long __v;                             \
> > > > > > > +               __asm__ __volatile__ ("csrr %0, " #csr          \
> > > > > > > +                : "=r" (__v) :                                 \
> > > > > > > +                : "memory");                                   \
> > > > > >
> > > > > > To avoid the macro pasting that could potentially go weird, could this be:
> > > > > >
> > > > > > __asm__ __volatile__ ("csrr %0, %1",
> > > > > >   : "=r"(__v) /* outputs */
> > > > > >   : "i"(csr) /* inputs */
> > > > > >   : "memory" /* clobbers */)
> > > >
> > > > Forgot to answer this one: it compiles, but I have to admit that I
> > > > don't understand the difference and if that's correct (all macros in
> > > > arch/riscv/include/asm/csr.h use # to do this) and what benefits it
> > > > brings. Can you elaborate more on things that could "go weird"?
> > >
> > > So rather than use an input constraint for the asm block you are using
> > > the C preprocessor to paste in the csr argument. If csr is something
> > > like "1" then it looks good and you'll get "csrr %0,1". If you pass
> > > something like "1 << 31" then that will be pasted as "csrr %0, 1 <<
> > > 31" and that starts to get weird in the context of being in the
> > > assembler where it is unlikely the C operators work. Using the input
> > > constraint avoids this, causes the C compiler to check the type of the
> > > argument and you'll probably get more intelligible error messages as a
> > > consequence.
> > >
> >
> > Thanks. So if I'm not mistaken, in this exact context, given we only
> > use csr_read() through the csr_read_num() function, it seems ok right?
>
> So you've formed a cargo cult and the justification is not wanting to
> stop a copy-paste chain from somewhere else. This code itself will be
> copy-pasted and we go to some ends to encourage that by placing parts
> of it in include/uapi/linux/perf_event.h. It seems better to catch
> this issue early rather than propagate it.

OK, nothing to argue here, I'll use your first proposition in the next version.

>
> > > >
> > > > Thanks again,
> > > >
> > > > Alex
> > > >
> > > > > >
> > > > > > Also, why is this clobbering memory? Worth adding a comment.
> > > > >
> > > > > No idea, I see that it is also done this way in
> > > > > arch/riscv/include/asm/csr.h. @Atish Kumar Patra , @Palmer Dabbelt ?
> > >
> > > It would seem to make sense then not to have a memory constraint until
> > > we know why we're doing it?
> > >
> >
> > I have just had the answer internally (thanks to @Brendan Sweeney):
> > csr modifications can alter how the memory is accessed (satp which
> > changes the address space, sum which allows/disallows userspace
> > access...), so we need the memory barrier to make sure the compiler
> > does not reorder the memory accesses.
>
> The conditions you mention shouldn't apply here though? Even if you
> add a memory barrier for the compiler what is stopping the hardware
> reordering loads and stores? If it absolutely has to be there then a
> comment something like "There is a bug is riscv where the csrr
> instruction can clobber memory breaking future reads and some how this
> constraint fixes it by ... ".

You're right, it does not apply here, so I can remove this memory
barrier. Regarding the hardware that could speculate across a csr
instruction, that's dealt with in the riscv spec:

"In particular, a CSR access is performed after the execution of any
prior instructions in program order whose behavior modifies or is
modified by the CSR state and before the execution of any subsequent
instructions in program order whose behavior modifies or is modified
by the CSR state"

Thanks for your comments,

Alex

>
> Thanks,
> Ian
>
> > Thanks,
> >
> > Alex
> >
> > > Thanks,
> > > Ian
> > >
> > > > >
> > > > > Thanks for your comments!
> > > > >
> > > > > Alex
> > > > >
> > > > > >
> > > > > > Thanks,
> > > > > > Ian
> > > > > >
> > > > > > > +                __v;                                           \
> > > > > > > +})
> > > > > > > +
> > > > > > > +static unsigned long csr_read_num(int csr_num)
> > > > > > > +{
> > > > > > > +#define switchcase_csr_read(__csr_num, __val)           {\
> > > > > > > +       case __csr_num:                                 \
> > > > > > > +               __val = csr_read(__csr_num);            \
> > > > > > > +               break; }
> > > > > > > +#define switchcase_csr_read_2(__csr_num, __val)         {\
> > > > > > > +       switchcase_csr_read(__csr_num + 0, __val)        \
> > > > > > > +       switchcase_csr_read(__csr_num + 1, __val)}
> > > > > > > +#define switchcase_csr_read_4(__csr_num, __val)         {\
> > > > > > > +       switchcase_csr_read_2(__csr_num + 0, __val)      \
> > > > > > > +       switchcase_csr_read_2(__csr_num + 2, __val)}
> > > > > > > +#define switchcase_csr_read_8(__csr_num, __val)         {\
> > > > > > > +       switchcase_csr_read_4(__csr_num + 0, __val)      \
> > > > > > > +       switchcase_csr_read_4(__csr_num + 4, __val)}
> > > > > > > +#define switchcase_csr_read_16(__csr_num, __val)        {\
> > > > > > > +       switchcase_csr_read_8(__csr_num + 0, __val)      \
> > > > > > > +       switchcase_csr_read_8(__csr_num + 8, __val)}
> > > > > > > +#define switchcase_csr_read_32(__csr_num, __val)        {\
> > > > > > > +       switchcase_csr_read_16(__csr_num + 0, __val)     \
> > > > > > > +       switchcase_csr_read_16(__csr_num + 16, __val)}
> > > > > > > +
> > > > > > > +       unsigned long ret = 0;
> > > > > > > +
> > > > > > > +       switch (csr_num) {
> > > > > > > +       switchcase_csr_read_32(CSR_CYCLE, ret)
> > > > > > > +       default:
> > > > > > > +               break;
> > > > > > > +       }
> > > > > > > +
> > > > > > > +       return ret;
> > > > > > > +#undef switchcase_csr_read_32
> > > > > > > +#undef switchcase_csr_read_16
> > > > > > > +#undef switchcase_csr_read_8
> > > > > > > +#undef switchcase_csr_read_4
> > > > > > > +#undef switchcase_csr_read_2
> > > > > > > +#undef switchcase_csr_read
> > > > > > > +}
> > > > > > > +
> > > > > > > +static u64 read_perf_counter(unsigned int counter)
> > > > > > > +{
> > > > > > > +       return csr_read_num(CSR_CYCLE + counter);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static u64 read_timestamp(void)
> > > > > > > +{
> > > > > > > +       return csr_read_num(CSR_TIME);
> > > > > > > +}
> > > > > > > +
> > > > > > >  #else
> > > > > > >  static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
> > > > > > >  static u64 read_timestamp(void) { return 0; }
> > > > > > > --
> > > > > > > 2.39.2
> > > > > > >
  

Patch

diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c
index 0d1634cedf44..378a163f0554 100644
--- a/tools/lib/perf/mmap.c
+++ b/tools/lib/perf/mmap.c
@@ -392,6 +392,71 @@  static u64 read_perf_counter(unsigned int counter)
 
 static u64 read_timestamp(void) { return read_sysreg(cntvct_el0); }
 
+#elif __riscv_xlen == 64
+
+/* TODO: implement rv32 support */
+
+#define CSR_CYCLE	0xc00
+#define CSR_TIME	0xc01
+
+#define csr_read(csr)						\
+({								\
+	register unsigned long __v;				\
+		__asm__ __volatile__ ("csrr %0, " #csr		\
+		 : "=r" (__v) :					\
+		 : "memory");					\
+		 __v;						\
+})
+
+static unsigned long csr_read_num(int csr_num)
+{
+#define switchcase_csr_read(__csr_num, __val)           {\
+	case __csr_num:                                 \
+		__val = csr_read(__csr_num);            \
+		break; }
+#define switchcase_csr_read_2(__csr_num, __val)         {\
+	switchcase_csr_read(__csr_num + 0, __val)        \
+	switchcase_csr_read(__csr_num + 1, __val)}
+#define switchcase_csr_read_4(__csr_num, __val)         {\
+	switchcase_csr_read_2(__csr_num + 0, __val)      \
+	switchcase_csr_read_2(__csr_num + 2, __val)}
+#define switchcase_csr_read_8(__csr_num, __val)         {\
+	switchcase_csr_read_4(__csr_num + 0, __val)      \
+	switchcase_csr_read_4(__csr_num + 4, __val)}
+#define switchcase_csr_read_16(__csr_num, __val)        {\
+	switchcase_csr_read_8(__csr_num + 0, __val)      \
+	switchcase_csr_read_8(__csr_num + 8, __val)}
+#define switchcase_csr_read_32(__csr_num, __val)        {\
+	switchcase_csr_read_16(__csr_num + 0, __val)     \
+	switchcase_csr_read_16(__csr_num + 16, __val)}
+
+	unsigned long ret = 0;
+
+	switch (csr_num) {
+	switchcase_csr_read_32(CSR_CYCLE, ret)
+	default:
+		break;
+	}
+
+	return ret;
+#undef switchcase_csr_read_32
+#undef switchcase_csr_read_16
+#undef switchcase_csr_read_8
+#undef switchcase_csr_read_4
+#undef switchcase_csr_read_2
+#undef switchcase_csr_read
+}
+
+static u64 read_perf_counter(unsigned int counter)
+{
+	return csr_read_num(CSR_CYCLE + counter);
+}
+
+static u64 read_timestamp(void)
+{
+	return csr_read_num(CSR_TIME);
+}
+
 #else
 static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
 static u64 read_timestamp(void) { return 0; }