[V11,05/10] arm64/perf: Add branch stack support in ARMV8 PMU

Message ID 20230531040428.501523-6-anshuman.khandual@arm.com
State New
Headers
Series arm64/perf: Enable branch stack sampling |

Commit Message

Anshuman Khandual May 31, 2023, 4:04 a.m. UTC
  This enables support for branch stack sampling event in ARMV8 PMU, checking
has_branch_stack() on the event inside 'struct arm_pmu' callbacks. Although
these branch stack helpers armv8pmu_branch_XXXXX() are just dummy functions
for now. While here, this also defines arm_pmu's sched_task() callback with
armv8pmu_sched_task(), which resets the branch record buffer on a sched_in.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Tested-by: James Clark <james.clark@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 arch/arm64/include/asm/perf_event.h | 33 +++++++++++++
 drivers/perf/arm_pmuv3.c            | 76 ++++++++++++++++++++---------
 2 files changed, 86 insertions(+), 23 deletions(-)
  

Comments

Namhyung Kim June 2, 2023, 2:33 a.m. UTC | #1
On Tue, May 30, 2023 at 9:27 PM Anshuman Khandual
<anshuman.khandual@arm.com> wrote:
>
> This enables support for branch stack sampling event in ARMV8 PMU, checking
> has_branch_stack() on the event inside 'struct arm_pmu' callbacks. Although
> these branch stack helpers armv8pmu_branch_XXXXX() are just dummy functions
> for now. While here, this also defines arm_pmu's sched_task() callback with
> armv8pmu_sched_task(), which resets the branch record buffer on a sched_in.
>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> Tested-by: James Clark <james.clark@arm.com>
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---
>  arch/arm64/include/asm/perf_event.h | 33 +++++++++++++
>  drivers/perf/arm_pmuv3.c            | 76 ++++++++++++++++++++---------
>  2 files changed, 86 insertions(+), 23 deletions(-)
>
> diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
> index eb7071c9eb34..7548813783ba 100644
> --- a/arch/arm64/include/asm/perf_event.h
> +++ b/arch/arm64/include/asm/perf_event.h
> @@ -24,4 +24,37 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
>         (regs)->pstate = PSR_MODE_EL1h; \
>  }
>
> +struct pmu_hw_events;
> +struct arm_pmu;
> +struct perf_event;
> +
> +#ifdef CONFIG_PERF_EVENTS
> +static inline bool has_branch_stack(struct perf_event *event);
> +
> +static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
> +{
> +       WARN_ON_ONCE(!has_branch_stack(event));
> +}
> +
> +static inline bool armv8pmu_branch_valid(struct perf_event *event)
> +{
> +       WARN_ON_ONCE(!has_branch_stack(event));
> +       return false;
> +}
> +
> +static inline void armv8pmu_branch_enable(struct perf_event *event)
> +{
> +       WARN_ON_ONCE(!has_branch_stack(event));
> +}
> +
> +static inline void armv8pmu_branch_disable(struct perf_event *event)
> +{
> +       WARN_ON_ONCE(!has_branch_stack(event));
> +}
> +
> +static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu) { }
> +static inline void armv8pmu_branch_reset(void) { }
> +static inline int armv8pmu_private_alloc(struct arm_pmu *arm_pmu) { return 0; }
> +static inline void armv8pmu_private_free(struct arm_pmu *arm_pmu) { }
> +#endif
>  #endif
> diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
> index c98e4039386d..86d803ff1ae3 100644
> --- a/drivers/perf/arm_pmuv3.c
> +++ b/drivers/perf/arm_pmuv3.c
> @@ -705,38 +705,21 @@ static void armv8pmu_enable_event(struct perf_event *event)
>          * Enable counter and interrupt, and set the counter to count
>          * the event that we're interested in.
>          */
> -
> -       /*
> -        * Disable counter
> -        */
>         armv8pmu_disable_event_counter(event);
> -
> -       /*
> -        * Set event.
> -        */
>         armv8pmu_write_event_type(event);
> -
> -       /*
> -        * Enable interrupt for this counter
> -        */
>         armv8pmu_enable_event_irq(event);
> -
> -       /*
> -        * Enable counter
> -        */
>         armv8pmu_enable_event_counter(event);
> +
> +       if (has_branch_stack(event))
> +               armv8pmu_branch_enable(event);
>  }
>
>  static void armv8pmu_disable_event(struct perf_event *event)
>  {
> -       /*
> -        * Disable counter
> -        */
> -       armv8pmu_disable_event_counter(event);
> +       if (has_branch_stack(event))
> +               armv8pmu_branch_disable(event);
>
> -       /*
> -        * Disable interrupt for this counter
> -        */
> +       armv8pmu_disable_event_counter(event);
>         armv8pmu_disable_event_irq(event);
>  }
>
> @@ -814,6 +797,11 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>                 if (!armpmu_event_set_period(event))
>                         continue;
>
> +               if (has_branch_stack(event) && !WARN_ON(!cpuc->branches)) {
> +                       armv8pmu_branch_read(cpuc, event);
> +                       perf_sample_save_brstack(&data, event, &cpuc->branches->branch_stack);
> +               }
> +
>                 /*
>                  * Perf event overflow will queue the processing of the event as
>                  * an irq_work which will be taken care of in the handling of
> @@ -912,6 +900,14 @@ static int armv8pmu_user_event_idx(struct perf_event *event)
>         return event->hw.idx;
>  }
>
> +static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
> +{
> +       struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
> +
> +       if (sched_in && arm_pmu_branch_stack_supported(armpmu))
> +               armv8pmu_branch_reset();
> +}
> +
>  /*
>   * Add an event filter to a given event.
>   */
> @@ -982,6 +978,9 @@ static void armv8pmu_reset(void *info)
>                 pmcr |= ARMV8_PMU_PMCR_LP;
>
>         armv8pmu_pmcr_write(pmcr);
> +
> +       if (arm_pmu_branch_stack_supported(cpu_pmu))
> +               armv8pmu_branch_reset();
>  }
>
>  static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
> @@ -1019,6 +1018,9 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
>
>         hw_event_id = __armv8_pmuv3_map_event_id(armpmu, event);
>
> +       if (has_branch_stack(event) && !armv8pmu_branch_valid(event))
> +               return -EOPNOTSUPP;
> +
>         /*
>          * CHAIN events only work when paired with an adjacent counter, and it
>          * never makes sense for a user to open one in isolation, as they'll be
> @@ -1135,6 +1137,21 @@ static void __armv8pmu_probe_pmu(void *info)
>                 cpu_pmu->reg_pmmir = read_pmmir();
>         else
>                 cpu_pmu->reg_pmmir = 0;
> +       armv8pmu_branch_probe(cpu_pmu);
> +}
> +
> +static int branch_records_alloc(struct arm_pmu *armpmu)
> +{
> +       struct pmu_hw_events *events;
> +       int cpu;
> +
> +       for_each_possible_cpu(cpu) {
> +               events = per_cpu_ptr(armpmu->hw_events, cpu);
> +               events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
> +               if (!events->branches)
> +                       return -ENOMEM;
> +       }
> +       return 0;
>  }
>
>  static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
> @@ -1145,12 +1162,24 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>         };
>         int ret;
>
> +       ret = armv8pmu_private_alloc(cpu_pmu);
> +       if (ret)
> +               return ret;

Wouldn't it be better to move it under the if statement below
if it's only needed for branch stack?

> +
>         ret = smp_call_function_any(&cpu_pmu->supported_cpus,
>                                     __armv8pmu_probe_pmu,
>                                     &probe, 1);
>         if (ret)
>                 return ret;

Otherwise you might need to free it here.

>
> +       if (arm_pmu_branch_stack_supported(cpu_pmu)) {
> +               ret = branch_records_alloc(cpu_pmu);
> +               if (ret)
> +                       return ret;

And here too.

Thanks,
Namhyung


> +       } else {
> +               armv8pmu_private_free(cpu_pmu);
> +       }
> +
>         return probe.present ? 0 : -ENODEV;
>  }
>
> @@ -1214,6 +1243,7 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
>         cpu_pmu->set_event_filter       = armv8pmu_set_event_filter;
>
>         cpu_pmu->pmu.event_idx          = armv8pmu_user_event_idx;
> +       cpu_pmu->sched_task             = armv8pmu_sched_task;
>
>         cpu_pmu->name                   = name;
>         cpu_pmu->map_event              = map_event;
> --
> 2.25.1
>
  
Anshuman Khandual June 5, 2023, 2:43 a.m. UTC | #2
Hello Namhyung,

On 6/2/23 08:03, Namhyung Kim wrote:
> On Tue, May 30, 2023 at 9:27 PM Anshuman Khandual
> <anshuman.khandual@arm.com> wrote:
>> This enables support for branch stack sampling event in ARMV8 PMU, checking
>> has_branch_stack() on the event inside 'struct arm_pmu' callbacks. Although
>> these branch stack helpers armv8pmu_branch_XXXXX() are just dummy functions
>> for now. While here, this also defines arm_pmu's sched_task() callback with
>> armv8pmu_sched_task(), which resets the branch record buffer on a sched_in.
>>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: Mark Rutland <mark.rutland@arm.com>
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linux-kernel@vger.kernel.org
>> Tested-by: James Clark <james.clark@arm.com>
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
>>  arch/arm64/include/asm/perf_event.h | 33 +++++++++++++
>>  drivers/perf/arm_pmuv3.c            | 76 ++++++++++++++++++++---------
>>  2 files changed, 86 insertions(+), 23 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
>> index eb7071c9eb34..7548813783ba 100644
>> --- a/arch/arm64/include/asm/perf_event.h
>> +++ b/arch/arm64/include/asm/perf_event.h
>> @@ -24,4 +24,37 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
>>         (regs)->pstate = PSR_MODE_EL1h; \
>>  }
>>
>> +struct pmu_hw_events;
>> +struct arm_pmu;
>> +struct perf_event;
>> +
>> +#ifdef CONFIG_PERF_EVENTS
>> +static inline bool has_branch_stack(struct perf_event *event);
>> +
>> +static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
>> +{
>> +       WARN_ON_ONCE(!has_branch_stack(event));
>> +}
>> +
>> +static inline bool armv8pmu_branch_valid(struct perf_event *event)
>> +{
>> +       WARN_ON_ONCE(!has_branch_stack(event));
>> +       return false;
>> +}
>> +
>> +static inline void armv8pmu_branch_enable(struct perf_event *event)
>> +{
>> +       WARN_ON_ONCE(!has_branch_stack(event));
>> +}
>> +
>> +static inline void armv8pmu_branch_disable(struct perf_event *event)
>> +{
>> +       WARN_ON_ONCE(!has_branch_stack(event));
>> +}
>> +
>> +static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu) { }
>> +static inline void armv8pmu_branch_reset(void) { }
>> +static inline int armv8pmu_private_alloc(struct arm_pmu *arm_pmu) { return 0; }
>> +static inline void armv8pmu_private_free(struct arm_pmu *arm_pmu) { }
>> +#endif
>>  #endif
>> diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
>> index c98e4039386d..86d803ff1ae3 100644
>> --- a/drivers/perf/arm_pmuv3.c
>> +++ b/drivers/perf/arm_pmuv3.c
>> @@ -705,38 +705,21 @@ static void armv8pmu_enable_event(struct perf_event *event)
>>          * Enable counter and interrupt, and set the counter to count
>>          * the event that we're interested in.
>>          */
>> -
>> -       /*
>> -        * Disable counter
>> -        */
>>         armv8pmu_disable_event_counter(event);
>> -
>> -       /*
>> -        * Set event.
>> -        */
>>         armv8pmu_write_event_type(event);
>> -
>> -       /*
>> -        * Enable interrupt for this counter
>> -        */
>>         armv8pmu_enable_event_irq(event);
>> -
>> -       /*
>> -        * Enable counter
>> -        */
>>         armv8pmu_enable_event_counter(event);
>> +
>> +       if (has_branch_stack(event))
>> +               armv8pmu_branch_enable(event);
>>  }
>>
>>  static void armv8pmu_disable_event(struct perf_event *event)
>>  {
>> -       /*
>> -        * Disable counter
>> -        */
>> -       armv8pmu_disable_event_counter(event);
>> +       if (has_branch_stack(event))
>> +               armv8pmu_branch_disable(event);
>>
>> -       /*
>> -        * Disable interrupt for this counter
>> -        */
>> +       armv8pmu_disable_event_counter(event);
>>         armv8pmu_disable_event_irq(event);
>>  }
>>
>> @@ -814,6 +797,11 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>                 if (!armpmu_event_set_period(event))
>>                         continue;
>>
>> +               if (has_branch_stack(event) && !WARN_ON(!cpuc->branches)) {
>> +                       armv8pmu_branch_read(cpuc, event);
>> +                       perf_sample_save_brstack(&data, event, &cpuc->branches->branch_stack);
>> +               }
>> +
>>                 /*
>>                  * Perf event overflow will queue the processing of the event as
>>                  * an irq_work which will be taken care of in the handling of
>> @@ -912,6 +900,14 @@ static int armv8pmu_user_event_idx(struct perf_event *event)
>>         return event->hw.idx;
>>  }
>>
>> +static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
>> +{
>> +       struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
>> +
>> +       if (sched_in && arm_pmu_branch_stack_supported(armpmu))
>> +               armv8pmu_branch_reset();
>> +}
>> +
>>  /*
>>   * Add an event filter to a given event.
>>   */
>> @@ -982,6 +978,9 @@ static void armv8pmu_reset(void *info)
>>                 pmcr |= ARMV8_PMU_PMCR_LP;
>>
>>         armv8pmu_pmcr_write(pmcr);
>> +
>> +       if (arm_pmu_branch_stack_supported(cpu_pmu))
>> +               armv8pmu_branch_reset();
>>  }
>>
>>  static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
>> @@ -1019,6 +1018,9 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
>>
>>         hw_event_id = __armv8_pmuv3_map_event_id(armpmu, event);
>>
>> +       if (has_branch_stack(event) && !armv8pmu_branch_valid(event))
>> +               return -EOPNOTSUPP;
>> +
>>         /*
>>          * CHAIN events only work when paired with an adjacent counter, and it
>>          * never makes sense for a user to open one in isolation, as they'll be
>> @@ -1135,6 +1137,21 @@ static void __armv8pmu_probe_pmu(void *info)
>>                 cpu_pmu->reg_pmmir = read_pmmir();
>>         else
>>                 cpu_pmu->reg_pmmir = 0;
>> +       armv8pmu_branch_probe(cpu_pmu);
>> +}
>> +
>> +static int branch_records_alloc(struct arm_pmu *armpmu)
>> +{
>> +       struct pmu_hw_events *events;
>> +       int cpu;
>> +
>> +       for_each_possible_cpu(cpu) {
>> +               events = per_cpu_ptr(armpmu->hw_events, cpu);
>> +               events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
>> +               if (!events->branches)
>> +                       return -ENOMEM;
>> +       }
>> +       return 0;
>>  }
>>
>>  static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>> @@ -1145,12 +1162,24 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>         };
>>         int ret;
>>
>> +       ret = armv8pmu_private_alloc(cpu_pmu);
>> +       if (ret)
>> +               return ret;
> Wouldn't it be better to move it under the if statement below
> if it's only needed for branch stack?

armv8pmu_private_alloc() allocates arm_pmu's private structure which stores
the BRBE HW attributes during armv8pmu_branch_probe(), called from this SMP
callback __armv8pmu_probe_pmu(). Hence without the structure being allocated
and assigned, following smp_call_function_any() cannot execute successfully.

armv8pmu_private_alloc()
	{
		......
		Allocates arm_pmu->private as single 'struct brbe_hw_attr'
		Allocates arm_pmu->pmu.task_ctx_cache
		......
	}

__armv8pmu_probe_pmu()
	armv8pmu_branch_probe()
		brbe_attributes_probe()
		{
			......
			brbe_attr->brbe_version = brbe;
			brbe_attr->brbe_format = brbe_get_format(brbidr);
        		brbe_attr->brbe_cc = brbe_get_cc_bits(brbidr);
        		brbe_attr->brbe_nr = brbe_get_numrec(brbidr);
			......
		}

armv8pmu_private_alloc() cannot be moved inside armv8pmu_branch_probe(),
because there cannot be any allocation while being in a SMP call context.

> 
>> +
>>         ret = smp_call_function_any(&cpu_pmu->supported_cpus,
>>                                     __armv8pmu_probe_pmu,
>>                                     &probe, 1);
>>         if (ret)
>>                 return ret;
> Otherwise you might need to free it here.
> 
>> +       if (arm_pmu_branch_stack_supported(cpu_pmu)) {
>> +               ret = branch_records_alloc(cpu_pmu);
>> +               if (ret)
>> +                       return ret;
> And here too.

Not freeing the arm_pmu's private data, might not be a problem in cases
where either pmu does not support BRBE or pmu probe itself fails. But for
completeness, will change as following.

diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 9725a53d6799..fdbe52913cc7 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -1198,13 +1198,17 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
        ret = smp_call_function_any(&cpu_pmu->supported_cpus,
                                    __armv8pmu_probe_pmu,
                                    &probe, 1);
-       if (ret)
+       if (ret) {
+               armv8pmu_private_free(cpu_pmu);
                return ret;
+       }
 
        if (arm_pmu_branch_stack_supported(cpu_pmu)) {
                ret = branch_records_alloc(cpu_pmu);
-               if (ret)
+               if (ret) {
+                       armv8pmu_private_free(cpu_pmu);
                        return ret;
+               }
        } else {
                armv8pmu_private_free(cpu_pmu);
        }
  
Mark Rutland June 5, 2023, 12:05 p.m. UTC | #3
On Wed, May 31, 2023 at 09:34:23AM +0530, Anshuman Khandual wrote:
> This enables support for branch stack sampling event in ARMV8 PMU, checking
> has_branch_stack() on the event inside 'struct arm_pmu' callbacks. Although
> these branch stack helpers armv8pmu_branch_XXXXX() are just dummy functions
> for now. While here, this also defines arm_pmu's sched_task() callback with
> armv8pmu_sched_task(), which resets the branch record buffer on a sched_in.

This generally looks good, but I have a few comments below.

[...]

> +static inline bool armv8pmu_branch_valid(struct perf_event *event)
> +{
> +	WARN_ON_ONCE(!has_branch_stack(event));
> +	return false;
> +}

IIUC this is for validating the attr, so could we please name this
armv8pmu_branch_attr_valid() ?

[...]

> +static int branch_records_alloc(struct arm_pmu *armpmu)
> +{
> +	struct pmu_hw_events *events;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		events = per_cpu_ptr(armpmu->hw_events, cpu);
> +		events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
> +		if (!events->branches)
> +			return -ENOMEM;
> +	}
> +	return 0;

This leaks memory if any allocation fails, and the next patch replaces this
code entirely.

Please add this once in a working state. Either use the percpu allocation
trick in the next patch from the start, or have this kzalloc() with a
corresponding kfree() in an error path.

>  }
>  
>  static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
> @@ -1145,12 +1162,24 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>  	};
>  	int ret;
>  
> +	ret = armv8pmu_private_alloc(cpu_pmu);
> +	if (ret)
> +		return ret;
> +
>  	ret = smp_call_function_any(&cpu_pmu->supported_cpus,
>  				    __armv8pmu_probe_pmu,
>  				    &probe, 1);
>  	if (ret)
>  		return ret;
>  
> +	if (arm_pmu_branch_stack_supported(cpu_pmu)) {
> +		ret = branch_records_alloc(cpu_pmu);
> +		if (ret)
> +			return ret;
> +	} else {
> +		armv8pmu_private_free(cpu_pmu);
> +	}

I see from the next patch that "private" is four ints, so please just add that
to struct arm_pmu under an ifdef CONFIG_ARM64_BRBE. That'll simplify this, and
if we end up needing more space in future we can consider factoring it out.

> +
>  	return probe.present ? 0 : -ENODEV;
>  }

It also seems odd to ceck probe.present *after* checking
arm_pmu_branch_stack_supported().

With the allocation removed I think this can be written more clearly as:

| static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
| {
|         struct armv8pmu_probe_info probe = {
|                 .pmu = cpu_pmu,
|                 .present = false,
|         };   
|         int ret; 
| 
|         ret = smp_call_function_any(&cpu_pmu->supported_cpus,
|                                     __armv8pmu_probe_pmu,
|                                     &probe, 1);
|         if (ret)
|                 return ret; 
| 
|         if (!probe.present)
|                 return -ENODEV;
| 
|         if (arm_pmu_branch_stack_supported(cpu_pmu))
|                 ret = branch_records_alloc(cpu_pmu);
|              
|         return ret; 
| }

Thanks,
Mark.
  
Anshuman Khandual June 6, 2023, 10:34 a.m. UTC | #4
On 6/5/23 17:35, Mark Rutland wrote:
> On Wed, May 31, 2023 at 09:34:23AM +0530, Anshuman Khandual wrote:
>> This enables support for branch stack sampling event in ARMV8 PMU, checking
>> has_branch_stack() on the event inside 'struct arm_pmu' callbacks. Although
>> these branch stack helpers armv8pmu_branch_XXXXX() are just dummy functions
>> for now. While here, this also defines arm_pmu's sched_task() callback with
>> armv8pmu_sched_task(), which resets the branch record buffer on a sched_in.
> 
> This generally looks good, but I have a few comments below.
> 
> [...]
> 
>> +static inline bool armv8pmu_branch_valid(struct perf_event *event)
>> +{
>> +	WARN_ON_ONCE(!has_branch_stack(event));
>> +	return false;
>> +}
> 
> IIUC this is for validating the attr, so could we please name this
> armv8pmu_branch_attr_valid() ?

Sure, will change the name and updated call sites.

> 
> [...]
> 
>> +static int branch_records_alloc(struct arm_pmu *armpmu)
>> +{
>> +	struct pmu_hw_events *events;
>> +	int cpu;
>> +
>> +	for_each_possible_cpu(cpu) {
>> +		events = per_cpu_ptr(armpmu->hw_events, cpu);
>> +		events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
>> +		if (!events->branches)
>> +			return -ENOMEM;
>> +	}
>> +	return 0;
> 
> This leaks memory if any allocation fails, and the next patch replaces this
> code entirely.

Okay.

> 
> Please add this once in a working state. Either use the percpu allocation
> trick in the next patch from the start, or have this kzalloc() with a
> corresponding kfree() in an error path.

I will change branch_records_alloc() as suggested in the next patch's thread
and fold those changes here in this patch.

> 
>>  }
>>  
>>  static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>> @@ -1145,12 +1162,24 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>  	};
>>  	int ret;
>>  
>> +	ret = armv8pmu_private_alloc(cpu_pmu);
>> +	if (ret)
>> +		return ret;
>> +
>>  	ret = smp_call_function_any(&cpu_pmu->supported_cpus,
>>  				    __armv8pmu_probe_pmu,
>>  				    &probe, 1);
>>  	if (ret)
>>  		return ret;
>>  
>> +	if (arm_pmu_branch_stack_supported(cpu_pmu)) {
>> +		ret = branch_records_alloc(cpu_pmu);
>> +		if (ret)
>> +			return ret;
>> +	} else {
>> +		armv8pmu_private_free(cpu_pmu);
>> +	}
> 
> I see from the next patch that "private" is four ints, so please just add that
> to struct arm_pmu under an ifdef CONFIG_ARM64_BRBE. That'll simplify this, and
> if we end up needing more space in future we can consider factoring it out.

struct arm_pmu {
	........................................
        /* Implementation specific attributes */
        void            *private;
}

private pointer here creates an abstraction for given pmu implementation
to hide attribute details without making it known to core arm pmu layer.
Although adding ifdef CONFIG_ARM64_BRBE solves the problem as mentioned
above, it does break that abstraction. Currently arm_pmu layer is aware
about 'branch records' but not about BRBE in particular which the driver
adds later on. I suggest we should not break that abstraction.

Instead a global 'static struct brbe_hw_attr' in drivers/perf/arm_brbe.c
can be initialized into arm_pmu->private during armv8pmu_branch_probe(),
which will also solve the allocation-free problem. Also similar helpers
armv8pmu_task_ctx_alloc()/free() could be defined to manage task context
cache i.e arm_pmu->pmu.task_ctx_cache independently.

But now armv8pmu_task_ctx_alloc() can be called after pmu probe confirms
to have arm_pmu->has_branch_stack.

> 
>> +
>>  	return probe.present ? 0 : -ENODEV;
>>  }
> 
> It also seems odd to ceck probe.present *after* checking
> arm_pmu_branch_stack_supported().

I will reorganize as suggested below.

> 
> With the allocation removed I think this can be written more clearly as:
> 
> | static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
> | {
> |         struct armv8pmu_probe_info probe = {
> |                 .pmu = cpu_pmu,
> |                 .present = false,
> |         };   
> |         int ret; 
> | 
> |         ret = smp_call_function_any(&cpu_pmu->supported_cpus,
> |                                     __armv8pmu_probe_pmu,
> |                                     &probe, 1);
> |         if (ret)
> |                 return ret; > | 
> |         if (!probe.present)
> |                 return -ENODEV;
> | 
> |         if (arm_pmu_branch_stack_supported(cpu_pmu))
> |                 ret = branch_records_alloc(cpu_pmu);
> |              
> |         return ret; 
> | }
  
Mark Rutland June 6, 2023, 10:41 a.m. UTC | #5
On Tue, Jun 06, 2023 at 04:04:25PM +0530, Anshuman Khandual wrote:
> On 6/5/23 17:35, Mark Rutland wrote:
> > On Wed, May 31, 2023 at 09:34:23AM +0530, Anshuman Khandual wrote:
> >>  static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
> >> @@ -1145,12 +1162,24 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
> >>  	};
> >>  	int ret;
> >>  
> >> +	ret = armv8pmu_private_alloc(cpu_pmu);
> >> +	if (ret)
> >> +		return ret;
> >> +
> >>  	ret = smp_call_function_any(&cpu_pmu->supported_cpus,
> >>  				    __armv8pmu_probe_pmu,
> >>  				    &probe, 1);
> >>  	if (ret)
> >>  		return ret;
> >>  
> >> +	if (arm_pmu_branch_stack_supported(cpu_pmu)) {
> >> +		ret = branch_records_alloc(cpu_pmu);
> >> +		if (ret)
> >> +			return ret;
> >> +	} else {
> >> +		armv8pmu_private_free(cpu_pmu);
> >> +	}
> > 
> > I see from the next patch that "private" is four ints, so please just add that
> > to struct arm_pmu under an ifdef CONFIG_ARM64_BRBE. That'll simplify this, and
> > if we end up needing more space in future we can consider factoring it out.
> 
> struct arm_pmu {
> 	........................................
>         /* Implementation specific attributes */
>         void            *private;
> }
> 
> private pointer here creates an abstraction for given pmu implementation
> to hide attribute details without making it known to core arm pmu layer.
> Although adding ifdef CONFIG_ARM64_BRBE solves the problem as mentioned
> above, it does break that abstraction. Currently arm_pmu layer is aware
> about 'branch records' but not about BRBE in particular which the driver
> adds later on. I suggest we should not break that abstraction.

I understand the rationale, but I think it's simpler for now to break that
abstraction. We can always refactor it later.

> Instead a global 'static struct brbe_hw_attr' in drivers/perf/arm_brbe.c
> can be initialized into arm_pmu->private during armv8pmu_branch_probe(),
> which will also solve the allocation-free problem. 

IIUC that's not going to work for big.LITTLE systems where the BRBE support
varies, as we need this data per arm_pmu.

> Also similar helpers armv8pmu_task_ctx_alloc()/free() could be defined to
> manage task context cache i.e arm_pmu->pmu.task_ctx_cache independently.
> 
> But now armv8pmu_task_ctx_alloc() can be called after pmu probe confirms
> to have arm_pmu->has_branch_stack.

I think those are different, and should be kept.

Thanks,
Mark.
  
Suzuki K Poulose June 8, 2023, 10:13 a.m. UTC | #6
On 06/06/2023 11:34, Anshuman Khandual wrote:
> 
> 
> On 6/5/23 17:35, Mark Rutland wrote:
>> On Wed, May 31, 2023 at 09:34:23AM +0530, Anshuman Khandual wrote:
>>> This enables support for branch stack sampling event in ARMV8 PMU, checking
>>> has_branch_stack() on the event inside 'struct arm_pmu' callbacks. Although
>>> these branch stack helpers armv8pmu_branch_XXXXX() are just dummy functions
>>> for now. While here, this also defines arm_pmu's sched_task() callback with
>>> armv8pmu_sched_task(), which resets the branch record buffer on a sched_in.
>>
>> This generally looks good, but I have a few comments below.
>>
>> [...]
>>
>>> +static inline bool armv8pmu_branch_valid(struct perf_event *event)
>>> +{
>>> +	WARN_ON_ONCE(!has_branch_stack(event));
>>> +	return false;
>>> +}
>>
>> IIUC this is for validating the attr, so could we please name this
>> armv8pmu_branch_attr_valid() ?
> 
> Sure, will change the name and updated call sites.
> 
>>
>> [...]
>>
>>> +static int branch_records_alloc(struct arm_pmu *armpmu)
>>> +{
>>> +	struct pmu_hw_events *events;
>>> +	int cpu;
>>> +
>>> +	for_each_possible_cpu(cpu) {

Shouldn't this be supported_pmus ? i.e.
	for_each_cpu(cpu, &armpmu->supported_cpus) {


>>> +		events = per_cpu_ptr(armpmu->hw_events, cpu);
>>> +		events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
>>> +		if (!events->branches)
>>> +			return -ENOMEM;

Do we need to free the allocated branches already ?

>>> +	}


May be:
	int ret = 0;

	for_each_cpu(cpu, &armpmu->supported_cpus) {
		events = per_cpu_ptr(armpmu->hw_events, cpu);
		events->branches = kzalloc(sizeof(struct 		branch_records), GFP_KERNEL);
		
		if (!events->branches) {
			ret = -ENOMEM;
			break;
		}
	}

	if (!ret)
		return 0;

	for_each_cpu(cpu, &armpmu->supported_cpus) {
		events = per_cpu_ptr(armpmu->hw_events, cpu);
		if (!events->branches)
			break;
		kfree(events->branches);
	}
	return ret;
	
>>> +	return 0;
>>
>> This leaks memory if any allocation fails, and the next patch replaces this
>> code entirely.
> 
> Okay.
> 
>>
>> Please add this once in a working state. Either use the percpu allocation
>> trick in the next patch from the start, or have this kzalloc() with a
>> corresponding kfree() in an error path.
> 
> I will change branch_records_alloc() as suggested in the next patch's thread
> and fold those changes here in this patch.
> 
>>
>>>   }
>>>   
>>>   static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>> @@ -1145,12 +1162,24 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>>   	};
>>>   	int ret;
>>>   
>>> +	ret = armv8pmu_private_alloc(cpu_pmu);
>>> +	if (ret)
>>> +		return ret;
>>> +
>>>   	ret = smp_call_function_any(&cpu_pmu->supported_cpus,
>>>   				    __armv8pmu_probe_pmu,
>>>   				    &probe, 1);
>>>   	if (ret)
>>>   		return ret;
>>>   
>>> +	if (arm_pmu_branch_stack_supported(cpu_pmu)) {
>>> +		ret = branch_records_alloc(cpu_pmu);
>>> +		if (ret)
>>> +			return ret;
>>> +	} else {
>>> +		armv8pmu_private_free(cpu_pmu);
>>> +	}
>>
>> I see from the next patch that "private" is four ints, so please just add that
>> to struct arm_pmu under an ifdef CONFIG_ARM64_BRBE. That'll simplify this, and
>> if we end up needing more space in future we can consider factoring it out.
> 
> struct arm_pmu {
> 	........................................
>          /* Implementation specific attributes */
>          void            *private;
> }
> 
> private pointer here creates an abstraction for given pmu implementation
> to hide attribute details without making it known to core arm pmu layer.
> Although adding ifdef CONFIG_ARM64_BRBE solves the problem as mentioned
> above, it does break that abstraction. Currently arm_pmu layer is aware
> about 'branch records' but not about BRBE in particular which the driver
> adds later on. I suggest we should not break that abstraction.
> 
> Instead a global 'static struct brbe_hw_attr' in drivers/perf/arm_brbe.c
> can be initialized into arm_pmu->private during armv8pmu_branch_probe(),
> which will also solve the allocation-free problem. Also similar helpers
> armv8pmu_task_ctx_alloc()/free() could be defined to manage task context
> cache i.e arm_pmu->pmu.task_ctx_cache independently.
> 
> But now armv8pmu_task_ctx_alloc() can be called after pmu probe confirms
> to have arm_pmu->has_branch_stack.
> 
>>
>>> +
>>>   	return probe.present ? 0 : -ENODEV;
>>>   }
>>
>> It also seems odd to ceck probe.present *after* checking
>> arm_pmu_branch_stack_supported().
> 
> I will reorganize as suggested below.
> 
>>
>> With the allocation removed I think this can be written more clearly as:
>>
>> | static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>> | {
>> |         struct armv8pmu_probe_info probe = {
>> |                 .pmu = cpu_pmu,
>> |                 .present = false,
>> |         };
>> |         int ret;
>> |
>> |         ret = smp_call_function_any(&cpu_pmu->supported_cpus,
>> |                                     __armv8pmu_probe_pmu,
>> |                                     &probe, 1);
>> |         if (ret)
>> |                 return ret; > |
>> |         if (!probe.present)
>> |                 return -ENODEV;
>> |
>> |         if (arm_pmu_branch_stack_supported(cpu_pmu))
>> |                 ret = branch_records_alloc(cpu_pmu);
>> |
>> |         return ret;
>> | }

Could we not simplify this as below and keep the abstraction, since we
already have it ?

 >> | static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
 >> | {
 >> |         struct armv8pmu_probe_info probe = {
 >> |                 .pmu = cpu_pmu,
 >> |                 .present = false,
 >> |         };
 >> |         int ret;
 >> |
 >> |         ret = smp_call_function_any(&cpu_pmu->supported_cpus,
 >> |                                     __armv8pmu_probe_pmu,
 >> |                                     &probe, 1);
 >> |         if (ret)
 >> |                 return ret;
 >> |         if (!probe.present)
 >> |                 return -ENODEV;
 >> |
 >> |  	     if (!arm_pmu_branch_stack_supported(cpu_pmu))
 >> |		     return 0;
 >> |
 >> |	     ret = armv8pmu_private_alloc(cpu_pmu);
 >> |	     if (ret)
 >> |		 return ret;
 >> |		
 >> |	      ret = branch_records_alloc(cpu_pmu);
 >> |	      if (ret)
 >> |		  armv8pmu_private_free(cpu_pmu);
 >> |		
 >> |        return ret;
 >> | }


Suzuki
  
Anshuman Khandual June 9, 2023, 4 a.m. UTC | #7
On 6/8/23 15:43, Suzuki K Poulose wrote:
> On 06/06/2023 11:34, Anshuman Khandual wrote:
>>
>>
>> On 6/5/23 17:35, Mark Rutland wrote:
>>> On Wed, May 31, 2023 at 09:34:23AM +0530, Anshuman Khandual wrote:
>>>> This enables support for branch stack sampling event in ARMV8 PMU, checking
>>>> has_branch_stack() on the event inside 'struct arm_pmu' callbacks. Although
>>>> these branch stack helpers armv8pmu_branch_XXXXX() are just dummy functions
>>>> for now. While here, this also defines arm_pmu's sched_task() callback with
>>>> armv8pmu_sched_task(), which resets the branch record buffer on a sched_in.
>>>
>>> This generally looks good, but I have a few comments below.
>>>
>>> [...]
>>>
>>>> +static inline bool armv8pmu_branch_valid(struct perf_event *event)
>>>> +{
>>>> +    WARN_ON_ONCE(!has_branch_stack(event));
>>>> +    return false;
>>>> +}
>>>
>>> IIUC this is for validating the attr, so could we please name this
>>> armv8pmu_branch_attr_valid() ?
>>
>> Sure, will change the name and updated call sites.
>>
>>>
>>> [...]
>>>
>>>> +static int branch_records_alloc(struct arm_pmu *armpmu)
>>>> +{
>>>> +    struct pmu_hw_events *events;
>>>> +    int cpu;
>>>> +
>>>> +    for_each_possible_cpu(cpu) {
> 
> Shouldn't this be supported_pmus ? i.e.
>     for_each_cpu(cpu, &armpmu->supported_cpus) {
> 
> 
>>>> +        events = per_cpu_ptr(armpmu->hw_events, cpu);
>>>> +        events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
>>>> +        if (!events->branches)
>>>> +            return -ENOMEM;
> 
> Do we need to free the allocated branches already ?

This gets fixed in the next patch via per-cpu allocation. I will
move and fold the code block in here. Updated function will look
like the following.

static int branch_records_alloc(struct arm_pmu *armpmu)
{
        struct branch_records __percpu *records;
        int cpu;

        records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
        if (!records)
                return -ENOMEM;

        /*
         * FIXME: Memory allocated via records gets completely
         * consumed here, never required to be freed up later. Hence
         * losing access to on stack 'records' is acceptable.
         * Otherwise this alloc handle has to be saved some where.
         */
        for_each_possible_cpu(cpu) {
                struct pmu_hw_events *events_cpu;
                struct branch_records *records_cpu;

                events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
                records_cpu = per_cpu_ptr(records, cpu);
                events_cpu->branches = records_cpu;
        }
        return 0;
}

Regarding the cpumask argument in for_each_cpu().

- hw_events is a __percpu pointer in struct arm_pmu

	- pmu->hw_events = alloc_percpu_gfp(struct pmu_hw_events, GFP_KERNEL)


- 'records' above is being allocated via alloc_percpu_gfp()

	- records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL)

If 'armpmu->supported_cpus' mask gets used instead of possible cpu mask,
would not there be some dangling per-cpu branch_record allocated areas,
that remain unsigned ? Assigning all of them back into hw_events should
be harmless.

> 
>>>> +    }
> 
> 
> May be:
>     int ret = 0;
> 
>     for_each_cpu(cpu, &armpmu->supported_cpus) {
>         events = per_cpu_ptr(armpmu->hw_events, cpu);
>         events->branches = kzalloc(sizeof(struct         branch_records), GFP_KERNEL);
>        
>         if (!events->branches) {
>             ret = -ENOMEM;
>             break;
>         }
>     }
> 
>     if (!ret)
>         return 0;
> 
>     for_each_cpu(cpu, &armpmu->supported_cpus) {
>         events = per_cpu_ptr(armpmu->hw_events, cpu);
>         if (!events->branches)
>             break;
>         kfree(events->branches);
>     }
>     return ret;
>     
>>>> +    return 0;
>>>
>>> This leaks memory if any allocation fails, and the next patch replaces this
>>> code entirely.
>>
>> Okay.
>>
>>>
>>> Please add this once in a working state. Either use the percpu allocation
>>> trick in the next patch from the start, or have this kzalloc() with a
>>> corresponding kfree() in an error path.
>>
>> I will change branch_records_alloc() as suggested in the next patch's thread
>> and fold those changes here in this patch.
>>
>>>
>>>>   }
>>>>     static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>>> @@ -1145,12 +1162,24 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>>>       };
>>>>       int ret;
>>>>   +    ret = armv8pmu_private_alloc(cpu_pmu);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>>       ret = smp_call_function_any(&cpu_pmu->supported_cpus,
>>>>                       __armv8pmu_probe_pmu,
>>>>                       &probe, 1);
>>>>       if (ret)
>>>>           return ret;
>>>>   +    if (arm_pmu_branch_stack_supported(cpu_pmu)) {
>>>> +        ret = branch_records_alloc(cpu_pmu);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +    } else {
>>>> +        armv8pmu_private_free(cpu_pmu);
>>>> +    }
>>>
>>> I see from the next patch that "private" is four ints, so please just add that
>>> to struct arm_pmu under an ifdef CONFIG_ARM64_BRBE. That'll simplify this, and
>>> if we end up needing more space in future we can consider factoring it out.
>>
>> struct arm_pmu {
>>     ........................................
>>          /* Implementation specific attributes */
>>          void            *private;
>> }
>>
>> private pointer here creates an abstraction for given pmu implementation
>> to hide attribute details without making it known to core arm pmu layer.
>> Although adding ifdef CONFIG_ARM64_BRBE solves the problem as mentioned
>> above, it does break that abstraction. Currently arm_pmu layer is aware
>> about 'branch records' but not about BRBE in particular which the driver
>> adds later on. I suggest we should not break that abstraction.
>>
>> Instead a global 'static struct brbe_hw_attr' in drivers/perf/arm_brbe.c
>> can be initialized into arm_pmu->private during armv8pmu_branch_probe(),
>> which will also solve the allocation-free problem. Also similar helpers
>> armv8pmu_task_ctx_alloc()/free() could be defined to manage task context
>> cache i.e arm_pmu->pmu.task_ctx_cache independently.
>>
>> But now armv8pmu_task_ctx_alloc() can be called after pmu probe confirms
>> to have arm_pmu->has_branch_stack.
>>
>>>
>>>> +
>>>>       return probe.present ? 0 : -ENODEV;
>>>>   }
>>>
>>> It also seems odd to ceck probe.present *after* checking
>>> arm_pmu_branch_stack_supported().
>>
>> I will reorganize as suggested below.
>>
>>>
>>> With the allocation removed I think this can be written more clearly as:
>>>
>>> | static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>> | {
>>> |         struct armv8pmu_probe_info probe = {
>>> |                 .pmu = cpu_pmu,
>>> |                 .present = false,
>>> |         };
>>> |         int ret;
>>> |
>>> |         ret = smp_call_function_any(&cpu_pmu->supported_cpus,
>>> |                                     __armv8pmu_probe_pmu,
>>> |                                     &probe, 1);
>>> |         if (ret)
>>> |                 return ret; > |
>>> |         if (!probe.present)
>>> |                 return -ENODEV;
>>> |
>>> |         if (arm_pmu_branch_stack_supported(cpu_pmu))
>>> |                 ret = branch_records_alloc(cpu_pmu);
>>> |
>>> |         return ret;
>>> | }
> 
> Could we not simplify this as below and keep the abstraction, since we
> already have it ?

No, there is an allocation dependency before the smp call context.
 
> 
>>> | static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>> | {
>>> |         struct armv8pmu_probe_info probe = {
>>> |                 .pmu = cpu_pmu,
>>> |                 .present = false,
>>> |         };
>>> |         int ret;
>>> |
>>> |         ret = smp_call_function_any(&cpu_pmu->supported_cpus,
>>> |                                     __armv8pmu_probe_pmu,
>>> |                                     &probe, 1);
>>> |         if (ret)
>>> |                 return ret;
>>> |         if (!probe.present)
>>> |                 return -ENODEV;
>>> |
>>> |           if (!arm_pmu_branch_stack_supported(cpu_pmu))
>>> |             return 0;
>>> |
>>> |         ret = armv8pmu_private_alloc(cpu_pmu);

This needs to be allocated before each supported PMU gets probed via
__armv8pmu_probe_pmu() inside smp_call_function_any() callbacks that
unfortunately cannot do memory allocation.

>>> |         if (ret)
>>> |         return ret;
>>> |       
>>> |          ret = branch_records_alloc(cpu_pmu);
>>> |          if (ret)
>>> |          armv8pmu_private_free(cpu_pmu);
>>> |       
>>> |        return ret;
>>> | }

Changing the abstraction will cause too much code churn, this late in
the development phase, which should be avoided IMHO.
  
Anshuman Khandual June 9, 2023, 7:14 a.m. UTC | #8
[..]

On 6/8/23 15:43, Suzuki K Poulose wrote:
>>> | static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>> | {
>>> |         struct armv8pmu_probe_info probe = {
>>> |                 .pmu = cpu_pmu,
>>> |                 .present = false,
>>> |         };
>>> |         int ret;
>>> |
>>> |         ret = smp_call_function_any(&cpu_pmu->supported_cpus,
>>> |                                     __armv8pmu_probe_pmu,
>>> |                                     &probe, 1);
>>> |         if (ret)
>>> |                 return ret;
>>> |         if (!probe.present)
>>> |                 return -ENODEV;
>>> |
>>> |           if (!arm_pmu_branch_stack_supported(cpu_pmu))
>>> |             return 0;
>>> |
>>> |         ret = armv8pmu_private_alloc(cpu_pmu);
>>> |         if (ret)
>>> |         return ret;
>>> |       
>>> |          ret = branch_records_alloc(cpu_pmu);
>>> |          if (ret)
>>> |          armv8pmu_private_free(cpu_pmu);
>>> |       
>>> |        return ret;
>>> | }


After splitting the task ctx cache management from pmu private data
management, the above function will look something like this taking
care of all error path freeing as well.

static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
{
        struct armv8pmu_probe_info probe = {
                .pmu = cpu_pmu,
                .present = false,
        };
        int ret;

        ret = armv8pmu_private_alloc(cpu_pmu);
        if (ret)
                return ret;

        ret = smp_call_function_any(&cpu_pmu->supported_cpus,
                                    __armv8pmu_probe_pmu,
                                    &probe, 1);
        if (ret)
                goto probe_err;

        if (!probe.present) {
                ret = -ENODEV;
                goto probe_err;
        }

        if (cpu_pmu->has_branch_stack) {
                ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
                if (ret)
                        goto probe_err;

                ret = branch_records_alloc(cpu_pmu);
                if (ret) {
                        armv8pmu_task_ctx_cache_free(cpu_pmu);
                        goto probe_err;
                }
                return 0;
        }
        armv8pmu_private_free(cpu_pmu);
        return 0;

probe_err:
        armv8pmu_private_free(cpu_pmu);
        return ret;
}
  
Suzuki K Poulose June 9, 2023, 9:54 a.m. UTC | #9
On 09/06/2023 05:00, Anshuman Khandual wrote:
> 
> 
> On 6/8/23 15:43, Suzuki K Poulose wrote:
>> On 06/06/2023 11:34, Anshuman Khandual wrote:
>>>
>>>
>>> On 6/5/23 17:35, Mark Rutland wrote:
>>>> On Wed, May 31, 2023 at 09:34:23AM +0530, Anshuman Khandual wrote:
>>>>> This enables support for branch stack sampling event in ARMV8 PMU, checking
>>>>> has_branch_stack() on the event inside 'struct arm_pmu' callbacks. Although
>>>>> these branch stack helpers armv8pmu_branch_XXXXX() are just dummy functions
>>>>> for now. While here, this also defines arm_pmu's sched_task() callback with
>>>>> armv8pmu_sched_task(), which resets the branch record buffer on a sched_in.
>>>>
>>>> This generally looks good, but I have a few comments below.
>>>>
>>>> [...]
>>>>
>>>>> +static inline bool armv8pmu_branch_valid(struct perf_event *event)
>>>>> +{
>>>>> +    WARN_ON_ONCE(!has_branch_stack(event));
>>>>> +    return false;
>>>>> +}
>>>>
>>>> IIUC this is for validating the attr, so could we please name this
>>>> armv8pmu_branch_attr_valid() ?
>>>
>>> Sure, will change the name and updated call sites.
>>>
>>>>
>>>> [...]
>>>>
>>>>> +static int branch_records_alloc(struct arm_pmu *armpmu)
>>>>> +{
>>>>> +    struct pmu_hw_events *events;
>>>>> +    int cpu;
>>>>> +
>>>>> +    for_each_possible_cpu(cpu) {
>>
>> Shouldn't this be supported_pmus ? i.e.
>>      for_each_cpu(cpu, &armpmu->supported_cpus) {
>>
>>
>>>>> +        events = per_cpu_ptr(armpmu->hw_events, cpu);
>>>>> +        events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
>>>>> +        if (!events->branches)
>>>>> +            return -ENOMEM;
>>
>> Do we need to free the allocated branches already ?
> 
> This gets fixed in the next patch via per-cpu allocation. I will
> move and fold the code block in here. Updated function will look
> like the following.
> 
> static int branch_records_alloc(struct arm_pmu *armpmu)
> {
>          struct branch_records __percpu *records;
>          int cpu;
> 
>          records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
>          if (!records)
>                  return -ENOMEM;
> 
>          /*
>           * FIXME: Memory allocated via records gets completely
>           * consumed here, never required to be freed up later. Hence
>           * losing access to on stack 'records' is acceptable.
>           * Otherwise this alloc handle has to be saved some where.
>           */
>          for_each_possible_cpu(cpu) {
>                  struct pmu_hw_events *events_cpu;
>                  struct branch_records *records_cpu;
> 
>                  events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
>                  records_cpu = per_cpu_ptr(records, cpu);
>                  events_cpu->branches = records_cpu;
>          }
>          return 0;
> }
> 
> Regarding the cpumask argument in for_each_cpu().
> 
> - hw_events is a __percpu pointer in struct arm_pmu
> 
> 	- pmu->hw_events = alloc_percpu_gfp(struct pmu_hw_events, GFP_KERNEL)
> 
> 
> - 'records' above is being allocated via alloc_percpu_gfp()
> 
> 	- records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL)


> 
> If 'armpmu->supported_cpus' mask gets used instead of possible cpu mask,
> would not there be some dangling per-cpu branch_record allocated areas,
> that remain unsigned ? Assigning all of them back into hw_events should
> be harmless.

Thats because you are using alloc_percpu for records ? With the current
proposed code, if there are two different arm_pmus on the system, you
would end up wasting 1xper_cpu branch_records ? And if there are 3,
2xper_cpu gets wasted ?

> 
>>
>>>>> +    }
>>
>>
>> May be:
>>      int ret = 0;
>>
>>      for_each_cpu(cpu, &armpmu->supported_cpus) {
>>          events = per_cpu_ptr(armpmu->hw_events, cpu);
>>          events->branches = kzalloc(sizeof(struct         branch_records), GFP_KERNEL);
>>         
>>          if (!events->branches) {
>>              ret = -ENOMEM;
>>              break;
>>          }
>>      }
>>
>>      if (!ret)
>>          return 0;
>>
>>      for_each_cpu(cpu, &armpmu->supported_cpus) {
>>          events = per_cpu_ptr(armpmu->hw_events, cpu);
>>          if (!events->branches)
>>              break;
>>          kfree(events->branches);
>>      }
>>      return ret;
>>      
>>>>> +    return 0;
>>>>
>>>> This leaks memory if any allocation fails, and the next patch replaces this
>>>> code entirely.
>>>
>>> Okay.
>>>
>>>>
>>>> Please add this once in a working state. Either use the percpu allocation
>>>> trick in the next patch from the start, or have this kzalloc() with a
>>>> corresponding kfree() in an error path.
>>>
>>> I will change branch_records_alloc() as suggested in the next patch's thread
>>> and fold those changes here in this patch.
>>>
>>>>
>>>>>    }
>>>>>      static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>>>> @@ -1145,12 +1162,24 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>>>>        };
>>>>>        int ret;
>>>>>    +    ret = armv8pmu_private_alloc(cpu_pmu);
>>>>> +    if (ret)
>>>>> +        return ret;
>>>>> +
>>>>>        ret = smp_call_function_any(&cpu_pmu->supported_cpus,
>>>>>                        __armv8pmu_probe_pmu,
>>>>>                        &probe, 1);
>>>>>        if (ret)
>>>>>            return ret;
>>>>>    +    if (arm_pmu_branch_stack_supported(cpu_pmu)) {
>>>>> +        ret = branch_records_alloc(cpu_pmu);
>>>>> +        if (ret)
>>>>> +            return ret;
>>>>> +    } else {
>>>>> +        armv8pmu_private_free(cpu_pmu);
>>>>> +    }
>>>>
>>>> I see from the next patch that "private" is four ints, so please just add that
>>>> to struct arm_pmu under an ifdef CONFIG_ARM64_BRBE. That'll simplify this, and
>>>> if we end up needing more space in future we can consider factoring it out.
>>>
>>> struct arm_pmu {
>>>      ........................................
>>>           /* Implementation specific attributes */
>>>           void            *private;
>>> }
>>>
>>> private pointer here creates an abstraction for given pmu implementation
>>> to hide attribute details without making it known to core arm pmu layer.
>>> Although adding ifdef CONFIG_ARM64_BRBE solves the problem as mentioned
>>> above, it does break that abstraction. Currently arm_pmu layer is aware
>>> about 'branch records' but not about BRBE in particular which the driver
>>> adds later on. I suggest we should not break that abstraction.
>>>
>>> Instead a global 'static struct brbe_hw_attr' in drivers/perf/arm_brbe.c
>>> can be initialized into arm_pmu->private during armv8pmu_branch_probe(),
>>> which will also solve the allocation-free problem. Also similar helpers
>>> armv8pmu_task_ctx_alloc()/free() could be defined to manage task context
>>> cache i.e arm_pmu->pmu.task_ctx_cache independently.
>>>
>>> But now armv8pmu_task_ctx_alloc() can be called after pmu probe confirms
>>> to have arm_pmu->has_branch_stack.
>>>
>>>>
>>>>> +
>>>>>        return probe.present ? 0 : -ENODEV;
>>>>>    }
>>>>
>>>> It also seems odd to ceck probe.present *after* checking
>>>> arm_pmu_branch_stack_supported().
>>>
>>> I will reorganize as suggested below.
>>>
>>>>
>>>> With the allocation removed I think this can be written more clearly as:
>>>>
>>>> | static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>>> | {
>>>> |         struct armv8pmu_probe_info probe = {
>>>> |                 .pmu = cpu_pmu,
>>>> |                 .present = false,
>>>> |         };
>>>> |         int ret;
>>>> |
>>>> |         ret = smp_call_function_any(&cpu_pmu->supported_cpus,
>>>> |                                     __armv8pmu_probe_pmu,
>>>> |                                     &probe, 1);
>>>> |         if (ret)
>>>> |                 return ret; > |
>>>> |         if (!probe.present)
>>>> |                 return -ENODEV;
>>>> |
>>>> |         if (arm_pmu_branch_stack_supported(cpu_pmu))
>>>> |                 ret = branch_records_alloc(cpu_pmu);
>>>> |
>>>> |         return ret;
>>>> | }
>>
>> Could we not simplify this as below and keep the abstraction, since we
>> already have it ?
> 
> No, there is an allocation dependency before the smp call context.

Ok, I wasn't aware of that. Could we not read whatever we need to know
about the brbe in armv8pmu_probe_info and process it at the caller here?
And then do the the private_alloc etc as we need ?

Suzuki
  

Patch

diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index eb7071c9eb34..7548813783ba 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -24,4 +24,37 @@  extern unsigned long perf_misc_flags(struct pt_regs *regs);
 	(regs)->pstate = PSR_MODE_EL1h;	\
 }
 
+struct pmu_hw_events;
+struct arm_pmu;
+struct perf_event;
+
+#ifdef CONFIG_PERF_EVENTS
+static inline bool has_branch_stack(struct perf_event *event);
+
+static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
+{
+	WARN_ON_ONCE(!has_branch_stack(event));
+}
+
+static inline bool armv8pmu_branch_valid(struct perf_event *event)
+{
+	WARN_ON_ONCE(!has_branch_stack(event));
+	return false;
+}
+
+static inline void armv8pmu_branch_enable(struct perf_event *event)
+{
+	WARN_ON_ONCE(!has_branch_stack(event));
+}
+
+static inline void armv8pmu_branch_disable(struct perf_event *event)
+{
+	WARN_ON_ONCE(!has_branch_stack(event));
+}
+
+static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu) { }
+static inline void armv8pmu_branch_reset(void) { }
+static inline int armv8pmu_private_alloc(struct arm_pmu *arm_pmu) { return 0; }
+static inline void armv8pmu_private_free(struct arm_pmu *arm_pmu) { }
+#endif
 #endif
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index c98e4039386d..86d803ff1ae3 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -705,38 +705,21 @@  static void armv8pmu_enable_event(struct perf_event *event)
 	 * Enable counter and interrupt, and set the counter to count
 	 * the event that we're interested in.
 	 */
-
-	/*
-	 * Disable counter
-	 */
 	armv8pmu_disable_event_counter(event);
-
-	/*
-	 * Set event.
-	 */
 	armv8pmu_write_event_type(event);
-
-	/*
-	 * Enable interrupt for this counter
-	 */
 	armv8pmu_enable_event_irq(event);
-
-	/*
-	 * Enable counter
-	 */
 	armv8pmu_enable_event_counter(event);
+
+	if (has_branch_stack(event))
+		armv8pmu_branch_enable(event);
 }
 
 static void armv8pmu_disable_event(struct perf_event *event)
 {
-	/*
-	 * Disable counter
-	 */
-	armv8pmu_disable_event_counter(event);
+	if (has_branch_stack(event))
+		armv8pmu_branch_disable(event);
 
-	/*
-	 * Disable interrupt for this counter
-	 */
+	armv8pmu_disable_event_counter(event);
 	armv8pmu_disable_event_irq(event);
 }
 
@@ -814,6 +797,11 @@  static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 		if (!armpmu_event_set_period(event))
 			continue;
 
+		if (has_branch_stack(event) && !WARN_ON(!cpuc->branches)) {
+			armv8pmu_branch_read(cpuc, event);
+			perf_sample_save_brstack(&data, event, &cpuc->branches->branch_stack);
+		}
+
 		/*
 		 * Perf event overflow will queue the processing of the event as
 		 * an irq_work which will be taken care of in the handling of
@@ -912,6 +900,14 @@  static int armv8pmu_user_event_idx(struct perf_event *event)
 	return event->hw.idx;
 }
 
+static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
+
+	if (sched_in && arm_pmu_branch_stack_supported(armpmu))
+		armv8pmu_branch_reset();
+}
+
 /*
  * Add an event filter to a given event.
  */
@@ -982,6 +978,9 @@  static void armv8pmu_reset(void *info)
 		pmcr |= ARMV8_PMU_PMCR_LP;
 
 	armv8pmu_pmcr_write(pmcr);
+
+	if (arm_pmu_branch_stack_supported(cpu_pmu))
+		armv8pmu_branch_reset();
 }
 
 static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
@@ -1019,6 +1018,9 @@  static int __armv8_pmuv3_map_event(struct perf_event *event,
 
 	hw_event_id = __armv8_pmuv3_map_event_id(armpmu, event);
 
+	if (has_branch_stack(event) && !armv8pmu_branch_valid(event))
+		return -EOPNOTSUPP;
+
 	/*
 	 * CHAIN events only work when paired with an adjacent counter, and it
 	 * never makes sense for a user to open one in isolation, as they'll be
@@ -1135,6 +1137,21 @@  static void __armv8pmu_probe_pmu(void *info)
 		cpu_pmu->reg_pmmir = read_pmmir();
 	else
 		cpu_pmu->reg_pmmir = 0;
+	armv8pmu_branch_probe(cpu_pmu);
+}
+
+static int branch_records_alloc(struct arm_pmu *armpmu)
+{
+	struct pmu_hw_events *events;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		events = per_cpu_ptr(armpmu->hw_events, cpu);
+		events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
+		if (!events->branches)
+			return -ENOMEM;
+	}
+	return 0;
 }
 
 static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
@@ -1145,12 +1162,24 @@  static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
 	};
 	int ret;
 
+	ret = armv8pmu_private_alloc(cpu_pmu);
+	if (ret)
+		return ret;
+
 	ret = smp_call_function_any(&cpu_pmu->supported_cpus,
 				    __armv8pmu_probe_pmu,
 				    &probe, 1);
 	if (ret)
 		return ret;
 
+	if (arm_pmu_branch_stack_supported(cpu_pmu)) {
+		ret = branch_records_alloc(cpu_pmu);
+		if (ret)
+			return ret;
+	} else {
+		armv8pmu_private_free(cpu_pmu);
+	}
+
 	return probe.present ? 0 : -ENODEV;
 }
 
@@ -1214,6 +1243,7 @@  static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
 	cpu_pmu->set_event_filter	= armv8pmu_set_event_filter;
 
 	cpu_pmu->pmu.event_idx		= armv8pmu_user_event_idx;
+	cpu_pmu->sched_task		= armv8pmu_sched_task;
 
 	cpu_pmu->name			= name;
 	cpu_pmu->map_event		= map_event;