[V11,06/10] arm64/perf: Enable branch stack events via FEAT_BRBE

Message ID 20230531040428.501523-7-anshuman.khandual@arm.com
State New
Headers
Series arm64/perf: Enable branch stack sampling |

Commit Message

Anshuman Khandual May 31, 2023, 4:04 a.m. UTC
  This enables branch stack sampling events in ARMV8 PMU, via an architecture
feature FEAT_BRBE aka branch record buffer extension. This defines required
branch helper functions pmuv8pmu_branch_XXXXX() and the implementation here
is wrapped with a new config option CONFIG_ARM64_BRBE.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Tested-by: James Clark <james.clark@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 arch/arm64/include/asm/perf_event.h |  11 +
 drivers/perf/Kconfig                |  11 +
 drivers/perf/Makefile               |   1 +
 drivers/perf/arm_brbe.c             | 571 ++++++++++++++++++++++++++++
 drivers/perf/arm_brbe.h             | 257 +++++++++++++
 drivers/perf/arm_pmuv3.c            |  21 +-
 6 files changed, 869 insertions(+), 3 deletions(-)
 create mode 100644 drivers/perf/arm_brbe.c
 create mode 100644 drivers/perf/arm_brbe.h
  

Comments

Namhyung Kim June 2, 2023, 1:45 a.m. UTC | #1
Hello,

On Tue, May 30, 2023 at 9:21 PM Anshuman Khandual
<anshuman.khandual@arm.com> wrote:
>
> This enables branch stack sampling events in ARMV8 PMU, via an architecture
> feature FEAT_BRBE aka branch record buffer extension. This defines required
> branch helper functions pmuv8pmu_branch_XXXXX() and the implementation here
> is wrapped with a new config option CONFIG_ARM64_BRBE.
>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> Tested-by: James Clark <james.clark@arm.com>
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---

[SNIP]
> +void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
> +{
> +       struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
> +       u64 brbfcr, brbcr;
> +       int idx, loop1_idx1, loop1_idx2, loop2_idx1, loop2_idx2, count;
> +
> +       brbcr = read_sysreg_s(SYS_BRBCR_EL1);
> +       brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
> +
> +       /* Ensure pause on PMU interrupt is enabled */
> +       WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
> +
> +       /* Pause the buffer */
> +       write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> +       isb();
> +
> +       /* Determine the indices for each loop */
> +       loop1_idx1 = BRBE_BANK0_IDX_MIN;
> +       if (brbe_attr->brbe_nr <= BRBE_BANK_MAX_ENTRIES) {
> +               loop1_idx2 = brbe_attr->brbe_nr - 1;
> +               loop2_idx1 = BRBE_BANK1_IDX_MIN;
> +               loop2_idx2 = BRBE_BANK0_IDX_MAX;

Is this to disable the bank1?  Maybe need a comment.


> +       } else {
> +               loop1_idx2 = BRBE_BANK0_IDX_MAX;
> +               loop2_idx1 = BRBE_BANK1_IDX_MIN;
> +               loop2_idx2 = brbe_attr->brbe_nr - 1;
> +       }

The loop2_idx1 is the same for both cases.  Maybe better
to move it out of the if statement.

Thanks,
Namhyung


> +
> +       /* Loop through bank 0 */
> +       select_brbe_bank(BRBE_BANK_IDX_0);
> +       for (idx = 0, count = loop1_idx1; count <= loop1_idx2; idx++, count++) {
> +               if (!capture_branch_entry(cpuc, event, idx))
> +                       goto skip_bank_1;
> +       }
> +
> +       /* Loop through bank 1 */
> +       select_brbe_bank(BRBE_BANK_IDX_1);
> +       for (count = loop2_idx1; count <= loop2_idx2; idx++, count++) {
> +               if (!capture_branch_entry(cpuc, event, idx))
> +                       break;
> +       }
> +
> +skip_bank_1:
> +       cpuc->branches->branch_stack.nr = idx;
> +       cpuc->branches->branch_stack.hw_idx = -1ULL;
> +       process_branch_aborts(cpuc);
> +
> +       /* Unpause the buffer */
> +       write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> +       isb();
> +       armv8pmu_branch_reset();
> +}
  
Anshuman Khandual June 5, 2023, 3 a.m. UTC | #2
On 6/2/23 07:15, Namhyung Kim wrote:
> Hello,
> 
> On Tue, May 30, 2023 at 9:21 PM Anshuman Khandual
> <anshuman.khandual@arm.com> wrote:
>>
>> This enables branch stack sampling events in ARMV8 PMU, via an architecture
>> feature FEAT_BRBE aka branch record buffer extension. This defines required
>> branch helper functions pmuv8pmu_branch_XXXXX() and the implementation here
>> is wrapped with a new config option CONFIG_ARM64_BRBE.
>>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: Mark Rutland <mark.rutland@arm.com>
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linux-kernel@vger.kernel.org
>> Tested-by: James Clark <james.clark@arm.com>
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
> 
> [SNIP]
>> +void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
>> +{
>> +       struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
>> +       u64 brbfcr, brbcr;
>> +       int idx, loop1_idx1, loop1_idx2, loop2_idx1, loop2_idx2, count;
>> +
>> +       brbcr = read_sysreg_s(SYS_BRBCR_EL1);
>> +       brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
>> +
>> +       /* Ensure pause on PMU interrupt is enabled */
>> +       WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
>> +
>> +       /* Pause the buffer */
>> +       write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>> +       isb();
>> +
>> +       /* Determine the indices for each loop */
>> +       loop1_idx1 = BRBE_BANK0_IDX_MIN;
>> +       if (brbe_attr->brbe_nr <= BRBE_BANK_MAX_ENTRIES) {
>> +               loop1_idx2 = brbe_attr->brbe_nr - 1;
>> +               loop2_idx1 = BRBE_BANK1_IDX_MIN;
>> +               loop2_idx2 = BRBE_BANK0_IDX_MAX;
> 
> Is this to disable the bank1?  Maybe need a comment.

Sure, will add a comment.

> 
> 
>> +       } else {
>> +               loop1_idx2 = BRBE_BANK0_IDX_MAX;
>> +               loop2_idx1 = BRBE_BANK1_IDX_MIN;
>> +               loop2_idx2 = brbe_attr->brbe_nr - 1;
>> +       }
> 
> The loop2_idx1 is the same for both cases.  Maybe better
> to move it out of the if statement.

Sure, will do the following change as suggested but wondering whether should
the change be implemented from this patch onwards or in the later patch that
adds capture_brbe_regset().
 
--- a/drivers/perf/arm_brbe.c
+++ b/drivers/perf/arm_brbe.c
@@ -56,13 +56,14 @@ static int capture_brbe_regset(struct brbe_hw_attr *brbe_attr, struct brbe_regse
        int idx, count;
 
        loop1_idx1 = BRBE_BANK0_IDX_MIN;
+       loop2_idx1 = BRBE_BANK1_IDX_MIN;
        if (brbe_attr->brbe_nr <= BRBE_BANK_MAX_ENTRIES) {
                loop1_idx2 = brbe_attr->brbe_nr - 1;
-               loop2_idx1 = BRBE_BANK1_IDX_MIN;
+
+               /* Disable capturing the bank 1 */
                loop2_idx2 = BRBE_BANK0_IDX_MAX;
        } else {
                loop1_idx2 = BRBE_BANK0_IDX_MAX;
-               loop2_idx1 = BRBE_BANK1_IDX_MIN;
                loop2_idx2 = brbe_attr->brbe_nr - 1;
        }
  
Mark Rutland June 5, 2023, 1:43 p.m. UTC | #3
On Wed, May 31, 2023 at 09:34:24AM +0530, Anshuman Khandual wrote:
> This enables branch stack sampling events in ARMV8 PMU, via an architecture
> feature FEAT_BRBE aka branch record buffer extension. This defines required
> branch helper functions pmuv8pmu_branch_XXXXX() and the implementation here
> is wrapped with a new config option CONFIG_ARM64_BRBE.

[...]

> +int armv8pmu_private_alloc(struct arm_pmu *arm_pmu)
> +{
> +	struct brbe_hw_attr *brbe_attr = kzalloc(sizeof(struct brbe_hw_attr), GFP_KERNEL);
> +
> +	if (!brbe_attr)
> +		return -ENOMEM;
> +
> +	arm_pmu->private = brbe_attr;
> +	return 0;
> +}
> +
> +void armv8pmu_private_free(struct arm_pmu *arm_pmu)
> +{
> +	kfree(arm_pmu->private);
> +}

As on the previous patch, I think these should go for now.

[...]

> +static int brbe_attributes_probe(struct arm_pmu *armpmu, u32 brbe)
> +{
> +	struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)armpmu->private;
> +	u64 brbidr = read_sysreg_s(SYS_BRBIDR0_EL1);
> +
> +	brbe_attr->brbe_version = brbe;
> +	brbe_attr->brbe_format = brbe_get_format(brbidr);
> +	brbe_attr->brbe_cc = brbe_get_cc_bits(brbidr);
> +	brbe_attr->brbe_nr = brbe_get_numrec(brbidr);

I think we can store the BRBIDR0_EL1 value directly in arm_pmu as a single
value, and extract the fields as required, like we do for PMMIDR.

[...]

> +static u64 branch_type_to_brbcr(int branch_type)
> +{
> +	u64 brbcr = BRBCR_EL1_DEFAULT_TS;
> +
> +	/*
> +	 * BRBE need not be paused on PMU interrupt while tracing only
> +	 * the user space, bcause it will automatically be inside the
> +	 * prohibited region. But even after PMU overflow occurs, the
> +	 * interrupt could still take much more cycles, before it can
> +	 * be taken and by that time BRBE will have been overwritten.
> +	 * Let's enable pause on PMU interrupt mechanism even for user
> +	 * only traces.
> +	 */
> +	brbcr |= BRBCR_EL1_FZP;

I think this is trying to say that we *should* use FZP when sampling the
kernel (due to IRQ latency), and *can* safely use it when sampling userspace,
so it would be good to explain it that way around.

It's a bit unfortunate, because where this matters we'll always be losing some
branches either way, but I guess we don't have much say in the matter.

[...]

> +/*
> + * A branch record with BRBINFx_EL1.LASTFAILED set, implies that all
> + * preceding consecutive branch records, that were in a transaction
> + * (i.e their BRBINFx_EL1.TX set) have been aborted.
> + *
> + * Similarly BRBFCR_EL1.LASTFAILED set, indicate that all preceding
> + * consecutive branch records up to the last record, which were in a
> + * transaction (i.e their BRBINFx_EL1.TX set) have been aborted.
> + *
> + * --------------------------------- -------------------
> + * | 00 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
> + * --------------------------------- -------------------
> + * | 01 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
> + * --------------------------------- -------------------
> + * | 02 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
> + * --------------------------------- -------------------
> + * | 03 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> + * --------------------------------- -------------------
> + * | 04 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> + * --------------------------------- -------------------
> + * | 05 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 1 |
> + * --------------------------------- -------------------
> + * | .. | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
> + * --------------------------------- -------------------
> + * | 61 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> + * --------------------------------- -------------------
> + * | 62 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> + * --------------------------------- -------------------
> + * | 63 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> + * --------------------------------- -------------------
> + *
> + * BRBFCR_EL1.LASTFAILED == 1
> + *
> + * BRBFCR_EL1.LASTFAILED fails all those consecutive, in transaction
> + * branches records near the end of the BRBE buffer.
> + *
> + * Architecture does not guarantee a non transaction (TX = 0) branch
> + * record between two different transactions. So it is possible that
> + * a subsequent lastfailed record (TX = 0, LF = 1) might erroneously
> + * mark more than required transactions as aborted.
> + */

Linux doesn't currently support TME (and IIUC no-one has built it), so can't we
delete the transaction handling for now? We can add a comment with somehing like:

/*
 * TODO: add transaction handling for TME.
 */

Assuming no-one has built TME, we might also be able to get an architectural
fix to disambiguate the boundary between two transactions, and avoid the
problem described above.

[...]

> +void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
> +{
> +	struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
> +	u64 brbfcr, brbcr;
> +	int idx, loop1_idx1, loop1_idx2, loop2_idx1, loop2_idx2, count;
> +
> +	brbcr = read_sysreg_s(SYS_BRBCR_EL1);
> +	brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
> +
> +	/* Ensure pause on PMU interrupt is enabled */
> +	WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
> +
> +	/* Pause the buffer */
> +	write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> +	isb();
> +
> +	/* Determine the indices for each loop */
> +	loop1_idx1 = BRBE_BANK0_IDX_MIN;
> +	if (brbe_attr->brbe_nr <= BRBE_BANK_MAX_ENTRIES) {
> +		loop1_idx2 = brbe_attr->brbe_nr - 1;
> +		loop2_idx1 = BRBE_BANK1_IDX_MIN;
> +		loop2_idx2 = BRBE_BANK0_IDX_MAX;
> +	} else {
> +		loop1_idx2 = BRBE_BANK0_IDX_MAX;
> +		loop2_idx1 = BRBE_BANK1_IDX_MIN;
> +		loop2_idx2 = brbe_attr->brbe_nr - 1;
> +	}
> +
> +	/* Loop through bank 0 */
> +	select_brbe_bank(BRBE_BANK_IDX_0);
> +	for (idx = 0, count = loop1_idx1; count <= loop1_idx2; idx++, count++) {
> +		if (!capture_branch_entry(cpuc, event, idx))
> +			goto skip_bank_1;
> +	}
> +
> +	/* Loop through bank 1 */
> +	select_brbe_bank(BRBE_BANK_IDX_1);
> +	for (count = loop2_idx1; count <= loop2_idx2; idx++, count++) {
> +		if (!capture_branch_entry(cpuc, event, idx))
> +			break;
> +	}
> +
> +skip_bank_1:
> +	cpuc->branches->branch_stack.nr = idx;
> +	cpuc->branches->branch_stack.hw_idx = -1ULL;
> +	process_branch_aborts(cpuc);
> +
> +	/* Unpause the buffer */
> +	write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> +	isb();
> +	armv8pmu_branch_reset();
> +}

The loop indicies are rather difficult to follow, and I think those can be made
quite a lot simpler if split out, e.g.

| int __armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
| {
| 	struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
| 	int nr_hw_entries = brbe_attr->brbe_nr;
| 	int idx;
| 
| 	select_brbe_bank(BRBE_BANK_IDX_0);
| 	while (idx < nr_hw_entries && idx < BRBE_BANK0_IDX_MAX) {
| 		if (!capture_branch_entry(cpuc, event, idx))
| 			return idx;
| 		idx++;
| 	}
| 
| 	select_brbe_bank(BRBE_BANK_IDX_1);
| 	while (idx < nr_hw_entries && idx < BRBE_BANK1_IDX_MAX) {
| 		if (!capture_branch_entry(cpuc, event, idx))
| 			return idx;
| 		idx++;
| 	}
| 
| 	return idx;
| }
| 
| void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
| {
| 	u64 brbfcr, brbcr;
| 	int nr;
| 
| 	brbcr = read_sysreg_s(SYS_BRBCR_EL1);
| 	brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
| 
| 	/* Ensure pause on PMU interrupt is enabled */
| 	WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
| 
| 	/* Pause the buffer */
| 	write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
| 	isb();
| 
| 	nr = __armv8pmu_branch_read(cpus, event);
| 
| 	cpuc->branches->branch_stack.nr = nr;
| 	cpuc->branches->branch_stack.hw_idx = -1ULL;
| 	process_branch_aborts(cpuc);
| 
| 	/* Unpause the buffer */
| 	write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
| 	isb();
| 	armv8pmu_branch_reset();
| }

Looking at <linux/perf_event.h> I see:

| /*
|  * branch stack layout:
|  *  nr: number of taken branches stored in entries[]
|  *  hw_idx: The low level index of raw branch records
|  *          for the most recent branch.
|  *          -1ULL means invalid/unknown.
|  *
|  * Note that nr can vary from sample to sample
|  * branches (to, from) are stored from most recent
|  * to least recent, i.e., entries[0] contains the most
|  * recent branch.
|  * The entries[] is an abstraction of raw branch records,
|  * which may not be stored in age order in HW, e.g. Intel LBR.
|  * The hw_idx is to expose the low level index of raw
|  * branch record for the most recent branch aka entries[0].
|  * The hw_idx index is between -1 (unknown) and max depth,
|  * which can be retrieved in /sys/devices/cpu/caps/branches.
|  * For the architectures whose raw branch records are
|  * already stored in age order, the hw_idx should be 0.
|  */
| struct perf_branch_stack {
|         __u64                           nr;  
|         __u64                           hw_idx;
|         struct perf_branch_entry        entries[];
| };

... which seems to indicate we should be setting hw_idx to 0, since IIUC our
records are in age order.

[...]

> @@ -1142,14 +1146,25 @@ static void __armv8pmu_probe_pmu(void *info)
>  
>  static int branch_records_alloc(struct arm_pmu *armpmu)
>  {
> +	struct branch_records __percpu *tmp_alloc_ptr;
> +	struct branch_records *records;
>  	struct pmu_hw_events *events;
>  	int cpu;
>  
> +	tmp_alloc_ptr = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
> +	if (!tmp_alloc_ptr)
> +		return -ENOMEM;
> +
> +	/*
> +	 * FIXME: Memory allocated via tmp_alloc_ptr gets completely
> +	 * consumed here, never required to be freed up later. Hence
> +	 * losing access to on stack 'tmp_alloc_ptr' is acceptible.
> +	 * Otherwise this alloc handle has to be saved some where.
> +	 */
>  	for_each_possible_cpu(cpu) {
>  		events = per_cpu_ptr(armpmu->hw_events, cpu);
> -		events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
> -		if (!events->branches)
> -			return -ENOMEM;
> +		records = per_cpu_ptr(tmp_alloc_ptr, cpu);
> +		events->branches = records;
>  	}
>  	return 0;
>  }

As on a prior patch, I think either this should be the approach from the start,
or we should have cleanup for the kzalloc, and either way this should not be a
part of this patch.

If you use the approach in this patch, please rename "tmp_alloc_pointer" for
clarity, and move the temporaries into the loop, e.g.

| static int branch_records_alloc(struct arm_pmu *armpmu)
| {
| 	struct branch_records __percpu *records;
| 	int cpu;
| 
| 	records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
| 	if (!records)
| 		return -ENOMEM;
| 
| 	for_each_possible_cpu(cpu) {
| 		struct pmu_hw_events *events_cpu;
| 		struct branch_records *records_cpu;
| 
| 		events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
| 		records_cpu = per_cpu_ptr(records, cpu);
| 		events_cpu->branches = records_cpu;
| 	}
|	
| 	return 0;
| }

Thanks,
Mark.
  
Anshuman Khandual June 9, 2023, 4:30 a.m. UTC | #4
On 6/5/23 19:13, Mark Rutland wrote:
>> +static u64 branch_type_to_brbcr(int branch_type)
>> +{
>> +	u64 brbcr = BRBCR_EL1_DEFAULT_TS;
>> +
>> +	/*
>> +	 * BRBE need not be paused on PMU interrupt while tracing only
>> +	 * the user space, bcause it will automatically be inside the
>> +	 * prohibited region. But even after PMU overflow occurs, the
>> +	 * interrupt could still take much more cycles, before it can
>> +	 * be taken and by that time BRBE will have been overwritten.
>> +	 * Let's enable pause on PMU interrupt mechanism even for user
>> +	 * only traces.
>> +	 */
>> +	brbcr |= BRBCR_EL1_FZP;
> I think this is trying to say that we *should* use FZP when sampling the
> kernel (due to IRQ latency), and *can* safely use it when sampling userspace,
> so it would be good to explain it that way around.

Agreed, following updated comment explains why we should enable FZP
when sampling kernel, otherwise BRBE will capture unwanted records.
It also explains why we should enable FZP even when sampling user
space due to IRQ latency.

        /*
         * BRBE should be paused on PMU interrupt while tracing kernel
         * space to stop capturing further branch records. Otherwise
         * interrupt handler branch records might get into the samples
         * which is not desired.
         *
         * BRBE need not be paused on PMU interrupt while tracing only
         * the user space, because it will automatically be inside the
         * prohibited region. But even after PMU overflow occurs, the
         * interrupt could still take much more cycles, before it can
         * be taken and by that time BRBE will have been overwritten.
         * Hence enable pause on PMU interrupt mechanism even for user
         * only traces as well.
         */
        brbcr |= BRBCR_EL1_FZP;

> 
> It's a bit unfortunate, because where this matters we'll always be losing some
> branches either way, but I guess we don't have much say in the matter.
  
Anshuman Khandual June 9, 2023, 4:47 a.m. UTC | #5
On 6/5/23 19:13, Mark Rutland wrote:
>> +/*
>> + * A branch record with BRBINFx_EL1.LASTFAILED set, implies that all
>> + * preceding consecutive branch records, that were in a transaction
>> + * (i.e their BRBINFx_EL1.TX set) have been aborted.
>> + *
>> + * Similarly BRBFCR_EL1.LASTFAILED set, indicate that all preceding
>> + * consecutive branch records up to the last record, which were in a
>> + * transaction (i.e their BRBINFx_EL1.TX set) have been aborted.
>> + *
>> + * --------------------------------- -------------------
>> + * | 00 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
>> + * --------------------------------- -------------------
>> + * | 01 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
>> + * --------------------------------- -------------------
>> + * | 02 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
>> + * --------------------------------- -------------------
>> + * | 03 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>> + * --------------------------------- -------------------
>> + * | 04 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>> + * --------------------------------- -------------------
>> + * | 05 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 1 |
>> + * --------------------------------- -------------------
>> + * | .. | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
>> + * --------------------------------- -------------------
>> + * | 61 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>> + * --------------------------------- -------------------
>> + * | 62 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>> + * --------------------------------- -------------------
>> + * | 63 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>> + * --------------------------------- -------------------
>> + *
>> + * BRBFCR_EL1.LASTFAILED == 1
>> + *
>> + * BRBFCR_EL1.LASTFAILED fails all those consecutive, in transaction
>> + * branches records near the end of the BRBE buffer.
>> + *
>> + * Architecture does not guarantee a non transaction (TX = 0) branch
>> + * record between two different transactions. So it is possible that
>> + * a subsequent lastfailed record (TX = 0, LF = 1) might erroneously
>> + * mark more than required transactions as aborted.
>> + */
> Linux doesn't currently support TME (and IIUC no-one has built it), so can't we
> delete the transaction handling for now? We can add a comment with somehing like:
> 
> /*
>  * TODO: add transaction handling for TME.
>  */
> 
> Assuming no-one has built TME, we might also be able to get an architectural
> fix to disambiguate the boundary between two transactions, and avoid the
> problem described above.
> 
> [...]
> 

OR can leave this unchanged for now. Then update it if and when the relevant
architectural fix comes in. The current TME branch records handling here, is
as per the current architectural specification.
  
Anshuman Khandual June 9, 2023, 5:22 a.m. UTC | #6
[...]

On 6/5/23 19:13, Mark Rutland wrote:
>> +void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
>> +{
>> +	struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
>> +	u64 brbfcr, brbcr;
>> +	int idx, loop1_idx1, loop1_idx2, loop2_idx1, loop2_idx2, count;
>> +
>> +	brbcr = read_sysreg_s(SYS_BRBCR_EL1);
>> +	brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
>> +
>> +	/* Ensure pause on PMU interrupt is enabled */
>> +	WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
>> +
>> +	/* Pause the buffer */
>> +	write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>> +	isb();
>> +
>> +	/* Determine the indices for each loop */
>> +	loop1_idx1 = BRBE_BANK0_IDX_MIN;
>> +	if (brbe_attr->brbe_nr <= BRBE_BANK_MAX_ENTRIES) {
>> +		loop1_idx2 = brbe_attr->brbe_nr - 1;
>> +		loop2_idx1 = BRBE_BANK1_IDX_MIN;
>> +		loop2_idx2 = BRBE_BANK0_IDX_MAX;
>> +	} else {
>> +		loop1_idx2 = BRBE_BANK0_IDX_MAX;
>> +		loop2_idx1 = BRBE_BANK1_IDX_MIN;
>> +		loop2_idx2 = brbe_attr->brbe_nr - 1;
>> +	}
>> +
>> +	/* Loop through bank 0 */
>> +	select_brbe_bank(BRBE_BANK_IDX_0);
>> +	for (idx = 0, count = loop1_idx1; count <= loop1_idx2; idx++, count++) {
>> +		if (!capture_branch_entry(cpuc, event, idx))
>> +			goto skip_bank_1;
>> +	}
>> +
>> +	/* Loop through bank 1 */
>> +	select_brbe_bank(BRBE_BANK_IDX_1);
>> +	for (count = loop2_idx1; count <= loop2_idx2; idx++, count++) {
>> +		if (!capture_branch_entry(cpuc, event, idx))
>> +			break;
>> +	}
>> +
>> +skip_bank_1:
>> +	cpuc->branches->branch_stack.nr = idx;
>> +	cpuc->branches->branch_stack.hw_idx = -1ULL;
>> +	process_branch_aborts(cpuc);
>> +
>> +	/* Unpause the buffer */
>> +	write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>> +	isb();
>> +	armv8pmu_branch_reset();
>> +}
> The loop indicies are rather difficult to follow, and I think those can be made
> quite a lot simpler if split out, e.g.
> 
> | int __armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
> | {
> | 	struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
> | 	int nr_hw_entries = brbe_attr->brbe_nr;
> | 	int idx;

I guess idx needs an init to 0.

> | 
> | 	select_brbe_bank(BRBE_BANK_IDX_0);
> | 	while (idx < nr_hw_entries && idx < BRBE_BANK0_IDX_MAX) {
> | 		if (!capture_branch_entry(cpuc, event, idx))
> | 			return idx;
> | 		idx++;
> | 	}
> | 
> | 	select_brbe_bank(BRBE_BANK_IDX_1);
> | 	while (idx < nr_hw_entries && idx < BRBE_BANK1_IDX_MAX) {
> | 		if (!capture_branch_entry(cpuc, event, idx))
> | 			return idx;
> | 		idx++;
> | 	}
> | 
> | 	return idx;
> | }

These loops are better than the proposed one with indices, will update.

> | 
> | void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
> | {
> | 	u64 brbfcr, brbcr;
> | 	int nr;
> | 
> | 	brbcr = read_sysreg_s(SYS_BRBCR_EL1);
> | 	brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
> | 
> | 	/* Ensure pause on PMU interrupt is enabled */
> | 	WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
> | 
> | 	/* Pause the buffer */
> | 	write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> | 	isb();
> | 
> | 	nr = __armv8pmu_branch_read(cpus, event);
> | 
> | 	cpuc->branches->branch_stack.nr = nr;
> | 	cpuc->branches->branch_stack.hw_idx = -1ULL;
> | 	process_branch_aborts(cpuc);
> | 
> | 	/* Unpause the buffer */
> | 	write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> | 	isb();
> | 	armv8pmu_branch_reset();
> | }
> 
> Looking at <linux/perf_event.h> I see:
> 
> | /*
> |  * branch stack layout:
> |  *  nr: number of taken branches stored in entries[]
> |  *  hw_idx: The low level index of raw branch records
> |  *          for the most recent branch.
> |  *          -1ULL means invalid/unknown.
> |  *
> |  * Note that nr can vary from sample to sample
> |  * branches (to, from) are stored from most recent
> |  * to least recent, i.e., entries[0] contains the most
> |  * recent branch.
> |  * The entries[] is an abstraction of raw branch records,
> |  * which may not be stored in age order in HW, e.g. Intel LBR.
> |  * The hw_idx is to expose the low level index of raw
> |  * branch record for the most recent branch aka entries[0].
> |  * The hw_idx index is between -1 (unknown) and max depth,
> |  * which can be retrieved in /sys/devices/cpu/caps/branches.
> |  * For the architectures whose raw branch records are
> |  * already stored in age order, the hw_idx should be 0.
> |  */
> | struct perf_branch_stack {
> |         __u64                           nr;  
> |         __u64                           hw_idx;
> |         struct perf_branch_entry        entries[];
> | };
> 
> ... which seems to indicate we should be setting hw_idx to 0, since IIUC our
> records are in age order.
Branch records are indeed in age order, sure will change hw_idx as 0. Earlier
figured that there was no need for hw_idx and hence marked it as -1UL similar
to other platforms like powerpc.
  
Mark Rutland June 9, 2023, 12:37 p.m. UTC | #7
On Fri, Jun 09, 2023 at 10:00:09AM +0530, Anshuman Khandual wrote:
> >> +static u64 branch_type_to_brbcr(int branch_type)
> >> +{
> >> +	u64 brbcr = BRBCR_EL1_DEFAULT_TS;
> >> +
> >> +	/*
> >> +	 * BRBE need not be paused on PMU interrupt while tracing only
> >> +	 * the user space, bcause it will automatically be inside the
> >> +	 * prohibited region. But even after PMU overflow occurs, the
> >> +	 * interrupt could still take much more cycles, before it can
> >> +	 * be taken and by that time BRBE will have been overwritten.
> >> +	 * Let's enable pause on PMU interrupt mechanism even for user
> >> +	 * only traces.
> >> +	 */
> >> +	brbcr |= BRBCR_EL1_FZP;
> > I think this is trying to say that we *should* use FZP when sampling the
> > kernel (due to IRQ latency), and *can* safely use it when sampling userspace,
> > so it would be good to explain it that way around.
> 
> Agreed, following updated comment explains why we should enable FZP
> when sampling kernel, otherwise BRBE will capture unwanted records.
> It also explains why we should enable FZP even when sampling user
> space due to IRQ latency.
> 
>         /*
>          * BRBE should be paused on PMU interrupt while tracing kernel
>          * space to stop capturing further branch records. Otherwise
>          * interrupt handler branch records might get into the samples
>          * which is not desired.
>          *
>          * BRBE need not be paused on PMU interrupt while tracing only
>          * the user space, because it will automatically be inside the
>          * prohibited region. But even after PMU overflow occurs, the
>          * interrupt could still take much more cycles, before it can
>          * be taken and by that time BRBE will have been overwritten.
>          * Hence enable pause on PMU interrupt mechanism even for user
>          * only traces as well.
>          */
>         brbcr |= BRBCR_EL1_FZP;

Thanks; I think that's a lot clearer!

Mark.
  
Mark Rutland June 9, 2023, 12:42 p.m. UTC | #8
On Fri, Jun 09, 2023 at 10:17:19AM +0530, Anshuman Khandual wrote:
> On 6/5/23 19:13, Mark Rutland wrote:
> >> +/*
> >> + * A branch record with BRBINFx_EL1.LASTFAILED set, implies that all
> >> + * preceding consecutive branch records, that were in a transaction
> >> + * (i.e their BRBINFx_EL1.TX set) have been aborted.
> >> + *
> >> + * Similarly BRBFCR_EL1.LASTFAILED set, indicate that all preceding
> >> + * consecutive branch records up to the last record, which were in a
> >> + * transaction (i.e their BRBINFx_EL1.TX set) have been aborted.
> >> + *
> >> + * --------------------------------- -------------------
> >> + * | 00 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
> >> + * --------------------------------- -------------------
> >> + * | 01 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
> >> + * --------------------------------- -------------------
> >> + * | 02 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
> >> + * --------------------------------- -------------------
> >> + * | 03 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> >> + * --------------------------------- -------------------
> >> + * | 04 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> >> + * --------------------------------- -------------------
> >> + * | 05 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 1 |
> >> + * --------------------------------- -------------------
> >> + * | .. | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
> >> + * --------------------------------- -------------------
> >> + * | 61 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> >> + * --------------------------------- -------------------
> >> + * | 62 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> >> + * --------------------------------- -------------------
> >> + * | 63 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> >> + * --------------------------------- -------------------
> >> + *
> >> + * BRBFCR_EL1.LASTFAILED == 1
> >> + *
> >> + * BRBFCR_EL1.LASTFAILED fails all those consecutive, in transaction
> >> + * branches records near the end of the BRBE buffer.
> >> + *
> >> + * Architecture does not guarantee a non transaction (TX = 0) branch
> >> + * record between two different transactions. So it is possible that
> >> + * a subsequent lastfailed record (TX = 0, LF = 1) might erroneously
> >> + * mark more than required transactions as aborted.
> >> + */
> > Linux doesn't currently support TME (and IIUC no-one has built it), so can't we
> > delete the transaction handling for now? We can add a comment with somehing like:
> > 
> > /*
> >  * TODO: add transaction handling for TME.
> >  */
> > 
> > Assuming no-one has built TME, we might also be able to get an architectural
> > fix to disambiguate the boundary between two transactions, and avoid the
> > problem described above.
> > 
> > [...]
> > 
> 
> OR can leave this unchanged for now. Then update it if and when the relevant
> architectural fix comes in. The current TME branch records handling here, is
> as per the current architectural specification.

My rationale for deleting it is that it cannot be used and cannot be tested,
since the kernel doesn't support TME, and there are no TME implementations out
there. If and when we support TME in the kernel, this is very likely to have
bit-rotted.

I'd be happy to link to the current version, e.g.

/*
 * TODO: add transaction handling for FEAT_TME. See:
 *
 * https://lore.kernel.org/linux-arm-kernel/20230531040428.501523-7-anshuman.khandual@arm.com/
 */

I do appreciate that time and effort has gone into writing this, but IMO it's
more distracting than helpful at present, and deleting it makes this easier to
review and maintain.

Thanks,
Mark.
  
Mark Rutland June 9, 2023, 12:47 p.m. UTC | #9
On Fri, Jun 09, 2023 at 10:52:37AM +0530, Anshuman Khandual wrote:
> [...]
> 
> On 6/5/23 19:13, Mark Rutland wrote:
> >> +void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
> >> +{
> >> +	struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
> >> +	u64 brbfcr, brbcr;
> >> +	int idx, loop1_idx1, loop1_idx2, loop2_idx1, loop2_idx2, count;
> >> +
> >> +	brbcr = read_sysreg_s(SYS_BRBCR_EL1);
> >> +	brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
> >> +
> >> +	/* Ensure pause on PMU interrupt is enabled */
> >> +	WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
> >> +
> >> +	/* Pause the buffer */
> >> +	write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> >> +	isb();
> >> +
> >> +	/* Determine the indices for each loop */
> >> +	loop1_idx1 = BRBE_BANK0_IDX_MIN;
> >> +	if (brbe_attr->brbe_nr <= BRBE_BANK_MAX_ENTRIES) {
> >> +		loop1_idx2 = brbe_attr->brbe_nr - 1;
> >> +		loop2_idx1 = BRBE_BANK1_IDX_MIN;
> >> +		loop2_idx2 = BRBE_BANK0_IDX_MAX;
> >> +	} else {
> >> +		loop1_idx2 = BRBE_BANK0_IDX_MAX;
> >> +		loop2_idx1 = BRBE_BANK1_IDX_MIN;
> >> +		loop2_idx2 = brbe_attr->brbe_nr - 1;
> >> +	}
> >> +
> >> +	/* Loop through bank 0 */
> >> +	select_brbe_bank(BRBE_BANK_IDX_0);
> >> +	for (idx = 0, count = loop1_idx1; count <= loop1_idx2; idx++, count++) {
> >> +		if (!capture_branch_entry(cpuc, event, idx))
> >> +			goto skip_bank_1;
> >> +	}
> >> +
> >> +	/* Loop through bank 1 */
> >> +	select_brbe_bank(BRBE_BANK_IDX_1);
> >> +	for (count = loop2_idx1; count <= loop2_idx2; idx++, count++) {
> >> +		if (!capture_branch_entry(cpuc, event, idx))
> >> +			break;
> >> +	}
> >> +
> >> +skip_bank_1:
> >> +	cpuc->branches->branch_stack.nr = idx;
> >> +	cpuc->branches->branch_stack.hw_idx = -1ULL;
> >> +	process_branch_aborts(cpuc);
> >> +
> >> +	/* Unpause the buffer */
> >> +	write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> >> +	isb();
> >> +	armv8pmu_branch_reset();
> >> +}
> > The loop indicies are rather difficult to follow, and I think those can be made
> > quite a lot simpler if split out, e.g.
> > 
> > | int __armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
> > | {
> > | 	struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
> > | 	int nr_hw_entries = brbe_attr->brbe_nr;
> > | 	int idx;
> 
> I guess idx needs an init to 0.

Yes, sorry, that should have been:

	int idx = 0;

> > | 
> > | 	select_brbe_bank(BRBE_BANK_IDX_0);
> > | 	while (idx < nr_hw_entries && idx < BRBE_BANK0_IDX_MAX) {
> > | 		if (!capture_branch_entry(cpuc, event, idx))
> > | 			return idx;
> > | 		idx++;
> > | 	}
> > | 
> > | 	select_brbe_bank(BRBE_BANK_IDX_1);
> > | 	while (idx < nr_hw_entries && idx < BRBE_BANK1_IDX_MAX) {
> > | 		if (!capture_branch_entry(cpuc, event, idx))
> > | 			return idx;
> > | 		idx++;
> > | 	}
> > | 
> > | 	return idx;
> > | }
> 
> These loops are better than the proposed one with indices, will update.

Great!

> > | 
> > | void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
> > | {
> > | 	u64 brbfcr, brbcr;
> > | 	int nr;
> > | 
> > | 	brbcr = read_sysreg_s(SYS_BRBCR_EL1);
> > | 	brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
> > | 
> > | 	/* Ensure pause on PMU interrupt is enabled */
> > | 	WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
> > | 
> > | 	/* Pause the buffer */
> > | 	write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> > | 	isb();
> > | 
> > | 	nr = __armv8pmu_branch_read(cpus, event);
> > | 
> > | 	cpuc->branches->branch_stack.nr = nr;
> > | 	cpuc->branches->branch_stack.hw_idx = -1ULL;
> > | 	process_branch_aborts(cpuc);
> > | 
> > | 	/* Unpause the buffer */
> > | 	write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> > | 	isb();
> > | 	armv8pmu_branch_reset();
> > | }
> > 
> > Looking at <linux/perf_event.h> I see:
> > 
> > | /*
> > |  * branch stack layout:
> > |  *  nr: number of taken branches stored in entries[]
> > |  *  hw_idx: The low level index of raw branch records
> > |  *          for the most recent branch.
> > |  *          -1ULL means invalid/unknown.
> > |  *
> > |  * Note that nr can vary from sample to sample
> > |  * branches (to, from) are stored from most recent
> > |  * to least recent, i.e., entries[0] contains the most
> > |  * recent branch.
> > |  * The entries[] is an abstraction of raw branch records,
> > |  * which may not be stored in age order in HW, e.g. Intel LBR.
> > |  * The hw_idx is to expose the low level index of raw
> > |  * branch record for the most recent branch aka entries[0].
> > |  * The hw_idx index is between -1 (unknown) and max depth,
> > |  * which can be retrieved in /sys/devices/cpu/caps/branches.
> > |  * For the architectures whose raw branch records are
> > |  * already stored in age order, the hw_idx should be 0.
> > |  */
> > | struct perf_branch_stack {
> > |         __u64                           nr;  
> > |         __u64                           hw_idx;
> > |         struct perf_branch_entry        entries[];
> > | };
> > 
> > ... which seems to indicate we should be setting hw_idx to 0, since IIUC our
> > records are in age order.
> Branch records are indeed in age order, sure will change hw_idx as 0. Earlier
> figured that there was no need for hw_idx and hence marked it as -1UL similar
> to other platforms like powerpc.

That's fair enough; looking at power_pmu_bhrb_read() in
arch/powerpc/perf/core-book3s.c, I see a comment:

	Branches are read most recent first (ie. mfbhrb 0 is
	the most recent branch).

... which suggests that should be 0 also, or that the documentation is wrong.

Do you know how the perf tool consumes this?

Thanks,
Mark.
  
Mark Rutland June 9, 2023, 1:15 p.m. UTC | #10
On Fri, Jun 09, 2023 at 01:47:18PM +0100, Mark Rutland wrote:
> On Fri, Jun 09, 2023 at 10:52:37AM +0530, Anshuman Khandual wrote:
> > On 6/5/23 19:13, Mark Rutland wrote:
> > > Looking at <linux/perf_event.h> I see:
> > > 
> > > | /*
> > > |  * branch stack layout:
> > > |  *  nr: number of taken branches stored in entries[]
> > > |  *  hw_idx: The low level index of raw branch records
> > > |  *          for the most recent branch.
> > > |  *          -1ULL means invalid/unknown.
> > > |  *
> > > |  * Note that nr can vary from sample to sample
> > > |  * branches (to, from) are stored from most recent
> > > |  * to least recent, i.e., entries[0] contains the most
> > > |  * recent branch.
> > > |  * The entries[] is an abstraction of raw branch records,
> > > |  * which may not be stored in age order in HW, e.g. Intel LBR.
> > > |  * The hw_idx is to expose the low level index of raw
> > > |  * branch record for the most recent branch aka entries[0].
> > > |  * The hw_idx index is between -1 (unknown) and max depth,
> > > |  * which can be retrieved in /sys/devices/cpu/caps/branches.
> > > |  * For the architectures whose raw branch records are
> > > |  * already stored in age order, the hw_idx should be 0.
> > > |  */
> > > | struct perf_branch_stack {
> > > |         __u64                           nr;  
> > > |         __u64                           hw_idx;
> > > |         struct perf_branch_entry        entries[];
> > > | };
> > > 
> > > ... which seems to indicate we should be setting hw_idx to 0, since IIUC our
> > > records are in age order.
> > Branch records are indeed in age order, sure will change hw_idx as 0. Earlier
> > figured that there was no need for hw_idx and hence marked it as -1UL similar
> > to other platforms like powerpc.
> 
> That's fair enough; looking at power_pmu_bhrb_read() in
> arch/powerpc/perf/core-book3s.c, I see a comment:
> 
> 	Branches are read most recent first (ie. mfbhrb 0 is
> 	the most recent branch).
> 
> ... which suggests that should be 0 also, or that the documentation is wrong.
> 
> Do you know how the perf tool consumes this?


Thinking about this some more, if what this is saying is that if entries[0]
must be strictly the last branch, and we've lost branches due to interrupt
latency, then we clearly don't meet that requirement and must report -1ULL
here.

So while it'd be nice to figure this out, I'm happy using -1ULL, and would be a
bit concerned using 0.

Sorry for flip-flopping on this.

Thanks,
Mark.
  
James Clark June 9, 2023, 1:34 p.m. UTC | #11
On 09/06/2023 13:47, Mark Rutland wrote:
> On Fri, Jun 09, 2023 at 10:52:37AM +0530, Anshuman Khandual wrote:
>> [...]
>>
>> On 6/5/23 19:13, Mark Rutland wrote:
>>>> +void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
>>>> +{
>>>> +	struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
>>>> +	u64 brbfcr, brbcr;
>>>> +	int idx, loop1_idx1, loop1_idx2, loop2_idx1, loop2_idx2, count;
>>>> +
>>>> +	brbcr = read_sysreg_s(SYS_BRBCR_EL1);
>>>> +	brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
>>>> +
>>>> +	/* Ensure pause on PMU interrupt is enabled */
>>>> +	WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
>>>> +
>>>> +	/* Pause the buffer */
>>>> +	write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>>>> +	isb();
>>>> +
>>>> +	/* Determine the indices for each loop */
>>>> +	loop1_idx1 = BRBE_BANK0_IDX_MIN;
>>>> +	if (brbe_attr->brbe_nr <= BRBE_BANK_MAX_ENTRIES) {
>>>> +		loop1_idx2 = brbe_attr->brbe_nr - 1;
>>>> +		loop2_idx1 = BRBE_BANK1_IDX_MIN;
>>>> +		loop2_idx2 = BRBE_BANK0_IDX_MAX;
>>>> +	} else {
>>>> +		loop1_idx2 = BRBE_BANK0_IDX_MAX;
>>>> +		loop2_idx1 = BRBE_BANK1_IDX_MIN;
>>>> +		loop2_idx2 = brbe_attr->brbe_nr - 1;
>>>> +	}
>>>> +
>>>> +	/* Loop through bank 0 */
>>>> +	select_brbe_bank(BRBE_BANK_IDX_0);
>>>> +	for (idx = 0, count = loop1_idx1; count <= loop1_idx2; idx++, count++) {
>>>> +		if (!capture_branch_entry(cpuc, event, idx))
>>>> +			goto skip_bank_1;
>>>> +	}
>>>> +
>>>> +	/* Loop through bank 1 */
>>>> +	select_brbe_bank(BRBE_BANK_IDX_1);
>>>> +	for (count = loop2_idx1; count <= loop2_idx2; idx++, count++) {
>>>> +		if (!capture_branch_entry(cpuc, event, idx))
>>>> +			break;
>>>> +	}
>>>> +
>>>> +skip_bank_1:
>>>> +	cpuc->branches->branch_stack.nr = idx;
>>>> +	cpuc->branches->branch_stack.hw_idx = -1ULL;
>>>> +	process_branch_aborts(cpuc);
>>>> +
>>>> +	/* Unpause the buffer */
>>>> +	write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>>>> +	isb();
>>>> +	armv8pmu_branch_reset();
>>>> +}
>>> The loop indicies are rather difficult to follow, and I think those can be made
>>> quite a lot simpler if split out, e.g.
>>>
>>> | int __armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
>>> | {
>>> | 	struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
>>> | 	int nr_hw_entries = brbe_attr->brbe_nr;
>>> | 	int idx;
>>
>> I guess idx needs an init to 0.
> 
> Yes, sorry, that should have been:
> 
> 	int idx = 0;
> 
>>> | 
>>> | 	select_brbe_bank(BRBE_BANK_IDX_0);
>>> | 	while (idx < nr_hw_entries && idx < BRBE_BANK0_IDX_MAX) {
>>> | 		if (!capture_branch_entry(cpuc, event, idx))
>>> | 			return idx;
>>> | 		idx++;
>>> | 	}
>>> | 
>>> | 	select_brbe_bank(BRBE_BANK_IDX_1);
>>> | 	while (idx < nr_hw_entries && idx < BRBE_BANK1_IDX_MAX) {
>>> | 		if (!capture_branch_entry(cpuc, event, idx))
>>> | 			return idx;
>>> | 		idx++;
>>> | 	}
>>> | 
>>> | 	return idx;
>>> | }
>>
>> These loops are better than the proposed one with indices, will update.
> 
> Great!
> 
>>> | 
>>> | void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
>>> | {
>>> | 	u64 brbfcr, brbcr;
>>> | 	int nr;
>>> | 
>>> | 	brbcr = read_sysreg_s(SYS_BRBCR_EL1);
>>> | 	brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
>>> | 
>>> | 	/* Ensure pause on PMU interrupt is enabled */
>>> | 	WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
>>> | 
>>> | 	/* Pause the buffer */
>>> | 	write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>>> | 	isb();
>>> | 
>>> | 	nr = __armv8pmu_branch_read(cpus, event);
>>> | 
>>> | 	cpuc->branches->branch_stack.nr = nr;
>>> | 	cpuc->branches->branch_stack.hw_idx = -1ULL;
>>> | 	process_branch_aborts(cpuc);
>>> | 
>>> | 	/* Unpause the buffer */
>>> | 	write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>>> | 	isb();
>>> | 	armv8pmu_branch_reset();
>>> | }
>>>
>>> Looking at <linux/perf_event.h> I see:
>>>
>>> | /*
>>> |  * branch stack layout:
>>> |  *  nr: number of taken branches stored in entries[]
>>> |  *  hw_idx: The low level index of raw branch records
>>> |  *          for the most recent branch.
>>> |  *          -1ULL means invalid/unknown.
>>> |  *
>>> |  * Note that nr can vary from sample to sample
>>> |  * branches (to, from) are stored from most recent
>>> |  * to least recent, i.e., entries[0] contains the most
>>> |  * recent branch.
>>> |  * The entries[] is an abstraction of raw branch records,
>>> |  * which may not be stored in age order in HW, e.g. Intel LBR.
>>> |  * The hw_idx is to expose the low level index of raw
>>> |  * branch record for the most recent branch aka entries[0].
>>> |  * The hw_idx index is between -1 (unknown) and max depth,
>>> |  * which can be retrieved in /sys/devices/cpu/caps/branches.
>>> |  * For the architectures whose raw branch records are
>>> |  * already stored in age order, the hw_idx should be 0.
>>> |  */
>>> | struct perf_branch_stack {
>>> |         __u64                           nr;  
>>> |         __u64                           hw_idx;
>>> |         struct perf_branch_entry        entries[];
>>> | };
>>>
>>> ... which seems to indicate we should be setting hw_idx to 0, since IIUC our
>>> records are in age order.
>> Branch records are indeed in age order, sure will change hw_idx as 0. Earlier
>> figured that there was no need for hw_idx and hence marked it as -1UL similar
>> to other platforms like powerpc.
> 
> That's fair enough; looking at power_pmu_bhrb_read() in
> arch/powerpc/perf/core-book3s.c, I see a comment:
> 
> 	Branches are read most recent first (ie. mfbhrb 0 is
> 	the most recent branch).
> 
> ... which suggests that should be 0 also, or that the documentation is wrong.
> 
> Do you know how the perf tool consumes this?
> 
> Thanks,
> Mark.

It looks like it's a unique ID/last position updated in the LBR FIFO and
it's used to stitch callchains together when the stack depth exceeds the
buffer size. Perf takes the previous one that got filled to the limit
and and the new one and stitches them together if the hw_idx matches.

There are some options in perf you need to provide to make it happen, so
I think for BRBE it doesn't matter what value is assigned to it. -1
seems to be a 'not used' value which we should probably set in case the
event is opened with PERF_SAMPLE_BRANCH_HW_INDEX

You could also fail to open the event if PERF_SAMPLE_BRANCH_HW_INDEX is
set, and that would save writing out -1 every time for every branch
stack. Although it's not enabled by default, so maybe that's not necessary.

James
  
Anshuman Khandual June 12, 2023, 8:35 a.m. UTC | #12
On 6/9/23 18:45, Mark Rutland wrote:
> On Fri, Jun 09, 2023 at 01:47:18PM +0100, Mark Rutland wrote:
>> On Fri, Jun 09, 2023 at 10:52:37AM +0530, Anshuman Khandual wrote:
>>> On 6/5/23 19:13, Mark Rutland wrote:
>>>> Looking at <linux/perf_event.h> I see:
>>>>
>>>> | /*
>>>> |  * branch stack layout:
>>>> |  *  nr: number of taken branches stored in entries[]
>>>> |  *  hw_idx: The low level index of raw branch records
>>>> |  *          for the most recent branch.
>>>> |  *          -1ULL means invalid/unknown.
>>>> |  *
>>>> |  * Note that nr can vary from sample to sample
>>>> |  * branches (to, from) are stored from most recent
>>>> |  * to least recent, i.e., entries[0] contains the most
>>>> |  * recent branch.
>>>> |  * The entries[] is an abstraction of raw branch records,
>>>> |  * which may not be stored in age order in HW, e.g. Intel LBR.
>>>> |  * The hw_idx is to expose the low level index of raw
>>>> |  * branch record for the most recent branch aka entries[0].
>>>> |  * The hw_idx index is between -1 (unknown) and max depth,
>>>> |  * which can be retrieved in /sys/devices/cpu/caps/branches.
>>>> |  * For the architectures whose raw branch records are
>>>> |  * already stored in age order, the hw_idx should be 0.
>>>> |  */
>>>> | struct perf_branch_stack {
>>>> |         __u64                           nr;  
>>>> |         __u64                           hw_idx;
>>>> |         struct perf_branch_entry        entries[];
>>>> | };
>>>>
>>>> ... which seems to indicate we should be setting hw_idx to 0, since IIUC our
>>>> records are in age order.
>>> Branch records are indeed in age order, sure will change hw_idx as 0. Earlier
>>> figured that there was no need for hw_idx and hence marked it as -1UL similar
>>> to other platforms like powerpc.
>>
>> That's fair enough; looking at power_pmu_bhrb_read() in
>> arch/powerpc/perf/core-book3s.c, I see a comment:
>>
>> 	Branches are read most recent first (ie. mfbhrb 0 is
>> 	the most recent branch).
>>
>> ... which suggests that should be 0 also, or that the documentation is wrong.
>>
>> Do you know how the perf tool consumes this?
> 
> 
> Thinking about this some more, if what this is saying is that if entries[0]
> must be strictly the last branch, and we've lost branches due to interrupt
> latency, then we clearly don't meet that requirement and must report -1ULL
> here.

'last branch' means relative to the captured records not in absolute terms.
Loosing records for interrupt latency too does not change the relative age
order for the set. Hence '0' might suggest valid relative not absolute age
order for the branch records set.

> 
> So while it'd be nice to figure this out, I'm happy using -1ULL, and would be a
> bit concerned using 0.

Sounds reasonable. If tools are not going to use this anyway, I guess there
is no point in suggesting that each record set has got valid age order with
subtle conditions involved.
  
Anshuman Khandual June 12, 2023, 10:12 a.m. UTC | #13
On 6/9/23 19:04, James Clark wrote:
> 
> 
> On 09/06/2023 13:47, Mark Rutland wrote:
>> On Fri, Jun 09, 2023 at 10:52:37AM +0530, Anshuman Khandual wrote:
>>> [...]
>>>
>>> On 6/5/23 19:13, Mark Rutland wrote:
>>>>> +void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
>>>>> +{
>>>>> +	struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
>>>>> +	u64 brbfcr, brbcr;
>>>>> +	int idx, loop1_idx1, loop1_idx2, loop2_idx1, loop2_idx2, count;
>>>>> +
>>>>> +	brbcr = read_sysreg_s(SYS_BRBCR_EL1);
>>>>> +	brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
>>>>> +
>>>>> +	/* Ensure pause on PMU interrupt is enabled */
>>>>> +	WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
>>>>> +
>>>>> +	/* Pause the buffer */
>>>>> +	write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>>>>> +	isb();
>>>>> +
>>>>> +	/* Determine the indices for each loop */
>>>>> +	loop1_idx1 = BRBE_BANK0_IDX_MIN;
>>>>> +	if (brbe_attr->brbe_nr <= BRBE_BANK_MAX_ENTRIES) {
>>>>> +		loop1_idx2 = brbe_attr->brbe_nr - 1;
>>>>> +		loop2_idx1 = BRBE_BANK1_IDX_MIN;
>>>>> +		loop2_idx2 = BRBE_BANK0_IDX_MAX;
>>>>> +	} else {
>>>>> +		loop1_idx2 = BRBE_BANK0_IDX_MAX;
>>>>> +		loop2_idx1 = BRBE_BANK1_IDX_MIN;
>>>>> +		loop2_idx2 = brbe_attr->brbe_nr - 1;
>>>>> +	}
>>>>> +
>>>>> +	/* Loop through bank 0 */
>>>>> +	select_brbe_bank(BRBE_BANK_IDX_0);
>>>>> +	for (idx = 0, count = loop1_idx1; count <= loop1_idx2; idx++, count++) {
>>>>> +		if (!capture_branch_entry(cpuc, event, idx))
>>>>> +			goto skip_bank_1;
>>>>> +	}
>>>>> +
>>>>> +	/* Loop through bank 1 */
>>>>> +	select_brbe_bank(BRBE_BANK_IDX_1);
>>>>> +	for (count = loop2_idx1; count <= loop2_idx2; idx++, count++) {
>>>>> +		if (!capture_branch_entry(cpuc, event, idx))
>>>>> +			break;
>>>>> +	}
>>>>> +
>>>>> +skip_bank_1:
>>>>> +	cpuc->branches->branch_stack.nr = idx;
>>>>> +	cpuc->branches->branch_stack.hw_idx = -1ULL;
>>>>> +	process_branch_aborts(cpuc);
>>>>> +
>>>>> +	/* Unpause the buffer */
>>>>> +	write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>>>>> +	isb();
>>>>> +	armv8pmu_branch_reset();
>>>>> +}
>>>> The loop indicies are rather difficult to follow, and I think those can be made
>>>> quite a lot simpler if split out, e.g.
>>>>
>>>> | int __armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
>>>> | {
>>>> | 	struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
>>>> | 	int nr_hw_entries = brbe_attr->brbe_nr;
>>>> | 	int idx;
>>>
>>> I guess idx needs an init to 0.
>>
>> Yes, sorry, that should have been:
>>
>> 	int idx = 0;
>>
>>>> | 
>>>> | 	select_brbe_bank(BRBE_BANK_IDX_0);
>>>> | 	while (idx < nr_hw_entries && idx < BRBE_BANK0_IDX_MAX) {
>>>> | 		if (!capture_branch_entry(cpuc, event, idx))
>>>> | 			return idx;
>>>> | 		idx++;
>>>> | 	}
>>>> | 
>>>> | 	select_brbe_bank(BRBE_BANK_IDX_1);
>>>> | 	while (idx < nr_hw_entries && idx < BRBE_BANK1_IDX_MAX) {
>>>> | 		if (!capture_branch_entry(cpuc, event, idx))
>>>> | 			return idx;
>>>> | 		idx++;
>>>> | 	}
>>>> | 
>>>> | 	return idx;
>>>> | }
>>>
>>> These loops are better than the proposed one with indices, will update.
>>
>> Great!
>>
>>>> | 
>>>> | void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
>>>> | {
>>>> | 	u64 brbfcr, brbcr;
>>>> | 	int nr;
>>>> | 
>>>> | 	brbcr = read_sysreg_s(SYS_BRBCR_EL1);
>>>> | 	brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
>>>> | 
>>>> | 	/* Ensure pause on PMU interrupt is enabled */
>>>> | 	WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
>>>> | 
>>>> | 	/* Pause the buffer */
>>>> | 	write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>>>> | 	isb();
>>>> | 
>>>> | 	nr = __armv8pmu_branch_read(cpus, event);
>>>> | 
>>>> | 	cpuc->branches->branch_stack.nr = nr;
>>>> | 	cpuc->branches->branch_stack.hw_idx = -1ULL;
>>>> | 	process_branch_aborts(cpuc);
>>>> | 
>>>> | 	/* Unpause the buffer */
>>>> | 	write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>>>> | 	isb();
>>>> | 	armv8pmu_branch_reset();
>>>> | }
>>>>
>>>> Looking at <linux/perf_event.h> I see:
>>>>
>>>> | /*
>>>> |  * branch stack layout:
>>>> |  *  nr: number of taken branches stored in entries[]
>>>> |  *  hw_idx: The low level index of raw branch records
>>>> |  *          for the most recent branch.
>>>> |  *          -1ULL means invalid/unknown.
>>>> |  *
>>>> |  * Note that nr can vary from sample to sample
>>>> |  * branches (to, from) are stored from most recent
>>>> |  * to least recent, i.e., entries[0] contains the most
>>>> |  * recent branch.
>>>> |  * The entries[] is an abstraction of raw branch records,
>>>> |  * which may not be stored in age order in HW, e.g. Intel LBR.
>>>> |  * The hw_idx is to expose the low level index of raw
>>>> |  * branch record for the most recent branch aka entries[0].
>>>> |  * The hw_idx index is between -1 (unknown) and max depth,
>>>> |  * which can be retrieved in /sys/devices/cpu/caps/branches.
>>>> |  * For the architectures whose raw branch records are
>>>> |  * already stored in age order, the hw_idx should be 0.
>>>> |  */
>>>> | struct perf_branch_stack {
>>>> |         __u64                           nr;  
>>>> |         __u64                           hw_idx;
>>>> |         struct perf_branch_entry        entries[];
>>>> | };
>>>>
>>>> ... which seems to indicate we should be setting hw_idx to 0, since IIUC our
>>>> records are in age order.
>>> Branch records are indeed in age order, sure will change hw_idx as 0. Earlier
>>> figured that there was no need for hw_idx and hence marked it as -1UL similar
>>> to other platforms like powerpc.
>>
>> That's fair enough; looking at power_pmu_bhrb_read() in
>> arch/powerpc/perf/core-book3s.c, I see a comment:
>>
>> 	Branches are read most recent first (ie. mfbhrb 0 is
>> 	the most recent branch).
>>
>> ... which suggests that should be 0 also, or that the documentation is wrong.
>>
>> Do you know how the perf tool consumes this?
>>
>> Thanks,
>> Mark.
> 
> It looks like it's a unique ID/last position updated in the LBR FIFO and
> it's used to stitch callchains together when the stack depth exceeds the
> buffer size. Perf takes the previous one that got filled to the limit
> and and the new one and stitches them together if the hw_idx matches.

Right.

> 
> There are some options in perf you need to provide to make it happen, so
> I think for BRBE it doesn't matter what value is assigned to it. -1
> seems to be a 'not used' value which we should probably set in case the
> event is opened with PERF_SAMPLE_BRANCH_HW_INDEX

-1 indeed did seem like a "not used" option rather than an "unkwown" option.
 
> 
> You could also fail to open the event if PERF_SAMPLE_BRANCH_HW_INDEX is
> set, and that would save writing out -1 every time for every branch
> stack. Although it's not enabled by default, so maybe that's not necessary.

Yeah blocking events with PERF_SAMPLE_BRANCH_HW_INDEX is not necessary IMHO.
  

Patch

diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index 7548813783ba..f071d629c0cf 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -31,6 +31,16 @@  struct perf_event;
 #ifdef CONFIG_PERF_EVENTS
 static inline bool has_branch_stack(struct perf_event *event);
 
+#ifdef CONFIG_ARM64_BRBE
+void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event);
+bool armv8pmu_branch_valid(struct perf_event *event);
+void armv8pmu_branch_enable(struct perf_event *event);
+void armv8pmu_branch_disable(struct perf_event *event);
+void armv8pmu_branch_probe(struct arm_pmu *arm_pmu);
+void armv8pmu_branch_reset(void);
+int armv8pmu_private_alloc(struct arm_pmu *arm_pmu);
+void armv8pmu_private_free(struct arm_pmu *arm_pmu);
+#else
 static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
 {
 	WARN_ON_ONCE(!has_branch_stack(event));
@@ -58,3 +68,4 @@  static inline int armv8pmu_private_alloc(struct arm_pmu *arm_pmu) { return 0; }
 static inline void armv8pmu_private_free(struct arm_pmu *arm_pmu) { }
 #endif
 #endif
+#endif
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 711f82400086..7d07aa79e5b0 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -172,6 +172,17 @@  config ARM_SPE_PMU
 	  Extension, which provides periodic sampling of operations in
 	  the CPU pipeline and reports this via the perf AUX interface.
 
+config ARM64_BRBE
+	bool "Enable support for Branch Record Buffer Extension (BRBE)"
+	depends on PERF_EVENTS && ARM64 && ARM_PMU
+	default y
+	help
+	  Enable perf support for Branch Record Buffer Extension (BRBE) which
+	  records all branches taken in an execution path. This supports some
+	  branch types and privilege based filtering. It captured additional
+	  relevant information such as cycle count, misprediction and branch
+	  type, branch privilege level etc.
+
 config ARM_DMC620_PMU
 	tristate "Enable PMU support for the ARM DMC-620 memory controller"
 	depends on (ARM64 && ACPI) || COMPILE_TEST
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index dabc859540ce..29d256f2deaa 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -17,6 +17,7 @@  obj-$(CONFIG_RISCV_PMU_SBI) += riscv_pmu_sbi.o
 obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
 obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
+obj-$(CONFIG_ARM64_BRBE) += arm_brbe.o
 obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
 obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
 obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
diff --git a/drivers/perf/arm_brbe.c b/drivers/perf/arm_brbe.c
new file mode 100644
index 000000000000..34547ad750ad
--- /dev/null
+++ b/drivers/perf/arm_brbe.c
@@ -0,0 +1,571 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Branch Record Buffer Extension Driver.
+ *
+ * Copyright (C) 2022 ARM Limited
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#include "arm_brbe.h"
+
+static bool valid_brbe_nr(int brbe_nr)
+{
+	return brbe_nr == BRBIDR0_EL1_NUMREC_8 ||
+	       brbe_nr == BRBIDR0_EL1_NUMREC_16 ||
+	       brbe_nr == BRBIDR0_EL1_NUMREC_32 ||
+	       brbe_nr == BRBIDR0_EL1_NUMREC_64;
+}
+
+static bool valid_brbe_cc(int brbe_cc)
+{
+	return brbe_cc == BRBIDR0_EL1_CC_20_BIT;
+}
+
+static bool valid_brbe_format(int brbe_format)
+{
+	return brbe_format == BRBIDR0_EL1_FORMAT_0;
+}
+
+static bool valid_brbe_version(int brbe_version)
+{
+	return brbe_version == ID_AA64DFR0_EL1_BRBE_IMP ||
+	       brbe_version == ID_AA64DFR0_EL1_BRBE_BRBE_V1P1;
+}
+
+static void select_brbe_bank(int bank)
+{
+	u64 brbfcr;
+
+	WARN_ON(bank > BRBE_BANK_IDX_1);
+	brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
+	brbfcr &= ~BRBFCR_EL1_BANK_MASK;
+	brbfcr |= SYS_FIELD_PREP(BRBFCR_EL1, BANK, bank);
+	write_sysreg_s(brbfcr, SYS_BRBFCR_EL1);
+	isb();
+}
+
+/*
+ * Generic perf branch filters supported on BRBE
+ *
+ * New branch filters need to be evaluated whether they could be supported on
+ * BRBE. This ensures that such branch filters would not just be accepted, to
+ * fail silently. PERF_SAMPLE_BRANCH_HV is a special case that is selectively
+ * supported only on platforms where kernel is in hyp mode.
+ */
+#define BRBE_EXCLUDE_BRANCH_FILTERS (PERF_SAMPLE_BRANCH_ABORT_TX	| \
+				     PERF_SAMPLE_BRANCH_IN_TX		| \
+				     PERF_SAMPLE_BRANCH_NO_TX		| \
+				     PERF_SAMPLE_BRANCH_CALL_STACK)
+
+#define BRBE_ALLOWED_BRANCH_FILTERS (PERF_SAMPLE_BRANCH_USER		| \
+				     PERF_SAMPLE_BRANCH_KERNEL		| \
+				     PERF_SAMPLE_BRANCH_HV		| \
+				     PERF_SAMPLE_BRANCH_ANY		| \
+				     PERF_SAMPLE_BRANCH_ANY_CALL	| \
+				     PERF_SAMPLE_BRANCH_ANY_RETURN	| \
+				     PERF_SAMPLE_BRANCH_IND_CALL	| \
+				     PERF_SAMPLE_BRANCH_COND		| \
+				     PERF_SAMPLE_BRANCH_IND_JUMP	| \
+				     PERF_SAMPLE_BRANCH_CALL		| \
+				     PERF_SAMPLE_BRANCH_NO_FLAGS	| \
+				     PERF_SAMPLE_BRANCH_NO_CYCLES	| \
+				     PERF_SAMPLE_BRANCH_TYPE_SAVE	| \
+				     PERF_SAMPLE_BRANCH_HW_INDEX	| \
+				     PERF_SAMPLE_BRANCH_PRIV_SAVE)
+
+#define BRBE_PERF_BRANCH_FILTERS    (BRBE_ALLOWED_BRANCH_FILTERS	| \
+				     BRBE_EXCLUDE_BRANCH_FILTERS)
+
+bool armv8pmu_branch_valid(struct perf_event *event)
+{
+	u64 branch_type = event->attr.branch_sample_type;
+
+	/*
+	 * Ensure both perf branch filter allowed and exclude
+	 * masks are always in sync with the generic perf ABI.
+	 */
+	BUILD_BUG_ON(BRBE_PERF_BRANCH_FILTERS != (PERF_SAMPLE_BRANCH_MAX - 1));
+
+	if (branch_type & ~BRBE_ALLOWED_BRANCH_FILTERS) {
+		pr_debug_once("requested branch filter not supported 0x%llx\n", branch_type);
+		return false;
+	}
+
+	/*
+	 * If the event does not have at least one of the privilege
+	 * branch filters as in PERF_SAMPLE_BRANCH_PLM_ALL, the core
+	 * perf will adjust its value based on perf event's existing
+	 * privilege level via attr.exclude_[user|kernel|hv].
+	 *
+	 * As event->attr.branch_sample_type might have been changed
+	 * when the event reaches here, it is not possible to figure
+	 * out whether the event originally had HV privilege request
+	 * or got added via the core perf. Just report this situation
+	 * once and continue ignoring if there are other instances.
+	 */
+	if ((branch_type & PERF_SAMPLE_BRANCH_HV) && !is_kernel_in_hyp_mode())
+		pr_debug_once("hypervisor privilege filter not supported 0x%llx\n", branch_type);
+
+	return true;
+}
+
+int armv8pmu_private_alloc(struct arm_pmu *arm_pmu)
+{
+	struct brbe_hw_attr *brbe_attr = kzalloc(sizeof(struct brbe_hw_attr), GFP_KERNEL);
+
+	if (!brbe_attr)
+		return -ENOMEM;
+
+	arm_pmu->private = brbe_attr;
+	return 0;
+}
+
+void armv8pmu_private_free(struct arm_pmu *arm_pmu)
+{
+	kfree(arm_pmu->private);
+}
+
+static int brbe_attributes_probe(struct arm_pmu *armpmu, u32 brbe)
+{
+	struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)armpmu->private;
+	u64 brbidr = read_sysreg_s(SYS_BRBIDR0_EL1);
+
+	brbe_attr->brbe_version = brbe;
+	brbe_attr->brbe_format = brbe_get_format(brbidr);
+	brbe_attr->brbe_cc = brbe_get_cc_bits(brbidr);
+	brbe_attr->brbe_nr = brbe_get_numrec(brbidr);
+
+	if (!valid_brbe_version(brbe_attr->brbe_version) ||
+	   !valid_brbe_format(brbe_attr->brbe_format) ||
+	   !valid_brbe_cc(brbe_attr->brbe_cc) ||
+	   !valid_brbe_nr(brbe_attr->brbe_nr))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+void armv8pmu_branch_probe(struct arm_pmu *armpmu)
+{
+	u64 aa64dfr0 = read_sysreg_s(SYS_ID_AA64DFR0_EL1);
+	u32 brbe;
+
+	brbe = cpuid_feature_extract_unsigned_field(aa64dfr0, ID_AA64DFR0_EL1_BRBE_SHIFT);
+	if (!brbe)
+		return;
+
+	if (brbe_attributes_probe(armpmu, brbe))
+		return;
+
+	armpmu->has_branch_stack = 1;
+}
+
+static u64 branch_type_to_brbfcr(int branch_type)
+{
+	u64 brbfcr = 0;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_ANY) {
+		brbfcr |= BRBFCR_EL1_BRANCH_FILTERS;
+		return brbfcr;
+	}
+
+	if (branch_type & PERF_SAMPLE_BRANCH_ANY_CALL) {
+		brbfcr |= BRBFCR_EL1_INDCALL;
+		brbfcr |= BRBFCR_EL1_DIRCALL;
+	}
+
+	if (branch_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+		brbfcr |= BRBFCR_EL1_RTN;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_IND_CALL)
+		brbfcr |= BRBFCR_EL1_INDCALL;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_COND)
+		brbfcr |= BRBFCR_EL1_CONDDIR;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+		brbfcr |= BRBFCR_EL1_INDIRECT;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_CALL)
+		brbfcr |= BRBFCR_EL1_DIRCALL;
+
+	return brbfcr;
+}
+
+static u64 branch_type_to_brbcr(int branch_type)
+{
+	u64 brbcr = BRBCR_EL1_DEFAULT_TS;
+
+	/*
+	 * BRBE need not be paused on PMU interrupt while tracing only
+	 * the user space, bcause it will automatically be inside the
+	 * prohibited region. But even after PMU overflow occurs, the
+	 * interrupt could still take much more cycles, before it can
+	 * be taken and by that time BRBE will have been overwritten.
+	 * Let's enable pause on PMU interrupt mechanism even for user
+	 * only traces.
+	 */
+	brbcr |= BRBCR_EL1_FZP;
+
+	/*
+	 * When running in the hyp mode, writing into BRBCR_EL1
+	 * actually writes into BRBCR_EL2 instead. Field E2BRE
+	 * is also at the same position as E1BRE.
+	 */
+	if (branch_type & PERF_SAMPLE_BRANCH_USER)
+		brbcr |= BRBCR_EL1_E0BRE;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_KERNEL)
+		brbcr |= BRBCR_EL1_E1BRE;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_HV) {
+		if (is_kernel_in_hyp_mode())
+			brbcr |= BRBCR_EL1_E1BRE;
+	}
+
+	if (!(branch_type & PERF_SAMPLE_BRANCH_NO_CYCLES))
+		brbcr |= BRBCR_EL1_CC;
+
+	if (!(branch_type & PERF_SAMPLE_BRANCH_NO_FLAGS))
+		brbcr |= BRBCR_EL1_MPRED;
+
+	/*
+	 * The exception and exception return branches could be
+	 * captured, irrespective of the perf event's privilege.
+	 * If the perf event does not have enough privilege for
+	 * a given exception level, then addresses which falls
+	 * under that exception level will be reported as zero
+	 * for the captured branch record, creating source only
+	 * or target only records.
+	 */
+	if (branch_type & PERF_SAMPLE_BRANCH_ANY) {
+		brbcr |= BRBCR_EL1_EXCEPTION;
+		brbcr |= BRBCR_EL1_ERTN;
+	}
+
+	if (branch_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+		brbcr |= BRBCR_EL1_EXCEPTION;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+		brbcr |= BRBCR_EL1_ERTN;
+
+	return brbcr & BRBCR_EL1_DEFAULT_CONFIG;
+}
+
+void armv8pmu_branch_enable(struct perf_event *event)
+{
+	u64 branch_type = event->attr.branch_sample_type;
+	u64 brbfcr, brbcr;
+
+	brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
+	brbfcr &= ~BRBFCR_EL1_DEFAULT_CONFIG;
+	brbfcr |= branch_type_to_brbfcr(branch_type);
+	write_sysreg_s(brbfcr, SYS_BRBFCR_EL1);
+	isb();
+
+	brbcr = read_sysreg_s(SYS_BRBCR_EL1);
+	brbcr &= ~BRBCR_EL1_DEFAULT_CONFIG;
+	brbcr |= branch_type_to_brbcr(branch_type);
+	write_sysreg_s(brbcr, SYS_BRBCR_EL1);
+	isb();
+	armv8pmu_branch_reset();
+}
+
+void armv8pmu_branch_disable(struct perf_event *event)
+{
+	u64 brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
+	u64 brbcr = read_sysreg_s(SYS_BRBCR_EL1);
+
+	brbcr &= ~(BRBCR_EL1_E0BRE | BRBCR_EL1_E1BRE);
+	brbfcr |= BRBFCR_EL1_PAUSED;
+	write_sysreg_s(brbcr, SYS_BRBCR_EL1);
+	write_sysreg_s(brbfcr, SYS_BRBFCR_EL1);
+	isb();
+}
+
+static void brbe_set_perf_entry_type(struct perf_branch_entry *entry, u64 brbinf)
+{
+	int brbe_type = brbe_get_type(brbinf);
+
+	switch (brbe_type) {
+	case BRBINFx_EL1_TYPE_UNCOND_DIR:
+		entry->type = PERF_BR_UNCOND;
+		break;
+	case BRBINFx_EL1_TYPE_INDIR:
+		entry->type = PERF_BR_IND;
+		break;
+	case BRBINFx_EL1_TYPE_DIR_LINK:
+		entry->type = PERF_BR_CALL;
+		break;
+	case BRBINFx_EL1_TYPE_INDIR_LINK:
+		entry->type = PERF_BR_IND_CALL;
+		break;
+	case BRBINFx_EL1_TYPE_RET_SUB:
+		entry->type = PERF_BR_RET;
+		break;
+	case BRBINFx_EL1_TYPE_COND_DIR:
+		entry->type = PERF_BR_COND;
+		break;
+	case BRBINFx_EL1_TYPE_CALL:
+		entry->type = PERF_BR_CALL;
+		break;
+	case BRBINFx_EL1_TYPE_TRAP:
+		entry->type = PERF_BR_SYSCALL;
+		break;
+	case BRBINFx_EL1_TYPE_RET_EXCPT:
+		entry->type = PERF_BR_ERET;
+		break;
+	case BRBINFx_EL1_TYPE_IRQ:
+		entry->type = PERF_BR_IRQ;
+		break;
+	case BRBINFx_EL1_TYPE_DEBUG_HALT:
+		entry->type = PERF_BR_EXTEND_ABI;
+		entry->new_type = PERF_BR_ARM64_DEBUG_HALT;
+		break;
+	case BRBINFx_EL1_TYPE_SERROR:
+		entry->type = PERF_BR_SERROR;
+		break;
+	case BRBINFx_EL1_TYPE_INST_DEBUG:
+		entry->type = PERF_BR_EXTEND_ABI;
+		entry->new_type = PERF_BR_ARM64_DEBUG_INST;
+		break;
+	case BRBINFx_EL1_TYPE_DATA_DEBUG:
+		entry->type = PERF_BR_EXTEND_ABI;
+		entry->new_type = PERF_BR_ARM64_DEBUG_DATA;
+		break;
+	case BRBINFx_EL1_TYPE_ALGN_FAULT:
+		entry->type = PERF_BR_EXTEND_ABI;
+		entry->new_type = PERF_BR_NEW_FAULT_ALGN;
+		break;
+	case BRBINFx_EL1_TYPE_INST_FAULT:
+		entry->type = PERF_BR_EXTEND_ABI;
+		entry->new_type = PERF_BR_NEW_FAULT_INST;
+		break;
+	case BRBINFx_EL1_TYPE_DATA_FAULT:
+		entry->type = PERF_BR_EXTEND_ABI;
+		entry->new_type = PERF_BR_NEW_FAULT_DATA;
+		break;
+	case BRBINFx_EL1_TYPE_FIQ:
+		entry->type = PERF_BR_EXTEND_ABI;
+		entry->new_type = PERF_BR_ARM64_FIQ;
+		break;
+	case BRBINFx_EL1_TYPE_DEBUG_EXIT:
+		entry->type = PERF_BR_EXTEND_ABI;
+		entry->new_type = PERF_BR_ARM64_DEBUG_EXIT;
+		break;
+	default:
+		pr_warn_once("%d - unknown branch type captured\n", brbe_type);
+		entry->type = PERF_BR_UNKNOWN;
+		break;
+	}
+}
+
+static int brbe_get_perf_priv(u64 brbinf)
+{
+	int brbe_el = brbe_get_el(brbinf);
+
+	switch (brbe_el) {
+	case BRBINFx_EL1_EL_EL0:
+		return PERF_BR_PRIV_USER;
+	case BRBINFx_EL1_EL_EL1:
+		return PERF_BR_PRIV_KERNEL;
+	case BRBINFx_EL1_EL_EL2:
+		if (is_kernel_in_hyp_mode())
+			return PERF_BR_PRIV_KERNEL;
+		return PERF_BR_PRIV_HV;
+	default:
+		pr_warn_once("%d - unknown branch privilege captured\n", brbe_el);
+		return PERF_BR_PRIV_UNKNOWN;
+	}
+}
+
+static void capture_brbe_flags(struct perf_branch_entry *entry, struct perf_event *event,
+			       u64 brbinf)
+{
+	if (branch_sample_type(event))
+		brbe_set_perf_entry_type(entry, brbinf);
+
+	if (!branch_sample_no_cycles(event))
+		entry->cycles = brbe_get_cycles(brbinf);
+
+	if (!branch_sample_no_flags(event)) {
+		/*
+		 * BRBINFx_EL1.LASTFAILED indicates that a TME transaction failed (or
+		 * was cancelled) prior to this record, and some number of records
+		 * prior to this one, may have been generated during an attempt to
+		 * execute the transaction.
+		 *
+		 * We will remove such entries later in process_branch_aborts().
+		 */
+		entry->abort = brbe_get_lastfailed(brbinf);
+
+		/*
+		 * All these information (i.e transaction state and mispredicts)
+		 * are available for source only and complete branch records.
+		 */
+		if (brbe_record_is_complete(brbinf) ||
+		    brbe_record_is_source_only(brbinf)) {
+			entry->mispred = brbe_get_mispredict(brbinf);
+			entry->predicted = !entry->mispred;
+			entry->in_tx = brbe_get_in_tx(brbinf);
+		}
+	}
+
+	if (branch_sample_priv(event)) {
+		/*
+		 * All these information (i.e branch privilege level) are
+		 * available for target only and complete branch records.
+		 */
+		if (brbe_record_is_complete(brbinf) ||
+		    brbe_record_is_target_only(brbinf))
+			entry->priv = brbe_get_perf_priv(brbinf);
+	}
+}
+
+/*
+ * A branch record with BRBINFx_EL1.LASTFAILED set, implies that all
+ * preceding consecutive branch records, that were in a transaction
+ * (i.e their BRBINFx_EL1.TX set) have been aborted.
+ *
+ * Similarly BRBFCR_EL1.LASTFAILED set, indicate that all preceding
+ * consecutive branch records up to the last record, which were in a
+ * transaction (i.e their BRBINFx_EL1.TX set) have been aborted.
+ *
+ * --------------------------------- -------------------
+ * | 00 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
+ * --------------------------------- -------------------
+ * | 01 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
+ * --------------------------------- -------------------
+ * | 02 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
+ * --------------------------------- -------------------
+ * | 03 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
+ * --------------------------------- -------------------
+ * | 04 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
+ * --------------------------------- -------------------
+ * | 05 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 1 |
+ * --------------------------------- -------------------
+ * | .. | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
+ * --------------------------------- -------------------
+ * | 61 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
+ * --------------------------------- -------------------
+ * | 62 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
+ * --------------------------------- -------------------
+ * | 63 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
+ * --------------------------------- -------------------
+ *
+ * BRBFCR_EL1.LASTFAILED == 1
+ *
+ * BRBFCR_EL1.LASTFAILED fails all those consecutive, in transaction
+ * branches records near the end of the BRBE buffer.
+ *
+ * Architecture does not guarantee a non transaction (TX = 0) branch
+ * record between two different transactions. So it is possible that
+ * a subsequent lastfailed record (TX = 0, LF = 1) might erroneously
+ * mark more than required transactions as aborted.
+ */
+static void process_branch_aborts(struct pmu_hw_events *cpuc)
+{
+	struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
+	u64 brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
+	bool lastfailed = !!(brbfcr & BRBFCR_EL1_LASTFAILED);
+	int idx = brbe_attr->brbe_nr - 1;
+	struct perf_branch_entry *entry;
+
+	do {
+		entry = &cpuc->branches->branch_entries[idx];
+		if (entry->in_tx) {
+			entry->abort = lastfailed;
+		} else {
+			lastfailed = entry->abort;
+			entry->abort = false;
+		}
+	} while (idx--, idx >= 0);
+}
+
+void armv8pmu_branch_reset(void)
+{
+	asm volatile(BRB_IALL);
+	isb();
+}
+
+static bool capture_branch_entry(struct pmu_hw_events *cpuc,
+				 struct perf_event *event, int idx)
+{
+	struct perf_branch_entry *entry = &cpuc->branches->branch_entries[idx];
+	u64 brbinf = get_brbinf_reg(idx);
+
+	/*
+	 * There are no valid entries anymore on the buffer.
+	 * Abort the branch record processing to save some
+	 * cycles and also reduce the capture/process load
+	 * for the user space as well.
+	 */
+	if (brbe_invalid(brbinf))
+		return false;
+
+	perf_clear_branch_entry_bitfields(entry);
+	if (brbe_record_is_complete(brbinf)) {
+		entry->from = get_brbsrc_reg(idx);
+		entry->to = get_brbtgt_reg(idx);
+	} else if (brbe_record_is_source_only(brbinf)) {
+		entry->from = get_brbsrc_reg(idx);
+		entry->to = 0;
+	} else if (brbe_record_is_target_only(brbinf)) {
+		entry->from = 0;
+		entry->to = get_brbtgt_reg(idx);
+	}
+	capture_brbe_flags(entry, event, brbinf);
+	return true;
+}
+
+void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
+{
+	struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
+	u64 brbfcr, brbcr;
+	int idx, loop1_idx1, loop1_idx2, loop2_idx1, loop2_idx2, count;
+
+	brbcr = read_sysreg_s(SYS_BRBCR_EL1);
+	brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
+
+	/* Ensure pause on PMU interrupt is enabled */
+	WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
+
+	/* Pause the buffer */
+	write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
+	isb();
+
+	/* Determine the indices for each loop */
+	loop1_idx1 = BRBE_BANK0_IDX_MIN;
+	if (brbe_attr->brbe_nr <= BRBE_BANK_MAX_ENTRIES) {
+		loop1_idx2 = brbe_attr->brbe_nr - 1;
+		loop2_idx1 = BRBE_BANK1_IDX_MIN;
+		loop2_idx2 = BRBE_BANK0_IDX_MAX;
+	} else {
+		loop1_idx2 = BRBE_BANK0_IDX_MAX;
+		loop2_idx1 = BRBE_BANK1_IDX_MIN;
+		loop2_idx2 = brbe_attr->brbe_nr - 1;
+	}
+
+	/* Loop through bank 0 */
+	select_brbe_bank(BRBE_BANK_IDX_0);
+	for (idx = 0, count = loop1_idx1; count <= loop1_idx2; idx++, count++) {
+		if (!capture_branch_entry(cpuc, event, idx))
+			goto skip_bank_1;
+	}
+
+	/* Loop through bank 1 */
+	select_brbe_bank(BRBE_BANK_IDX_1);
+	for (count = loop2_idx1; count <= loop2_idx2; idx++, count++) {
+		if (!capture_branch_entry(cpuc, event, idx))
+			break;
+	}
+
+skip_bank_1:
+	cpuc->branches->branch_stack.nr = idx;
+	cpuc->branches->branch_stack.hw_idx = -1ULL;
+	process_branch_aborts(cpuc);
+
+	/* Unpause the buffer */
+	write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
+	isb();
+	armv8pmu_branch_reset();
+}
diff --git a/drivers/perf/arm_brbe.h b/drivers/perf/arm_brbe.h
new file mode 100644
index 000000000000..a47480eec070
--- /dev/null
+++ b/drivers/perf/arm_brbe.h
@@ -0,0 +1,257 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Branch Record Buffer Extension Helpers.
+ *
+ * Copyright (C) 2022 ARM Limited
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#define pr_fmt(fmt) "brbe: " fmt
+
+#include <linux/perf/arm_pmu.h>
+
+#define BRBFCR_EL1_BRANCH_FILTERS (BRBFCR_EL1_DIRECT   | \
+				   BRBFCR_EL1_INDIRECT | \
+				   BRBFCR_EL1_RTN      | \
+				   BRBFCR_EL1_INDCALL  | \
+				   BRBFCR_EL1_DIRCALL  | \
+				   BRBFCR_EL1_CONDDIR)
+
+#define BRBFCR_EL1_DEFAULT_CONFIG (BRBFCR_EL1_BANK_MASK | \
+				   BRBFCR_EL1_PAUSED    | \
+				   BRBFCR_EL1_EnI       | \
+				   BRBFCR_EL1_BRANCH_FILTERS)
+
+/*
+ * BRBTS_EL1 is currently not used for branch stack implementation
+ * purpose but BRBCR_EL1.TS needs to have a valid value from all
+ * available options. BRBCR_EL1_TS_VIRTUAL is selected for this.
+ */
+#define BRBCR_EL1_DEFAULT_TS      FIELD_PREP(BRBCR_EL1_TS_MASK, BRBCR_EL1_TS_VIRTUAL)
+
+#define BRBCR_EL1_DEFAULT_CONFIG  (BRBCR_EL1_EXCEPTION | \
+				   BRBCR_EL1_ERTN      | \
+				   BRBCR_EL1_CC        | \
+				   BRBCR_EL1_MPRED     | \
+				   BRBCR_EL1_E1BRE     | \
+				   BRBCR_EL1_E0BRE     | \
+				   BRBCR_EL1_FZP       | \
+				   BRBCR_EL1_DEFAULT_TS)
+/*
+ * BRBE Instructions
+ *
+ * BRB_IALL : Invalidate the entire buffer
+ * BRB_INJ  : Inject latest branch record derived from [BRBSRCINJ, BRBTGTINJ, BRBINFINJ]
+ */
+#define BRB_IALL __emit_inst(0xD5000000 | sys_insn(1, 1, 7, 2, 4) | (0x1f))
+#define BRB_INJ  __emit_inst(0xD5000000 | sys_insn(1, 1, 7, 2, 5) | (0x1f))
+
+/*
+ * BRBE Buffer Organization
+ *
+ * BRBE buffer is arranged as multiple banks of 32 branch record
+ * entries each. An individual branch record in a given bank could
+ * be accessed, after selecting the bank in BRBFCR_EL1.BANK and
+ * accessing the registers i.e [BRBSRC, BRBTGT, BRBINF] set with
+ * indices [0..31].
+ *
+ * Bank 0
+ *
+ *	---------------------------------	------
+ *	| 00 | BRBSRC | BRBTGT | BRBINF |	| 00 |
+ *	---------------------------------	------
+ *	| 01 | BRBSRC | BRBTGT | BRBINF |	| 01 |
+ *	---------------------------------	------
+ *	| .. | BRBSRC | BRBTGT | BRBINF |	| .. |
+ *	---------------------------------	------
+ *	| 31 | BRBSRC | BRBTGT | BRBINF |	| 31 |
+ *	---------------------------------	------
+ *
+ * Bank 1
+ *
+ *	---------------------------------	------
+ *	| 32 | BRBSRC | BRBTGT | BRBINF |	| 00 |
+ *	---------------------------------	------
+ *	| 33 | BRBSRC | BRBTGT | BRBINF |	| 01 |
+ *	---------------------------------	------
+ *	| .. | BRBSRC | BRBTGT | BRBINF |	| .. |
+ *	---------------------------------	------
+ *	| 63 | BRBSRC | BRBTGT | BRBINF |	| 31 |
+ *	---------------------------------	------
+ */
+#define BRBE_BANK_MAX_ENTRIES 32
+
+#define BRBE_BANK0_IDX_MIN 0
+#define BRBE_BANK0_IDX_MAX 31
+#define BRBE_BANK1_IDX_MIN 32
+#define BRBE_BANK1_IDX_MAX 63
+
+struct brbe_hw_attr {
+	int	brbe_version;
+	int	brbe_cc;
+	int	brbe_nr;
+	int	brbe_format;
+};
+
+enum brbe_bank_idx {
+	BRBE_BANK_IDX_INVALID = -1,
+	BRBE_BANK_IDX_0,
+	BRBE_BANK_IDX_1,
+	BRBE_BANK_IDX_MAX
+};
+
+#define RETURN_READ_BRBSRCN(n) \
+	read_sysreg_s(SYS_BRBSRC##n##_EL1)
+
+#define RETURN_READ_BRBTGTN(n) \
+	read_sysreg_s(SYS_BRBTGT##n##_EL1)
+
+#define RETURN_READ_BRBINFN(n) \
+	read_sysreg_s(SYS_BRBINF##n##_EL1)
+
+#define BRBE_REGN_CASE(n, case_macro) \
+	case n: return case_macro(n); break
+
+#define BRBE_REGN_SWITCH(x, case_macro)				\
+	do {							\
+		switch (x) {					\
+		BRBE_REGN_CASE(0, case_macro);			\
+		BRBE_REGN_CASE(1, case_macro);			\
+		BRBE_REGN_CASE(2, case_macro);			\
+		BRBE_REGN_CASE(3, case_macro);			\
+		BRBE_REGN_CASE(4, case_macro);			\
+		BRBE_REGN_CASE(5, case_macro);			\
+		BRBE_REGN_CASE(6, case_macro);			\
+		BRBE_REGN_CASE(7, case_macro);			\
+		BRBE_REGN_CASE(8, case_macro);			\
+		BRBE_REGN_CASE(9, case_macro);			\
+		BRBE_REGN_CASE(10, case_macro);			\
+		BRBE_REGN_CASE(11, case_macro);			\
+		BRBE_REGN_CASE(12, case_macro);			\
+		BRBE_REGN_CASE(13, case_macro);			\
+		BRBE_REGN_CASE(14, case_macro);			\
+		BRBE_REGN_CASE(15, case_macro);			\
+		BRBE_REGN_CASE(16, case_macro);			\
+		BRBE_REGN_CASE(17, case_macro);			\
+		BRBE_REGN_CASE(18, case_macro);			\
+		BRBE_REGN_CASE(19, case_macro);			\
+		BRBE_REGN_CASE(20, case_macro);			\
+		BRBE_REGN_CASE(21, case_macro);			\
+		BRBE_REGN_CASE(22, case_macro);			\
+		BRBE_REGN_CASE(23, case_macro);			\
+		BRBE_REGN_CASE(24, case_macro);			\
+		BRBE_REGN_CASE(25, case_macro);			\
+		BRBE_REGN_CASE(26, case_macro);			\
+		BRBE_REGN_CASE(27, case_macro);			\
+		BRBE_REGN_CASE(28, case_macro);			\
+		BRBE_REGN_CASE(29, case_macro);			\
+		BRBE_REGN_CASE(30, case_macro);			\
+		BRBE_REGN_CASE(31, case_macro);			\
+		default:					\
+			pr_warn("unknown register index\n");	\
+			return -1;				\
+		}						\
+	} while (0)
+
+static inline int buffer_to_brbe_idx(int buffer_idx)
+{
+	return buffer_idx % BRBE_BANK_MAX_ENTRIES;
+}
+
+static inline u64 get_brbsrc_reg(int buffer_idx)
+{
+	int brbe_idx = buffer_to_brbe_idx(buffer_idx);
+
+	BRBE_REGN_SWITCH(brbe_idx, RETURN_READ_BRBSRCN);
+}
+
+static inline u64 get_brbtgt_reg(int buffer_idx)
+{
+	int brbe_idx = buffer_to_brbe_idx(buffer_idx);
+
+	BRBE_REGN_SWITCH(brbe_idx, RETURN_READ_BRBTGTN);
+}
+
+static inline u64 get_brbinf_reg(int buffer_idx)
+{
+	int brbe_idx = buffer_to_brbe_idx(buffer_idx);
+
+	BRBE_REGN_SWITCH(brbe_idx, RETURN_READ_BRBINFN);
+}
+
+static inline u64 brbe_record_valid(u64 brbinf)
+{
+	return FIELD_GET(BRBINFx_EL1_VALID_MASK, brbinf);
+}
+
+static inline bool brbe_invalid(u64 brbinf)
+{
+	return brbe_record_valid(brbinf) == BRBINFx_EL1_VALID_NONE;
+}
+
+static inline bool brbe_record_is_complete(u64 brbinf)
+{
+	return brbe_record_valid(brbinf) == BRBINFx_EL1_VALID_FULL;
+}
+
+static inline bool brbe_record_is_source_only(u64 brbinf)
+{
+	return brbe_record_valid(brbinf) == BRBINFx_EL1_VALID_SOURCE;
+}
+
+static inline bool brbe_record_is_target_only(u64 brbinf)
+{
+	return brbe_record_valid(brbinf) == BRBINFx_EL1_VALID_TARGET;
+}
+
+static inline int brbe_get_in_tx(u64 brbinf)
+{
+	return FIELD_GET(BRBINFx_EL1_T_MASK, brbinf);
+}
+
+static inline int brbe_get_mispredict(u64 brbinf)
+{
+	return FIELD_GET(BRBINFx_EL1_MPRED_MASK, brbinf);
+}
+
+static inline int brbe_get_lastfailed(u64 brbinf)
+{
+	return FIELD_GET(BRBINFx_EL1_LASTFAILED_MASK, brbinf);
+}
+
+static inline int brbe_get_cycles(u64 brbinf)
+{
+	/*
+	 * Captured cycle count is unknown and hence
+	 * should not be passed on to the user space.
+	 */
+	if (brbinf & BRBINFx_EL1_CCU)
+		return 0;
+
+	return FIELD_GET(BRBINFx_EL1_CC_MASK, brbinf);
+}
+
+static inline int brbe_get_type(u64 brbinf)
+{
+	return FIELD_GET(BRBINFx_EL1_TYPE_MASK, brbinf);
+}
+
+static inline int brbe_get_el(u64 brbinf)
+{
+	return FIELD_GET(BRBINFx_EL1_EL_MASK, brbinf);
+}
+
+static inline int brbe_get_numrec(u64 brbidr)
+{
+	return FIELD_GET(BRBIDR0_EL1_NUMREC_MASK, brbidr);
+}
+
+static inline int brbe_get_format(u64 brbidr)
+{
+	return FIELD_GET(BRBIDR0_EL1_FORMAT_MASK, brbidr);
+}
+
+static inline int brbe_get_cc_bits(u64 brbidr)
+{
+	return FIELD_GET(BRBIDR0_EL1_CC_MASK, brbidr);
+}
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 86d803ff1ae3..fef1bc6067cc 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -797,6 +797,10 @@  static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 		if (!armpmu_event_set_period(event))
 			continue;
 
+		/*
+		 * PMU IRQ should remain asserted until all branch records
+		 * are captured and processed into struct perf_sample_data.
+		 */
 		if (has_branch_stack(event) && !WARN_ON(!cpuc->branches)) {
 			armv8pmu_branch_read(cpuc, event);
 			perf_sample_save_brstack(&data, event, &cpuc->branches->branch_stack);
@@ -1142,14 +1146,25 @@  static void __armv8pmu_probe_pmu(void *info)
 
 static int branch_records_alloc(struct arm_pmu *armpmu)
 {
+	struct branch_records __percpu *tmp_alloc_ptr;
+	struct branch_records *records;
 	struct pmu_hw_events *events;
 	int cpu;
 
+	tmp_alloc_ptr = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
+	if (!tmp_alloc_ptr)
+		return -ENOMEM;
+
+	/*
+	 * FIXME: Memory allocated via tmp_alloc_ptr gets completely
+	 * consumed here, never required to be freed up later. Hence
+	 * losing access to on stack 'tmp_alloc_ptr' is acceptible.
+	 * Otherwise this alloc handle has to be saved some where.
+	 */
 	for_each_possible_cpu(cpu) {
 		events = per_cpu_ptr(armpmu->hw_events, cpu);
-		events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
-		if (!events->branches)
-			return -ENOMEM;
+		records = per_cpu_ptr(tmp_alloc_ptr, cpu);
+		events->branches = records;
 	}
 	return 0;
 }