[V7,6/6] arm64/perf: Enable branch stack events via FEAT_BRBE
Commit Message
This enables branch stack sampling events in ARMV8 PMU, via an architecture
feature FEAT_BRBE aka branch record buffer extension. This defines required
branch helper functions pmuv8pmu_branch_XXXXX() and the implementation here
is wrapped with a new config option CONFIG_ARM64_BRBE.
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
arch/arm64/Kconfig | 11 +
arch/arm64/include/asm/perf_event.h | 9 +
arch/arm64/kernel/Makefile | 1 +
arch/arm64/kernel/brbe.c | 512 ++++++++++++++++++++++++++++
arch/arm64/kernel/brbe.h | 257 ++++++++++++++
5 files changed, 790 insertions(+)
create mode 100644 arch/arm64/kernel/brbe.c
create mode 100644 arch/arm64/kernel/brbe.h
Comments
On Thu, Jan 05, 2023 at 08:40:39AM +0530, Anshuman Khandual wrote:
> This enables branch stack sampling events in ARMV8 PMU, via an architecture
> feature FEAT_BRBE aka branch record buffer extension. This defines required
> branch helper functions pmuv8pmu_branch_XXXXX() and the implementation here
> is wrapped with a new config option CONFIG_ARM64_BRBE.
>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---
> arch/arm64/Kconfig | 11 +
> arch/arm64/include/asm/perf_event.h | 9 +
> arch/arm64/kernel/Makefile | 1 +
> arch/arm64/kernel/brbe.c | 512 ++++++++++++++++++++++++++++
> arch/arm64/kernel/brbe.h | 257 ++++++++++++++
> 5 files changed, 790 insertions(+)
> create mode 100644 arch/arm64/kernel/brbe.c
> create mode 100644 arch/arm64/kernel/brbe.h
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 03934808b2ed..915b12709a46 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -1363,6 +1363,17 @@ config HW_PERF_EVENTS
> def_bool y
> depends on ARM_PMU
>
> +config ARM64_BRBE
> + bool "Enable support for Branch Record Buffer Extension (BRBE)"
> + depends on PERF_EVENTS && ARM64 && ARM_PMU
> + default y
> + help
> + Enable perf support for Branch Record Buffer Extension (BRBE) which
> + records all branches taken in an execution path. This supports some
> + branch types and privilege based filtering. It captured additional
> + relevant information such as cycle count, misprediction and branch
> + type, branch privilege level etc.
> +
> # Supported by clang >= 7.0 or GCC >= 12.0.0
> config CC_HAVE_SHADOW_CALL_STACK
> def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18)
> diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
> index a038902d6874..cf2e88c7b707 100644
> --- a/arch/arm64/include/asm/perf_event.h
> +++ b/arch/arm64/include/asm/perf_event.h
> @@ -277,6 +277,14 @@ struct pmu_hw_events;
> struct arm_pmu;
> struct perf_event;
>
> +#ifdef CONFIG_ARM64_BRBE
> +void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event);
> +bool armv8pmu_branch_valid(struct perf_event *event);
> +void armv8pmu_branch_enable(struct perf_event *event);
> +void armv8pmu_branch_disable(struct perf_event *event);
> +void armv8pmu_branch_probe(struct arm_pmu *arm_pmu);
> +void armv8pmu_branch_reset(void);
> +#else
> static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event) { }
> static inline bool armv8pmu_branch_valid(struct perf_event *event) { return false; }
> static inline void armv8pmu_branch_enable(struct perf_event *event) { }
> @@ -284,3 +292,4 @@ static inline void armv8pmu_branch_disable(struct perf_event *event) { }
> static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu) { }
> static inline void armv8pmu_branch_reset(void) { }
> #endif
> +#endif
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index ceba6792f5b3..6ee7ccb61621 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -46,6 +46,7 @@ obj-$(CONFIG_MODULES) += module.o
> obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o
> obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o
> obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o
> +obj-$(CONFIG_ARM64_BRBE) += brbe.o
> obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
> obj-$(CONFIG_CPU_PM) += sleep.o suspend.o
> obj-$(CONFIG_CPU_IDLE) += cpuidle.o
> diff --git a/arch/arm64/kernel/brbe.c b/arch/arm64/kernel/brbe.c
> new file mode 100644
> index 000000000000..cd03d3531e04
> --- /dev/null
> +++ b/arch/arm64/kernel/brbe.c
> @@ -0,0 +1,512 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Branch Record Buffer Extension Driver.
> + *
> + * Copyright (C) 2022 ARM Limited
> + *
> + * Author: Anshuman Khandual <anshuman.khandual@arm.com>
> + */
> +#include "brbe.h"
> +
> +static bool valid_brbe_nr(int brbe_nr)
> +{
> + return brbe_nr == BRBIDR0_EL1_NUMREC_8 ||
> + brbe_nr == BRBIDR0_EL1_NUMREC_16 ||
> + brbe_nr == BRBIDR0_EL1_NUMREC_32 ||
> + brbe_nr == BRBIDR0_EL1_NUMREC_64;
> +}
> +
> +static bool valid_brbe_cc(int brbe_cc)
> +{
> + return brbe_cc == BRBIDR0_EL1_CC_20_BIT;
> +}
> +
> +static bool valid_brbe_format(int brbe_format)
> +{
> + return brbe_format == BRBIDR0_EL1_FORMAT_0;
> +}
> +
> +static bool valid_brbe_version(int brbe_version)
> +{
> + return brbe_version == ID_AA64DFR0_EL1_BRBE_IMP ||
> + brbe_version == ID_AA64DFR0_EL1_BRBE_BRBE_V1P1;
> +}
> +
> +static void select_brbe_bank(int bank)
> +{
> + static int brbe_current_bank = BRBE_BANK_IDX_INVALID;
This is a per-cpu peroperty, so I don't understand how this can safely be
stored in a static variable. If this is necessary it needs to go in a per-cpu
variable, but I suspect we don't actually need it.
> + u64 brbfcr;
> +
> + if (brbe_current_bank == bank)
> + return;
It looks like this is just for the same of optimizing redundant changes when
armv8pmu_branch_read() iterates over the records?
It'd be simpler to have armv8pmu_branch_read() iterate over each bank, then
within that iterate over each record within that bank.
> + WARN_ON(bank > BRBE_BANK_IDX_1);
> + brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
> + brbfcr &= ~BRBFCR_EL1_BANK_MASK;
> + brbfcr |= ((bank << BRBFCR_EL1_BANK_SHIFT) & BRBFCR_EL1_BANK_MASK);
You can use SYS_FIELD_PREP() for this:
brbfcr &= ~BRBFCR_EL1_BANK_MASK;
brbfcr |= SYS_FIELD_PREP(BRBFCR_EL1, BANK, bank);
Please use FIELD_PREP for this.
> + write_sysreg_s(brbfcr, SYS_BRBFCR_EL1);
> + isb();
> + brbe_current_bank = bank;
> +}
> +
> +static void select_brbe_bank_index(int buffer_idx)
> +{
> + switch (buffer_idx) {
> + case BRBE_BANK0_IDX_MIN ... BRBE_BANK0_IDX_MAX:
> + select_brbe_bank(BRBE_BANK_IDX_0);
> + break;
> + case BRBE_BANK1_IDX_MIN ... BRBE_BANK1_IDX_MAX:
> + select_brbe_bank(BRBE_BANK_IDX_1);
> + break;
> + default:
> + pr_warn("unsupported BRBE index\n");
It would be worth logging the specific index in case we ever have to debug
this. It's probably worth also making this a WARN_ONCE() or WARN_RATELIMITED().
> + }
> +}
> +
> +static const char branch_filter_error_msg[] = "branch filter not supported";
> +
> +bool armv8pmu_branch_valid(struct perf_event *event)
> +{
> + u64 branch_type = event->attr.branch_sample_type;
> +
> + /*
> + * If the event does not have at least one of the privilege
> + * branch filters as in PERF_SAMPLE_BRANCH_PLM_ALL, the core
> + * perf will adjust its value based on perf event's existing
> + * privilege level via attr.exclude_[user|kernel|hv].
> + *
> + * As event->attr.branch_sample_type might have been changed
> + * when the event reaches here, it is not possible to figure
> + * out whether the event originally had HV privilege request
> + * or got added via the core perf. Just report this situation
> + * once and continue ignoring if there are other instances.
> + */
> + if ((branch_type & PERF_SAMPLE_BRANCH_HV) && !is_kernel_in_hyp_mode())
> + pr_warn_once("%s - hypervisor privilege\n", branch_filter_error_msg);
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_ABORT_TX) {
> + pr_warn_once("%s - aborted transaction\n", branch_filter_error_msg);
> + return false;
> + }
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_NO_TX) {
> + pr_warn_once("%s - no transaction\n", branch_filter_error_msg);
> + return false;
> + }
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_IN_TX) {
> + pr_warn_once("%s - in transaction\n", branch_filter_error_msg);
> + return false;
> + }
> + return true;
> +}
Is this called when validating user input? If so, NAK to printing anything to a
higher leval than debug. If there are constraints the user needs to be aware of
we should expose the relevant information under sysfs, but it seems that these
are just generic perf options that BRBE doesn't support.
It would be better to whitelist what we do support rather than blacklisting
what we don't.
> +
> +static void branch_records_alloc(struct arm_pmu *armpmu)
> +{
> + struct pmu_hw_events *events;
> + int cpu;
> +
> + for_each_possible_cpu(cpu) {
> + events = per_cpu_ptr(armpmu->hw_events, cpu);
> +
> + events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
> + WARN_ON(!events->branches);
> + }
> +}
It would be simpler for this to be a percpu allocation.
If the allocation fails, we should propogate that error rather than just
WARNing, and fail probing the PMU.
Also, if the generic allocator fails it will print a warning (unless
__GFP_NOWARN was used), so we don't need the warning here.
> +
> +static int brbe_attributes_probe(struct arm_pmu *armpmu, u32 brbe)
> +{
> + struct brbe_hw_attr *brbe_attr = kzalloc(sizeof(struct brbe_hw_attr), GFP_KERNEL);
Same comments as for the failure path in branch_records_alloc().
> + u64 brbidr = read_sysreg_s(SYS_BRBIDR0_EL1);
Which context is this run in? Unless this is affine to a relevant CPU we can't
read the sysreg safely, and if we're in a cross-call we cannot allocate memory,
so this doesn't look right to me.
I suspect CONFIG_DEBUG_ATOMIC_SLEEP=y and/or CONFIG_PROVE_LOCKING=y will complain here.
Please follow the approach of armv8pmu_probe_pmu(), where we use a probe_info
structure that the callee can fill with information. Then we can do the
allocation in the main thread from a non-atomic context.
> +
> + WARN_ON(!brbe_attr);
> + armpmu->private = brbe_attr;
> +
> + brbe_attr->brbe_version = brbe;
> + brbe_attr->brbe_format = brbe_fetch_format(brbidr);
> + brbe_attr->brbe_cc = brbe_fetch_cc_bits(brbidr);
> + brbe_attr->brbe_nr = brbe_fetch_numrec(brbidr);
As a minor thing, could we please s/fetch/get/ ? To me, 'fetch' sounds like a
memory operation, and elsewhere we use 'get' for this sort of getter function.
> +
> + if (!valid_brbe_version(brbe_attr->brbe_version) ||
> + !valid_brbe_format(brbe_attr->brbe_format) ||
> + !valid_brbe_cc(brbe_attr->brbe_cc) ||
> + !valid_brbe_nr(brbe_attr->brbe_nr))
> + return -EOPNOTSUPP;
> +
> + return 0;
> +}
> +
> +void armv8pmu_branch_probe(struct arm_pmu *armpmu)
> +{
> + u64 aa64dfr0 = read_sysreg_s(SYS_ID_AA64DFR0_EL1);
> + u32 brbe;
> +
> + brbe = cpuid_feature_extract_unsigned_field(aa64dfr0, ID_AA64DFR0_EL1_BRBE_SHIFT);
> + if (!brbe)
> + return;
> +
> + if (brbe_attributes_probe(armpmu, brbe))
> + return;
> +
> + branch_records_alloc(armpmu);
> + armpmu->features |= ARM_PMU_BRANCH_STACK;
> +}
> +
> +static u64 branch_type_to_brbfcr(int branch_type)
> +{
> + u64 brbfcr = 0;
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_ANY) {
> + brbfcr |= BRBFCR_EL1_BRANCH_FILTERS;
> + return brbfcr;
> + }
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_ANY_CALL) {
> + brbfcr |= BRBFCR_EL1_INDCALL;
> + brbfcr |= BRBFCR_EL1_DIRCALL;
> + }
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
> + brbfcr |= BRBFCR_EL1_RTN;
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_IND_CALL)
> + brbfcr |= BRBFCR_EL1_INDCALL;
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_COND)
> + brbfcr |= BRBFCR_EL1_CONDDIR;
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_IND_JUMP)
> + brbfcr |= BRBFCR_EL1_INDIRECT;
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_CALL)
> + brbfcr |= BRBFCR_EL1_DIRCALL;
> +
> + return brbfcr;
> +}
> +
> +static u64 branch_type_to_brbcr(int branch_type)
> +{
> + u64 brbcr = (BRBCR_EL1_FZP | BRBCR_EL1_DEFAULT_TS);
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_USER)
> + brbcr |= BRBCR_EL1_E0BRE;
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_KERNEL)
> + brbcr |= BRBCR_EL1_E1BRE;
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_HV) {
> + if (is_kernel_in_hyp_mode())
> + brbcr |= BRBCR_EL1_E1BRE;
> + }
I assume that in that case we're actually writing to BRBCR_EL2, and this is
actually the E2BRE bit, which is at the same position? If so, I think that's
worth a comment above the USER/KERNEL/HV bits here.
How do the BRB* control registers work with E2H? Is BRBCR_EL1 rewritten to
BRBCR_EL2 by the hardware?
> +
> + if (!(branch_type & PERF_SAMPLE_BRANCH_NO_CYCLES))
> + brbcr |= BRBCR_EL1_CC;
> +
> + if (!(branch_type & PERF_SAMPLE_BRANCH_NO_FLAGS))
> + brbcr |= BRBCR_EL1_MPRED;
> +
> + /*
> + * The exception and exception return branches could be
> + * captured, irrespective of the perf event's privilege.
> + * If the perf event does not have enough privilege for
> + * a given exception level, then addresses which falls
> + * under that exception level will be reported as zero
> + * for the captured branch record, creating source only
> + * or target only records.
> + */
> + if (branch_type & PERF_SAMPLE_BRANCH_ANY) {
> + brbcr |= BRBCR_EL1_EXCEPTION;
> + brbcr |= BRBCR_EL1_ERTN;
> + }
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_ANY_CALL)
> + brbcr |= BRBCR_EL1_EXCEPTION;
> +
> + if (branch_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
> + brbcr |= BRBCR_EL1_ERTN;
> +
> + return brbcr & BRBCR_EL1_DEFAULT_CONFIG;
> +}
> +
> +void armv8pmu_branch_enable(struct perf_event *event)
> +{
> + u64 branch_type = event->attr.branch_sample_type;
> + u64 brbfcr, brbcr;
> +
> + brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
> + brbfcr &= ~BRBFCR_EL1_DEFAULT_CONFIG;
> + brbfcr |= branch_type_to_brbfcr(branch_type);
> + write_sysreg_s(brbfcr, SYS_BRBFCR_EL1);
> + isb();
> +
> + brbcr = read_sysreg_s(SYS_BRBCR_EL1);
> + brbcr &= ~BRBCR_EL1_DEFAULT_CONFIG;
> + brbcr |= branch_type_to_brbcr(branch_type);
> + write_sysreg_s(brbcr, SYS_BRBCR_EL1);
> + isb();
> + armv8pmu_branch_reset();
> +}
> +
> +void armv8pmu_branch_disable(struct perf_event *event)
> +{
> + u64 brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
> + u64 brbcr = read_sysreg_s(SYS_BRBCR_EL1);
> +
> + brbcr &= ~(BRBCR_EL1_E0BRE | BRBCR_EL1_E1BRE);
> + brbfcr |= BRBFCR_EL1_PAUSED;
> + write_sysreg_s(brbcr, SYS_BRBCR_EL1);
> + write_sysreg_s(brbfcr, SYS_BRBFCR_EL1);
> + isb();
> +}
> +
> +static int brbe_fetch_perf_type(u64 brbinf, bool *new_branch_type)
It's a bit confusing to return the type and new_type fields in this way.
I think this would be clearer as a setter function, even if that results in it
being a bit longer, since it keeps all the type and new_type relationships in
one place and has a single path for returning the value:
static void brbe_set_perf_entry_type(struct perf_branch_entry *entry,
u64 brbinf)
{
int brbe_type = brbe_fetch_type(brbinf);
switch (brbe_type) {
case BRBINF_EL1_TYPE_UNCOND_DIR;
entry->type = PERF_BR_UNCOND;
break;
...
case BRBINF_EL1_TYPE_DEBUG_HALT;
entry->type = PERF_BR_EXTEND_ABI;
entry->new_type = PERF_BR_ARM64_DEBUG_HALT;
break;
...
default:
...
}
}
... and in theory that makes it easier to propogate an error in future if we
want to.
> +{
> + int brbe_type = brbe_fetch_type(brbinf);
> + *new_branch_type = false;
> +
> + switch (brbe_type) {
> + case BRBINF_EL1_TYPE_UNCOND_DIR:
> + return PERF_BR_UNCOND;
> + case BRBINF_EL1_TYPE_INDIR:
> + return PERF_BR_IND;
> + case BRBINF_EL1_TYPE_DIR_LINK:
> + return PERF_BR_CALL;
> + case BRBINF_EL1_TYPE_INDIR_LINK:
> + return PERF_BR_IND_CALL;
> + case BRBINF_EL1_TYPE_RET_SUB:
> + return PERF_BR_RET;
> + case BRBINF_EL1_TYPE_COND_DIR:
> + return PERF_BR_COND;
> + case BRBINF_EL1_TYPE_CALL:
> + return PERF_BR_CALL;
> + case BRBINF_EL1_TYPE_TRAP:
> + return PERF_BR_SYSCALL;
> + case BRBINF_EL1_TYPE_RET_EXCPT:
> + return PERF_BR_ERET;
> + case BRBINF_EL1_TYPE_IRQ:
> + return PERF_BR_IRQ;
> + case BRBINF_EL1_TYPE_DEBUG_HALT:
> + *new_branch_type = true;
> + return PERF_BR_ARM64_DEBUG_HALT;
> + case BRBINF_EL1_TYPE_SERROR:
> + return PERF_BR_SERROR;
> + case BRBINF_EL1_TYPE_INST_DEBUG:
> + *new_branch_type = true;
> + return PERF_BR_ARM64_DEBUG_INST;
> + case BRBINF_EL1_TYPE_DATA_DEBUG:
> + *new_branch_type = true;
> + return PERF_BR_ARM64_DEBUG_DATA;
> + case BRBINF_EL1_TYPE_ALGN_FAULT:
> + *new_branch_type = true;
> + return PERF_BR_NEW_FAULT_ALGN;
> + case BRBINF_EL1_TYPE_INST_FAULT:
> + *new_branch_type = true;
> + return PERF_BR_NEW_FAULT_INST;
> + case BRBINF_EL1_TYPE_DATA_FAULT:
> + *new_branch_type = true;
> + return PERF_BR_NEW_FAULT_DATA;
> + case BRBINF_EL1_TYPE_FIQ:
> + *new_branch_type = true;
> + return PERF_BR_ARM64_FIQ;
> + case BRBINF_EL1_TYPE_DEBUG_EXIT:
> + *new_branch_type = true;
> + return PERF_BR_ARM64_DEBUG_EXIT;
> + default:
> + pr_warn("unknown branch type captured\n");
> + return PERF_BR_UNKNOWN;
It would be worth logging the specific value in case we ever have to debug
this. This should also be marked as _ratelimited or _once.
> + }
> +}
> +
> +static int brbe_fetch_perf_priv(u64 brbinf)
> +{
> + int brbe_el = brbe_fetch_el(brbinf);
> +
> + switch (brbe_el) {
> + case BRBINF_EL1_EL_EL0:
> + return PERF_BR_PRIV_USER;
> + case BRBINF_EL1_EL_EL1:
> + return PERF_BR_PRIV_KERNEL;
> + case BRBINF_EL1_EL_EL2:
> + if (is_kernel_in_hyp_mode())
> + return PERF_BR_PRIV_KERNEL;
> + return PERF_BR_PRIV_HV;
> + default:
> + pr_warn("unknown branch privilege captured\n");
> + return PERF_BR_PRIV_UNKNOWN;
It would be worth logging the specific value in case we ever have to debug
this. This should also be marked as _ratelimited or _once.
> + }
> +}
> +
> +static void capture_brbe_flags(struct pmu_hw_events *cpuc, struct perf_event *event,
> + u64 brbinf, u64 brbcr, int idx)
> +{
> + struct perf_branch_entry *entry = &cpuc->branches->branch_entries[idx];
> + bool new_branch_type;
> + int branch_type;
> +
> + if (branch_sample_type(event)) {
> + branch_type = brbe_fetch_perf_type(brbinf, &new_branch_type);
> + if (new_branch_type) {
> + entry->type = PERF_BR_EXTEND_ABI;
> + entry->new_type = branch_type;
> + } else {
> + entry->type = branch_type;
> + }
> + }
With the suggestions bove, this would become:
if (branch_sample_type(event))
brbe_set_perf_entry_type(entry, brbinf);
> + if (!branch_sample_no_cycles(event)) {
> + WARN_ON_ONCE(!(brbcr & BRBCR_EL1_CC));
> + entry->cycles = brbe_fetch_cycles(brbinf);
> + }
> +
> + if (!branch_sample_no_flags(event)) {
> + /*
> + * BRBINF_LASTFAILED does not indicate whether last transaction
> + * got failed or aborted during the current branch record itself.
> + * Rather, this indicates that all the branch records which were
> + * in transaction until the curret branch record have failed. So
> + * the entire BRBE buffer needs to be processed later on to find
> + * all branch records which might have failed.
> + */
This is quite difficult to follow.
I took in the ARM ARM, and it looks like this is all about TME transactions
(which Linux doesn't currently support). Per ARM DDI 0487I.a, page D15-5506:
| R_GVCJH
| When an entire transaction is executed in a BRBE Non-prohibited region and
| the transaction fails or is canceled then BRBFCR_EL1.LASTFAILED is set to
| 1.
| R_KBSZM
| When a Branch record is generated, other than through the injection
| mechanism, the value of BRBFCR_EL1.LASTFAILED is copied to the LASTFAILED
| field in the Branch record and BRBFCR_EL1.LASTFAILED is set to 0.
| I_JBPHS
| When a transaction fails or is canceled, Branch records generated in the
| transaction are not removed from the Branch record buffer.
I think what this is saying is:
/*
* BRBINFx_EL1.LASTFAILED indicates that a TME transaction failed (or
* was cancelled) prior to this record, and some number of records
* prior to this one may have been generated during an attempt to
* execute the transaction.
*
* We will remove such entries later in process_branch_aborts().
*/
Is that right?
> +
> + /*
> + * All these information (i.e transaction state and mispredicts)
> + * are not available for target only branch records.
> + */
> + if (!brbe_target(brbinf)) {
Could we rename these heleprs for clarity, e.g.
brbe_record_is_{target_only,source_only,complete}()
With that, it would also be clearer to have:
/*
* These fields only exist for complete and source-only records.
*/
if (brbe_record_is_complete(brbinf) ||
brbe_record_is_source_only()) {
... and explicilty match the cases we care about[
> + WARN_ON_ONCE(!(brbcr & BRBCR_EL1_MPRED));
Huh? Why does the value of BRBCR matter here?
> + entry->mispred = brbe_fetch_mispredict(brbinf);
> + entry->predicted = !entry->mispred;
> + entry->in_tx = brbe_fetch_in_tx(brbinf);
> + }
> + }
> +
> + if (branch_sample_priv(event)) {
> + /*
> + * All these information (i.e branch privilege level) are not
> + * available for source only branch records.
> + */
> + if (!brbe_source(brbinf))
> + entry->priv = brbe_fetch_perf_priv(brbinf);
Same style comment as above.
> + }
> +}
> +
> +/*
> + * A branch record with BRBINF_EL1.LASTFAILED set, implies that all
> + * preceding consecutive branch records, that were in a transaction
> + * (i.e their BRBINF_EL1.TX set) have been aborted.
> + *
> + * Similarly BRBFCR_EL1.LASTFAILED set, indicate that all preceding
> + * consecutive branch records upto the last record, which were in a
> + * transaction (i.e their BRBINF_EL1.TX set) have been aborted.
> + *
> + * --------------------------------- -------------------
> + * | 00 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
> + * --------------------------------- -------------------
> + * | 01 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
> + * --------------------------------- -------------------
> + * | 02 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
> + * --------------------------------- -------------------
> + * | 03 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> + * --------------------------------- -------------------
> + * | 04 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> + * --------------------------------- -------------------
> + * | 05 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 1 |
> + * --------------------------------- -------------------
> + * | .. | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
> + * --------------------------------- -------------------
> + * | 61 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> + * --------------------------------- -------------------
> + * | 62 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> + * --------------------------------- -------------------
> + * | 63 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> + * --------------------------------- -------------------
Are we guaranteed to have a record between two transactions with TX = 0?
AFAICT you could have a sequence where a TCOMMIT is immediately followed by a
TSTART, and IIUC in that case you could have back-to-back records for distinct
transactions all with TX = 1, where the first transaction could be commited,
and the second might fail/cancel.
... or do TCOMMIT/TCANCEL/TSTART get handled specially?
> + *
> + * BRBFCR_EL1.LASTFAILED == 1
> + *
> + * Here BRBFCR_EL1.LASTFAILED failes all those consecutive and also
> + * in transaction branches near the end of the BRBE buffer.
> + */
> +static void process_branch_aborts(struct pmu_hw_events *cpuc)
> +{
> + struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
> + u64 brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
> + bool lastfailed = !!(brbfcr & BRBFCR_EL1_LASTFAILED);
> + int idx = brbe_attr->brbe_nr - 1;
> + struct perf_branch_entry *entry;
> +
> + do {
> + entry = &cpuc->branches->branch_entries[idx];
> + if (entry->in_tx) {
> + entry->abort = lastfailed;
> + } else {
> + lastfailed = entry->abort;
> + entry->abort = false;
> + }
> + } while (idx--, idx >= 0);
> +}
> +
> +void armv8pmu_branch_reset(void)
> +{
> + asm volatile(BRB_IALL);
> + isb();
> +}
> +
> +void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
> +{
> + struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
> + u64 brbinf, brbfcr, brbcr;
> + int idx;
> +
> + brbcr = read_sysreg_s(SYS_BRBCR_EL1);
> + brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
> +
> + /* Ensure pause on PMU interrupt is enabled */
> + WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
As above, I think this needs commentary in the interrupt handler, since this
presumably needs us to keep the IRQ asserted until we're done
reading/manipulating records in the IRQ handler.
Do we ever read this outside of the IRQ handler? AFAICT we don't, and that
makes it seem like some of this is redundant.
> +
> + /* Save and clear the privilege */
> + write_sysreg_s(brbcr & ~(BRBCR_EL1_E0BRE | BRBCR_EL1_E1BRE), SYS_BRBCR_EL1);
Why? Later on we restore this, and AFAICT we don't modify it.
If it's paused, why do we care about the privilege?
> +
> + /* Pause the buffer */
> + write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> + isb();
Why? If we're in the IRQ handler it's already paused, and if we're not in the
IRQ handler what prevents us racing with an IRQ?
> +
> + for (idx = 0; idx < brbe_attr->brbe_nr; idx++) {
> + struct perf_branch_entry *entry = &cpuc->branches->branch_entries[idx];
> +
> + select_brbe_bank_index(idx);
> + brbinf = get_brbinf_reg(idx);
> + /*
> + * There are no valid entries anymore on the buffer.
> + * Abort the branch record processing to save some
> + * cycles and also reduce the capture/process load
> + * for the user space as well.
> + */
> + if (brbe_invalid(brbinf))
> + break;
> +
> + perf_clear_branch_entry_bitfields(entry);
> + if (brbe_valid(brbinf)) {
> + entry->from = get_brbsrc_reg(idx);
> + entry->to = get_brbtgt_reg(idx);
> + } else if (brbe_source(brbinf)) {
> + entry->from = get_brbsrc_reg(idx);
> + entry->to = 0;
> + } else if (brbe_target(brbinf)) {
> + entry->from = 0;
> + entry->to = get_brbtgt_reg(idx);
> + }
> + capture_brbe_flags(cpuc, event, brbinf, brbcr, idx);
> + }
> + cpuc->branches->branch_stack.nr = idx;
> + cpuc->branches->branch_stack.hw_idx = -1ULL;
> + process_branch_aborts(cpuc);
> +
> + /* Restore privilege, enable pause on PMU interrupt */
> + write_sysreg_s(brbcr | BRBCR_EL1_FZP, SYS_BRBCR_EL1);
Why do we have to save/restore this?
> +
> + /* Unpause the buffer */
> + write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> + isb();
> + armv8pmu_branch_reset();
> +}
Why do we enable it before we reset it?
Surely it would make sense to reset it first, and ammortize the cost of the ISB?
That said, as above, do we actually need to pause/unpause it? Or is it already
paused by virtue of the IRQ?
Thanks,
Mark.
On 1/12/23 22:21, Mark Rutland wrote:
> On Thu, Jan 05, 2023 at 08:40:39AM +0530, Anshuman Khandual wrote:
>> This enables branch stack sampling events in ARMV8 PMU, via an architecture
>> feature FEAT_BRBE aka branch record buffer extension. This defines required
>> branch helper functions pmuv8pmu_branch_XXXXX() and the implementation here
>> is wrapped with a new config option CONFIG_ARM64_BRBE.
>>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: Mark Rutland <mark.rutland@arm.com>
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linux-kernel@vger.kernel.org
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
>> arch/arm64/Kconfig | 11 +
>> arch/arm64/include/asm/perf_event.h | 9 +
>> arch/arm64/kernel/Makefile | 1 +
>> arch/arm64/kernel/brbe.c | 512 ++++++++++++++++++++++++++++
>> arch/arm64/kernel/brbe.h | 257 ++++++++++++++
>> 5 files changed, 790 insertions(+)
>> create mode 100644 arch/arm64/kernel/brbe.c
>> create mode 100644 arch/arm64/kernel/brbe.h
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index 03934808b2ed..915b12709a46 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -1363,6 +1363,17 @@ config HW_PERF_EVENTS
>> def_bool y
>> depends on ARM_PMU
>>
>> +config ARM64_BRBE
>> + bool "Enable support for Branch Record Buffer Extension (BRBE)"
>> + depends on PERF_EVENTS && ARM64 && ARM_PMU
>> + default y
>> + help
>> + Enable perf support for Branch Record Buffer Extension (BRBE) which
>> + records all branches taken in an execution path. This supports some
>> + branch types and privilege based filtering. It captured additional
>> + relevant information such as cycle count, misprediction and branch
>> + type, branch privilege level etc.
>> +
>> # Supported by clang >= 7.0 or GCC >= 12.0.0
>> config CC_HAVE_SHADOW_CALL_STACK
>> def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18)
>> diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
>> index a038902d6874..cf2e88c7b707 100644
>> --- a/arch/arm64/include/asm/perf_event.h
>> +++ b/arch/arm64/include/asm/perf_event.h
>> @@ -277,6 +277,14 @@ struct pmu_hw_events;
>> struct arm_pmu;
>> struct perf_event;
>>
>> +#ifdef CONFIG_ARM64_BRBE
>> +void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event);
>> +bool armv8pmu_branch_valid(struct perf_event *event);
>> +void armv8pmu_branch_enable(struct perf_event *event);
>> +void armv8pmu_branch_disable(struct perf_event *event);
>> +void armv8pmu_branch_probe(struct arm_pmu *arm_pmu);
>> +void armv8pmu_branch_reset(void);
>> +#else
>> static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event) { }
>> static inline bool armv8pmu_branch_valid(struct perf_event *event) { return false; }
>> static inline void armv8pmu_branch_enable(struct perf_event *event) { }
>> @@ -284,3 +292,4 @@ static inline void armv8pmu_branch_disable(struct perf_event *event) { }
>> static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu) { }
>> static inline void armv8pmu_branch_reset(void) { }
>> #endif
>> +#endif
>> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
>> index ceba6792f5b3..6ee7ccb61621 100644
>> --- a/arch/arm64/kernel/Makefile
>> +++ b/arch/arm64/kernel/Makefile
>> @@ -46,6 +46,7 @@ obj-$(CONFIG_MODULES) += module.o
>> obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o
>> obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o
>> obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o
>> +obj-$(CONFIG_ARM64_BRBE) += brbe.o
>> obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
>> obj-$(CONFIG_CPU_PM) += sleep.o suspend.o
>> obj-$(CONFIG_CPU_IDLE) += cpuidle.o
>> diff --git a/arch/arm64/kernel/brbe.c b/arch/arm64/kernel/brbe.c
>> new file mode 100644
>> index 000000000000..cd03d3531e04
>> --- /dev/null
>> +++ b/arch/arm64/kernel/brbe.c
>> @@ -0,0 +1,512 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * Branch Record Buffer Extension Driver.
>> + *
>> + * Copyright (C) 2022 ARM Limited
>> + *
>> + * Author: Anshuman Khandual <anshuman.khandual@arm.com>
>> + */
>> +#include "brbe.h"
>> +
>> +static bool valid_brbe_nr(int brbe_nr)
>> +{
>> + return brbe_nr == BRBIDR0_EL1_NUMREC_8 ||
>> + brbe_nr == BRBIDR0_EL1_NUMREC_16 ||
>> + brbe_nr == BRBIDR0_EL1_NUMREC_32 ||
>> + brbe_nr == BRBIDR0_EL1_NUMREC_64;
>> +}
>> +
>> +static bool valid_brbe_cc(int brbe_cc)
>> +{
>> + return brbe_cc == BRBIDR0_EL1_CC_20_BIT;
>> +}
>> +
>> +static bool valid_brbe_format(int brbe_format)
>> +{
>> + return brbe_format == BRBIDR0_EL1_FORMAT_0;
>> +}
>> +
>> +static bool valid_brbe_version(int brbe_version)
>> +{
>> + return brbe_version == ID_AA64DFR0_EL1_BRBE_IMP ||
>> + brbe_version == ID_AA64DFR0_EL1_BRBE_BRBE_V1P1;
>> +}
>> +
>> +static void select_brbe_bank(int bank)
>> +{
>> + static int brbe_current_bank = BRBE_BANK_IDX_INVALID;
>
> This is a per-cpu peroperty, so I don't understand how this can safely be
> stored in a static variable. If this is necessary it needs to go in a per-cpu
> variable, but I suspect we don't actually need it.
You are right, we dont need it.
>
>> + u64 brbfcr;
>> +
>> + if (brbe_current_bank == bank)
>> + return;
>
> It looks like this is just for the same of optimizing redundant changes when
> armv8pmu_branch_read() iterates over the records?
Right, it is.
>
> It'd be simpler to have armv8pmu_branch_read() iterate over each bank, then
> within that iterate over each record within that bank.
Sure, will drop this optimization completely. I will split the iteration into two
separate loops, one each for bank 0 and other for bank 1.
>
>> + WARN_ON(bank > BRBE_BANK_IDX_1);
>> + brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
>> + brbfcr &= ~BRBFCR_EL1_BANK_MASK;
>> + brbfcr |= ((bank << BRBFCR_EL1_BANK_SHIFT) & BRBFCR_EL1_BANK_MASK);
>
> You can use SYS_FIELD_PREP() for this:
Sure, will do.
>
> brbfcr &= ~BRBFCR_EL1_BANK_MASK;
> brbfcr |= SYS_FIELD_PREP(BRBFCR_EL1, BANK, bank);
>
> Please use FIELD_PREP for this.
Done.
>
>> + write_sysreg_s(brbfcr, SYS_BRBFCR_EL1);
>> + isb();
>> + brbe_current_bank = bank;
>> +}
>> +
>> +static void select_brbe_bank_index(int buffer_idx)
>> +{
>> + switch (buffer_idx) {
>> + case BRBE_BANK0_IDX_MIN ... BRBE_BANK0_IDX_MAX:
>> + select_brbe_bank(BRBE_BANK_IDX_0);
>> + break;
>> + case BRBE_BANK1_IDX_MIN ... BRBE_BANK1_IDX_MAX:
>> + select_brbe_bank(BRBE_BANK_IDX_1);
>> + break;
>> + default:
>> + pr_warn("unsupported BRBE index\n");
>
> It would be worth logging the specific index in case we ever have to debug
> this. It's probably worth also making this a WARN_ONCE() or WARN_RATELIMITED().
This function will not be required once individual loops based read for each
BRBE bank is implemented, thus reducing number of times select_brbe_bank()
gets called i.e just two times once for bank 0 and other for bank 1.
>
>> + }
>> +}
>> +
>> +static const char branch_filter_error_msg[] = "branch filter not supported";
>> +
>> +bool armv8pmu_branch_valid(struct perf_event *event)
>> +{
>> + u64 branch_type = event->attr.branch_sample_type;
>> +
>> + /*
>> + * If the event does not have at least one of the privilege
>> + * branch filters as in PERF_SAMPLE_BRANCH_PLM_ALL, the core
>> + * perf will adjust its value based on perf event's existing
>> + * privilege level via attr.exclude_[user|kernel|hv].
>> + *
>> + * As event->attr.branch_sample_type might have been changed
>> + * when the event reaches here, it is not possible to figure
>> + * out whether the event originally had HV privilege request
>> + * or got added via the core perf. Just report this situation
>> + * once and continue ignoring if there are other instances.
>> + */
>> + if ((branch_type & PERF_SAMPLE_BRANCH_HV) && !is_kernel_in_hyp_mode())
>> + pr_warn_once("%s - hypervisor privilege\n", branch_filter_error_msg);
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_ABORT_TX) {
>> + pr_warn_once("%s - aborted transaction\n", branch_filter_error_msg);
>> + return false;
>> + }
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_NO_TX) {
>> + pr_warn_once("%s - no transaction\n", branch_filter_error_msg);
>> + return false;
>> + }
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_IN_TX) {
>> + pr_warn_once("%s - in transaction\n", branch_filter_error_msg);
>> + return false;
>> + }
>> + return true;
>> +}
>
> Is this called when validating user input? If so, NAK to printing anything to a
> higher leval than debug. If there are constraints the user needs to be aware of
You mean pr_debug() based prints ?
> we should expose the relevant information under sysfs, but it seems that these
> are just generic perf options that BRBE doesn't support.
Right, these are generic perf options. As you mentioned, will replace these with
pr_debug() instead.
>
> It would be better to whitelist what we do support rather than blacklisting
> what we don't.
But with a negative list, user would know what is not supported via these pr_debug()
based output when enabled ? But I dont have a strong opinion either way.
>
>> +
>> +static void branch_records_alloc(struct arm_pmu *armpmu)
>> +{
>> + struct pmu_hw_events *events;
>> + int cpu;
>> +
>> + for_each_possible_cpu(cpu) {
>> + events = per_cpu_ptr(armpmu->hw_events, cpu);
>> +
>> + events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
>> + WARN_ON(!events->branches);
>> + }
>> +}
>
> It would be simpler for this to be a percpu allocation.
Could you please be more specific ? alloc_percpu_gfp() cannot be used here
because 'events->branches' is not a __percpu variable unlike its parent
'events' which is derived from armpmu.
>
> If the allocation fails, we should propogate that error rather than just
> WARNing, and fail probing the PMU.
Sure, will change that.
>
> Also, if the generic allocator fails it will print a warning (unless
> __GFP_NOWARN was used), so we don't need the warning here.
Sure, understood.
>
>> +
>> +static int brbe_attributes_probe(struct arm_pmu *armpmu, u32 brbe)
>> +{
>> + struct brbe_hw_attr *brbe_attr = kzalloc(sizeof(struct brbe_hw_attr), GFP_KERNEL);
>
> Same comments as for the failure path in branch_records_alloc().
>
>> + u64 brbidr = read_sysreg_s(SYS_BRBIDR0_EL1);
>
> Which context is this run in? Unless this is affine to a relevant CPU we can't
> read the sysreg safely, and if we're in a cross-call we cannot allocate memory,
> so this doesn't look right to me.
Called from smp_call_function_any() context via __armv8pmu_probe_pmu().
>
> I suspect CONFIG_DEBUG_ATOMIC_SLEEP=y and/or CONFIG_PROVE_LOCKING=y will complain here.
Right, it does. Remember dropping pr_info() during BRBE probe for the exact same
reason here but did not realize we will run into the same problem again.
>
> Please follow the approach of armv8pmu_probe_pmu(), where we use a probe_info
> structure that the callee can fill with information. Then we can do the
> allocation in the main thread from a non-atomic context.
Right, will do that. The only problem being 'struct brbe_hw_attr' which will not be
visible in the main thread, might need an abstraction function to do the allocation
in BRBE implementation. Regardless, a successful BRBE in the preceding function can
be ascertained from armpmu->has_branch_stack().
>
>> +
>> + WARN_ON(!brbe_attr);
>> + armpmu->private = brbe_attr;
>> +
>> + brbe_attr->brbe_version = brbe;
>> + brbe_attr->brbe_format = brbe_fetch_format(brbidr);
>> + brbe_attr->brbe_cc = brbe_fetch_cc_bits(brbidr);
>> + brbe_attr->brbe_nr = brbe_fetch_numrec(brbidr);
>
> As a minor thing, could we please s/fetch/get/ ? To me, 'fetch' sounds like a
> memory operation, and elsewhere we use 'get' for this sort of getter function.
Sure, but shall we change fetch as get for entire BRBE implementation (where ever
there is a determination of field from a register value) or just the above function ?
Default, will change all places.
>
>> +
>> + if (!valid_brbe_version(brbe_attr->brbe_version) ||
>> + !valid_brbe_format(brbe_attr->brbe_format) ||
>> + !valid_brbe_cc(brbe_attr->brbe_cc) ||
>> + !valid_brbe_nr(brbe_attr->brbe_nr))
>> + return -EOPNOTSUPP;
>> +
>> + return 0;
>> +}
>> +
>> +void armv8pmu_branch_probe(struct arm_pmu *armpmu)
>> +{
>> + u64 aa64dfr0 = read_sysreg_s(SYS_ID_AA64DFR0_EL1);
>> + u32 brbe;
>> +
>> + brbe = cpuid_feature_extract_unsigned_field(aa64dfr0, ID_AA64DFR0_EL1_BRBE_SHIFT);
>> + if (!brbe)
>> + return;
>> +
>> + if (brbe_attributes_probe(armpmu, brbe))
>> + return;
>> +
>> + branch_records_alloc(armpmu);
>> + armpmu->features |= ARM_PMU_BRANCH_STACK;
>> +}
>> +
>> +static u64 branch_type_to_brbfcr(int branch_type)
>> +{
>> + u64 brbfcr = 0;
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_ANY) {
>> + brbfcr |= BRBFCR_EL1_BRANCH_FILTERS;
>> + return brbfcr;
>> + }
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_ANY_CALL) {
>> + brbfcr |= BRBFCR_EL1_INDCALL;
>> + brbfcr |= BRBFCR_EL1_DIRCALL;
>> + }
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
>> + brbfcr |= BRBFCR_EL1_RTN;
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_IND_CALL)
>> + brbfcr |= BRBFCR_EL1_INDCALL;
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_COND)
>> + brbfcr |= BRBFCR_EL1_CONDDIR;
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_IND_JUMP)
>> + brbfcr |= BRBFCR_EL1_INDIRECT;
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_CALL)
>> + brbfcr |= BRBFCR_EL1_DIRCALL;
>> +
>> + return brbfcr;
>> +}
>> +
>> +static u64 branch_type_to_brbcr(int branch_type)
>> +{
>> + u64 brbcr = (BRBCR_EL1_FZP | BRBCR_EL1_DEFAULT_TS);
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_USER)
>> + brbcr |= BRBCR_EL1_E0BRE;
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_KERNEL)
>> + brbcr |= BRBCR_EL1_E1BRE;
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_HV) {
>> + if (is_kernel_in_hyp_mode())
>> + brbcr |= BRBCR_EL1_E1BRE;
>> + }
>
> I assume that in that case we're actually writing to BRBCR_EL2, and this is
> actually the E2BRE bit, which is at the same position? If so, I think that's
> worth a comment above the USER/KERNEL/HV bits here.
That is right, will add a comment.
>
> How do the BRB* control registers work with E2H? Is BRBCR_EL1 rewritten to
> BRBCR_EL2 by the hardware?
Right, that is my understanding as well.
With FEAT_VHE and HCR_EL2.E2H = 1, access to BRBCR_EL1 at EL2, accesses BRBCR_EL2
without FEAT_VHE or HCR_EL2.E2H = 0, access to BRBCR_EL1 at EL2, accesses BRBCR_EL1
>
>> +
>> + if (!(branch_type & PERF_SAMPLE_BRANCH_NO_CYCLES))
>> + brbcr |= BRBCR_EL1_CC;
>> +
>> + if (!(branch_type & PERF_SAMPLE_BRANCH_NO_FLAGS))
>> + brbcr |= BRBCR_EL1_MPRED;
>> +
>> + /*
>> + * The exception and exception return branches could be
>> + * captured, irrespective of the perf event's privilege.
>> + * If the perf event does not have enough privilege for
>> + * a given exception level, then addresses which falls
>> + * under that exception level will be reported as zero
>> + * for the captured branch record, creating source only
>> + * or target only records.
>> + */
>> + if (branch_type & PERF_SAMPLE_BRANCH_ANY) {
>> + brbcr |= BRBCR_EL1_EXCEPTION;
>> + brbcr |= BRBCR_EL1_ERTN;
>> + }
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_ANY_CALL)
>> + brbcr |= BRBCR_EL1_EXCEPTION;
>> +
>> + if (branch_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
>> + brbcr |= BRBCR_EL1_ERTN;
>> +
>> + return brbcr & BRBCR_EL1_DEFAULT_CONFIG;
>> +}
>> +
>> +void armv8pmu_branch_enable(struct perf_event *event)
>> +{
>> + u64 branch_type = event->attr.branch_sample_type;
>> + u64 brbfcr, brbcr;
>> +
>> + brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
>> + brbfcr &= ~BRBFCR_EL1_DEFAULT_CONFIG;
>> + brbfcr |= branch_type_to_brbfcr(branch_type);
>> + write_sysreg_s(brbfcr, SYS_BRBFCR_EL1);
>> + isb();
>> +
>> + brbcr = read_sysreg_s(SYS_BRBCR_EL1);
>> + brbcr &= ~BRBCR_EL1_DEFAULT_CONFIG;
>> + brbcr |= branch_type_to_brbcr(branch_type);
>> + write_sysreg_s(brbcr, SYS_BRBCR_EL1);
>> + isb();
>> + armv8pmu_branch_reset();
>> +}
>> +
>> +void armv8pmu_branch_disable(struct perf_event *event)
>> +{
>> + u64 brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
>> + u64 brbcr = read_sysreg_s(SYS_BRBCR_EL1);
>> +
>> + brbcr &= ~(BRBCR_EL1_E0BRE | BRBCR_EL1_E1BRE);
>> + brbfcr |= BRBFCR_EL1_PAUSED;
>> + write_sysreg_s(brbcr, SYS_BRBCR_EL1);
>> + write_sysreg_s(brbfcr, SYS_BRBFCR_EL1);
>> + isb();
>> +}
>> +
>> +static int brbe_fetch_perf_type(u64 brbinf, bool *new_branch_type)
>
> It's a bit confusing to return the type and new_type fields in this way.
>
> I think this would be clearer as a setter function, even if that results in it
> being a bit longer, since it keeps all the type and new_type relationships in
> one place and has a single path for returning the value:
Makes sense.
>
> static void brbe_set_perf_entry_type(struct perf_branch_entry *entry,
> u64 brbinf)
> {
> int brbe_type = brbe_fetch_type(brbinf);
>
> switch (brbe_type) {
> case BRBINF_EL1_TYPE_UNCOND_DIR;
> entry->type = PERF_BR_UNCOND;
> break;
> ...
> case BRBINF_EL1_TYPE_DEBUG_HALT;
> entry->type = PERF_BR_EXTEND_ABI;
> entry->new_type = PERF_BR_ARM64_DEBUG_HALT;
> break;
> ...
> default:
> ...
> }
> }
>
> ... and in theory that makes it easier to propogate an error in future if we
> want to.
Sure, will convert this function into brbe_set_perf_entry_type() as suggested.
>
>> +{
>> + int brbe_type = brbe_fetch_type(brbinf);
>> + *new_branch_type = false;
>> +
>> + switch (brbe_type) {
>> + case BRBINF_EL1_TYPE_UNCOND_DIR:
>> + return PERF_BR_UNCOND;
>> + case BRBINF_EL1_TYPE_INDIR:
>> + return PERF_BR_IND;
>> + case BRBINF_EL1_TYPE_DIR_LINK:
>> + return PERF_BR_CALL;
>> + case BRBINF_EL1_TYPE_INDIR_LINK:
>> + return PERF_BR_IND_CALL;
>> + case BRBINF_EL1_TYPE_RET_SUB:
>> + return PERF_BR_RET;
>> + case BRBINF_EL1_TYPE_COND_DIR:
>> + return PERF_BR_COND;
>> + case BRBINF_EL1_TYPE_CALL:
>> + return PERF_BR_CALL;
>> + case BRBINF_EL1_TYPE_TRAP:
>> + return PERF_BR_SYSCALL;
>> + case BRBINF_EL1_TYPE_RET_EXCPT:
>> + return PERF_BR_ERET;
>> + case BRBINF_EL1_TYPE_IRQ:
>> + return PERF_BR_IRQ;
>> + case BRBINF_EL1_TYPE_DEBUG_HALT:
>> + *new_branch_type = true;
>> + return PERF_BR_ARM64_DEBUG_HALT;
>> + case BRBINF_EL1_TYPE_SERROR:
>> + return PERF_BR_SERROR;
>> + case BRBINF_EL1_TYPE_INST_DEBUG:
>> + *new_branch_type = true;
>> + return PERF_BR_ARM64_DEBUG_INST;
>> + case BRBINF_EL1_TYPE_DATA_DEBUG:
>> + *new_branch_type = true;
>> + return PERF_BR_ARM64_DEBUG_DATA;
>> + case BRBINF_EL1_TYPE_ALGN_FAULT:
>> + *new_branch_type = true;
>> + return PERF_BR_NEW_FAULT_ALGN;
>> + case BRBINF_EL1_TYPE_INST_FAULT:
>> + *new_branch_type = true;
>> + return PERF_BR_NEW_FAULT_INST;
>> + case BRBINF_EL1_TYPE_DATA_FAULT:
>> + *new_branch_type = true;
>> + return PERF_BR_NEW_FAULT_DATA;
>> + case BRBINF_EL1_TYPE_FIQ:
>> + *new_branch_type = true;
>> + return PERF_BR_ARM64_FIQ;
>> + case BRBINF_EL1_TYPE_DEBUG_EXIT:
>> + *new_branch_type = true;
>> + return PERF_BR_ARM64_DEBUG_EXIT;
>> + default:
>> + pr_warn("unknown branch type captured\n");
>> + return PERF_BR_UNKNOWN;
>
> It would be worth logging the specific value in case we ever have to debug
> this. This should also be marked as _ratelimited or _once.
Sure, will replace with a pr_warn_once() printing 'branch_type'.
>
>> + }
>> +}
>> +
>> +static int brbe_fetch_perf_priv(u64 brbinf)
>> +{
>> + int brbe_el = brbe_fetch_el(brbinf);
>> +
>> + switch (brbe_el) {
>> + case BRBINF_EL1_EL_EL0:
>> + return PERF_BR_PRIV_USER;
>> + case BRBINF_EL1_EL_EL1:
>> + return PERF_BR_PRIV_KERNEL;
>> + case BRBINF_EL1_EL_EL2:
>> + if (is_kernel_in_hyp_mode())
>> + return PERF_BR_PRIV_KERNEL;
>> + return PERF_BR_PRIV_HV;
>> + default:
>> + pr_warn("unknown branch privilege captured\n");
>> + return PERF_BR_PRIV_UNKNOWN;
>
> It would be worth logging the specific value in case we ever have to debug
> this. This should also be marked as _ratelimited or _once.
Sure, will replace with a pr_warn_once() printing 'brbe_el.
>
>> + }
>> +}
>> +
>> +static void capture_brbe_flags(struct pmu_hw_events *cpuc, struct perf_event *event,
>> + u64 brbinf, u64 brbcr, int idx)
>> +{
>> + struct perf_branch_entry *entry = &cpuc->branches->branch_entries[idx];
>> + bool new_branch_type;
>> + int branch_type;
>> +
>> + if (branch_sample_type(event)) {
>> + branch_type = brbe_fetch_perf_type(brbinf, &new_branch_type);
>> + if (new_branch_type) {
>> + entry->type = PERF_BR_EXTEND_ABI;
>> + entry->new_type = branch_type;
>> + } else {
>> + entry->type = branch_type;
>> + }
>> + }
>
> With the suggestions bove, this would become:
>
> if (branch_sample_type(event))
> brbe_set_perf_entry_type(entry, brbinf);
That's right, will change.
>
>> + if (!branch_sample_no_cycles(event)) {
>> + WARN_ON_ONCE(!(brbcr & BRBCR_EL1_CC));
>> + entry->cycles = brbe_fetch_cycles(brbinf);
>> + }
>> +
>> + if (!branch_sample_no_flags(event)) {
>> + /*
>> + * BRBINF_LASTFAILED does not indicate whether last transaction
>> + * got failed or aborted during the current branch record itself.
>> + * Rather, this indicates that all the branch records which were
>> + * in transaction until the curret branch record have failed. So
>> + * the entire BRBE buffer needs to be processed later on to find
>> + * all branch records which might have failed.
>> + */
>
> This is quite difficult to follow.
>
> I took in the ARM ARM, and it looks like this is all about TME transactions
> (which Linux doesn't currently support). Per ARM DDI 0487I.a, page D15-5506:
>
> | R_GVCJH
> | When an entire transaction is executed in a BRBE Non-prohibited region and
> | the transaction fails or is canceled then BRBFCR_EL1.LASTFAILED is set to
> | 1.
>
> | R_KBSZM
> | When a Branch record is generated, other than through the injection
> | mechanism, the value of BRBFCR_EL1.LASTFAILED is copied to the LASTFAILED
> | field in the Branch record and BRBFCR_EL1.LASTFAILED is set to 0.
>
> | I_JBPHS
> | When a transaction fails or is canceled, Branch records generated in the
> | transaction are not removed from the Branch record buffer.
>
> I think what this is saying is:
>
> /*
> * BRBINFx_EL1.LASTFAILED indicates that a TME transaction failed (or
> * was cancelled) prior to this record, and some number of records
> * prior to this one may have been generated during an attempt to
> * execute the transaction.
> *
> * We will remove such entries later in process_branch_aborts().
> */
>
> Is that right?
Right, will update the comment here.
>
>> +
>> + /*
>> + * All these information (i.e transaction state and mispredicts)
>> + * are not available for target only branch records.
>> + */
>> + if (!brbe_target(brbinf)) {
>
> Could we rename these heleprs for clarity, e.g.
> brbe_record_is_{target_only,source_only,complete}()
Sure, will do.
>
> With that, it would also be clearer to have:
>
> /*
> * These fields only exist for complete and source-only records.
> */
> if (brbe_record_is_complete(brbinf) ||
> brbe_record_is_source_only()) {
>
> ... and explicilty match the cases we care about[
Sure, will invert the check and update the comment here.
>
>
>> + WARN_ON_ONCE(!(brbcr & BRBCR_EL1_MPRED));
>
> Huh? Why does the value of BRBCR matter here?
This is just a code hardening measure here. Before recording branch record
cycles or its flags, ensure BRBCR_EL1 was configured correctly to produce
these additional information along with the branch records.
>
>> + entry->mispred = brbe_fetch_mispredict(brbinf);
>> + entry->predicted = !entry->mispred;
>> + entry->in_tx = brbe_fetch_in_tx(brbinf);
>> + }
>> + }
>> +
>> + if (branch_sample_priv(event)) {
>> + /*
>> + * All these information (i.e branch privilege level) are not
>> + * available for source only branch records.
>> + */
>> + if (!brbe_source(brbinf))
>> + entry->priv = brbe_fetch_perf_priv(brbinf);
>
> Same style comment as above.
Sure, will do.
>
>> + }
>> +}
>> +
>> +/*
>> + * A branch record with BRBINF_EL1.LASTFAILED set, implies that all
>> + * preceding consecutive branch records, that were in a transaction
>> + * (i.e their BRBINF_EL1.TX set) have been aborted.
>> + *
>> + * Similarly BRBFCR_EL1.LASTFAILED set, indicate that all preceding
>> + * consecutive branch records upto the last record, which were in a
>> + * transaction (i.e their BRBINF_EL1.TX set) have been aborted.
>> + *
>> + * --------------------------------- -------------------
>> + * | 00 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
>> + * --------------------------------- -------------------
>> + * | 01 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
>> + * --------------------------------- -------------------
>> + * | 02 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
>> + * --------------------------------- -------------------
>> + * | 03 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>> + * --------------------------------- -------------------
>> + * | 04 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>> + * --------------------------------- -------------------
>> + * | 05 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 1 |
>> + * --------------------------------- -------------------
>> + * | .. | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
>> + * --------------------------------- -------------------
>> + * | 61 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>> + * --------------------------------- -------------------
>> + * | 62 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>> + * --------------------------------- -------------------
>> + * | 63 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>> + * --------------------------------- -------------------
>
> Are we guaranteed to have a record between two transactions with TX = 0?
With TX = 0 i.e no transaction was active, indicates normal sequence of branches
creating their own branch records. How can there be a transaction with TX = 0 ?
Could you please be more specific here ?
>
> AFAICT you could have a sequence where a TCOMMIT is immediately followed by a
> TSTART, and IIUC in that case you could have back-to-back records for distinct
> transactions all with TX = 1, where the first transaction could be commited,
> and the second might fail/cancel.
>
> ... or do TCOMMIT/TCANCEL/TSTART get handled specially?
I guess these are micro-architectural implementation details which unfortunately
BRBINF_EL1/BRBCR_EL1 specifications do not capture in detail. But all it says is
that upon encountering BRBINF_EL1.LASTFAILED or BRBFCR_EL1.LASTFAILED (just for
the last record) all previous in-transaction branch records (BRBINF_EL1.TX = 1)
should be considered aborted for branch record reporting purpose.
>
>> + *
>> + * BRBFCR_EL1.LASTFAILED == 1
>> + *
>> + * Here BRBFCR_EL1.LASTFAILED failes all those consecutive and also
>> + * in transaction branches near the end of the BRBE buffer.
>> + */
>> +static void process_branch_aborts(struct pmu_hw_events *cpuc)
>> +{
>> + struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
>> + u64 brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
>> + bool lastfailed = !!(brbfcr & BRBFCR_EL1_LASTFAILED);
>> + int idx = brbe_attr->brbe_nr - 1;
>> + struct perf_branch_entry *entry;
>> +
>> + do {
>> + entry = &cpuc->branches->branch_entries[idx];
>> + if (entry->in_tx) {
>> + entry->abort = lastfailed;
>> + } else {
>> + lastfailed = entry->abort;
>> + entry->abort = false;
>> + }
>> + } while (idx--, idx >= 0);
>> +}
>> +
>> +void armv8pmu_branch_reset(void)
>> +{
>> + asm volatile(BRB_IALL);
>> + isb();
>> +}
>> +
>> +void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
>> +{
>> + struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
>> + u64 brbinf, brbfcr, brbcr;
>> + int idx;
>> +
>> + brbcr = read_sysreg_s(SYS_BRBCR_EL1);
>> + brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
>> +
>> + /* Ensure pause on PMU interrupt is enabled */
>> + WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
>
> As above, I think this needs commentary in the interrupt handler, since this
> presumably needs us to keep the IRQ asserted until we're done
> reading/manipulating records in the IRQ handler.
The base IRQ handler armv8pmu_handle_irq() is in ARMV8 PMU code inside perf_event.c
which could/should not access BRBE specific details without adding an another new
abstraction function. But I guess adding a comment should be fine.
>
> Do we ever read this outside of the IRQ handler? AFAICT we don't, and that
> makes it seem like some of this is redundant.
>
>> +
>> + /* Save and clear the privilege */
>> + write_sysreg_s(brbcr & ~(BRBCR_EL1_E0BRE | BRBCR_EL1_E1BRE), SYS_BRBCR_EL1);
>
> Why? Later on we restore this, and AFAICT we don't modify it.
>
> If it's paused, why do we care about the privilege?
This disables BRBE completely (not only pause) providing confidence that no
branch record can come in while the existing records are being processed.
>
>> +
>> + /* Pause the buffer */
>> + write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>> + isb();
>
> Why? If we're in the IRQ handler it's already paused, and if we're not in the
> IRQ handler what prevents us racing with an IRQ?
armv8pmu_branch_read() always gets called from an IRQ context only. The point
here is to force a pause (and also disable, as I had explained earlier) before
reading the buffer.
>
>> +
>> + for (idx = 0; idx < brbe_attr->brbe_nr; idx++) {
>> + struct perf_branch_entry *entry = &cpuc->branches->branch_entries[idx];
>> +
>> + select_brbe_bank_index(idx);
>> + brbinf = get_brbinf_reg(idx);
>> + /*
>> + * There are no valid entries anymore on the buffer.
>> + * Abort the branch record processing to save some
>> + * cycles and also reduce the capture/process load
>> + * for the user space as well.
>> + */
>> + if (brbe_invalid(brbinf))
>> + break;
>> +
>> + perf_clear_branch_entry_bitfields(entry);
>> + if (brbe_valid(brbinf)) {
>> + entry->from = get_brbsrc_reg(idx);
>> + entry->to = get_brbtgt_reg(idx);
>> + } else if (brbe_source(brbinf)) {
>> + entry->from = get_brbsrc_reg(idx);
>> + entry->to = 0;
>> + } else if (brbe_target(brbinf)) {
>> + entry->from = 0;
>> + entry->to = get_brbtgt_reg(idx);
>> + }
>> + capture_brbe_flags(cpuc, event, brbinf, brbcr, idx);
>> + }
>> + cpuc->branches->branch_stack.nr = idx;
>> + cpuc->branches->branch_stack.hw_idx = -1ULL;
>> + process_branch_aborts(cpuc);
>> +
>> + /* Restore privilege, enable pause on PMU interrupt */
>> + write_sysreg_s(brbcr | BRBCR_EL1_FZP, SYS_BRBCR_EL1);
>
> Why do we have to save/restore this?
Yes, this guarantees (more so than the paused state) that BRBE remains disabled in
privilege levels that are relevant, while the contents are being read.
>
>> +
>> + /* Unpause the buffer */
>> + write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>> + isb();
>> + armv8pmu_branch_reset();
>> +}
>
> Why do we enable it before we reset it?
This is the last opportunity for a clean slate start for BRBE buffer before it is
back recording the branches. Basically helps in ensuring a clean start.
>
> Surely it would make sense to reset it first, and ammortize the cost of the ISB?
>
> That said, as above, do we actually need to pause/unpause it? Or is it already
> paused by virtue of the IRQ?
Yes, it should be paused after an IRQ but it is also enforced before reading along
with privilege level disable. Regardless the buffer needs to be un-paused and also
enabled for required privilege levels before exiting from here.
On Thu, Jan 19, 2023 at 08:18:47AM +0530, Anshuman Khandual wrote:
> On 1/12/23 22:21, Mark Rutland wrote:
> > On Thu, Jan 05, 2023 at 08:40:39AM +0530, Anshuman Khandual wrote:
> >> +bool armv8pmu_branch_valid(struct perf_event *event)
> >> +{
> >> + u64 branch_type = event->attr.branch_sample_type;
> >> +
> >> + /*
> >> + * If the event does not have at least one of the privilege
> >> + * branch filters as in PERF_SAMPLE_BRANCH_PLM_ALL, the core
> >> + * perf will adjust its value based on perf event's existing
> >> + * privilege level via attr.exclude_[user|kernel|hv].
> >> + *
> >> + * As event->attr.branch_sample_type might have been changed
> >> + * when the event reaches here, it is not possible to figure
> >> + * out whether the event originally had HV privilege request
> >> + * or got added via the core perf. Just report this situation
> >> + * once and continue ignoring if there are other instances.
> >> + */
> >> + if ((branch_type & PERF_SAMPLE_BRANCH_HV) && !is_kernel_in_hyp_mode())
> >> + pr_warn_once("%s - hypervisor privilege\n", branch_filter_error_msg);
> >> +
> >> + if (branch_type & PERF_SAMPLE_BRANCH_ABORT_TX) {
> >> + pr_warn_once("%s - aborted transaction\n", branch_filter_error_msg);
> >> + return false;
> >> + }
> >> +
> >> + if (branch_type & PERF_SAMPLE_BRANCH_NO_TX) {
> >> + pr_warn_once("%s - no transaction\n", branch_filter_error_msg);
> >> + return false;
> >> + }
> >> +
> >> + if (branch_type & PERF_SAMPLE_BRANCH_IN_TX) {
> >> + pr_warn_once("%s - in transaction\n", branch_filter_error_msg);
> >> + return false;
> >> + }
> >> + return true;
> >> +}
> >
> > Is this called when validating user input? If so, NAK to printing anything to a
> > higher leval than debug. If there are constraints the user needs to be aware of
>
> You mean pr_debug() based prints ?
Yes.
> > It would be better to whitelist what we do support rather than blacklisting
> > what we don't.
>
> But with a negative list, user would know what is not supported via these pr_debug()
> based output when enabled ? But I dont have a strong opinion either way.
With a negative list, when new options are added the driver will erroneously
and silently accept them, which is worse.
>
> >
> >> +
> >> +static void branch_records_alloc(struct arm_pmu *armpmu)
> >> +{
> >> + struct pmu_hw_events *events;
> >> + int cpu;
> >> +
> >> + for_each_possible_cpu(cpu) {
> >> + events = per_cpu_ptr(armpmu->hw_events, cpu);
> >> +
> >> + events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
> >> + WARN_ON(!events->branches);
> >> + }
> >> +}
> >
> > It would be simpler for this to be a percpu allocation.
>
> Could you please be more specific ? alloc_percpu_gfp() cannot be used here
> because 'events->branches' is not a __percpu variable unlike its parent
> 'events' which is derived from armpmu.
You can allocate it per-cpu, then grab each of the cpu's pointers using
per_cpu() and place those into events->branches.
That way you only make one allocation which can fail, which makes the error
path much simpler.
[...]
> >> +static int brbe_attributes_probe(struct arm_pmu *armpmu, u32 brbe)
> >> +{
> >> + struct brbe_hw_attr *brbe_attr = kzalloc(sizeof(struct brbe_hw_attr), GFP_KERNEL);
> >
> > Same comments as for the failure path in branch_records_alloc().
> >
> >> + u64 brbidr = read_sysreg_s(SYS_BRBIDR0_EL1);
> >
> > Which context is this run in? Unless this is affine to a relevant CPU we can't
> > read the sysreg safely, and if we're in a cross-call we cannot allocate memory,
> > so this doesn't look right to me.
>
> Called from smp_call_function_any() context via __armv8pmu_probe_pmu().
Ok; so the read is safe, but the allocation is not.
[...]
> >> + WARN_ON(!brbe_attr);
> >> + armpmu->private = brbe_attr;
> >> +
> >> + brbe_attr->brbe_version = brbe;
> >> + brbe_attr->brbe_format = brbe_fetch_format(brbidr);
> >> + brbe_attr->brbe_cc = brbe_fetch_cc_bits(brbidr);
> >> + brbe_attr->brbe_nr = brbe_fetch_numrec(brbidr);
> >
> > As a minor thing, could we please s/fetch/get/ ? To me, 'fetch' sounds like a
> > memory operation, and elsewhere we use 'get' for this sort of getter function.
>
> Sure, but shall we change fetch as get for entire BRBE implementation (where ever
> there is a determination of field from a register value) or just the above function ?
> Default, will change all places.
I had meant in all cases, so that's perfect, thanks.
[...]
> >> + WARN_ON_ONCE(!(brbcr & BRBCR_EL1_MPRED));
> >
> > Huh? Why does the value of BRBCR matter here?
>
> This is just a code hardening measure here. Before recording branch record
> cycles or its flags, ensure BRBCR_EL1 was configured correctly to produce
> these additional information along with the branch records.
I don't think that's necessary. Where is brbcr written such that this could be
misconfigured?
At the least, this needs a comment as to why we need to check, and what we're
checking for.
[...]
> >> +/*
> >> + * A branch record with BRBINF_EL1.LASTFAILED set, implies that all
> >> + * preceding consecutive branch records, that were in a transaction
> >> + * (i.e their BRBINF_EL1.TX set) have been aborted.
> >> + *
> >> + * Similarly BRBFCR_EL1.LASTFAILED set, indicate that all preceding
> >> + * consecutive branch records upto the last record, which were in a
> >> + * transaction (i.e their BRBINF_EL1.TX set) have been aborted.
> >> + *
> >> + * --------------------------------- -------------------
> >> + * | 00 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
> >> + * --------------------------------- -------------------
> >> + * | 01 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
> >> + * --------------------------------- -------------------
> >> + * | 02 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
> >> + * --------------------------------- -------------------
> >> + * | 03 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> >> + * --------------------------------- -------------------
> >> + * | 04 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> >> + * --------------------------------- -------------------
> >> + * | 05 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 1 |
> >> + * --------------------------------- -------------------
> >> + * | .. | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
> >> + * --------------------------------- -------------------
> >> + * | 61 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> >> + * --------------------------------- -------------------
> >> + * | 62 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> >> + * --------------------------------- -------------------
> >> + * | 63 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
> >> + * --------------------------------- -------------------
> >
> > Are we guaranteed to have a record between two transactions with TX = 0?
>
> With TX = 0 i.e no transaction was active, indicates normal sequence of branches
> creating their own branch records. How can there be a transaction with TX = 0 ?
> Could you please be more specific here ?
Consider a sequence of a successful transaction followed by a cancelled
transation, with not branches between the first transation being commited and
the second transaction starting:
TSTART // TX=1
... // TX=1
TCOMMIT // TX=1
TSTART // TX=1
... // TX=1
<failure>
// TX=0, LF=1
AFAICT, we are not guaranteed to have a record with TX=0 between that
successful TCOMMIT and the subsequent TSTART, and so the LASTFAILED field
doesn't indicate that *all* preceding records with TX set are part of the
failed transaction.
Am I missing something? e.g. does the TCOMMIT get records with TX=0?
> > AFAICT you could have a sequence where a TCOMMIT is immediately followed by a
> > TSTART, and IIUC in that case you could have back-to-back records for distinct
> > transactions all with TX = 1, where the first transaction could be commited,
> > and the second might fail/cancel.
> >
> > ... or do TCOMMIT/TCANCEL/TSTART get handled specially?
>
> I guess these are micro-architectural implementation details which unfortunately
> BRBINF_EL1/BRBCR_EL1 specifications do not capture in detail. But all it says is
> that upon encountering BRBINF_EL1.LASTFAILED or BRBFCR_EL1.LASTFAILED (just for
> the last record) all previous in-transaction branch records (BRBINF_EL1.TX = 1)
> should be considered aborted for branch record reporting purpose.
Ok, so we're throwing away data?
If we're going to do that, it would be good to at least have a comment
explaining why we're forced to do so. Ideally we'd get the architecture
clarified/fixed, since AFAIK no-one has actually built TME yet, and it might be
a simple fix (as above).
[...]
> >> +void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
> >> +{
> >> + struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
> >> + u64 brbinf, brbfcr, brbcr;
> >> + int idx;
> >> +
> >> + brbcr = read_sysreg_s(SYS_BRBCR_EL1);
> >> + brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
> >> +
> >> + /* Ensure pause on PMU interrupt is enabled */
> >> + WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
> >
> > As above, I think this needs commentary in the interrupt handler, since this
> > presumably needs us to keep the IRQ asserted until we're done
> > reading/manipulating records in the IRQ handler.
>
> The base IRQ handler armv8pmu_handle_irq() is in ARMV8 PMU code inside perf_event.c
> which could/should not access BRBE specific details without adding an another new
> abstraction function. But I guess adding a comment should be fine.
I think it's fine to have a comment there saying that we *must not* do
something that woukd break BRBE.
> >> +
> >> + /* Save and clear the privilege */
> >> + write_sysreg_s(brbcr & ~(BRBCR_EL1_E0BRE | BRBCR_EL1_E1BRE), SYS_BRBCR_EL1);
> >
> > Why? Later on we restore this, and AFAICT we don't modify it.
> >
> > If it's paused, why do we care about the privilege?
>
> This disables BRBE completely (not only pause) providing confidence that no
> branch record can come in while the existing records are being processed.
I thought from earlier that it was automatically paused by HW upon raising the
IRQ. Have I misunderstood, and we *must* stop it, or is this a belt-and-braces
additional disable?
Is that not the case, or do we not trust the pause for some reason?
Regardless, the comment should expalin *why* we're doing this (i.e. that this
is about ensuring BRBE does not create new records while we're manipulating
it).
> >> + /* Unpause the buffer */
> >> + write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
> >> + isb();
> >> + armv8pmu_branch_reset();
> >> +}
> >
> > Why do we enable it before we reset it?
>
> This is the last opportunity for a clean slate start for BRBE buffer before it is
> back recording the branches. Basically helps in ensuring a clean start.
My point is why do we start if *before* resetting it, rather than restting it
first? Why give it the opportunity to create records that we're going to
discard immediately thereafter?
> > Surely it would make sense to reset it first, and ammortize the cost of the ISB?
> >
> > That said, as above, do we actually need to pause/unpause it? Or is it already
> > paused by virtue of the IRQ?
>
> Yes, it should be paused after an IRQ but it is also enforced before reading along
> with privilege level disable.
I'm very confused as to why we're not trusting the HW to remain paused. Why do
we need to enforce what th e hardware should already be doing?
> Regardless the buffer needs to be un-paused and also
> enabled for required privilege levels before exiting from here.
I agree this needs to be balanced, it just seems to me that we're doing
redundant work here.
Thanks,
Mark.
On 2/9/23 01:33, Mark Rutland wrote:
> On Thu, Jan 19, 2023 at 08:18:47AM +0530, Anshuman Khandual wrote:
>> On 1/12/23 22:21, Mark Rutland wrote:
>>> On Thu, Jan 05, 2023 at 08:40:39AM +0530, Anshuman Khandual wrote:
>>>> +bool armv8pmu_branch_valid(struct perf_event *event)
>>>> +{
>>>> + u64 branch_type = event->attr.branch_sample_type;
>>>> +
>>>> + /*
>>>> + * If the event does not have at least one of the privilege
>>>> + * branch filters as in PERF_SAMPLE_BRANCH_PLM_ALL, the core
>>>> + * perf will adjust its value based on perf event's existing
>>>> + * privilege level via attr.exclude_[user|kernel|hv].
>>>> + *
>>>> + * As event->attr.branch_sample_type might have been changed
>>>> + * when the event reaches here, it is not possible to figure
>>>> + * out whether the event originally had HV privilege request
>>>> + * or got added via the core perf. Just report this situation
>>>> + * once and continue ignoring if there are other instances.
>>>> + */
>>>> + if ((branch_type & PERF_SAMPLE_BRANCH_HV) && !is_kernel_in_hyp_mode())
>>>> + pr_warn_once("%s - hypervisor privilege\n", branch_filter_error_msg);
>>>> +
>>>> + if (branch_type & PERF_SAMPLE_BRANCH_ABORT_TX) {
>>>> + pr_warn_once("%s - aborted transaction\n", branch_filter_error_msg);
>>>> + return false;
>>>> + }
>>>> +
>>>> + if (branch_type & PERF_SAMPLE_BRANCH_NO_TX) {
>>>> + pr_warn_once("%s - no transaction\n", branch_filter_error_msg);
>>>> + return false;
>>>> + }
>>>> +
>>>> + if (branch_type & PERF_SAMPLE_BRANCH_IN_TX) {
>>>> + pr_warn_once("%s - in transaction\n", branch_filter_error_msg);
>>>> + return false;
>>>> + }
>>>> + return true;
>>>> +}
>>>
>>> Is this called when validating user input? If so, NAK to printing anything to a
>>> higher leval than debug. If there are constraints the user needs to be aware of
>>
>> You mean pr_debug() based prints ?
>
> Yes.
>
>>> It would be better to whitelist what we do support rather than blacklisting
>>> what we don't.
>>
>> But with a negative list, user would know what is not supported via these pr_debug()
>> based output when enabled ? But I dont have a strong opinion either way.
>
> With a negative list, when new options are added the driver will erroneously
> and silently accept them, which is worse.
Okay, will rather convert this into a positive list instead.
>
>>
>>>
>>>> +
>>>> +static void branch_records_alloc(struct arm_pmu *armpmu)
>>>> +{
>>>> + struct pmu_hw_events *events;
>>>> + int cpu;
>>>> +
>>>> + for_each_possible_cpu(cpu) {
>>>> + events = per_cpu_ptr(armpmu->hw_events, cpu);
>>>> +
>>>> + events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
>>>> + WARN_ON(!events->branches);
>>>> + }
>>>> +}
>>>
>>> It would be simpler for this to be a percpu allocation.
>>
>> Could you please be more specific ? alloc_percpu_gfp() cannot be used here
>> because 'events->branches' is not a __percpu variable unlike its parent
>> 'events' which is derived from armpmu.
>
> You can allocate it per-cpu, then grab each of the cpu's pointers using
> per_cpu() and place those into events->branches.
>
> That way you only make one allocation which can fail, which makes the error
> path much simpler.
I guess you are suggesting something like this.
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index f0689c84530b..3f0a9d1df5e8 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1190,14 +1190,19 @@ static void __armv8pmu_probe_pmu(void *info)
static int branch_records_alloc(struct arm_pmu *armpmu)
{
+ struct branch_records __percpu *tmp;
struct pmu_hw_events *events;
+ struct branch_records *records;
int cpu;
+ tmp = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
for_each_possible_cpu(cpu) {
events = per_cpu_ptr(armpmu->hw_events, cpu);
- events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
- if (!events->branches)
- return -ENOMEM;
+ records = per_cpu_ptr(tmp, cpu);
+ events->branches = records;
}
return 0;
}
This should be all okay, as these branch records never really get freed up. But otherwise
this local 'tmp' will have to saved some where for later, to be used with free_percpu().
Else access to this percpu allocation handle is lost.
>
> [...]
>
>>>> +static int brbe_attributes_probe(struct arm_pmu *armpmu, u32 brbe)
>>>> +{
>>>> + struct brbe_hw_attr *brbe_attr = kzalloc(sizeof(struct brbe_hw_attr), GFP_KERNEL);
>>>
>>> Same comments as for the failure path in branch_records_alloc().
>>>
>>>> + u64 brbidr = read_sysreg_s(SYS_BRBIDR0_EL1);
>>>
>>> Which context is this run in? Unless this is affine to a relevant CPU we can't
>>> read the sysreg safely, and if we're in a cross-call we cannot allocate memory,
>>> so this doesn't look right to me.
>>
>> Called from smp_call_function_any() context via __armv8pmu_probe_pmu().
>
> Ok; so the read is safe, but the allocation is not.
This problem has already been solved in latest V8 series.
>
> [...]
>
>>>> + WARN_ON(!brbe_attr);
>>>> + armpmu->private = brbe_attr;
>>>> +
>>>> + brbe_attr->brbe_version = brbe;
>>>> + brbe_attr->brbe_format = brbe_fetch_format(brbidr);
>>>> + brbe_attr->brbe_cc = brbe_fetch_cc_bits(brbidr);
>>>> + brbe_attr->brbe_nr = brbe_fetch_numrec(brbidr);
>>>
>>> As a minor thing, could we please s/fetch/get/ ? To me, 'fetch' sounds like a
>>> memory operation, and elsewhere we use 'get' for this sort of getter function.
>>
>> Sure, but shall we change fetch as get for entire BRBE implementation (where ever
>> there is a determination of field from a register value) or just the above function ?
>> Default, will change all places.
>
> I had meant in all cases, so that's perfect, thanks.
>
>
> [...]
>
>>>> + WARN_ON_ONCE(!(brbcr & BRBCR_EL1_MPRED));
>>>
>>> Huh? Why does the value of BRBCR matter here?
>>
>> This is just a code hardening measure here. Before recording branch record
>> cycles or its flags, ensure BRBCR_EL1 was configured correctly to produce
>> these additional information along with the branch records.
>
> I don't think that's necessary. Where is brbcr written such that this could be
> misconfigured?
BRBCR cannot be mis-configured but this is just for code hardening purpose both
for BRBCR_EL1_MPRED and BRBCR_EL1_CC.
>
> At the least, this needs a comment as to why we need to check, and what we're
> checking for.
Sure, will probably add a comment explaining these checks.
>
> [...]
>
>>>> +/*
>>>> + * A branch record with BRBINF_EL1.LASTFAILED set, implies that all
>>>> + * preceding consecutive branch records, that were in a transaction
>>>> + * (i.e their BRBINF_EL1.TX set) have been aborted.
>>>> + *
>>>> + * Similarly BRBFCR_EL1.LASTFAILED set, indicate that all preceding
>>>> + * consecutive branch records upto the last record, which were in a
>>>> + * transaction (i.e their BRBINF_EL1.TX set) have been aborted.
>>>> + *
>>>> + * --------------------------------- -------------------
>>>> + * | 00 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
>>>> + * --------------------------------- -------------------
>>>> + * | 01 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
>>>> + * --------------------------------- -------------------
>>>> + * | 02 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
>>>> + * --------------------------------- -------------------
>>>> + * | 03 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>>>> + * --------------------------------- -------------------
>>>> + * | 04 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>>>> + * --------------------------------- -------------------
>>>> + * | 05 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 1 |
>>>> + * --------------------------------- -------------------
>>>> + * | .. | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
>>>> + * --------------------------------- -------------------
>>>> + * | 61 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>>>> + * --------------------------------- -------------------
>>>> + * | 62 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>>>> + * --------------------------------- -------------------
>>>> + * | 63 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
>>>> + * --------------------------------- -------------------
>>>
>>> Are we guaranteed to have a record between two transactions with TX = 0?
>>
>> With TX = 0 i.e no transaction was active, indicates normal sequence of branches
>> creating their own branch records. How can there be a transaction with TX = 0 ?
>> Could you please be more specific here ?
>
> Consider a sequence of a successful transaction followed by a cancelled
> transation, with not branches between the first transation being commited and
> the second transaction starting:
>
> TSTART // TX=1
> ... // TX=1
> TCOMMIT // TX=1
> TSTART // TX=1
> ... // TX=1
> <failure>
> // TX=0, LF=1
>
> AFAICT, we are not guaranteed to have a record with TX=0 between that
> successful TCOMMIT and the subsequent TSTART, and so the LASTFAILED field
> doesn't indicate that *all* preceding records with TX set are part of the
> failed transaction.
>
> Am I missing something? e.g. does the TCOMMIT get records with TX=0?
I would assume so. Otherwise there will be no marker between the branch records
generated from two different set of transactions, either successful or failed.
Although I will get this clarified from the HW folks.
>
>>> AFAICT you could have a sequence where a TCOMMIT is immediately followed by a
>>> TSTART, and IIUC in that case you could have back-to-back records for distinct
>>> transactions all with TX = 1, where the first transaction could be commited,
>>> and the second might fail/cancel.
>>>
>>> ... or do TCOMMIT/TCANCEL/TSTART get handled specially?
>>
>> I guess these are micro-architectural implementation details which unfortunately
>> BRBINF_EL1/BRBCR_EL1 specifications do not capture in detail. But all it says is
>> that upon encountering BRBINF_EL1.LASTFAILED or BRBFCR_EL1.LASTFAILED (just for
>> the last record) all previous in-transaction branch records (BRBINF_EL1.TX = 1)
>> should be considered aborted for branch record reporting purpose.
>
> Ok, so we're throwing away data?
No, we just mark branch record abort field (entry->abort) appropriately.
>
> If we're going to do that, it would be good to at least have a comment
> explaining why we're forced to do so. Ideally we'd get the architecture
> clarified/fixed, since AFAIK no-one has actually built TME yet, and it might be
> a simple fix (as above).
I will get this clarified from the HW folks.
>
> [...]
>
>>>> +void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
>>>> +{
>>>> + struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
>>>> + u64 brbinf, brbfcr, brbcr;
>>>> + int idx;
>>>> +
>>>> + brbcr = read_sysreg_s(SYS_BRBCR_EL1);
>>>> + brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
>>>> +
>>>> + /* Ensure pause on PMU interrupt is enabled */
>>>> + WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
>>>
>>> As above, I think this needs commentary in the interrupt handler, since this
>>> presumably needs us to keep the IRQ asserted until we're done
>>> reading/manipulating records in the IRQ handler.
>>
>> The base IRQ handler armv8pmu_handle_irq() is in ARMV8 PMU code inside perf_event.c
>> which could/should not access BRBE specific details without adding an another new
>> abstraction function. But I guess adding a comment should be fine.
>
> I think it's fine to have a comment there saying that we *must not* do
> something that woukd break BRBE.
Will add a comment here, something like this.
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 3f0a9d1df5e8..727c4806f18c 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -861,6 +861,10 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
if (!armpmu_event_set_period(event))
continue;
+ /*
+ * PMU IRQ should remain asserted until all branch records
+ * are captured and processed into struct perf_sample_data.
+ */
if (has_branch_stack(event)) {
WARN_ON(!cpuc->branches);
armv8pmu_branch_read(cpuc, event);
>
>>>> +
>>>> + /* Save and clear the privilege */
>>>> + write_sysreg_s(brbcr & ~(BRBCR_EL1_E0BRE | BRBCR_EL1_E1BRE), SYS_BRBCR_EL1);
>>>
>>> Why? Later on we restore this, and AFAICT we don't modify it.
>>>
>>> If it's paused, why do we care about the privilege?
>>
>> This disables BRBE completely (not only pause) providing confidence that no
>> branch record can come in while the existing records are being processed.
>
> I thought from earlier that it was automatically paused by HW upon raising the
> IRQ. Have I misunderstood, and we *must* stop it, or is this a belt-and-braces
> additional disable?
>
> Is that not the case, or do we not trust the pause for some reason?
Yes, this is a belt-and-braces additional disable i.e putting the BRBE in prohibited
region, which is more effective than a pause.
>
> Regardless, the comment should expalin *why* we're doing this (i.e. that this
> is about ensuring BRBE does not create new records while we're manipulating
> it).
Will update the comment, something like this.
diff --git a/arch/arm64/kernel/brbe.c b/arch/arm64/kernel/brbe.c
index a2de0b5a941c..35e11d0f41fa 100644
--- a/arch/arm64/kernel/brbe.c
+++ b/arch/arm64/kernel/brbe.c
@@ -504,7 +504,13 @@ void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
/* Ensure pause on PMU interrupt is enabled */
WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
- /* Save and clear the privilege */
+ /*
+ * Save and clear the privilege
+ *
+ * Clearing both privilege levels put the BRBE in prohibited state
+ * thus preventing new branch records being created while existing
+ * ones get captured and processed.
+ */
write_sysreg_s(brbcr & ~(BRBCR_EL1_E0BRE | BRBCR_EL1_E1BRE), SYS_BRBCR_EL1);
/* Pause the buffer */
>
>>>> + /* Unpause the buffer */
>>>> + write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
>>>> + isb();
>>>> + armv8pmu_branch_reset();
>>>> +}
>>>
>>> Why do we enable it before we reset it?
>>
>> This is the last opportunity for a clean slate start for BRBE buffer before it is
>> back recording the branches. Basically helps in ensuring a clean start.
>
> My point is why do we start if *before* resetting it, rather than restting it
> first? Why give it the opportunity to create records that we're going to
> discard immediately thereafter?
>
>>> Surely it would make sense to reset it first, and ammortize the cost of the ISB?
>>>
>>> That said, as above, do we actually need to pause/unpause it? Or is it already
>>> paused by virtue of the IRQ?
>>
>> Yes, it should be paused after an IRQ but it is also enforced before reading along
>> with privilege level disable.
>
> I'm very confused as to why we're not trusting the HW to remain paused. Why do
> we need to enforce what the hardware should already be doing?
As have learned from the HW folks, there might be situations where the BRBE buffer has
been actually paused, ready for extraction in principle, but without BRBFCR_EL1_PAUSED
being set. Setting the bit here explicitly creates consistency across scenarios before
capturing the branch records. But please do note, that putting the BRBE in prohibited
region via clearing BRBCR_EL1_E0BRE/E1BRE is the primary construct which ensures that
no new branch records will make into the buffer while it's being processed.
>
>> Regardless the buffer needs to be un-paused and also
>> enabled for required privilege levels before exiting from here.
>
> I agree this needs to be balanced, it just seems to me that we're doing
> redundant work here.
Extracting branch records for an user space only profile session might be faster as it
would require lesser context synchronization, might not even require prohibited region
change mechanism (will be already in there, upon a PMU interrupt) etc. I could try and
update IRQ branch records handling, based on whether current perf session was profiling
only the user space or not.
Hi Anshuman,
Following up on some of the bits below, I've tried to read the BRBE section in
the ARM ARM (ARM DDI 0487I.a), and explain my understanding below. Please let
me know if I've misunderstood or missed something.
On Mon, Feb 20, 2023 at 02:08:39PM +0530, Anshuman Khandual wrote:
> On 2/9/23 01:33, Mark Rutland wrote:
> > On Thu, Jan 19, 2023 at 08:18:47AM +0530, Anshuman Khandual wrote:
> >> On 1/12/23 22:21, Mark Rutland wrote:
> >>> On Thu, Jan 05, 2023 at 08:40:39AM +0530, Anshuman Khandual wrote:
> >>>> + /* Save and clear the privilege */
> >>>> + write_sysreg_s(brbcr & ~(BRBCR_EL1_E0BRE | BRBCR_EL1_E1BRE), SYS_BRBCR_EL1);
> >>>
> >>> Why? Later on we restore this, and AFAICT we don't modify it.
> >>>
> >>> If it's paused, why do we care about the privilege?
> >>
> >> This disables BRBE completely (not only pause) providing confidence that no
> >> branch record can come in while the existing records are being processed.
> >
> > I thought from earlier that it was automatically paused by HW upon raising the
> > IRQ. Have I misunderstood, and we *must* stop it, or is this a belt-and-braces
> > additional disable?
> >
> > Is that not the case, or do we not trust the pause for some reason?
>
> Yes, this is a belt-and-braces additional disable i.e putting the BRBE in prohibited
> region, which is more effective than a pause.
I'm afraid I don't understand what you mean by "more effective than a pause";
AFAICT the pause should be sufficient for what we're doing.
If there's a particular property that a prohibited region ensures but pausing
does not, could you please say what that property is specifically? e.g. as
below I note some differences w.r.t. the BRB_FILTRATE PMU event, but I'm not
sure if that's what you're referring to.
Per ARM DDI 0487I.a, section D15.3 on pages D15-5511 and D15-5512, we have:
R_PYBRZ:
Generation of Branch records is paused when BRBFCR_EL1.PAUSED is 1.
R_SRJND
If a direct read of BRBFCR_EL1.PAUSED returns 1, then no operations ordered
after the direct read will generate further Branch records until
BRBFCR_EL1.PAUSED is cleared by software.
Note: The subsequent operations can be ordered by a Context synchronization
event.
So if we read BRBFCR_EL1 and the PAUSED bit is set, then all we need is an ISB
to ensure that no further records will be generated.
Rules R_NXCWF, R_GXGWY, R_RPKTXQ mean that a freeze event is generated when
PMOVSCLR_EL0 bits become set (i.e. when an overflow occurs), and we have:
R_BHYTD
On a BRBE freeze event:
* BRBFCR_EL1.PAUSED is set to 1.
* The current timestamp is captured in BRBTS_EL1.
So any counter overflow will indirectly set BRBFCR_EL1.PAUSED, and stop the
generation of records. The note in R_SRJND tells us that remains the case until
we explicitly clear BRBFCR_EL1.PAUSED.
The only thing that I can see that potentially justifies placing the BRBE into
a prohibited region is the notes about BRB_FILTRATE, but I don't think that's
all that useful anyway since it's not manipulated atomically w.r.t. the actual
BRBE record management, and there are larger windows where BRBE will be paused
but counters running (e.g. between overflow occurring and poking the BRBE in
the overflow handler). So I think it'd be pointless to do that *just* for
BRB_FILTRATE.
Practically speaking, I expect that if we read PMOVSCLR and find any bits are
set, then issue an ISB, then after that ISB all of the following should be
true:
(a) BRBFCR_EL1.PAUSED will be set
(b) No further records will be generated
(c) We can safely manipulate the existing records
[...]
> >>> That said, as above, do we actually need to pause/unpause it? Or is it already
> >>> paused by virtue of the IRQ?
> >>
> >> Yes, it should be paused after an IRQ but it is also enforced before reading along
> >> with privilege level disable.
> >
> > I'm very confused as to why we're not trusting the HW to remain paused. Why do
> > we need to enforce what the hardware should already be doing?
>
> As have learned from the HW folks, there might be situations where the BRBE buffer has
> been actually paused, ready for extraction in principle, but without BRBFCR_EL1_PAUSED
> being set.
The ARM ARM is pretty clear that paused means BRBFCR_EL1.PAUSED==1, so I assume
you mean there's a different scenario where it won't generate records (e.g.
such as being in a prohibited region).
Can you please give an example here?
I'm happy to go talk with the HW folk with you for this.
> Setting the bit here explicitly creates consistency across scenarios before
> capturing the branch records. But please do note, that putting the BRBE in prohibited
> region via clearing BRBCR_EL1_E0BRE/E1BRE is the primary construct which ensures that
> no new branch records will make into the buffer while it's being processed.
I agree that's sufficient, but as above I don't believe it's necessary, and all
that we actually require is that no new records are generated.
> >> Regardless the buffer needs to be un-paused and also
> >> enabled for required privilege levels before exiting from here.
> >
> > I agree this needs to be balanced, it just seems to me that we're doing
> > redundant work here.
>
> Extracting branch records for an user space only profile session might be faster as it
> would require lesser context synchronization, might not even require prohibited region
> change mechanism (will be already in there, upon a PMU interrupt) etc. I could try and
> update IRQ branch records handling, based on whether current perf session was profiling
> only the user space or not.
For the IRQ handler I do not believe it matters which exception level(s) are
being monitored; if BRBFCR_EL1.PAUSED is set no new records will be generated
regardless. So I don't think that needs any special care.
For the context-switch between tasks I believe we'll need to transiently enter
a prohibited region, but that's a different path.
Thanks,
Mark.
@@ -1363,6 +1363,17 @@ config HW_PERF_EVENTS
def_bool y
depends on ARM_PMU
+config ARM64_BRBE
+ bool "Enable support for Branch Record Buffer Extension (BRBE)"
+ depends on PERF_EVENTS && ARM64 && ARM_PMU
+ default y
+ help
+ Enable perf support for Branch Record Buffer Extension (BRBE) which
+ records all branches taken in an execution path. This supports some
+ branch types and privilege based filtering. It captured additional
+ relevant information such as cycle count, misprediction and branch
+ type, branch privilege level etc.
+
# Supported by clang >= 7.0 or GCC >= 12.0.0
config CC_HAVE_SHADOW_CALL_STACK
def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18)
@@ -277,6 +277,14 @@ struct pmu_hw_events;
struct arm_pmu;
struct perf_event;
+#ifdef CONFIG_ARM64_BRBE
+void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event);
+bool armv8pmu_branch_valid(struct perf_event *event);
+void armv8pmu_branch_enable(struct perf_event *event);
+void armv8pmu_branch_disable(struct perf_event *event);
+void armv8pmu_branch_probe(struct arm_pmu *arm_pmu);
+void armv8pmu_branch_reset(void);
+#else
static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event) { }
static inline bool armv8pmu_branch_valid(struct perf_event *event) { return false; }
static inline void armv8pmu_branch_enable(struct perf_event *event) { }
@@ -284,3 +292,4 @@ static inline void armv8pmu_branch_disable(struct perf_event *event) { }
static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu) { }
static inline void armv8pmu_branch_reset(void) { }
#endif
+#endif
@@ -46,6 +46,7 @@ obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o
obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o
+obj-$(CONFIG_ARM64_BRBE) += brbe.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_CPU_PM) += sleep.o suspend.o
obj-$(CONFIG_CPU_IDLE) += cpuidle.o
new file mode 100644
@@ -0,0 +1,512 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Branch Record Buffer Extension Driver.
+ *
+ * Copyright (C) 2022 ARM Limited
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#include "brbe.h"
+
+static bool valid_brbe_nr(int brbe_nr)
+{
+ return brbe_nr == BRBIDR0_EL1_NUMREC_8 ||
+ brbe_nr == BRBIDR0_EL1_NUMREC_16 ||
+ brbe_nr == BRBIDR0_EL1_NUMREC_32 ||
+ brbe_nr == BRBIDR0_EL1_NUMREC_64;
+}
+
+static bool valid_brbe_cc(int brbe_cc)
+{
+ return brbe_cc == BRBIDR0_EL1_CC_20_BIT;
+}
+
+static bool valid_brbe_format(int brbe_format)
+{
+ return brbe_format == BRBIDR0_EL1_FORMAT_0;
+}
+
+static bool valid_brbe_version(int brbe_version)
+{
+ return brbe_version == ID_AA64DFR0_EL1_BRBE_IMP ||
+ brbe_version == ID_AA64DFR0_EL1_BRBE_BRBE_V1P1;
+}
+
+static void select_brbe_bank(int bank)
+{
+ static int brbe_current_bank = BRBE_BANK_IDX_INVALID;
+ u64 brbfcr;
+
+ if (brbe_current_bank == bank)
+ return;
+
+ WARN_ON(bank > BRBE_BANK_IDX_1);
+ brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
+ brbfcr &= ~BRBFCR_EL1_BANK_MASK;
+ brbfcr |= ((bank << BRBFCR_EL1_BANK_SHIFT) & BRBFCR_EL1_BANK_MASK);
+ write_sysreg_s(brbfcr, SYS_BRBFCR_EL1);
+ isb();
+ brbe_current_bank = bank;
+}
+
+static void select_brbe_bank_index(int buffer_idx)
+{
+ switch (buffer_idx) {
+ case BRBE_BANK0_IDX_MIN ... BRBE_BANK0_IDX_MAX:
+ select_brbe_bank(BRBE_BANK_IDX_0);
+ break;
+ case BRBE_BANK1_IDX_MIN ... BRBE_BANK1_IDX_MAX:
+ select_brbe_bank(BRBE_BANK_IDX_1);
+ break;
+ default:
+ pr_warn("unsupported BRBE index\n");
+ }
+}
+
+static const char branch_filter_error_msg[] = "branch filter not supported";
+
+bool armv8pmu_branch_valid(struct perf_event *event)
+{
+ u64 branch_type = event->attr.branch_sample_type;
+
+ /*
+ * If the event does not have at least one of the privilege
+ * branch filters as in PERF_SAMPLE_BRANCH_PLM_ALL, the core
+ * perf will adjust its value based on perf event's existing
+ * privilege level via attr.exclude_[user|kernel|hv].
+ *
+ * As event->attr.branch_sample_type might have been changed
+ * when the event reaches here, it is not possible to figure
+ * out whether the event originally had HV privilege request
+ * or got added via the core perf. Just report this situation
+ * once and continue ignoring if there are other instances.
+ */
+ if ((branch_type & PERF_SAMPLE_BRANCH_HV) && !is_kernel_in_hyp_mode())
+ pr_warn_once("%s - hypervisor privilege\n", branch_filter_error_msg);
+
+ if (branch_type & PERF_SAMPLE_BRANCH_ABORT_TX) {
+ pr_warn_once("%s - aborted transaction\n", branch_filter_error_msg);
+ return false;
+ }
+
+ if (branch_type & PERF_SAMPLE_BRANCH_NO_TX) {
+ pr_warn_once("%s - no transaction\n", branch_filter_error_msg);
+ return false;
+ }
+
+ if (branch_type & PERF_SAMPLE_BRANCH_IN_TX) {
+ pr_warn_once("%s - in transaction\n", branch_filter_error_msg);
+ return false;
+ }
+ return true;
+}
+
+static void branch_records_alloc(struct arm_pmu *armpmu)
+{
+ struct pmu_hw_events *events;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ events = per_cpu_ptr(armpmu->hw_events, cpu);
+
+ events->branches = kzalloc(sizeof(struct branch_records), GFP_KERNEL);
+ WARN_ON(!events->branches);
+ }
+}
+
+static int brbe_attributes_probe(struct arm_pmu *armpmu, u32 brbe)
+{
+ struct brbe_hw_attr *brbe_attr = kzalloc(sizeof(struct brbe_hw_attr), GFP_KERNEL);
+ u64 brbidr = read_sysreg_s(SYS_BRBIDR0_EL1);
+
+ WARN_ON(!brbe_attr);
+ armpmu->private = brbe_attr;
+
+ brbe_attr->brbe_version = brbe;
+ brbe_attr->brbe_format = brbe_fetch_format(brbidr);
+ brbe_attr->brbe_cc = brbe_fetch_cc_bits(brbidr);
+ brbe_attr->brbe_nr = brbe_fetch_numrec(brbidr);
+
+ if (!valid_brbe_version(brbe_attr->brbe_version) ||
+ !valid_brbe_format(brbe_attr->brbe_format) ||
+ !valid_brbe_cc(brbe_attr->brbe_cc) ||
+ !valid_brbe_nr(brbe_attr->brbe_nr))
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+void armv8pmu_branch_probe(struct arm_pmu *armpmu)
+{
+ u64 aa64dfr0 = read_sysreg_s(SYS_ID_AA64DFR0_EL1);
+ u32 brbe;
+
+ brbe = cpuid_feature_extract_unsigned_field(aa64dfr0, ID_AA64DFR0_EL1_BRBE_SHIFT);
+ if (!brbe)
+ return;
+
+ if (brbe_attributes_probe(armpmu, brbe))
+ return;
+
+ branch_records_alloc(armpmu);
+ armpmu->features |= ARM_PMU_BRANCH_STACK;
+}
+
+static u64 branch_type_to_brbfcr(int branch_type)
+{
+ u64 brbfcr = 0;
+
+ if (branch_type & PERF_SAMPLE_BRANCH_ANY) {
+ brbfcr |= BRBFCR_EL1_BRANCH_FILTERS;
+ return brbfcr;
+ }
+
+ if (branch_type & PERF_SAMPLE_BRANCH_ANY_CALL) {
+ brbfcr |= BRBFCR_EL1_INDCALL;
+ brbfcr |= BRBFCR_EL1_DIRCALL;
+ }
+
+ if (branch_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+ brbfcr |= BRBFCR_EL1_RTN;
+
+ if (branch_type & PERF_SAMPLE_BRANCH_IND_CALL)
+ brbfcr |= BRBFCR_EL1_INDCALL;
+
+ if (branch_type & PERF_SAMPLE_BRANCH_COND)
+ brbfcr |= BRBFCR_EL1_CONDDIR;
+
+ if (branch_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+ brbfcr |= BRBFCR_EL1_INDIRECT;
+
+ if (branch_type & PERF_SAMPLE_BRANCH_CALL)
+ brbfcr |= BRBFCR_EL1_DIRCALL;
+
+ return brbfcr;
+}
+
+static u64 branch_type_to_brbcr(int branch_type)
+{
+ u64 brbcr = (BRBCR_EL1_FZP | BRBCR_EL1_DEFAULT_TS);
+
+ if (branch_type & PERF_SAMPLE_BRANCH_USER)
+ brbcr |= BRBCR_EL1_E0BRE;
+
+ if (branch_type & PERF_SAMPLE_BRANCH_KERNEL)
+ brbcr |= BRBCR_EL1_E1BRE;
+
+ if (branch_type & PERF_SAMPLE_BRANCH_HV) {
+ if (is_kernel_in_hyp_mode())
+ brbcr |= BRBCR_EL1_E1BRE;
+ }
+
+ if (!(branch_type & PERF_SAMPLE_BRANCH_NO_CYCLES))
+ brbcr |= BRBCR_EL1_CC;
+
+ if (!(branch_type & PERF_SAMPLE_BRANCH_NO_FLAGS))
+ brbcr |= BRBCR_EL1_MPRED;
+
+ /*
+ * The exception and exception return branches could be
+ * captured, irrespective of the perf event's privilege.
+ * If the perf event does not have enough privilege for
+ * a given exception level, then addresses which falls
+ * under that exception level will be reported as zero
+ * for the captured branch record, creating source only
+ * or target only records.
+ */
+ if (branch_type & PERF_SAMPLE_BRANCH_ANY) {
+ brbcr |= BRBCR_EL1_EXCEPTION;
+ brbcr |= BRBCR_EL1_ERTN;
+ }
+
+ if (branch_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+ brbcr |= BRBCR_EL1_EXCEPTION;
+
+ if (branch_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+ brbcr |= BRBCR_EL1_ERTN;
+
+ return brbcr & BRBCR_EL1_DEFAULT_CONFIG;
+}
+
+void armv8pmu_branch_enable(struct perf_event *event)
+{
+ u64 branch_type = event->attr.branch_sample_type;
+ u64 brbfcr, brbcr;
+
+ brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
+ brbfcr &= ~BRBFCR_EL1_DEFAULT_CONFIG;
+ brbfcr |= branch_type_to_brbfcr(branch_type);
+ write_sysreg_s(brbfcr, SYS_BRBFCR_EL1);
+ isb();
+
+ brbcr = read_sysreg_s(SYS_BRBCR_EL1);
+ brbcr &= ~BRBCR_EL1_DEFAULT_CONFIG;
+ brbcr |= branch_type_to_brbcr(branch_type);
+ write_sysreg_s(brbcr, SYS_BRBCR_EL1);
+ isb();
+ armv8pmu_branch_reset();
+}
+
+void armv8pmu_branch_disable(struct perf_event *event)
+{
+ u64 brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
+ u64 brbcr = read_sysreg_s(SYS_BRBCR_EL1);
+
+ brbcr &= ~(BRBCR_EL1_E0BRE | BRBCR_EL1_E1BRE);
+ brbfcr |= BRBFCR_EL1_PAUSED;
+ write_sysreg_s(brbcr, SYS_BRBCR_EL1);
+ write_sysreg_s(brbfcr, SYS_BRBFCR_EL1);
+ isb();
+}
+
+static int brbe_fetch_perf_type(u64 brbinf, bool *new_branch_type)
+{
+ int brbe_type = brbe_fetch_type(brbinf);
+ *new_branch_type = false;
+
+ switch (brbe_type) {
+ case BRBINF_EL1_TYPE_UNCOND_DIR:
+ return PERF_BR_UNCOND;
+ case BRBINF_EL1_TYPE_INDIR:
+ return PERF_BR_IND;
+ case BRBINF_EL1_TYPE_DIR_LINK:
+ return PERF_BR_CALL;
+ case BRBINF_EL1_TYPE_INDIR_LINK:
+ return PERF_BR_IND_CALL;
+ case BRBINF_EL1_TYPE_RET_SUB:
+ return PERF_BR_RET;
+ case BRBINF_EL1_TYPE_COND_DIR:
+ return PERF_BR_COND;
+ case BRBINF_EL1_TYPE_CALL:
+ return PERF_BR_CALL;
+ case BRBINF_EL1_TYPE_TRAP:
+ return PERF_BR_SYSCALL;
+ case BRBINF_EL1_TYPE_RET_EXCPT:
+ return PERF_BR_ERET;
+ case BRBINF_EL1_TYPE_IRQ:
+ return PERF_BR_IRQ;
+ case BRBINF_EL1_TYPE_DEBUG_HALT:
+ *new_branch_type = true;
+ return PERF_BR_ARM64_DEBUG_HALT;
+ case BRBINF_EL1_TYPE_SERROR:
+ return PERF_BR_SERROR;
+ case BRBINF_EL1_TYPE_INST_DEBUG:
+ *new_branch_type = true;
+ return PERF_BR_ARM64_DEBUG_INST;
+ case BRBINF_EL1_TYPE_DATA_DEBUG:
+ *new_branch_type = true;
+ return PERF_BR_ARM64_DEBUG_DATA;
+ case BRBINF_EL1_TYPE_ALGN_FAULT:
+ *new_branch_type = true;
+ return PERF_BR_NEW_FAULT_ALGN;
+ case BRBINF_EL1_TYPE_INST_FAULT:
+ *new_branch_type = true;
+ return PERF_BR_NEW_FAULT_INST;
+ case BRBINF_EL1_TYPE_DATA_FAULT:
+ *new_branch_type = true;
+ return PERF_BR_NEW_FAULT_DATA;
+ case BRBINF_EL1_TYPE_FIQ:
+ *new_branch_type = true;
+ return PERF_BR_ARM64_FIQ;
+ case BRBINF_EL1_TYPE_DEBUG_EXIT:
+ *new_branch_type = true;
+ return PERF_BR_ARM64_DEBUG_EXIT;
+ default:
+ pr_warn("unknown branch type captured\n");
+ return PERF_BR_UNKNOWN;
+ }
+}
+
+static int brbe_fetch_perf_priv(u64 brbinf)
+{
+ int brbe_el = brbe_fetch_el(brbinf);
+
+ switch (brbe_el) {
+ case BRBINF_EL1_EL_EL0:
+ return PERF_BR_PRIV_USER;
+ case BRBINF_EL1_EL_EL1:
+ return PERF_BR_PRIV_KERNEL;
+ case BRBINF_EL1_EL_EL2:
+ if (is_kernel_in_hyp_mode())
+ return PERF_BR_PRIV_KERNEL;
+ return PERF_BR_PRIV_HV;
+ default:
+ pr_warn("unknown branch privilege captured\n");
+ return PERF_BR_PRIV_UNKNOWN;
+ }
+}
+
+static void capture_brbe_flags(struct pmu_hw_events *cpuc, struct perf_event *event,
+ u64 brbinf, u64 brbcr, int idx)
+{
+ struct perf_branch_entry *entry = &cpuc->branches->branch_entries[idx];
+ bool new_branch_type;
+ int branch_type;
+
+ if (branch_sample_type(event)) {
+ branch_type = brbe_fetch_perf_type(brbinf, &new_branch_type);
+ if (new_branch_type) {
+ entry->type = PERF_BR_EXTEND_ABI;
+ entry->new_type = branch_type;
+ } else {
+ entry->type = branch_type;
+ }
+ }
+
+ if (!branch_sample_no_cycles(event)) {
+ WARN_ON_ONCE(!(brbcr & BRBCR_EL1_CC));
+ entry->cycles = brbe_fetch_cycles(brbinf);
+ }
+
+ if (!branch_sample_no_flags(event)) {
+ /*
+ * BRBINF_LASTFAILED does not indicate whether last transaction
+ * got failed or aborted during the current branch record itself.
+ * Rather, this indicates that all the branch records which were
+ * in transaction until the curret branch record have failed. So
+ * the entire BRBE buffer needs to be processed later on to find
+ * all branch records which might have failed.
+ */
+ entry->abort = brbe_fetch_lastfailed(brbinf);
+
+ /*
+ * All these information (i.e transaction state and mispredicts)
+ * are not available for target only branch records.
+ */
+ if (!brbe_target(brbinf)) {
+ WARN_ON_ONCE(!(brbcr & BRBCR_EL1_MPRED));
+ entry->mispred = brbe_fetch_mispredict(brbinf);
+ entry->predicted = !entry->mispred;
+ entry->in_tx = brbe_fetch_in_tx(brbinf);
+ }
+ }
+
+ if (branch_sample_priv(event)) {
+ /*
+ * All these information (i.e branch privilege level) are not
+ * available for source only branch records.
+ */
+ if (!brbe_source(brbinf))
+ entry->priv = brbe_fetch_perf_priv(brbinf);
+ }
+}
+
+/*
+ * A branch record with BRBINF_EL1.LASTFAILED set, implies that all
+ * preceding consecutive branch records, that were in a transaction
+ * (i.e their BRBINF_EL1.TX set) have been aborted.
+ *
+ * Similarly BRBFCR_EL1.LASTFAILED set, indicate that all preceding
+ * consecutive branch records upto the last record, which were in a
+ * transaction (i.e their BRBINF_EL1.TX set) have been aborted.
+ *
+ * --------------------------------- -------------------
+ * | 00 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
+ * --------------------------------- -------------------
+ * | 01 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX success]
+ * --------------------------------- -------------------
+ * | 02 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
+ * --------------------------------- -------------------
+ * | 03 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
+ * --------------------------------- -------------------
+ * | 04 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
+ * --------------------------------- -------------------
+ * | 05 | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 1 |
+ * --------------------------------- -------------------
+ * | .. | BRBSRC | BRBTGT | BRBINF | | TX = 0 | LF = 0 |
+ * --------------------------------- -------------------
+ * | 61 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
+ * --------------------------------- -------------------
+ * | 62 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
+ * --------------------------------- -------------------
+ * | 63 | BRBSRC | BRBTGT | BRBINF | | TX = 1 | LF = 0 | [TX failed]
+ * --------------------------------- -------------------
+ *
+ * BRBFCR_EL1.LASTFAILED == 1
+ *
+ * Here BRBFCR_EL1.LASTFAILED failes all those consecutive and also
+ * in transaction branches near the end of the BRBE buffer.
+ */
+static void process_branch_aborts(struct pmu_hw_events *cpuc)
+{
+ struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
+ u64 brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
+ bool lastfailed = !!(brbfcr & BRBFCR_EL1_LASTFAILED);
+ int idx = brbe_attr->brbe_nr - 1;
+ struct perf_branch_entry *entry;
+
+ do {
+ entry = &cpuc->branches->branch_entries[idx];
+ if (entry->in_tx) {
+ entry->abort = lastfailed;
+ } else {
+ lastfailed = entry->abort;
+ entry->abort = false;
+ }
+ } while (idx--, idx >= 0);
+}
+
+void armv8pmu_branch_reset(void)
+{
+ asm volatile(BRB_IALL);
+ isb();
+}
+
+void armv8pmu_branch_read(struct pmu_hw_events *cpuc, struct perf_event *event)
+{
+ struct brbe_hw_attr *brbe_attr = (struct brbe_hw_attr *)cpuc->percpu_pmu->private;
+ u64 brbinf, brbfcr, brbcr;
+ int idx;
+
+ brbcr = read_sysreg_s(SYS_BRBCR_EL1);
+ brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
+
+ /* Ensure pause on PMU interrupt is enabled */
+ WARN_ON_ONCE(!(brbcr & BRBCR_EL1_FZP));
+
+ /* Save and clear the privilege */
+ write_sysreg_s(brbcr & ~(BRBCR_EL1_E0BRE | BRBCR_EL1_E1BRE), SYS_BRBCR_EL1);
+
+ /* Pause the buffer */
+ write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
+ isb();
+
+ for (idx = 0; idx < brbe_attr->brbe_nr; idx++) {
+ struct perf_branch_entry *entry = &cpuc->branches->branch_entries[idx];
+
+ select_brbe_bank_index(idx);
+ brbinf = get_brbinf_reg(idx);
+ /*
+ * There are no valid entries anymore on the buffer.
+ * Abort the branch record processing to save some
+ * cycles and also reduce the capture/process load
+ * for the user space as well.
+ */
+ if (brbe_invalid(brbinf))
+ break;
+
+ perf_clear_branch_entry_bitfields(entry);
+ if (brbe_valid(brbinf)) {
+ entry->from = get_brbsrc_reg(idx);
+ entry->to = get_brbtgt_reg(idx);
+ } else if (brbe_source(brbinf)) {
+ entry->from = get_brbsrc_reg(idx);
+ entry->to = 0;
+ } else if (brbe_target(brbinf)) {
+ entry->from = 0;
+ entry->to = get_brbtgt_reg(idx);
+ }
+ capture_brbe_flags(cpuc, event, brbinf, brbcr, idx);
+ }
+ cpuc->branches->branch_stack.nr = idx;
+ cpuc->branches->branch_stack.hw_idx = -1ULL;
+ process_branch_aborts(cpuc);
+
+ /* Restore privilege, enable pause on PMU interrupt */
+ write_sysreg_s(brbcr | BRBCR_EL1_FZP, SYS_BRBCR_EL1);
+
+ /* Unpause the buffer */
+ write_sysreg_s(brbfcr & ~BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
+ isb();
+ armv8pmu_branch_reset();
+}
new file mode 100644
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Branch Record Buffer Extension Helpers.
+ *
+ * Copyright (C) 2022 ARM Limited
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#define pr_fmt(fmt) "brbe: " fmt
+
+#include <linux/perf/arm_pmu.h>
+
+#define BRBFCR_EL1_BRANCH_FILTERS (BRBFCR_EL1_DIRECT | \
+ BRBFCR_EL1_INDIRECT | \
+ BRBFCR_EL1_RTN | \
+ BRBFCR_EL1_INDCALL | \
+ BRBFCR_EL1_DIRCALL | \
+ BRBFCR_EL1_CONDDIR)
+
+#define BRBFCR_EL1_DEFAULT_CONFIG (BRBFCR_EL1_BANK_MASK | \
+ BRBFCR_EL1_PAUSED | \
+ BRBFCR_EL1_EnI | \
+ BRBFCR_EL1_BRANCH_FILTERS)
+
+/*
+ * BRBTS_EL1 is currently not used for branch stack implementation
+ * purpose but BRBCR_EL1.TS needs to have a valid value from all
+ * available options. BRBCR_EL1_TS_VIRTUAL is selected for this.
+ */
+#define BRBCR_EL1_DEFAULT_TS FIELD_PREP(BRBCR_EL1_TS_MASK, BRBCR_EL1_TS_VIRTUAL)
+
+#define BRBCR_EL1_DEFAULT_CONFIG (BRBCR_EL1_EXCEPTION | \
+ BRBCR_EL1_ERTN | \
+ BRBCR_EL1_CC | \
+ BRBCR_EL1_MPRED | \
+ BRBCR_EL1_E1BRE | \
+ BRBCR_EL1_E0BRE | \
+ BRBCR_EL1_FZP | \
+ BRBCR_EL1_DEFAULT_TS)
+/*
+ * BRBE Instructions
+ *
+ * BRB_IALL : Invalidate the entire buffer
+ * BRB_INJ : Inject latest branch record derived from [BRBSRCINJ, BRBTGTINJ, BRBINFINJ]
+ */
+#define BRB_IALL __emit_inst(0xD5000000 | sys_insn(1, 1, 7, 2, 4) | (0x1f))
+#define BRB_INJ __emit_inst(0xD5000000 | sys_insn(1, 1, 7, 2, 5) | (0x1f))
+
+/*
+ * BRBE Buffer Organization
+ *
+ * BRBE buffer is arranged as multiple banks of 32 branch record
+ * entries each. An individual branch record in a given bank could
+ * be accessed, after selecting the bank in BRBFCR_EL1.BANK and
+ * accessing the registers i.e [BRBSRC, BRBTGT, BRBINF] set with
+ * indices [0..31].
+ *
+ * Bank 0
+ *
+ * --------------------------------- ------
+ * | 00 | BRBSRC | BRBTGT | BRBINF | | 00 |
+ * --------------------------------- ------
+ * | 01 | BRBSRC | BRBTGT | BRBINF | | 01 |
+ * --------------------------------- ------
+ * | .. | BRBSRC | BRBTGT | BRBINF | | .. |
+ * --------------------------------- ------
+ * | 31 | BRBSRC | BRBTGT | BRBINF | | 31 |
+ * --------------------------------- ------
+ *
+ * Bank 1
+ *
+ * --------------------------------- ------
+ * | 32 | BRBSRC | BRBTGT | BRBINF | | 00 |
+ * --------------------------------- ------
+ * | 33 | BRBSRC | BRBTGT | BRBINF | | 01 |
+ * --------------------------------- ------
+ * | .. | BRBSRC | BRBTGT | BRBINF | | .. |
+ * --------------------------------- ------
+ * | 63 | BRBSRC | BRBTGT | BRBINF | | 31 |
+ * --------------------------------- ------
+ */
+#define BRBE_BANK_MAX_ENTRIES 32
+
+#define BRBE_BANK0_IDX_MIN 0
+#define BRBE_BANK0_IDX_MAX 31
+#define BRBE_BANK1_IDX_MIN 32
+#define BRBE_BANK1_IDX_MAX 63
+
+struct brbe_hw_attr {
+ bool brbe_version;
+ int brbe_cc;
+ int brbe_nr;
+ int brbe_format;
+};
+
+enum brbe_bank_idx {
+ BRBE_BANK_IDX_INVALID = -1,
+ BRBE_BANK_IDX_0,
+ BRBE_BANK_IDX_1,
+ BRBE_BANK_IDX_MAX
+};
+
+#define RETURN_READ_BRBSRCN(n) \
+ read_sysreg_s(SYS_BRBSRC##n##_EL1)
+
+#define RETURN_READ_BRBTGTN(n) \
+ read_sysreg_s(SYS_BRBTGT##n##_EL1)
+
+#define RETURN_READ_BRBINFN(n) \
+ read_sysreg_s(SYS_BRBINF##n##_EL1)
+
+#define BRBE_REGN_CASE(n, case_macro) \
+ case n: return case_macro(n); break
+
+#define BRBE_REGN_SWITCH(x, case_macro) \
+ do { \
+ switch (x) { \
+ BRBE_REGN_CASE(0, case_macro); \
+ BRBE_REGN_CASE(1, case_macro); \
+ BRBE_REGN_CASE(2, case_macro); \
+ BRBE_REGN_CASE(3, case_macro); \
+ BRBE_REGN_CASE(4, case_macro); \
+ BRBE_REGN_CASE(5, case_macro); \
+ BRBE_REGN_CASE(6, case_macro); \
+ BRBE_REGN_CASE(7, case_macro); \
+ BRBE_REGN_CASE(8, case_macro); \
+ BRBE_REGN_CASE(9, case_macro); \
+ BRBE_REGN_CASE(10, case_macro); \
+ BRBE_REGN_CASE(11, case_macro); \
+ BRBE_REGN_CASE(12, case_macro); \
+ BRBE_REGN_CASE(13, case_macro); \
+ BRBE_REGN_CASE(14, case_macro); \
+ BRBE_REGN_CASE(15, case_macro); \
+ BRBE_REGN_CASE(16, case_macro); \
+ BRBE_REGN_CASE(17, case_macro); \
+ BRBE_REGN_CASE(18, case_macro); \
+ BRBE_REGN_CASE(19, case_macro); \
+ BRBE_REGN_CASE(20, case_macro); \
+ BRBE_REGN_CASE(21, case_macro); \
+ BRBE_REGN_CASE(22, case_macro); \
+ BRBE_REGN_CASE(23, case_macro); \
+ BRBE_REGN_CASE(24, case_macro); \
+ BRBE_REGN_CASE(25, case_macro); \
+ BRBE_REGN_CASE(26, case_macro); \
+ BRBE_REGN_CASE(27, case_macro); \
+ BRBE_REGN_CASE(28, case_macro); \
+ BRBE_REGN_CASE(29, case_macro); \
+ BRBE_REGN_CASE(30, case_macro); \
+ BRBE_REGN_CASE(31, case_macro); \
+ default: \
+ pr_warn("unknown register index\n"); \
+ return -1; \
+ } \
+ } while (0)
+
+static inline int buffer_to_brbe_idx(int buffer_idx)
+{
+ return buffer_idx % BRBE_BANK_MAX_ENTRIES;
+}
+
+static inline u64 get_brbsrc_reg(int buffer_idx)
+{
+ int brbe_idx = buffer_to_brbe_idx(buffer_idx);
+
+ BRBE_REGN_SWITCH(brbe_idx, RETURN_READ_BRBSRCN);
+}
+
+static inline u64 get_brbtgt_reg(int buffer_idx)
+{
+ int brbe_idx = buffer_to_brbe_idx(buffer_idx);
+
+ BRBE_REGN_SWITCH(brbe_idx, RETURN_READ_BRBTGTN);
+}
+
+static inline u64 get_brbinf_reg(int buffer_idx)
+{
+ int brbe_idx = buffer_to_brbe_idx(buffer_idx);
+
+ BRBE_REGN_SWITCH(brbe_idx, RETURN_READ_BRBINFN);
+}
+
+static inline u64 brbe_record_valid(u64 brbinf)
+{
+ return FIELD_GET(BRBINF_EL1_VALID_MASK, brbinf);
+}
+
+static inline bool brbe_invalid(u64 brbinf)
+{
+ return brbe_record_valid(brbinf) == BRBINF_EL1_VALID_NONE;
+}
+
+static inline bool brbe_valid(u64 brbinf)
+{
+ return brbe_record_valid(brbinf) == BRBINF_EL1_VALID_FULL;
+}
+
+static inline bool brbe_source(u64 brbinf)
+{
+ return brbe_record_valid(brbinf) == BRBINF_EL1_VALID_SOURCE;
+}
+
+static inline bool brbe_target(u64 brbinf)
+{
+ return brbe_record_valid(brbinf) == BRBINF_EL1_VALID_TARGET;
+}
+
+static inline int brbe_fetch_in_tx(u64 brbinf)
+{
+ return FIELD_GET(BRBINF_EL1_T_MASK, brbinf);
+}
+
+static inline int brbe_fetch_mispredict(u64 brbinf)
+{
+ return FIELD_GET(BRBINF_EL1_MPRED_MASK, brbinf);
+}
+
+static inline int brbe_fetch_lastfailed(u64 brbinf)
+{
+ return FIELD_GET(BRBINF_EL1_LASTFAILED_MASK, brbinf);
+}
+
+static inline int brbe_fetch_cycles(u64 brbinf)
+{
+ /*
+ * Captured cycle count is unknown and hence
+ * should not be passed on to the user space.
+ */
+ if (brbinf & BRBINF_EL1_CCU)
+ return 0;
+
+ return FIELD_GET(BRBINF_EL1_CC_MASK, brbinf);
+}
+
+static inline int brbe_fetch_type(u64 brbinf)
+{
+ return FIELD_GET(BRBINF_EL1_TYPE_MASK, brbinf);
+}
+
+static inline int brbe_fetch_el(u64 brbinf)
+{
+ return FIELD_GET(BRBINF_EL1_EL_MASK, brbinf);
+}
+
+static inline int brbe_fetch_numrec(u64 brbidr)
+{
+ return FIELD_GET(BRBIDR0_EL1_NUMREC_MASK, brbidr);
+}
+
+static inline int brbe_fetch_format(u64 brbidr)
+{
+ return FIELD_GET(BRBIDR0_EL1_FORMAT_MASK, brbidr);
+}
+
+static inline int brbe_fetch_cc_bits(u64 brbidr)
+{
+ return FIELD_GET(BRBIDR0_EL1_CC_MASK, brbidr);
+}