[1/3] perf/arm-cmn: Decouple wp_config registers from filter group number

Message ID 20240126221215.1537377-2-ilkka@os.amperecomputing.com
State New
Headers
Series perf/arm-cmn: Add support for tertiary match group |

Commit Message

Ilkka Koskinen Jan. 26, 2024, 10:12 p.m. UTC
  Previously, wp_config0/2 registers were used for primary match group and
wp_config1/3 registers for secondary match group. In order to support
tertiary match group, this patch decouples the registers and the groups.

Allocation is changed to dynamic but it's still per mesh instance rather
than per node.

Signed-off-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
---
 drivers/perf/arm-cmn.c | 52 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 9 deletions(-)
  

Comments

Robin Murphy Jan. 29, 2024, 5:06 p.m. UTC | #1
Hi Ilkka,

On 2024-01-26 10:12 pm, Ilkka Koskinen wrote:
> Previously, wp_config0/2 registers were used for primary match group and
> wp_config1/3 registers for secondary match group. In order to support
> tertiary match group, this patch decouples the registers and the groups.

Happy to see you having a stab at this, however I fear I you're in for a 
fair dose of "if it were this simple I might have already done it" :)

> Allocation is changed to dynamic but it's still per mesh instance rather
> than per node.
> 
> Signed-off-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
> ---
>   drivers/perf/arm-cmn.c | 52 ++++++++++++++++++++++++++++++++++--------
>   1 file changed, 43 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
> index c584165b13ba..93eb47ea7e25 100644
> --- a/drivers/perf/arm-cmn.c
> +++ b/drivers/perf/arm-cmn.c
> @@ -591,6 +591,7 @@ struct arm_cmn_hw_event {
>   	u8 dtm_offset;
>   	bool wide_sel;
>   	enum cmn_filter_select filter_sel;
> +	int wp_idx;
>   };
>   
>   #define for_each_hw_dn(hw, dn, i) \
> @@ -1337,7 +1338,35 @@ static const struct attribute_group *arm_cmn_attr_groups[] = {
>   
>   static int arm_cmn_wp_idx(struct perf_event *event)
>   {
> -	return CMN_EVENT_EVENTID(event) + CMN_EVENT_WP_GRP(event);
> +	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
> +
> +	return hw->wp_idx;

Sorry, this breaks group validation.

> +}
> +
> +static int arm_cmn_wp_idx_unused(struct perf_event *event, struct arm_cmn_dtm *dtm,
> +	struct arm_cmn_dtc *dtc)
> +{
> +	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
> +	int idx, tmp, direction = CMN_EVENT_EVENTID(event);
> +
> +	/*
> +	 * Examine wp 0 & 1 for the up direction,
> +	 * examine wp 2 & 3 for the down direction
> +	 */
> +	for (idx = direction; idx < direction + 2; idx++)
> +		if (dtm->wp_event[idx] < 0)
> +			break;
> +
> +	if (idx == direction + 2)
> +		return -ENOSPC;
> +
> +	tmp = dtm->wp_event[idx ^ 1];
> +	if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) !=
> +	    CMN_EVENT_WP_COMBINE(dtc->counters[tmp]))
> +		return -ENOSPC;
> +
> +	hw->wp_idx = idx;

I don't really get this logic either - we can allocate a potentially 
different index for every DTM, but only store the most recent one?

> +	return hw->wp_idx;
>   }
>   
>   static u32 arm_cmn_wp_config(struct perf_event *event)
> @@ -1785,6 +1814,8 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
>   
>   	for_each_hw_dtc_idx(hw, j, idx)
>   		cmn->dtc[j].counters[idx] = NULL;
> +
> +	hw->wp_idx = -1;
>   }
>   
>   static int arm_cmn_event_add(struct perf_event *event, int flags)
> @@ -1794,6 +1825,7 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
>   	struct arm_cmn_node *dn;
>   	enum cmn_node_type type = CMN_EVENT_TYPE(event);
>   	unsigned int input_sel, i = 0;
> +	int wp_idx;
>   
>   	if (type == CMN_TYPE_DTC) {
>   		while (cmn->dtc[i].cycles)
> @@ -1822,6 +1854,7 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
>   	}
>   
>   	/* ...then the local counters to feed them */
> +	wp_idx = -1;

Oh, I guess this trying to avoid some of that issue, but I still don't 
think it works - say we add an event targeted to XP B, which sees WP0 is 
free on DTM B so allocates index 0; then we add another event 
aggregating across XPs A and B, which sees WP0 is free on DTM A, 
allocates index 0, then goes on to stomp WP0 on DTM B as well - oops.

I don't think it's going to be feasible to do this without tracking the 
full allocation state with a wp_idx bitmap in the hw_event - at least it 
only needs to be half the size of dtm_idx, so I think there's still room.

Thanks,
Robin.

>   	for_each_hw_dn(hw, dn, i) {
>   		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset;
>   		unsigned int dtm_idx, shift, d = max_t(int, dn->dtc, 0);
> @@ -1835,16 +1868,17 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
>   		if (type == CMN_TYPE_XP) {
>   			input_sel = CMN__PMEVCNT0_INPUT_SEL_XP + dtm_idx;
>   		} else if (type == CMN_TYPE_WP) {
> -			int tmp, wp_idx = arm_cmn_wp_idx(event);
>   			u32 cfg = arm_cmn_wp_config(event);
>   
> -			if (dtm->wp_event[wp_idx] >= 0)
> -				goto free_dtms;
> -
> -			tmp = dtm->wp_event[wp_idx ^ 1];
> -			if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) !=
> -					CMN_EVENT_WP_COMBINE(cmn->dtc[d].counters[tmp]))
> -				goto free_dtms;
> +			/*
> +			 * wp_config register index is currently allocated per
> +			 * mesh instance rather than per node.
> +			 */
> +			if (wp_idx < 0) {
> +				wp_idx = arm_cmn_wp_idx_unused(event, dtm, &cmn->dtc[d]);
> +				if (wp_idx < 0)
> +					goto free_dtms;
> +			}
>   
>   			input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx;
>   			dtm->wp_event[wp_idx] = hw->dtc_idx[d];
  
Ilkka Koskinen Jan. 30, 2024, 5:28 a.m. UTC | #2
Hi Robin,

Thanks for the review.

On Mon, 29 Jan 2024, Robin Murphy wrote:
> Hi Ilkka,
>
> On 2024-01-26 10:12 pm, Ilkka Koskinen wrote:
>> Previously, wp_config0/2 registers were used for primary match group and
>> wp_config1/3 registers for secondary match group. In order to support
>> tertiary match group, this patch decouples the registers and the groups.
>
> Happy to see you having a stab at this, however I fear I you're in for a fair 
> dose of "if it were this simple I might have already done it" :)

Uh, for a little while I thought it seemed too easy but decided to 
continue nevertheless

>
>> Allocation is changed to dynamic but it's still per mesh instance rather
>> than per node.
>> 
>> Signed-off-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
>> ---
>>   drivers/perf/arm-cmn.c | 52 ++++++++++++++++++++++++++++++++++--------
>>   1 file changed, 43 insertions(+), 9 deletions(-)
>> 
>> diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
>> index c584165b13ba..93eb47ea7e25 100644
>> --- a/drivers/perf/arm-cmn.c
>> +++ b/drivers/perf/arm-cmn.c
>> @@ -591,6 +591,7 @@ struct arm_cmn_hw_event {
>>   	u8 dtm_offset;
>>   	bool wide_sel;
>>   	enum cmn_filter_select filter_sel;
>> +	int wp_idx;
>>   };
>>     #define for_each_hw_dn(hw, dn, i) \
>> @@ -1337,7 +1338,35 @@ static const struct attribute_group 
>> *arm_cmn_attr_groups[] = {
>>     static int arm_cmn_wp_idx(struct perf_event *event)
>>   {
>> -	return CMN_EVENT_EVENTID(event) + CMN_EVENT_WP_GRP(event);
>> +	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
>> +
>> +	return hw->wp_idx;
>
> Sorry, this breaks group validation.

Clearly, watchpoint group validation was missing from my tests :(

>
>> +}
>> +
>> +static int arm_cmn_wp_idx_unused(struct perf_event *event, struct 
>> arm_cmn_dtm *dtm,
>> +	struct arm_cmn_dtc *dtc)
>> +{
>> +	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
>> +	int idx, tmp, direction = CMN_EVENT_EVENTID(event);
>> +
>> +	/*
>> +	 * Examine wp 0 & 1 for the up direction,
>> +	 * examine wp 2 & 3 for the down direction
>> +	 */
>> +	for (idx = direction; idx < direction + 2; idx++)
>> +		if (dtm->wp_event[idx] < 0)
>> +			break;
>> +
>> +	if (idx == direction + 2)
>> +		return -ENOSPC;
>> +
>> +	tmp = dtm->wp_event[idx ^ 1];
>> +	if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) !=
>> +	    CMN_EVENT_WP_COMBINE(dtc->counters[tmp]))
>> +		return -ENOSPC;
>> +
>> +	hw->wp_idx = idx;
>
> I don't really get this logic either - we can allocate a potentially 
> different index for every DTM, but only store the most recent one?
>
>> +	return hw->wp_idx;
>>   }
>>     static u32 arm_cmn_wp_config(struct perf_event *event)
>> @@ -1785,6 +1814,8 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, 
>> struct perf_event *event,
>>     	for_each_hw_dtc_idx(hw, j, idx)
>>   		cmn->dtc[j].counters[idx] = NULL;
>> +
>> +	hw->wp_idx = -1;
>>   }
>>     static int arm_cmn_event_add(struct perf_event *event, int flags)
>> @@ -1794,6 +1825,7 @@ static int arm_cmn_event_add(struct perf_event 
>> *event, int flags)
>>   	struct arm_cmn_node *dn;
>>   	enum cmn_node_type type = CMN_EVENT_TYPE(event);
>>   	unsigned int input_sel, i = 0;
>> +	int wp_idx;
>>     	if (type == CMN_TYPE_DTC) {
>>   		while (cmn->dtc[i].cycles)
>> @@ -1822,6 +1854,7 @@ static int arm_cmn_event_add(struct perf_event 
>> *event, int flags)
>>   	}
>>     	/* ...then the local counters to feed them */
>> +	wp_idx = -1;
>
> Oh, I guess this trying to avoid some of that issue, but I still don't think 
> it works - say we add an event targeted to XP B, which sees WP0 is free on 
> DTM B so allocates index 0; then we add another event aggregating across XPs 
> A and B, which sees WP0 is free on DTM A, allocates index 0, then goes on to 
> stomp WP0 on DTM B as well - oops.
>
> I don't think it's going to be feasible to do this without tracking the full 
> allocation state with a wp_idx bitmap in the hw_event - at least it only 
> needs to be half the size of dtm_idx, so I think there's still room.

Yeah, it was supposed to be simple and stupid version until I'd have time 
to make the allocation per node. But the more I think about this, the more
I start to believe that the bitmap solution would be the better option
right away.

I'll take a look at better solution although it might take a while as I'll 
probably get other more urgent tasks soon. If you find yourself with too 
much free time, feel free to take this task ;)

Cheers, Ilkka

>
> Thanks,
> Robin.
>
>>   	for_each_hw_dn(hw, dn, i) {
>>   		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + 
>> hw->dtm_offset;
>>   		unsigned int dtm_idx, shift, d = max_t(int, dn->dtc, 0);
>> @@ -1835,16 +1868,17 @@ static int arm_cmn_event_add(struct perf_event 
>> *event, int flags)
>>   		if (type == CMN_TYPE_XP) {
>>   			input_sel = CMN__PMEVCNT0_INPUT_SEL_XP + dtm_idx;
>>   		} else if (type == CMN_TYPE_WP) {
>> -			int tmp, wp_idx = arm_cmn_wp_idx(event);
>>   			u32 cfg = arm_cmn_wp_config(event);
>>   -			if (dtm->wp_event[wp_idx] >= 0)
>> -				goto free_dtms;
>> -
>> -			tmp = dtm->wp_event[wp_idx ^ 1];
>> -			if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) !=
>> - 
>> CMN_EVENT_WP_COMBINE(cmn->dtc[d].counters[tmp]))
>> -				goto free_dtms;
>> +			/*
>> +			 * wp_config register index is currently allocated 
>> per
>> +			 * mesh instance rather than per node.
>> +			 */
>> +			if (wp_idx < 0) {
>> +				wp_idx = arm_cmn_wp_idx_unused(event, dtm, 
>> &cmn->dtc[d]);
>> +				if (wp_idx < 0)
>> +					goto free_dtms;
>> +			}
>>     			input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx;
>>   			dtm->wp_event[wp_idx] = hw->dtc_idx[d];
>
  

Patch

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index c584165b13ba..93eb47ea7e25 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -591,6 +591,7 @@  struct arm_cmn_hw_event {
 	u8 dtm_offset;
 	bool wide_sel;
 	enum cmn_filter_select filter_sel;
+	int wp_idx;
 };
 
 #define for_each_hw_dn(hw, dn, i) \
@@ -1337,7 +1338,35 @@  static const struct attribute_group *arm_cmn_attr_groups[] = {
 
 static int arm_cmn_wp_idx(struct perf_event *event)
 {
-	return CMN_EVENT_EVENTID(event) + CMN_EVENT_WP_GRP(event);
+	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
+
+	return hw->wp_idx;
+}
+
+static int arm_cmn_wp_idx_unused(struct perf_event *event, struct arm_cmn_dtm *dtm,
+	struct arm_cmn_dtc *dtc)
+{
+	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
+	int idx, tmp, direction = CMN_EVENT_EVENTID(event);
+
+	/*
+	 * Examine wp 0 & 1 for the up direction,
+	 * examine wp 2 & 3 for the down direction
+	 */
+	for (idx = direction; idx < direction + 2; idx++)
+		if (dtm->wp_event[idx] < 0)
+			break;
+
+	if (idx == direction + 2)
+		return -ENOSPC;
+
+	tmp = dtm->wp_event[idx ^ 1];
+	if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) !=
+	    CMN_EVENT_WP_COMBINE(dtc->counters[tmp]))
+		return -ENOSPC;
+
+	hw->wp_idx = idx;
+	return hw->wp_idx;
 }
 
 static u32 arm_cmn_wp_config(struct perf_event *event)
@@ -1785,6 +1814,8 @@  static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
 
 	for_each_hw_dtc_idx(hw, j, idx)
 		cmn->dtc[j].counters[idx] = NULL;
+
+	hw->wp_idx = -1;
 }
 
 static int arm_cmn_event_add(struct perf_event *event, int flags)
@@ -1794,6 +1825,7 @@  static int arm_cmn_event_add(struct perf_event *event, int flags)
 	struct arm_cmn_node *dn;
 	enum cmn_node_type type = CMN_EVENT_TYPE(event);
 	unsigned int input_sel, i = 0;
+	int wp_idx;
 
 	if (type == CMN_TYPE_DTC) {
 		while (cmn->dtc[i].cycles)
@@ -1822,6 +1854,7 @@  static int arm_cmn_event_add(struct perf_event *event, int flags)
 	}
 
 	/* ...then the local counters to feed them */
+	wp_idx = -1;
 	for_each_hw_dn(hw, dn, i) {
 		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset;
 		unsigned int dtm_idx, shift, d = max_t(int, dn->dtc, 0);
@@ -1835,16 +1868,17 @@  static int arm_cmn_event_add(struct perf_event *event, int flags)
 		if (type == CMN_TYPE_XP) {
 			input_sel = CMN__PMEVCNT0_INPUT_SEL_XP + dtm_idx;
 		} else if (type == CMN_TYPE_WP) {
-			int tmp, wp_idx = arm_cmn_wp_idx(event);
 			u32 cfg = arm_cmn_wp_config(event);
 
-			if (dtm->wp_event[wp_idx] >= 0)
-				goto free_dtms;
-
-			tmp = dtm->wp_event[wp_idx ^ 1];
-			if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) !=
-					CMN_EVENT_WP_COMBINE(cmn->dtc[d].counters[tmp]))
-				goto free_dtms;
+			/*
+			 * wp_config register index is currently allocated per
+			 * mesh instance rather than per node.
+			 */
+			if (wp_idx < 0) {
+				wp_idx = arm_cmn_wp_idx_unused(event, dtm, &cmn->dtc[d]);
+				if (wp_idx < 0)
+					goto free_dtms;
+			}
 
 			input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx;
 			dtm->wp_event[wp_idx] = hw->dtc_idx[d];