[16/24] selftests/resctrl: Rewrite Cache Allocation Technology (CAT) test

Message ID 20231024092634.7122-17-ilpo.jarvinen@linux.intel.com
State New
Headers
Series selftests/resctrl: CAT test improvements & generalized test framework |

Commit Message

Ilpo Järvinen Oct. 24, 2023, 9:26 a.m. UTC
  CAT test spawns two processes into two different control groups with
exclusive schemata. Both the processes alloc a buffer from memory
matching their allocated LLC block size and flush the entire buffer out
of caches. Since the processes are reading through the buffer only once
during the measurement and initially all the buffer was flushed, the
test isn't testing CAT.

Rewrite the CAT test to allocate a buffer sized to half of LLC. Then
perform a sequence of tests with different LLC alloc sizes starting
from half of the CBM bits down to 1-bit CBM. Flush the buffer before
each test and read the buffer twice. Observe the LLC misses on the
second read through the buffer. As the allocated LLC block gets smaller
and smaller, the LLC misses will become larger and larger giving a
strong signal on CAT working properly.

The new CAT test is using only a single process because it relies on
measured effect against another run of itself rather than another
process adding noise. The rest of the system is allocated the CBM bits
not used by the CAT test to keep the test isolated.

Replace count_bits() with count_contiguous_bits() to get the first bit
position in order to be able to calculate masks based on it.

This change has been tested with a number of systems from different
generations.

Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 tools/testing/selftests/resctrl/cat_test.c  | 286 +++++++++-----------
 tools/testing/selftests/resctrl/fill_buf.c  |   6 +-
 tools/testing/selftests/resctrl/resctrl.h   |   5 +-
 tools/testing/selftests/resctrl/resctrlfs.c |  44 +--
 4 files changed, 137 insertions(+), 204 deletions(-)
  

Comments

Maciej Wieczor-Retman Oct. 27, 2023, 12:05 p.m. UTC | #1
On 2023-10-24 at 12:26:26 +0300, Ilpo Järvinen wrote:
>CAT test spawns two processes into two different control groups with
>exclusive schemata. Both the processes alloc a buffer from memory
>matching their allocated LLC block size and flush the entire buffer out
>of caches. Since the processes are reading through the buffer only once
>during the measurement and initially all the buffer was flushed, the
>test isn't testing CAT.
>
>Rewrite the CAT test to allocate a buffer sized to half of LLC. Then
>perform a sequence of tests with different LLC alloc sizes starting
>from half of the CBM bits down to 1-bit CBM. Flush the buffer before
>each test and read the buffer twice. Observe the LLC misses on the
>second read through the buffer. As the allocated LLC block gets smaller
>and smaller, the LLC misses will become larger and larger giving a
>strong signal on CAT working properly.
>
>The new CAT test is using only a single process because it relies on
>measured effect against another run of itself rather than another
>process adding noise. The rest of the system is allocated the CBM bits
>not used by the CAT test to keep the test isolated.
>
>Replace count_bits() with count_contiguous_bits() to get the first bit
>position in order to be able to calculate masks based on it.
>
>This change has been tested with a number of systems from different
>generations.
>
>Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
>Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
>---
> tools/testing/selftests/resctrl/cat_test.c  | 286 +++++++++-----------
> tools/testing/selftests/resctrl/fill_buf.c  |   6 +-
> tools/testing/selftests/resctrl/resctrl.h   |   5 +-
> tools/testing/selftests/resctrl/resctrlfs.c |  44 +--
> 4 files changed, 137 insertions(+), 204 deletions(-)
>
>diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
>index e71690a9bbb3..7518c520c5cc 100644
>--- a/tools/testing/selftests/resctrl/cat_test.c
>+++ b/tools/testing/selftests/resctrl/cat_test.c
>@@ -11,65 +11,68 @@
> #include "resctrl.h"
> #include <unistd.h>
> 
>-#define RESULT_FILE_NAME1	"result_cat1"
>-#define RESULT_FILE_NAME2	"result_cat2"
>+#define RESULT_FILE_NAME	"result_cat"
> #define NUM_OF_RUNS		5
>-#define MAX_DIFF_PERCENT	4
>-#define MAX_DIFF		1000000
> 
> /*
>- * Change schemata. Write schemata to specified
>- * con_mon grp, mon_grp in resctrl FS.
>- * Run 5 times in order to get average values.
>+ * Minimum difference in LLC misses between a test with n+1 bits CBM mask to
>+ * the test with n bits. With e.g. 5 vs 4 bits in the CBM mask, the minimum
>+ * difference must be at least MIN_DIFF_PERCENT_PER_BIT * (4 - 1) = 3 percent.
>+ *
>+ * The relationship between number of used CBM bits and difference in LLC
>+ * misses is not expected to be linear. With a small number of bits, the
>+ * margin is smaller than with larger number of bits. For selftest purposes,
>+ * however, linear approach is enough because ultimately only pass/fail
>+ * decision has to be made and distinction between strong and stronger
>+ * signal is irrelevant.
>  */
>-static int cat_setup(struct resctrl_val_param *p)
>-{
>-	char schemata[64];
>-	int ret = 0;
>-
>-	/* Run NUM_OF_RUNS times */
>-	if (p->num_of_runs >= NUM_OF_RUNS)
>-		return END_OF_TESTS;
>-
>-	if (p->num_of_runs == 0) {
>-		sprintf(schemata, "%lx", p->mask);
>-		ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no,
>-				     p->resctrl_val);
>-	}
>-	p->num_of_runs++;
>-
>-	return ret;
>-}
>+#define MIN_DIFF_PERCENT_PER_BIT	1
> 
> static int show_results_info(__u64 sum_llc_val, int no_of_bits,
>-			     unsigned long cache_span, unsigned long max_diff,
>-			     unsigned long max_diff_percent, unsigned long num_of_runs,
>-			     bool platform)
>+			     unsigned long cache_span, long min_diff_percent,
>+			     unsigned long num_of_runs, bool platform,
>+			     __s64 *prev_avg_llc_val)
> {
> 	__u64 avg_llc_val = 0;
>-	float diff_percent;
>-	int ret;
>+	float avg_diff;
>+	int ret = 0;
> 
> 	avg_llc_val = sum_llc_val / num_of_runs;
>-	diff_percent = ((float)cache_span - avg_llc_val) / cache_span * 100;
>+	if (*prev_avg_llc_val) {
>+		float delta = (__s64)(avg_llc_val - *prev_avg_llc_val);
> 
>-	ret = platform && abs((int)diff_percent) > max_diff_percent;
>+		avg_diff = delta / *prev_avg_llc_val;
>+		ret = platform && (avg_diff * 100) < (float)min_diff_percent;
> 
>-	ksft_print_msg("%s Check cache miss rate within %lu%%\n",
>-		       ret ? "Fail:" : "Pass:", max_diff_percent);
>+		ksft_print_msg("%s Check cache miss rate changed more than %.1f%%\n",
>+			       ret ? "Fail:" : "Pass:", (float)min_diff_percent);

Shouldn't "Fail" and "Pass" be flipped in the ternary operator? Or the condition
sign above "<" should be ">"?

Now it looks like if (avg_diff * 100) is smaller than the min_diff_percent the
test is supposed to fail but the text suggests it's the other way around.

I also ran this selftest and that's the output:

# Pass: Check cache miss rate changed more than 3.0%
# Percent diff=45.8
# Number of bits: 4
# Average LLC val: 322489
# Cache span (lines): 294912
# Pass: Check cache miss rate changed more than 2.0%
# Percent diff=38.0
# Number of bits: 3
# Average LLC val: 445005
# Cache span (lines): 221184
# Pass: Check cache miss rate changed more than 1.0%
# Percent diff=27.2
# Number of bits: 2
# Average LLC val: 566145
# Cache span (lines): 147456
# Pass: Check cache miss rate changed more than 0.0%
# Percent diff=18.3
# Number of bits: 1
# Average LLC val: 669657
# Cache span (lines): 73728
ok 1 CAT: test

The diff percentages are much larger than the thresholds they're supposed to
be within and the test is passed.

>-	ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent));
>+		ksft_print_msg("Percent diff=%.1f\n", avg_diff * 100);
>+	}
>+	*prev_avg_llc_val = avg_llc_val;
> 
> 	show_cache_info(no_of_bits, avg_llc_val, cache_span, true);
> 
> 	return ret;
> }
> 
>@@ -143,54 +168,64 @@ static int cat_test(struct resctrl_val_param *param, size_t span)
> 	if (ret)
> 		return ret;
> 
>+	buf = alloc_buffer(span, 1);
>+	if (buf == NULL)

Similiar to patch 01/24, wouldn't this:
	if (!buf)
be better?

>+		return -1;
>+
  
Ilpo Järvinen Oct. 27, 2023, 12:32 p.m. UTC | #2
On Fri, 27 Oct 2023, Maciej Wieczór-Retman wrote:

> On 2023-10-24 at 12:26:26 +0300, Ilpo Järvinen wrote:
> >CAT test spawns two processes into two different control groups with
> >exclusive schemata. Both the processes alloc a buffer from memory
> >matching their allocated LLC block size and flush the entire buffer out
> >of caches. Since the processes are reading through the buffer only once
> >during the measurement and initially all the buffer was flushed, the
> >test isn't testing CAT.
> >
> >Rewrite the CAT test to allocate a buffer sized to half of LLC. Then
> >perform a sequence of tests with different LLC alloc sizes starting
> >from half of the CBM bits down to 1-bit CBM. Flush the buffer before
> >each test and read the buffer twice. Observe the LLC misses on the
> >second read through the buffer. As the allocated LLC block gets smaller
> >and smaller, the LLC misses will become larger and larger giving a
> >strong signal on CAT working properly.
> >
> >The new CAT test is using only a single process because it relies on
> >measured effect against another run of itself rather than another
> >process adding noise. The rest of the system is allocated the CBM bits
> >not used by the CAT test to keep the test isolated.
> >
> >Replace count_bits() with count_contiguous_bits() to get the first bit
> >position in order to be able to calculate masks based on it.
> >
> >This change has been tested with a number of systems from different
> >generations.
> >
> >Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
> >Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
> >---
> > tools/testing/selftests/resctrl/cat_test.c  | 286 +++++++++-----------
> > tools/testing/selftests/resctrl/fill_buf.c  |   6 +-
> > tools/testing/selftests/resctrl/resctrl.h   |   5 +-
> > tools/testing/selftests/resctrl/resctrlfs.c |  44 +--
> > 4 files changed, 137 insertions(+), 204 deletions(-)
> >
> >diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
> >index e71690a9bbb3..7518c520c5cc 100644
> >--- a/tools/testing/selftests/resctrl/cat_test.c
> >+++ b/tools/testing/selftests/resctrl/cat_test.c
> >@@ -11,65 +11,68 @@
> > #include "resctrl.h"
> > #include <unistd.h>
> > 
> >-#define RESULT_FILE_NAME1	"result_cat1"
> >-#define RESULT_FILE_NAME2	"result_cat2"
> >+#define RESULT_FILE_NAME	"result_cat"
> > #define NUM_OF_RUNS		5
> >-#define MAX_DIFF_PERCENT	4
> >-#define MAX_DIFF		1000000
> > 
> > /*
> >- * Change schemata. Write schemata to specified
> >- * con_mon grp, mon_grp in resctrl FS.
> >- * Run 5 times in order to get average values.
> >+ * Minimum difference in LLC misses between a test with n+1 bits CBM mask to
> >+ * the test with n bits. With e.g. 5 vs 4 bits in the CBM mask, the minimum
> >+ * difference must be at least MIN_DIFF_PERCENT_PER_BIT * (4 - 1) = 3 percent.
> >+ *
> >+ * The relationship between number of used CBM bits and difference in LLC
> >+ * misses is not expected to be linear. With a small number of bits, the
> >+ * margin is smaller than with larger number of bits. For selftest purposes,
> >+ * however, linear approach is enough because ultimately only pass/fail
> >+ * decision has to be made and distinction between strong and stronger
> >+ * signal is irrelevant.
> >  */
> >-static int cat_setup(struct resctrl_val_param *p)
> >-{
> >-	char schemata[64];
> >-	int ret = 0;
> >-
> >-	/* Run NUM_OF_RUNS times */
> >-	if (p->num_of_runs >= NUM_OF_RUNS)
> >-		return END_OF_TESTS;
> >-
> >-	if (p->num_of_runs == 0) {
> >-		sprintf(schemata, "%lx", p->mask);
> >-		ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no,
> >-				     p->resctrl_val);
> >-	}
> >-	p->num_of_runs++;
> >-
> >-	return ret;
> >-}
> >+#define MIN_DIFF_PERCENT_PER_BIT	1
> > 
> > static int show_results_info(__u64 sum_llc_val, int no_of_bits,
> >-			     unsigned long cache_span, unsigned long max_diff,
> >-			     unsigned long max_diff_percent, unsigned long num_of_runs,
> >-			     bool platform)
> >+			     unsigned long cache_span, long min_diff_percent,
> >+			     unsigned long num_of_runs, bool platform,
> >+			     __s64 *prev_avg_llc_val)
> > {
> > 	__u64 avg_llc_val = 0;
> >-	float diff_percent;
> >-	int ret;
> >+	float avg_diff;
> >+	int ret = 0;
> > 
> > 	avg_llc_val = sum_llc_val / num_of_runs;
> >-	diff_percent = ((float)cache_span - avg_llc_val) / cache_span * 100;
> >+	if (*prev_avg_llc_val) {
> >+		float delta = (__s64)(avg_llc_val - *prev_avg_llc_val);
> > 
> >-	ret = platform && abs((int)diff_percent) > max_diff_percent;
> >+		avg_diff = delta / *prev_avg_llc_val;
> >+		ret = platform && (avg_diff * 100) < (float)min_diff_percent;
> > 
> >-	ksft_print_msg("%s Check cache miss rate within %lu%%\n",
> >-		       ret ? "Fail:" : "Pass:", max_diff_percent);
> >+		ksft_print_msg("%s Check cache miss rate changed more than %.1f%%\n",
> >+			       ret ? "Fail:" : "Pass:", (float)min_diff_percent);
> 
> Shouldn't "Fail" and "Pass" be flipped in the ternary operator? Or the condition
> sign above "<" should be ">"?

I must not touch ret ? "Fail:" : "Pass:" logic, it's the correct way 
around. If I'd touch it, it'd break what the calling code assumes about 
the return value.

(More explanation below).

> Now it looks like if (avg_diff * 100) is smaller than the min_diff_percent the
> test is supposed to fail but the text suggests it's the other way around.
>
> I also ran this selftest and that's the output:
> 
> # Pass: Check cache miss rate changed more than 3.0%
> # Percent diff=45.8
> # Number of bits: 4
> # Average LLC val: 322489
> # Cache span (lines): 294912
> # Pass: Check cache miss rate changed more than 2.0%
> # Percent diff=38.0
> # Number of bits: 3
> # Average LLC val: 445005
> # Cache span (lines): 221184
> # Pass: Check cache miss rate changed more than 1.0%
> # Percent diff=27.2
> # Number of bits: 2
> # Average LLC val: 566145
> # Cache span (lines): 147456
> # Pass: Check cache miss rate changed more than 0.0%
> # Percent diff=18.3
> # Number of bits: 1
> # Average LLC val: 669657
> # Cache span (lines): 73728
> ok 1 CAT: test
> 
> The diff percentages are much larger than the thresholds they're supposed to
> be within and the test is passed.

No, the whole test logic is changed dramatically by this patch and 
failure logic is reverse now because of it. Note how I also altered these 
things:

- MAX_DIFF_PERCENT -> MIN_DIFF_PERCENT_PER_BIT
- max_diff_percent -> min_diff_percent
- "cache miss rate within" -> "cache miss rate changed more than"

The new CAT test measures the # of cache misses (or in case of L2 CAT 
test, LLC accesses which is used as a proxy for L2 misses). Then it takes 
one bit away from the allocation mask and repeats the measurement.

If the # of LLC misses changes more than min_diff_precent when the 
number of bits in the allocation was changed, it is a strong indicator CAT 
is working like it should. Based on your numbers above, I'm extremely 
confident CAT works as expected!

I know for a fact that when the selftest is bound to a wrong resource id 
(which actually occurs on broadwell's with CoD enabled without one of the 
later patches in this series), this test is guaranteed to fail 100%, 
there's no noticeable difference measured in LLC misses in that case.

> >@@ -143,54 +168,64 @@ static int cat_test(struct resctrl_val_param *param, size_t span)
> > 	if (ret)
> > 		return ret;
> > 
> >+	buf = alloc_buffer(span, 1);
> >+	if (buf == NULL)
> 
> Similiar to patch 01/24, wouldn't this:
> 	if (!buf)
> be better?

I've already changed this based on the comment you made against 1/24 :-).
  
Maciej Wieczor-Retman Oct. 31, 2023, 7:24 a.m. UTC | #3
On 2023-10-27 at 15:32:58 +0300, Ilpo Järvinen wrote:
>On Fri, 27 Oct 2023, Maciej Wieczór-Retman wrote:
>> On 2023-10-24 at 12:26:26 +0300, Ilpo Järvinen wrote:
>> >-	ksft_print_msg("%s Check cache miss rate within %lu%%\n",
>> >-		       ret ? "Fail:" : "Pass:", max_diff_percent);
>> >+		ksft_print_msg("%s Check cache miss rate changed more than %.1f%%\n",
>> >+			       ret ? "Fail:" : "Pass:", (float)min_diff_percent);
>> 
>> Shouldn't "Fail" and "Pass" be flipped in the ternary operator? Or the condition
>> sign above "<" should be ">"?
>
>I must not touch ret ? "Fail:" : "Pass:" logic, it's the correct way 
>around. If I'd touch it, it'd break what the calling code assumes about 
>the return value.
>
>(More explanation below).
>
>> Now it looks like if (avg_diff * 100) is smaller than the min_diff_percent the
>> test is supposed to fail but the text suggests it's the other way around.
>>
>> I also ran this selftest and that's the output:
>> 
>> # Pass: Check cache miss rate changed more than 3.0%
>> # Percent diff=45.8
>> # Number of bits: 4
>> # Average LLC val: 322489
>> # Cache span (lines): 294912
>> # Pass: Check cache miss rate changed more than 2.0%
>> # Percent diff=38.0
>> # Number of bits: 3
>> # Average LLC val: 445005
>> # Cache span (lines): 221184
>> # Pass: Check cache miss rate changed more than 1.0%
>> # Percent diff=27.2
>> # Number of bits: 2
>> # Average LLC val: 566145
>> # Cache span (lines): 147456
>> # Pass: Check cache miss rate changed more than 0.0%
>> # Percent diff=18.3
>> # Number of bits: 1
>> # Average LLC val: 669657
>> # Cache span (lines): 73728
>> ok 1 CAT: test
>> 
>> The diff percentages are much larger than the thresholds they're supposed to
>> be within and the test is passed.
>
>No, the whole test logic is changed dramatically by this patch and 
>failure logic is reverse now because of it. Note how I also altered these 
>things:
>
>- MAX_DIFF_PERCENT -> MIN_DIFF_PERCENT_PER_BIT
>- max_diff_percent -> min_diff_percent
>- "cache miss rate within" -> "cache miss rate changed more than"
>
>The new CAT test measures the # of cache misses (or in case of L2 CAT 
>test, LLC accesses which is used as a proxy for L2 misses). Then it takes 
>one bit away from the allocation mask and repeats the measurement.
>
>If the # of LLC misses changes more than min_diff_precent when the 
>number of bits in the allocation was changed, it is a strong indicator CAT 
>is working like it should. Based on your numbers above, I'm extremely 
>confident CAT works as expected!
>
>I know for a fact that when the selftest is bound to a wrong resource id 
>(which actually occurs on broadwell's with CoD enabled without one of the 
>later patches in this series), this test is guaranteed to fail 100%, 
>there's no noticeable difference measured in LLC misses in that case.

Thanks for explaining. Looking at it again the patch makes sense and seems very
coherent.
  
Reinette Chatre Nov. 2, 2023, 5:51 p.m. UTC | #4
Hi Ilpo,

On 10/24/2023 2:26 AM, Ilpo Järvinen wrote:
> CAT test spawns two processes into two different control groups with
> exclusive schemata. Both the processes alloc a buffer from memory
> matching their allocated LLC block size and flush the entire buffer out
> of caches. Since the processes are reading through the buffer only once
> during the measurement and initially all the buffer was flushed, the
> test isn't testing CAT.
> 
> Rewrite the CAT test to allocate a buffer sized to half of LLC. Then
> perform a sequence of tests with different LLC alloc sizes starting
> from half of the CBM bits down to 1-bit CBM. Flush the buffer before
> each test and read the buffer twice. Observe the LLC misses on the
> second read through the buffer. As the allocated LLC block gets smaller
> and smaller, the LLC misses will become larger and larger giving a
> strong signal on CAT working properly.
> 
> The new CAT test is using only a single process because it relies on
> measured effect against another run of itself rather than another
> process adding noise. The rest of the system is allocated the CBM bits
> not used by the CAT test to keep the test isolated.
> 
> Replace count_bits() with count_contiguous_bits() to get the first bit
> position in order to be able to calculate masks based on it.
> 
> This change has been tested with a number of systems from different
> generations.

Thank you very much for doing this.

> 
> Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
> Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
> ---
>  tools/testing/selftests/resctrl/cat_test.c  | 286 +++++++++-----------
>  tools/testing/selftests/resctrl/fill_buf.c  |   6 +-
>  tools/testing/selftests/resctrl/resctrl.h   |   5 +-
>  tools/testing/selftests/resctrl/resctrlfs.c |  44 +--
>  4 files changed, 137 insertions(+), 204 deletions(-)
> 
> diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
> index e71690a9bbb3..7518c520c5cc 100644
> --- a/tools/testing/selftests/resctrl/cat_test.c
> +++ b/tools/testing/selftests/resctrl/cat_test.c
> @@ -11,65 +11,68 @@
>  #include "resctrl.h"
>  #include <unistd.h>
>  
> -#define RESULT_FILE_NAME1	"result_cat1"
> -#define RESULT_FILE_NAME2	"result_cat2"
> +#define RESULT_FILE_NAME	"result_cat"
>  #define NUM_OF_RUNS		5
> -#define MAX_DIFF_PERCENT	4
> -#define MAX_DIFF		1000000
>  
>  /*
> - * Change schemata. Write schemata to specified
> - * con_mon grp, mon_grp in resctrl FS.
> - * Run 5 times in order to get average values.
> + * Minimum difference in LLC misses between a test with n+1 bits CBM mask to
> + * the test with n bits. With e.g. 5 vs 4 bits in the CBM mask, the minimum
> + * difference must be at least MIN_DIFF_PERCENT_PER_BIT * (4 - 1) = 3 percent.

This formula is not clear to me. In the code the formula is always:
MIN_DIFF_PERCENT_PER_BIT * (bits - 1) ... is the "-1" because it always
decrements the number of bits tested by one? So, for example, if testing
5 then 3 bits it would be  MIN_DIFF_PERCENT_PER_BIT * (3 - 2)?
Would above example thus be:
MIN_DIFF_PERCENT_PER_BIT * (4 - (5 - 4)) = 3 ?



> + *
> + * The relationship between number of used CBM bits and difference in LLC
> + * misses is not expected to be linear. With a small number of bits, the
> + * margin is smaller than with larger number of bits. For selftest purposes,
> + * however, linear approach is enough because ultimately only pass/fail
> + * decision has to be made and distinction between strong and stronger
> + * signal is irrelevant.
>   */

...

>  /*
>   * cat_test:	execute CAT benchmark and measure LLC cache misses
>   * @param:	parameters passed to cat_test()
>   * @span:	buffer size for the benchmark
> + * @current_mask	start mask for the first iteration
> + *
> + * Run CAT test, bits are removed one-by-one from the current_mask for each
> + * subsequent test.
>   *

Could this please be expanded to provide more details about how test works,
measurements taken, and how pass/fail is determined?

> - * Return:	0 on success. non-zero on failure.
> + * Return:		0 on success. non-zero on failure.

Is non-zero specific enough? Does that mean that <0 and >0 are failure?

>   */
> -static int cat_test(struct resctrl_val_param *param, size_t span)
> +static int cat_test(struct resctrl_val_param *param, size_t span, unsigned long current_mask)
>  {
> -	int memflush = 1, operation = 0, ret = 0;
>  	char *resctrl_val = param->resctrl_val;
>  	static struct perf_event_read pe_read;
>  	struct perf_event_attr pea;
> +	unsigned char *buf;
> +	char schemata[64];
> +	int ret, i, pe_fd;
>  	pid_t bm_pid;
> -	int pe_fd;
>  
>  	if (strcmp(param->filename, "") == 0)
>  		sprintf(param->filename, "stdio");
> @@ -143,54 +168,64 @@ static int cat_test(struct resctrl_val_param *param, size_t span)
>  	if (ret)
>  		return ret;
>  
> +	buf = alloc_buffer(span, 1);
> +	if (buf == NULL)
> +		return -1;
> +
>  	perf_event_attr_initialize(&pea, PERF_COUNT_HW_CACHE_MISSES);
>  	perf_event_initialize_read_format(&pe_read);
>  
> -	/* Test runs until the callback setup() tells the test to stop. */
> -	while (1) {
> -		ret = param->setup(param);
> -		if (ret == END_OF_TESTS) {
> -			ret = 0;
> -			break;
> -		}
> -		if (ret < 0)
> -			break;
> -		pe_fd = perf_event_reset_enable(&pea, bm_pid, param->cpu_no);
> -		if (pe_fd < 0) {
> -			ret = -1;
> -			break;
> -		}
> +	while (current_mask) {
> +		snprintf(schemata, sizeof(schemata), "%lx", param->mask & ~current_mask);
> +		ret = write_schemata("", schemata, param->cpu_no, param->resctrl_val);
> +		if (ret)
> +			goto free_buf;
> +		snprintf(schemata, sizeof(schemata), "%lx", current_mask);
> +		ret = write_schemata(param->ctrlgrp, schemata, param->cpu_no, param->resctrl_val);
> +		if (ret)
> +			goto free_buf;
> +
> +		for (i = 0; i < NUM_OF_RUNS; i++) {
> +			mem_flush(buf, span);
> +			ret = fill_cache_read(buf, span, true);
> +			if (ret)
> +				goto free_buf;
> +
> +			pe_fd = perf_event_reset_enable(&pea, bm_pid, param->cpu_no);
> +			if (pe_fd < 0) {
> +				ret = -1;
> +				goto free_buf;
> +			}

It seems to me that the perf counters are reconfigured at every iteration.
Can it not just be configured once and then the counters just reset and
enabled at each iteration? I'd expect this additional work to impact
values measured.

>  
> -		if (run_fill_buf(span, memflush, operation, true)) {
> -			fprintf(stderr, "Error-running fill buffer\n");
> -			ret = -1;
> -			goto pe_close;
> -		}
> +			fill_cache_read(buf, span, true);
>  
> -		sleep(1);
> -		ret = perf_event_measure(pe_fd, &pe_read, param, bm_pid);
> -		if (ret)
> -			goto pe_close;
> +			ret = perf_event_measure(pe_fd, &pe_read, param, bm_pid);
> +			if (ret)
> +				goto pe_close;
>  
> -		close(pe_fd);
> +			close(pe_fd);
> +		}
> +		current_mask = next_mask(current_mask);
>  	}
>  
> +free_buf:
> +	free(buf);
> +
>  	return ret;
>  
>  pe_close:
>  	close(pe_fd);
> -	return ret;
> +	goto free_buf;
>  }
>  

Reinette
  
Ilpo Järvinen Nov. 3, 2023, 10:57 a.m. UTC | #5
On Thu, 2 Nov 2023, Reinette Chatre wrote:
> On 10/24/2023 2:26 AM, Ilpo Järvinen wrote:
> > CAT test spawns two processes into two different control groups with
> > exclusive schemata. Both the processes alloc a buffer from memory
> > matching their allocated LLC block size and flush the entire buffer out
> > of caches. Since the processes are reading through the buffer only once
> > during the measurement and initially all the buffer was flushed, the
> > test isn't testing CAT.
> > 
> > Rewrite the CAT test to allocate a buffer sized to half of LLC. Then
> > perform a sequence of tests with different LLC alloc sizes starting
> > from half of the CBM bits down to 1-bit CBM. Flush the buffer before
> > each test and read the buffer twice. Observe the LLC misses on the
> > second read through the buffer. As the allocated LLC block gets smaller
> > and smaller, the LLC misses will become larger and larger giving a
> > strong signal on CAT working properly.
> > 
> > The new CAT test is using only a single process because it relies on
> > measured effect against another run of itself rather than another
> > process adding noise. The rest of the system is allocated the CBM bits
> > not used by the CAT test to keep the test isolated.
> > 
> > Replace count_bits() with count_contiguous_bits() to get the first bit
> > position in order to be able to calculate masks based on it.
> > 
> > This change has been tested with a number of systems from different
> > generations.
> 
> Thank you very much for doing this.
> 
> > 
> > Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
> > Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
> > ---
> >  tools/testing/selftests/resctrl/cat_test.c  | 286 +++++++++-----------
> >  tools/testing/selftests/resctrl/fill_buf.c  |   6 +-
> >  tools/testing/selftests/resctrl/resctrl.h   |   5 +-
> >  tools/testing/selftests/resctrl/resctrlfs.c |  44 +--
> >  4 files changed, 137 insertions(+), 204 deletions(-)
> > 
> > diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
> > index e71690a9bbb3..7518c520c5cc 100644
> > --- a/tools/testing/selftests/resctrl/cat_test.c
> > +++ b/tools/testing/selftests/resctrl/cat_test.c
> > @@ -11,65 +11,68 @@
> >  #include "resctrl.h"
> >  #include <unistd.h>
> >  
> > -#define RESULT_FILE_NAME1	"result_cat1"
> > -#define RESULT_FILE_NAME2	"result_cat2"
> > +#define RESULT_FILE_NAME	"result_cat"
> >  #define NUM_OF_RUNS		5
> > -#define MAX_DIFF_PERCENT	4
> > -#define MAX_DIFF		1000000
> >  
> >  /*
> > - * Change schemata. Write schemata to specified
> > - * con_mon grp, mon_grp in resctrl FS.
> > - * Run 5 times in order to get average values.
> > + * Minimum difference in LLC misses between a test with n+1 bits CBM mask to
> > + * the test with n bits. With e.g. 5 vs 4 bits in the CBM mask, the minimum
> > + * difference must be at least MIN_DIFF_PERCENT_PER_BIT * (4 - 1) = 3 percent.
> 
> This formula is not clear to me. In the code the formula is always:
> MIN_DIFF_PERCENT_PER_BIT * (bits - 1) ... is the "-1" because it always
> decrements the number of bits tested by one?

No, -1 is not related to decrementing bits by one, but setting a boundary 
which workds for 1 bit masks. In general,  the smaller the number of bits 
in the allocation mask is, less change there will be between n+1 -> n bits
results. When n is 1, the result with some platforms is close to zero so I 
just had to make the min diff to allow it. Thus, n-1 to set the failure 
threshold at 0%.

> So, for example, if testing
> 5 then 3 bits it would be  MIN_DIFF_PERCENT_PER_BIT * (3 - 2)?
> Would above example thus be:
> MIN_DIFF_PERCENT_PER_BIT * (4 - (5 - 4)) = 3 ?

I suspect you're overthinking it here. The CAT selftest currently doesn't 
jump from 5 to 3 bits so I don't know what you're trying to calculate 
here.

> > - * Return:	0 on success. non-zero on failure.
> > + * Return:		0 on success. non-zero on failure.
> 
> Is non-zero specific enough? Does that mean that <0 and >0 are failure?

I suspect it is, after all the cleanups and fixes that have been done.
The wording is from the original though.

> >   */
> > -static int cat_test(struct resctrl_val_param *param, size_t span)
> > +static int cat_test(struct resctrl_val_param *param, size_t span, unsigned long current_mask)
> >  {
> > -	int memflush = 1, operation = 0, ret = 0;
> >  	char *resctrl_val = param->resctrl_val;
> >  	static struct perf_event_read pe_read;
> >  	struct perf_event_attr pea;
> > +	unsigned char *buf;
> > +	char schemata[64];
> > +	int ret, i, pe_fd;
> >  	pid_t bm_pid;
> > -	int pe_fd;
> >  
> >  	if (strcmp(param->filename, "") == 0)
> >  		sprintf(param->filename, "stdio");
> > @@ -143,54 +168,64 @@ static int cat_test(struct resctrl_val_param *param, size_t span)
> >  	if (ret)
> >  		return ret;
> >  
> > +	buf = alloc_buffer(span, 1);
> > +	if (buf == NULL)
> > +		return -1;
> > +
> >  	perf_event_attr_initialize(&pea, PERF_COUNT_HW_CACHE_MISSES);
> >  	perf_event_initialize_read_format(&pe_read);
> >  
> > -	/* Test runs until the callback setup() tells the test to stop. */
> > -	while (1) {
> > -		ret = param->setup(param);
> > -		if (ret == END_OF_TESTS) {
> > -			ret = 0;
> > -			break;
> > -		}
> > -		if (ret < 0)
> > -			break;
> > -		pe_fd = perf_event_reset_enable(&pea, bm_pid, param->cpu_no);
> > -		if (pe_fd < 0) {
> > -			ret = -1;
> > -			break;
> > -		}
> > +	while (current_mask) {
> > +		snprintf(schemata, sizeof(schemata), "%lx", param->mask & ~current_mask);
> > +		ret = write_schemata("", schemata, param->cpu_no, param->resctrl_val);
> > +		if (ret)
> > +			goto free_buf;
> > +		snprintf(schemata, sizeof(schemata), "%lx", current_mask);
> > +		ret = write_schemata(param->ctrlgrp, schemata, param->cpu_no, param->resctrl_val);
> > +		if (ret)
> > +			goto free_buf;
> > +
> > +		for (i = 0; i < NUM_OF_RUNS; i++) {
> > +			mem_flush(buf, span);
> > +			ret = fill_cache_read(buf, span, true);
> > +			if (ret)
> > +				goto free_buf;
> > +
> > +			pe_fd = perf_event_reset_enable(&pea, bm_pid, param->cpu_no);
> > +			if (pe_fd < 0) {
> > +				ret = -1;
> > +				goto free_buf;
> > +			}
> 
> It seems to me that the perf counters are reconfigured at every iteration.
> Can it not just be configured once and then the counters just reset and
> enabled at each iteration? I'd expect this additional work to impact
> values measured.

So you suggest I undo one of the changes made in 10/24 and just call the 
function which does only the ioctl() calls? I don't know why it has been 
done the way it has been, I can try to change it and see what happens.
  
Reinette Chatre Nov. 3, 2023, 10:50 p.m. UTC | #6
Hi Ilpo,

On 11/3/2023 3:57 AM, Ilpo Järvinen wrote:
> On Thu, 2 Nov 2023, Reinette Chatre wrote:
>> On 10/24/2023 2:26 AM, Ilpo Järvinen wrote:

...

>>>  /*
>>> - * Change schemata. Write schemata to specified
>>> - * con_mon grp, mon_grp in resctrl FS.
>>> - * Run 5 times in order to get average values.
>>> + * Minimum difference in LLC misses between a test with n+1 bits CBM mask to
>>> + * the test with n bits. With e.g. 5 vs 4 bits in the CBM mask, the minimum
>>> + * difference must be at least MIN_DIFF_PERCENT_PER_BIT * (4 - 1) = 3 percent.
>>
>> This formula is not clear to me. In the code the formula is always:
>> MIN_DIFF_PERCENT_PER_BIT * (bits - 1) ... is the "-1" because it always
>> decrements the number of bits tested by one?
> 
> No, -1 is not related to decrementing bits by one, but setting a boundary 
> which workds for 1 bit masks. In general,  the smaller the number of bits 
> in the allocation mask is, less change there will be between n+1 -> n bits
> results. When n is 1, the result with some platforms is close to zero so I 
> just had to make the min diff to allow it. Thus, n-1 to set the failure 
> threshold at 0%.
> 
>> So, for example, if testing
>> 5 then 3 bits it would be  MIN_DIFF_PERCENT_PER_BIT * (3 - 2)?
>> Would above example thus be:
>> MIN_DIFF_PERCENT_PER_BIT * (4 - (5 - 4)) = 3 ?
> 
> I suspect you're overthinking it here. The CAT selftest currently doesn't 
> jump from 5 to 3 bits so I don't know what you're trying to calculate 
> here.

I was trying to understand the formula. The example starts with "e.g 5 vs 4 bits
in the CBM mask ..." and then jumps to "MIN_DIFF_PERCENT_PER_BIT * (4 - 1) = 3"
It is not obvious to me where the "5" and "4" from the example produces the
resulting formula.

Perhaps it would be simpler (to me) to say something like 

	Minimum difference in LLC misses between a test with n+1 bits CBM mask to
	the test with n bits is MIN_DIFF_PERCENT_PER_BIT * (n - 1). With e.g. 5 vs 4
	bits in the CBM mask, the minimum difference must be at least
	MIN_DIFF_PERCENT_PER_BIT * (4 - 1) = 3 percent.

> 
>>> - * Return:	0 on success. non-zero on failure.
>>> + * Return:		0 on success. non-zero on failure.
>>
>> Is non-zero specific enough? Does that mean that <0 and >0 are failure?
> 
> I suspect it is, after all the cleanups and fixes that have been done.
> The wording is from the original though.
> 
>>>   */
>>> -static int cat_test(struct resctrl_val_param *param, size_t span)
>>> +static int cat_test(struct resctrl_val_param *param, size_t span, unsigned long current_mask)
>>>  {
>>> -	int memflush = 1, operation = 0, ret = 0;
>>>  	char *resctrl_val = param->resctrl_val;
>>>  	static struct perf_event_read pe_read;
>>>  	struct perf_event_attr pea;
>>> +	unsigned char *buf;
>>> +	char schemata[64];
>>> +	int ret, i, pe_fd;
>>>  	pid_t bm_pid;
>>> -	int pe_fd;
>>>  
>>>  	if (strcmp(param->filename, "") == 0)
>>>  		sprintf(param->filename, "stdio");
>>> @@ -143,54 +168,64 @@ static int cat_test(struct resctrl_val_param *param, size_t span)
>>>  	if (ret)
>>>  		return ret;
>>>  
>>> +	buf = alloc_buffer(span, 1);
>>> +	if (buf == NULL)
>>> +		return -1;
>>> +
>>>  	perf_event_attr_initialize(&pea, PERF_COUNT_HW_CACHE_MISSES);
>>>  	perf_event_initialize_read_format(&pe_read);
>>>  
>>> -	/* Test runs until the callback setup() tells the test to stop. */
>>> -	while (1) {
>>> -		ret = param->setup(param);
>>> -		if (ret == END_OF_TESTS) {
>>> -			ret = 0;
>>> -			break;
>>> -		}
>>> -		if (ret < 0)
>>> -			break;
>>> -		pe_fd = perf_event_reset_enable(&pea, bm_pid, param->cpu_no);
>>> -		if (pe_fd < 0) {
>>> -			ret = -1;
>>> -			break;
>>> -		}
>>> +	while (current_mask) {
>>> +		snprintf(schemata, sizeof(schemata), "%lx", param->mask & ~current_mask);
>>> +		ret = write_schemata("", schemata, param->cpu_no, param->resctrl_val);
>>> +		if (ret)
>>> +			goto free_buf;
>>> +		snprintf(schemata, sizeof(schemata), "%lx", current_mask);
>>> +		ret = write_schemata(param->ctrlgrp, schemata, param->cpu_no, param->resctrl_val);
>>> +		if (ret)
>>> +			goto free_buf;
>>> +
>>> +		for (i = 0; i < NUM_OF_RUNS; i++) {
>>> +			mem_flush(buf, span);
>>> +			ret = fill_cache_read(buf, span, true);
>>> +			if (ret)
>>> +				goto free_buf;
>>> +
>>> +			pe_fd = perf_event_reset_enable(&pea, bm_pid, param->cpu_no);
>>> +			if (pe_fd < 0) {
>>> +				ret = -1;
>>> +				goto free_buf;
>>> +			}
>>
>> It seems to me that the perf counters are reconfigured at every iteration.
>> Can it not just be configured once and then the counters just reset and
>> enabled at each iteration? I'd expect this additional work to impact
>> values measured.
> 
> So you suggest I undo one of the changes made in 10/24 and just call the 
> function which does only the ioctl() calls? I don't know why it has been 
> done the way it has been, I can try to change it and see what happens.

The relationships between the functions are not as obvious to me but as
far as execution is concerned I am indeed suggesting that only the ioctl()s
to reset and enable counters are repeated in the loop. From what I understand
the counters need only be configured once.

Reinette
  

Patch

diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
index e71690a9bbb3..7518c520c5cc 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -11,65 +11,68 @@ 
 #include "resctrl.h"
 #include <unistd.h>
 
-#define RESULT_FILE_NAME1	"result_cat1"
-#define RESULT_FILE_NAME2	"result_cat2"
+#define RESULT_FILE_NAME	"result_cat"
 #define NUM_OF_RUNS		5
-#define MAX_DIFF_PERCENT	4
-#define MAX_DIFF		1000000
 
 /*
- * Change schemata. Write schemata to specified
- * con_mon grp, mon_grp in resctrl FS.
- * Run 5 times in order to get average values.
+ * Minimum difference in LLC misses between a test with n+1 bits CBM mask to
+ * the test with n bits. With e.g. 5 vs 4 bits in the CBM mask, the minimum
+ * difference must be at least MIN_DIFF_PERCENT_PER_BIT * (4 - 1) = 3 percent.
+ *
+ * The relationship between number of used CBM bits and difference in LLC
+ * misses is not expected to be linear. With a small number of bits, the
+ * margin is smaller than with larger number of bits. For selftest purposes,
+ * however, linear approach is enough because ultimately only pass/fail
+ * decision has to be made and distinction between strong and stronger
+ * signal is irrelevant.
  */
-static int cat_setup(struct resctrl_val_param *p)
-{
-	char schemata[64];
-	int ret = 0;
-
-	/* Run NUM_OF_RUNS times */
-	if (p->num_of_runs >= NUM_OF_RUNS)
-		return END_OF_TESTS;
-
-	if (p->num_of_runs == 0) {
-		sprintf(schemata, "%lx", p->mask);
-		ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no,
-				     p->resctrl_val);
-	}
-	p->num_of_runs++;
-
-	return ret;
-}
+#define MIN_DIFF_PERCENT_PER_BIT	1
 
 static int show_results_info(__u64 sum_llc_val, int no_of_bits,
-			     unsigned long cache_span, unsigned long max_diff,
-			     unsigned long max_diff_percent, unsigned long num_of_runs,
-			     bool platform)
+			     unsigned long cache_span, long min_diff_percent,
+			     unsigned long num_of_runs, bool platform,
+			     __s64 *prev_avg_llc_val)
 {
 	__u64 avg_llc_val = 0;
-	float diff_percent;
-	int ret;
+	float avg_diff;
+	int ret = 0;
 
 	avg_llc_val = sum_llc_val / num_of_runs;
-	diff_percent = ((float)cache_span - avg_llc_val) / cache_span * 100;
+	if (*prev_avg_llc_val) {
+		float delta = (__s64)(avg_llc_val - *prev_avg_llc_val);
 
-	ret = platform && abs((int)diff_percent) > max_diff_percent;
+		avg_diff = delta / *prev_avg_llc_val;
+		ret = platform && (avg_diff * 100) < (float)min_diff_percent;
 
-	ksft_print_msg("%s Check cache miss rate within %lu%%\n",
-		       ret ? "Fail:" : "Pass:", max_diff_percent);
+		ksft_print_msg("%s Check cache miss rate changed more than %.1f%%\n",
+			       ret ? "Fail:" : "Pass:", (float)min_diff_percent);
 
-	ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent));
+		ksft_print_msg("Percent diff=%.1f\n", avg_diff * 100);
+	}
+	*prev_avg_llc_val = avg_llc_val;
 
 	show_cache_info(no_of_bits, avg_llc_val, cache_span, true);
 
 	return ret;
 }
 
-static int check_results(struct resctrl_val_param *param, size_t span)
+/* Remove one bit from the consecutive cbm mask */
+static unsigned long next_mask(unsigned long current_mask)
+{
+	return current_mask & (current_mask >> 1);
+}
+
+static int check_results(struct resctrl_val_param *param, const char *cache_type,
+			 unsigned long cache_total_size, unsigned long full_cache_mask,
+			 unsigned long current_mask)
 {
 	char *token_array[8], temp[512];
 	__u64 sum_llc_perf_miss = 0;
-	int runs = 0, no_of_bits = 0;
+	unsigned long alloc_size;
+	__s64 prev_avg_llc_val = 0;
+	int runs = 0;
+	int fail = 0;
+	int ret;
 	FILE *fp;
 
 	ksft_print_msg("Checking for pass/fail\n");
@@ -83,49 +86,71 @@  static int check_results(struct resctrl_val_param *param, size_t span)
 	while (fgets(temp, sizeof(temp), fp)) {
 		char *token = strtok(temp, ":\t");
 		int fields = 0;
+		int bits;
 
 		while (token) {
 			token_array[fields++] = token;
 			token = strtok(NULL, ":\t");
 		}
-		/*
-		 * Discard the first value which is inaccurate due to monitoring
-		 * setup transition phase.
-		 */
-		if (runs > 0)
-			sum_llc_perf_miss += strtoull(token_array[3], NULL, 0);
+
+		sum_llc_perf_miss += strtoull(token_array[3], NULL, 0);
 		runs++;
+
+		if (runs < NUM_OF_RUNS)
+			continue;
+
+		if (!current_mask) {
+			ksft_print_msg("Unexpected empty cache mask\n");
+			break;
+		}
+
+		alloc_size = cache_size(cache_total_size, current_mask, full_cache_mask);
+
+		bits = count_bits(current_mask);
+
+		ret = show_results_info(sum_llc_perf_miss, bits,
+					alloc_size / 64,
+					MIN_DIFF_PERCENT_PER_BIT * (bits - 1), runs,
+					get_vendor() == ARCH_INTEL,
+					&prev_avg_llc_val);
+		if (ret)
+			fail = 1;
+
+		runs = 0;
+		sum_llc_perf_miss = 0;
+		current_mask = next_mask(current_mask);
 	}
 
 	fclose(fp);
-	no_of_bits = count_bits(param->mask);
 
-	return show_results_info(sum_llc_perf_miss, no_of_bits, span / 64,
-				 MAX_DIFF, MAX_DIFF_PERCENT, runs - 1,
-				 get_vendor() == ARCH_INTEL);
+	return fail;
 }
 
 void cat_test_cleanup(void)
 {
-	remove(RESULT_FILE_NAME1);
-	remove(RESULT_FILE_NAME2);
+	remove(RESULT_FILE_NAME);
 }
 
 /*
  * cat_test:	execute CAT benchmark and measure LLC cache misses
  * @param:	parameters passed to cat_test()
  * @span:	buffer size for the benchmark
+ * @current_mask	start mask for the first iteration
+ *
+ * Run CAT test, bits are removed one-by-one from the current_mask for each
+ * subsequent test.
  *
- * Return:	0 on success. non-zero on failure.
+ * Return:		0 on success. non-zero on failure.
  */
-static int cat_test(struct resctrl_val_param *param, size_t span)
+static int cat_test(struct resctrl_val_param *param, size_t span, unsigned long current_mask)
 {
-	int memflush = 1, operation = 0, ret = 0;
 	char *resctrl_val = param->resctrl_val;
 	static struct perf_event_read pe_read;
 	struct perf_event_attr pea;
+	unsigned char *buf;
+	char schemata[64];
+	int ret, i, pe_fd;
 	pid_t bm_pid;
-	int pe_fd;
 
 	if (strcmp(param->filename, "") == 0)
 		sprintf(param->filename, "stdio");
@@ -143,54 +168,64 @@  static int cat_test(struct resctrl_val_param *param, size_t span)
 	if (ret)
 		return ret;
 
+	buf = alloc_buffer(span, 1);
+	if (buf == NULL)
+		return -1;
+
 	perf_event_attr_initialize(&pea, PERF_COUNT_HW_CACHE_MISSES);
 	perf_event_initialize_read_format(&pe_read);
 
-	/* Test runs until the callback setup() tells the test to stop. */
-	while (1) {
-		ret = param->setup(param);
-		if (ret == END_OF_TESTS) {
-			ret = 0;
-			break;
-		}
-		if (ret < 0)
-			break;
-		pe_fd = perf_event_reset_enable(&pea, bm_pid, param->cpu_no);
-		if (pe_fd < 0) {
-			ret = -1;
-			break;
-		}
+	while (current_mask) {
+		snprintf(schemata, sizeof(schemata), "%lx", param->mask & ~current_mask);
+		ret = write_schemata("", schemata, param->cpu_no, param->resctrl_val);
+		if (ret)
+			goto free_buf;
+		snprintf(schemata, sizeof(schemata), "%lx", current_mask);
+		ret = write_schemata(param->ctrlgrp, schemata, param->cpu_no, param->resctrl_val);
+		if (ret)
+			goto free_buf;
+
+		for (i = 0; i < NUM_OF_RUNS; i++) {
+			mem_flush(buf, span);
+			ret = fill_cache_read(buf, span, true);
+			if (ret)
+				goto free_buf;
+
+			pe_fd = perf_event_reset_enable(&pea, bm_pid, param->cpu_no);
+			if (pe_fd < 0) {
+				ret = -1;
+				goto free_buf;
+			}
 
-		if (run_fill_buf(span, memflush, operation, true)) {
-			fprintf(stderr, "Error-running fill buffer\n");
-			ret = -1;
-			goto pe_close;
-		}
+			fill_cache_read(buf, span, true);
 
-		sleep(1);
-		ret = perf_event_measure(pe_fd, &pe_read, param, bm_pid);
-		if (ret)
-			goto pe_close;
+			ret = perf_event_measure(pe_fd, &pe_read, param, bm_pid);
+			if (ret)
+				goto pe_close;
 
-		close(pe_fd);
+			close(pe_fd);
+		}
+		current_mask = next_mask(current_mask);
 	}
 
+free_buf:
+	free(buf);
+
 	return ret;
 
 pe_close:
 	close(pe_fd);
-	return ret;
+	goto free_buf;
 }
 
 int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
 {
-	unsigned long l_mask, l_mask_1;
-	int ret, pipefd[2], sibling_cpu_no;
+	unsigned long long_mask, start_mask, full_cache_mask;
 	unsigned long cache_total_size = 0;
-	unsigned long full_cache_mask, long_mask;
+	unsigned int start;
 	int count_of_bits;
-	char pipe_message;
 	size_t span;
+	int ret;
 
 	/* Get default cbm mask for L3/L2 cache */
 	ret = get_cbm_mask(cache_type, &full_cache_mask);
@@ -207,8 +242,7 @@  int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
 		return ret;
 	ksft_print_msg("Cache size :%lu\n", cache_total_size);
 
-	/* Get max number of bits from default-cabm mask */
-	count_of_bits = count_bits(long_mask);
+	count_of_bits = count_contiguous_bits(long_mask, &start);
 
 	if (!n)
 		n = count_of_bits / 2;
@@ -219,88 +253,26 @@  int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
 			       count_of_bits - 1);
 		return -1;
 	}
-
-	/* Get core id from same socket for running another thread */
-	sibling_cpu_no = get_core_sibling(cpu_no);
-	if (sibling_cpu_no < 0)
-		return -1;
+	start_mask = create_bit_mask(start, n);
 
 	struct resctrl_val_param param = {
 		.resctrl_val	= CAT_STR,
 		.cpu_no		= cpu_no,
-		.setup		= cat_setup,
+		.ctrlgrp	= "c1",
+		.filename	= RESULT_FILE_NAME,
+		.num_of_runs	= 0,
 	};
-
-	l_mask = long_mask >> n;
-	l_mask_1 = ~l_mask & long_mask;
-
-	/* Set param values for parent thread which will be allocated bitmask
-	 * with (max_bits - n) bits
-	 */
-	span = cache_size(cache_total_size, l_mask, full_cache_mask);
-	strcpy(param.ctrlgrp, "c2");
-	strcpy(param.mongrp, "m2");
-	strcpy(param.filename, RESULT_FILE_NAME2);
-	param.mask = l_mask;
-	param.num_of_runs = 0;
-
-	if (pipe(pipefd)) {
-		perror("# Unable to create pipe");
-		return errno;
-	}
-
-	fflush(stdout);
-	bm_pid = fork();
-
-	/* Set param values for child thread which will be allocated bitmask
-	 * with n bits
-	 */
-	if (bm_pid == 0) {
-		param.mask = l_mask_1;
-		strcpy(param.ctrlgrp, "c1");
-		strcpy(param.mongrp, "m1");
-		span = cache_size(cache_total_size, l_mask_1, full_cache_mask);
-		strcpy(param.filename, RESULT_FILE_NAME1);
-		param.num_of_runs = 0;
-		param.cpu_no = sibling_cpu_no;
-	}
+	param.mask = long_mask;
+	span = cache_size(cache_total_size, start_mask, full_cache_mask);
 
 	remove(param.filename);
 
-	ret = cat_test(&param, span);
-	if (ret == 0)
-		ret = check_results(&param, span);
-
-	if (bm_pid == 0) {
-		/* Tell parent that child is ready */
-		close(pipefd[0]);
-		pipe_message = 1;
-		if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) <
-		    sizeof(pipe_message))
-			/*
-			 * Just print the error message.
-			 * Let while(1) run and wait for itself to be killed.
-			 */
-			perror("# failed signaling parent process");
-
-		close(pipefd[1]);
-		while (1)
-			;
-	} else {
-		/* Parent waits for child to be ready. */
-		close(pipefd[1]);
-		pipe_message = 0;
-		while (pipe_message != 1) {
-			if (read(pipefd[0], &pipe_message,
-				 sizeof(pipe_message)) < sizeof(pipe_message)) {
-				perror("# failed reading from child process");
-				break;
-			}
-		}
-		close(pipefd[0]);
-		kill(bm_pid, SIGKILL);
-	}
+	ret = cat_test(&param, span, start_mask);
+	if (ret)
+		goto out;
 
+	ret = check_results(&param, cache_type, cache_total_size, full_cache_mask, start_mask);
+out:
 	cat_test_cleanup();
 
 	return ret;
diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
index 326d530425d0..3dbb71371715 100644
--- a/tools/testing/selftests/resctrl/fill_buf.c
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -38,7 +38,7 @@  static void cl_flush(void *p)
 #endif
 }
 
-static void mem_flush(unsigned char *buf, size_t buf_size)
+void mem_flush(unsigned char *buf, size_t buf_size)
 {
 	unsigned char *cp = buf;
 	size_t i = 0;
@@ -100,7 +100,7 @@  static void fill_one_span_write(unsigned char *buf, size_t buf_size)
 	}
 }
 
-static int fill_cache_read(unsigned char *buf, size_t buf_size, bool once)
+int fill_cache_read(unsigned char *buf, size_t buf_size, bool once)
 {
 	int ret = 0;
 	FILE *fp;
@@ -134,7 +134,7 @@  static int fill_cache_write(unsigned char *buf, size_t buf_size, bool once)
 	return 0;
 }
 
-static unsigned char *alloc_buffer(size_t buf_size, int memflush)
+unsigned char *alloc_buffer(size_t buf_size, int memflush)
 {
 	void *p = NULL;
 	uint64_t *p64;
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index ee3cee74a69c..927f696e0ab7 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -99,6 +99,9 @@  int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp,
 			    char *resctrl_val);
 int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
 		    int group_fd, unsigned long flags);
+unsigned char *alloc_buffer(size_t buf_size, int memflush);
+void mem_flush(unsigned char *buf, size_t buf_size);
+int fill_cache_read(unsigned char *buf, size_t buf_size, bool once);
 int run_fill_buf(size_t buf_size, int memflush, int op, bool once);
 int resctrl_val(const char * const *benchmark_cmd, struct resctrl_val_param *param);
 int mbm_bw_change(int cpu_no, const char * const *benchmark_cmd);
@@ -107,6 +110,7 @@  void mbm_test_cleanup(void);
 int mba_schemata_change(int cpu_no, const char * const *benchmark_cmd);
 void mba_test_cleanup(void);
 unsigned long create_bit_mask(unsigned int start, unsigned int len);
+unsigned int count_contiguous_bits(unsigned long val, unsigned int *start);
 int get_cbm_mask(const char *cache_type, unsigned long *mask);
 int get_shareable_mask(const char *cache_type, unsigned long *shareable_mask);
 int get_mask_no_shareable(const char *cache_type, unsigned long *mask);
@@ -119,7 +123,6 @@  int cat_perf_miss_val(int cpu_no, int no_of_bits, char *cache_type);
 int cmt_resctrl_val(int cpu_no, int n, const char * const *benchmark_cmd);
 unsigned int count_bits(unsigned long n);
 void cmt_test_cleanup(void);
-int get_core_sibling(int cpu_no);
 
 void perf_event_attr_initialize(struct perf_event_attr *pea, __u64 config);
 void perf_event_initialize_read_format(struct perf_event_read *pe_read);
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 02b04878121f..c8fbbd96311d 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -245,7 +245,7 @@  unsigned long create_bit_mask(unsigned int start, unsigned int len)
  *
  * Return:	The length of the contiguous bits in the longest train of bits
  */
-static unsigned int count_contiguous_bits(unsigned long val, unsigned int *start)
+unsigned int count_contiguous_bits(unsigned long val, unsigned int *start)
 {
 	unsigned long last_val;
 	int count = 0;
@@ -337,48 +337,6 @@  int get_mask_no_shareable(const char *cache_type, unsigned long *mask)
 	return 0;
 }
 
-/*
- * get_core_sibling - Get sibling core id from the same socket for given CPU
- * @cpu_no:	CPU number
- *
- * Return:	> 0 on success, < 0 on failure.
- */
-int get_core_sibling(int cpu_no)
-{
-	char core_siblings_path[1024], cpu_list_str[64];
-	int sibling_cpu_no = -1;
-	FILE *fp;
-
-	sprintf(core_siblings_path, "%s%d/topology/core_siblings_list",
-		CORE_SIBLINGS_PATH, cpu_no);
-
-	fp = fopen(core_siblings_path, "r");
-	if (!fp) {
-		perror("Failed to open core siblings path");
-
-		return -1;
-	}
-	if (fscanf(fp, "%s", cpu_list_str) <= 0) {
-		perror("Could not get core_siblings list");
-		fclose(fp);
-
-		return -1;
-	}
-	fclose(fp);
-
-	char *token = strtok(cpu_list_str, "-,");
-
-	while (token) {
-		sibling_cpu_no = atoi(token);
-		/* Skipping core 0 as we don't want to run test on core 0 */
-		if (sibling_cpu_no != 0 && sibling_cpu_no != cpu_no)
-			break;
-		token = strtok(NULL, "-,");
-	}
-
-	return sibling_cpu_no;
-}
-
 /*
  * taskset_benchmark - Taskset PID (i.e. benchmark) to a specified cpu
  * @bm_pid:	PID that should be binded