[tip:,sched/core] sched/fair: Multi-LLC select_idle_sibling()

Message ID 168553468754.404.2298362895524875073.tip-bot2@tip-bot2
State New
Headers
Series [tip:,sched/core] sched/fair: Multi-LLC select_idle_sibling() |

Commit Message

tip-bot2 for Thomas Gleixner May 31, 2023, 12:04 p.m. UTC
  The following commit has been merged into the sched/core branch of tip:

Commit-ID:     c7dfd6b9122d29d0e9a4587ab470c0564d7f92ab
Gitweb:        https://git.kernel.org/tip/c7dfd6b9122d29d0e9a4587ab470c0564d7f92ab
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Tue, 30 May 2023 13:20:46 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 30 May 2023 22:46:27 +02:00

sched/fair: Multi-LLC select_idle_sibling()

Tejun reported that when he targets workqueues towards a specific LLC
on his Zen2 machine with 3 cores / LLC and 4 LLCs in total, he gets
significant idle time.

This is, of course, because of how select_idle_sibling() will not
consider anything outside of the local LLC, and since all these tasks
are short running the periodic idle load balancer is ineffective.

And while it is good to keep work cache local, it is better to not
have significant idle time. Therefore, have select_idle_sibling() try
other LLCs inside the same node when the local one comes up empty.

Reported-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/fair.c     | 38 ++++++++++++++++++++++++++++++++++++++
 kernel/sched/features.h |  1 +
 2 files changed, 39 insertions(+)
  

Comments

Abel Wu June 1, 2023, 3:41 a.m. UTC | #1
On 5/31/23 8:04 PM, tip-bot2 for Peter Zijlstra wrote:
> The following commit has been merged into the sched/core branch of tip:
> 
> Commit-ID:     c7dfd6b9122d29d0e9a4587ab470c0564d7f92ab
> Gitweb:        https://git.kernel.org/tip/c7dfd6b9122d29d0e9a4587ab470c0564d7f92ab
> Author:        Peter Zijlstra <peterz@infradead.org>
> AuthorDate:    Tue, 30 May 2023 13:20:46 +02:00
> Committer:     Peter Zijlstra <peterz@infradead.org>
> CommitterDate: Tue, 30 May 2023 22:46:27 +02:00
> 
> sched/fair: Multi-LLC select_idle_sibling()
> 
> Tejun reported that when he targets workqueues towards a specific LLC
> on his Zen2 machine with 3 cores / LLC and 4 LLCs in total, he gets
> significant idle time.
> 
> This is, of course, because of how select_idle_sibling() will not
> consider anything outside of the local LLC, and since all these tasks
> are short running the periodic idle load balancer is ineffective.
> 
> And while it is good to keep work cache local, it is better to not
> have significant idle time. Therefore, have select_idle_sibling() try
> other LLCs inside the same node when the local one comes up empty.
> 
> Reported-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   kernel/sched/fair.c     | 38 ++++++++++++++++++++++++++++++++++++++
>   kernel/sched/features.h |  1 +
>   2 files changed, 39 insertions(+)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 48b6f0c..0172458 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7028,6 +7028,38 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>   }
>   
>   /*
> + * For the multiple-LLC per node case, make sure to try the other LLC's if the
> + * local LLC comes up empty.
> + */
> +static int
> +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
> +{
> +	struct sched_domain *parent = sd->parent;
> +	struct sched_group *sg;
> +
> +	/* Make sure to not cross nodes. */
> +	if (!parent || parent->flags & SD_NUMA)
> +		return -1;
> +
> +	sg = parent->groups;
> +	do {
> +		int cpu = cpumask_first(sched_group_span(sg));
> +		struct sched_domain *sd_child;
> +
> +		sd_child = per_cpu(sd_llc, cpu);
> +		if (sd_child != sd) {

Since sd_llc is cpu private, I think it should be:

		if (!cpus_share_cache(cpu, target))

> +			int i = select_idle_cpu(p, sd_child, test_idle_cores(cpu), cpu);
> +			if ((unsigned)i < nr_cpumask_bits)
> +				return i;
> +		}
> +
> +		sg = sg->next;
> +	} while (sg != parent->groups);
> +
> +	return -1;
> +}
> +
> +/*
>    * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
>    * the task fits. If no CPU is big enough, but there are idle ones, try to
>    * maximize capacity.
> @@ -7199,6 +7231,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>   	if ((unsigned)i < nr_cpumask_bits)
>   		return i;
>   
> +	if (sched_feat(SIS_NODE)) {
> +		i = select_idle_node(p, sd, target);
> +		if ((unsigned)i < nr_cpumask_bits)
> +			return i;
> +	}
> +
>   	return target;
>   }
>   
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index ee7f23c..9e390eb 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -62,6 +62,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
>    */
>   SCHED_FEAT(SIS_PROP, false)
>   SCHED_FEAT(SIS_UTIL, true)
> +SCHED_FEAT(SIS_NODE, true)
>   
>   /*
>    * Issue a WARN when we do multiple update_rq_clock() calls
  
Peter Zijlstra June 1, 2023, 8:09 a.m. UTC | #2
On Thu, Jun 01, 2023 at 11:41:14AM +0800, Abel Wu wrote:
> On 5/31/23 8:04 PM, tip-bot2 for Peter Zijlstra wrote:
> > The following commit has been merged into the sched/core branch of tip:
> > 
> > Commit-ID:     c7dfd6b9122d29d0e9a4587ab470c0564d7f92ab
> > Gitweb:        https://git.kernel.org/tip/c7dfd6b9122d29d0e9a4587ab470c0564d7f92ab
> > Author:        Peter Zijlstra <peterz@infradead.org>
> > AuthorDate:    Tue, 30 May 2023 13:20:46 +02:00
> > Committer:     Peter Zijlstra <peterz@infradead.org>
> > CommitterDate: Tue, 30 May 2023 22:46:27 +02:00
> > 
> > sched/fair: Multi-LLC select_idle_sibling()
> > 
> > Tejun reported that when he targets workqueues towards a specific LLC
> > on his Zen2 machine with 3 cores / LLC and 4 LLCs in total, he gets
> > significant idle time.
> > 
> > This is, of course, because of how select_idle_sibling() will not
> > consider anything outside of the local LLC, and since all these tasks
> > are short running the periodic idle load balancer is ineffective.
> > 
> > And while it is good to keep work cache local, it is better to not
> > have significant idle time. Therefore, have select_idle_sibling() try
> > other LLCs inside the same node when the local one comes up empty.
> > 
> > Reported-by: Tejun Heo <tj@kernel.org>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> >   kernel/sched/fair.c     | 38 ++++++++++++++++++++++++++++++++++++++
> >   kernel/sched/features.h |  1 +
> >   2 files changed, 39 insertions(+)
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 48b6f0c..0172458 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -7028,6 +7028,38 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> >   }
> >   /*
> > + * For the multiple-LLC per node case, make sure to try the other LLC's if the
> > + * local LLC comes up empty.
> > + */
> > +static int
> > +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
> > +{
> > +	struct sched_domain *parent = sd->parent;
> > +	struct sched_group *sg;
> > +
> > +	/* Make sure to not cross nodes. */
> > +	if (!parent || parent->flags & SD_NUMA)
> > +		return -1;
> > +
> > +	sg = parent->groups;
> > +	do {
> > +		int cpu = cpumask_first(sched_group_span(sg));
> > +		struct sched_domain *sd_child;
> > +
> > +		sd_child = per_cpu(sd_llc, cpu);
> > +		if (sd_child != sd) {
> 
> Since sd_llc is cpu private, I think it should be:
> 
> 		if (!cpus_share_cache(cpu, target))

Hmm, yes.. either that or ensure sd is from the first cpu in it's mask.
Let me go fix that.

Thanks!
  
K Prateek Nayak June 1, 2023, 9:33 a.m. UTC | #3
Hello Peter, 

Sharing some initial benchmark results with the patch below.

tl;dr

- Hackbench starts off well but performance drops as the number of groups
  increases.

- schbench (old), tbench, netperf see improvement but there is a band of
  outlier results when system is fully loaded or slightly overloaded.

- Stream and ycsb-mongodb are don't mind the extra search.

- SPECjbb (with default scheduler tunables) and DeathStarBench are not
  very happy.

On 5/31/2023 5:34 PM, tip-bot2 for Peter Zijlstra wrote:
> The following commit has been merged into the sched/core branch of tip:
> 
> Commit-ID:     c7dfd6b9122d29d0e9a4587ab470c0564d7f92ab
> Gitweb:        https://git.kernel.org/tip/c7dfd6b9122d29d0e9a4587ab470c0564d7f92ab
> Author:        Peter Zijlstra <peterz@infradead.org>
> AuthorDate:    Tue, 30 May 2023 13:20:46 +02:00
> Committer:     Peter Zijlstra <peterz@infradead.org>
> CommitterDate: Tue, 30 May 2023 22:46:27 +02:00
> 
> sched/fair: Multi-LLC select_idle_sibling()
> 
> Tejun reported that when he targets workqueues towards a specific LLC
> on his Zen2 machine with 3 cores / LLC and 4 LLCs in total, he gets
> significant idle time.
> 
> This is, of course, because of how select_idle_sibling() will not
> consider anything outside of the local LLC, and since all these tasks
> are short running the periodic idle load balancer is ineffective.
> 
> And while it is good to keep work cache local, it is better to not
> have significant idle time. Therefore, have select_idle_sibling() try
> other LLCs inside the same node when the local one comes up empty.

Tests were run on a dual socket 3rd Generation EPYC server(2 x64C/128T)
running in NPS1 mode. Following it the simplified machine topology:

    NPS1: Each socket is a NUMA node.
    Total 2 NUMA nodes in the dual socket machine.

    DIE0: 0-63,   128-191
        MC0: 0-7, 128-135
           SMT0: 0,128
           SMT1: 1,129
           ...
           SMT7: 7,135
	MC1: 8-15, 136-143
           SMT8: 8,136
           SMT9: 9,137
           ...
           SMT15: 15,143
	...
	MC7: 56-63, 184-191
           SMT56: 56,184
           SMT57: 57,185
           ...
           SMT63: 63,191

    DIE1: 64-127, 192-255
        MC8: 64-71, 192-199
           SMT64: 64,192
           SMT65: 65,193
           ...
           SMT71: 71,199
	MC9: 72-79, 200-207
           SMT72: 72,200
           SMT73: 72,201
           ...
           SMT79: 79,207
	...
	MC15: 120-127, 248-255
           SMT120: 120,248
           SMT121: 121,249
           ...
           SMT127: 127,255

Since the patch extends the idle CPU search to one domain above MC in
case of on an unsuccessful search, for the above topology, the DIE
domain becomes the wake domain with potential 128CPUs to be searched.
Following are the benchmark results:

o Kernel Versions

- tip              - tip:sched/core at commit e2a1f85bf9f5 "sched/psi:
                     Avoid resetting the min update period when it is
                     unnecessary")

- peter-next-level - tip:sched/core + this patch

o Benchmark Results

Note: Benchmarks were run with boost enabled and C2 disabled to minimize
other external fact.

~~~~~~~~~~~~~
~ hackbench ~
~~~~~~~~~~~~~

o NPS1

Test:			tip		   peter-next-level
 1-groups:	   3.92 (0.00 pct)	   4.05 (-3.31 pct)
 2-groups:	   4.58 (0.00 pct)	   3.84 (16.15 pct)
 4-groups:	   4.99 (0.00 pct)	   3.98 (20.24 pct)
 8-groups:	   5.67 (0.00 pct)	   6.05 (-6.70 pct)	* Overloaded
16-groups:	   7.88 (0.00 pct)	  10.56 (-34.01 pct)	* Overloaded

~~~~~~~~~~~~~~~~~~
~ schbench (Old) ~
~~~~~~~~~~~~~~~~~~

o NPS1

#workers:	tip		  peter-next-level
  1:	  26.00 (0.00 pct)	  24.00 (7.69 pct)
  2:	  27.00 (0.00 pct)	  24.00 (11.11 pct)
  4:	  31.00 (0.00 pct)	  28.00 (9.67 pct)
  8:	  36.00 (0.00 pct)	  33.00 (8.33 pct)
 16:	  49.00 (0.00 pct)	  47.00 (4.08 pct)
 32:	  80.00 (0.00 pct)	  81.00 (-1.25 pct)
 64:	 169.00 (0.00 pct)	 169.00 (0.00 pct)
128:	 343.00 (0.00 pct)	 365.00 (-6.41 pct)	* Fully Loaded
256:	 42048.00 (0.00 pct)	 35392.00 (15.82 pct)
512:	 95104.00 (0.00 pct)	 88704.00 (6.72 pct)

~~~~~~~~~~
~ tbench ~
~~~~~~~~~~

o NPS1

Clients:	tip		 peter-next-level
    1	 452.49 (0.00 pct)	 457.94 (1.20 pct)
    2	 862.44 (0.00 pct)	 879.99 (2.03 pct)
    4	 1604.27 (0.00 pct)	 1618.87 (0.91 pct)
    8	 2966.77 (0.00 pct)	 3040.90 (2.49 pct)
   16	 5176.70 (0.00 pct)	 5292.29 (2.23 pct)
   32	 8205.24 (0.00 pct)	 8949.12 (9.06 pct)
   64	 13956.71 (0.00 pct)	 14461.42 (3.61 pct)
  128	 24005.50 (0.00 pct)	 26052.75 (8.52 pct)
  256	 32457.61 (0.00 pct)	 21999.41 (-32.22 pct)	* Overloaded
  512	 34345.24 (0.00 pct)	 41166.39 (19.86 pct)
 1024	 33432.92 (0.00 pct)	 40900.84 (22.33 pct)

~~~~~~~~~~
~ stream ~
~~~~~~~~~~

o NPS1

- 10 Runs:

Test:			tip	   peter-next-level
 Copy:	 271317.35 (0.00 pct)	 292440.22 (7.78 pct)
Scale:	 205533.77 (0.00 pct)	 203362.60 (-1.05 pct)
  Add:	 221624.62 (0.00 pct)	 225850.83 (1.90 pct)
Triad:	 228500.68 (0.00 pct)	 225885.25 (-1.14 pct)

- 100 Runs:

Test:			tip	   peter-next-level
 Copy:	 317381.65 (0.00 pct)	 318827.08 (0.45 pct)
Scale:	 214145.00 (0.00 pct)	 206213.69 (-3.70 pct)
  Add:	 239243.29 (0.00 pct)	 229791.67 (-3.95 pct)
Triad:	 249477.76 (0.00 pct)	 236843.06 (-5.06 pct)

~~~~~~~~~~~~~~~~~~~~
~ netperf - TCP_RR ~
~~~~~~~~~~~~~~~~~~~~

o NPS1

Test:			tip		   peter-next-level
 1-clients:	 102839.97 (0.00 pct)	 103540.33 (0.68 pct)
 2-clients:	 98428.08 (0.00 pct)	 100431.67 (2.03 pct)
 4-clients:	 92298.45 (0.00 pct)	 94800.51 (2.71 pct)
 8-clients:	 85618.41 (0.00 pct)	 89130.14 (4.10 pct)
16-clients:	 78722.18 (0.00 pct)	 79715.38 (1.26 pct)
32-clients:	 73610.75 (0.00 pct)	 72801.41 (-1.09 pct)
64-clients:	 55285.07 (0.00 pct)	 56184.38 (1.62 pct)
128-clients:	 31176.92 (0.00 pct)	 32830.06 (5.30 pct)
256-clients:	 20011.44 (0.00 pct)	 15135.39 (-24.36 pct)	* Overloaded

~~~~~~~~~~~~~
~ unixbench ~
~~~~~~~~~~~~~

o NPS1

						  tip			peter-next-level
Hmean     unixbench-dhry2reg-1   	  41322625.19 (   0.00%)   41224388.33 (  -0.24%)
Hmean     unixbench-dhry2reg-512	6252491108.60 (   0.00%)  6240160851.68 (  -0.20%)
Amean     unixbench-syscall-1    	   2501398.27 (   0.00%)    2577323.43 *  -3.04%*
Amean     unixbench-syscall-512  	   8120524.00 (   0.00%)    7512955.87 *   7.48%*
Hmean     unixbench-pipe-1    		   2359346.02 (   0.00%)    2392308.62 *   1.40%*
Hmean     unixbench-pipe-512		 338790322.61 (   0.00%)  337711432.92 (  -0.32%)
Hmean     unixbench-spawn-1       	      4261.52 (   0.00%)       4164.90 (  -2.27%)
Hmean     unixbench-spawn-512    	     64328.93 (   0.00%)      62257.64 *  -3.22%*
Hmean     unixbench-execl-1       	      3677.73 (   0.00%)       3652.08 (  -0.70%)
Hmean     unixbench-execl-512    	     11984.83 (   0.00%)      13585.65 *  13.36%*

~~~~~~~~~~~~~~~~
~ ycsb-mongodb ~
~~~~~~~~~~~~~~~~

o NPS1

tip:			131070.33 (var: 2.84%)
peter-next-level:	131070.33 (var: 2.84%) (0.00%)

~~~~~~~~~~~~~~~~~~~~~~~
~ SPECjbb - Multi-JVM ~
~~~~~~~~~~~~~~~~~~~~~~~

o NPS1

- Default Scheduler Tunables

kernel			max-jOPS		critical-jOPS
tip			100.00%			100.00%
peter-next-level	 94.45% (-5.55%)	 98.25% (-1.75%)

- Modified Scheduler Tunables

kernel			max-jOPS		critical-jOPS
tip			100.00%			100.00%
peter-next-level	100.00% (0.00%)		102.41% (2.41%)

~~~~~~~~~~~~~~~~~~
~ DeathStarBench ~
~~~~~~~~~~~~~~~~~~

Pinning   Scaling	tip		peter-next-level
1 CCD     1             100.00%      	100.30% (%diff:  0.30%)
2 CCD     2             100.00%      	100.17% (%diff:  0.17%)
4 CCD     4             100.00%      	 99.60% (%diff: -0.40%)
8 CCD     8             100.00%      	 92.05% (%diff: -7.95%)	*

---

Based on the above data, the results seem to be mostly positive for
the microbenchmarks but not so much for SpecJBB and DeathStarBench,
which have high utilization. There is also band of outliers when the
system is fully loaded or overloaded (~2 tasks per rq) for some of
the microbenchmarks.

I wonder if extending SIS_UTIL for SIS_NODE would help some of these
cases but I've not tried tinkering with it yet. I'll continue
testing on other NPS modes which would decrease the search scope.
I'll also try running the same bunch of workloads on an even larger
4th Generation EPYC server to see if the behavior there is similar.

Let me know if you need any data from from my test system for any
specific workload. I'll be more than happy to get them for you :)

> 
> Reported-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/fair.c     | 38 ++++++++++++++++++++++++++++++++++++++
>  kernel/sched/features.h |  1 +
>  2 files changed, 39 insertions(+)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 48b6f0c..0172458 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7028,6 +7028,38 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool 
>  }
>  
>  /*
> + * For the multiple-LLC per node case, make sure to try the other LLC's if the
> + * local LLC comes up empty.
> + */
> +static int
> +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
> +{
> +	struct sched_domain *parent = sd->parent;
> +	struct sched_group *sg;
> +
> +	/* Make sure to not cross nodes. */
> +	if (!parent || parent->flags & SD_NUMA)
> +		return -1;
> +
> +	sg = parent->groups;
> +	do {
> +		int cpu = cpumask_first(sched_group_span(sg));
> +		struct sched_domain *sd_child;
> +
> +		sd_child = per_cpu(sd_llc, cpu);
> +		if (sd_child != sd) {
> +			int i = select_idle_cpu(p, sd_child, test_idle_cores(cpu), cpu);
> +			if ((unsigned)i < nr_cpumask_bits)
> +				return i;
> +		}
> +
> +		sg = sg->next;
> +	} while (sg != parent->groups);
> +
> +	return -1;
> +}
> +
> +/*
>   * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
>   * the task fits. If no CPU is big enough, but there are idle ones, try to
>   * maximize capacity.
> @@ -7199,6 +7231,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  	if ((unsigned)i < nr_cpumask_bits)
>  		return i;
>  
> +	if (sched_feat(SIS_NODE)) {
> +		i = select_idle_node(p, sd, target);
> +		if ((unsigned)i < nr_cpumask_bits)
> +			return i;
> +	}
> +
>  	return target;
>  }
>  
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index ee7f23c..9e390eb 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -62,6 +62,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
>   */
>  SCHED_FEAT(SIS_PROP, false)
>  SCHED_FEAT(SIS_UTIL, true)
> +SCHED_FEAT(SIS_NODE, true)
>  
>  /*
>   * Issue a WARN when we do multiple update_rq_clock() calls

--
Thanks and Regards,
Prateek
  
Peter Zijlstra June 1, 2023, 11:13 a.m. UTC | #4
On Thu, Jun 01, 2023 at 03:03:39PM +0530, K Prateek Nayak wrote:
> Hello Peter, 
> 
> Sharing some initial benchmark results with the patch below.
> 
> tl;dr
> 
> - Hackbench starts off well but performance drops as the number of groups
>   increases.
> 
> - schbench (old), tbench, netperf see improvement but there is a band of
>   outlier results when system is fully loaded or slightly overloaded.
> 
> - Stream and ycsb-mongodb are don't mind the extra search.
> 
> - SPECjbb (with default scheduler tunables) and DeathStarBench are not
>   very happy.

Figures :/ Every time something like this is changed someone gets to be
sad..

> Tests were run on a dual socket 3rd Generation EPYC server(2 x64C/128T)
> running in NPS1 mode. Following it the simplified machine topology:

Right, Zen3 8 cores / LLC, 64 cores total give 8 LLC per node.

> ~~~~~~~~~~~~~~~~~~~~~~~
> ~ SPECjbb - Multi-JVM ~
> ~~~~~~~~~~~~~~~~~~~~~~~
> 
> o NPS1
> 
> - Default Scheduler Tunables
> 
> kernel			max-jOPS		critical-jOPS
> tip			100.00%			100.00%
> peter-next-level	 94.45% (-5.55%)	 98.25% (-1.75%)
> 
> - Modified Scheduler Tunables
> 
> kernel			max-jOPS		critical-jOPS
> tip			100.00%			100.00%
> peter-next-level	100.00% (0.00%)		102.41% (2.41%)

I'm slightly confused, either the default or the tuned is better. Given
it's counting ops, I'm thinking higher is more better, so isn't this an
improvement in the tuned case?

> ~~~~~~~~~~~~~~~~~~
> ~ DeathStarBench ~
> ~~~~~~~~~~~~~~~~~~
> 
> Pinning   Scaling	tip		peter-next-level
> 1 CCD     1             100.00%      	100.30% (%diff:  0.30%)
> 2 CCD     2             100.00%      	100.17% (%diff:  0.17%)
> 4 CCD     4             100.00%      	 99.60% (%diff: -0.40%)
> 8 CCD     8             100.00%      	 92.05% (%diff: -7.95%)	*

Right, so that's a definite loss.

> I wonder if extending SIS_UTIL for SIS_NODE would help some of these
> cases but I've not tried tinkering with it yet. I'll continue
> testing on other NPS modes which would decrease the search scope.
> I'll also try running the same bunch of workloads on an even larger
> 4th Generation EPYC server to see if the behavior there is similar.

> >  /*
> > + * For the multiple-LLC per node case, make sure to try the other LLC's if the
> > + * local LLC comes up empty.
> > + */
> > +static int
> > +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
> > +{
> > +	struct sched_domain *parent = sd->parent;
> > +	struct sched_group *sg;
> > +
> > +	/* Make sure to not cross nodes. */
> > +	if (!parent || parent->flags & SD_NUMA)
> > +		return -1;
> > +
> > +	sg = parent->groups;
> > +	do {
> > +		int cpu = cpumask_first(sched_group_span(sg));
> > +		struct sched_domain *sd_child;
> > +
> > +		sd_child = per_cpu(sd_llc, cpu);
> > +		if (sd_child != sd) {
> > +			int i = select_idle_cpu(p, sd_child, test_idle_cores(cpu), cpu);

Given how SIS_UTIL is inside select_idle_cpu() it should already be
effective here, no?

> > +			if ((unsigned)i < nr_cpumask_bits)
> > +				return i;
> > +		}
> > +
> > +		sg = sg->next;
> > +	} while (sg != parent->groups);
> > +
> > +	return -1;
> > +}

This DeathStarBench thing seems to suggest that scanning up to 4 CCDs
isn't too much of a bother; so perhaps something like so?

(on top of tip/sched/core from just a few hours ago, as I had to 'fix'
this patch and force pushed the thing)

And yeah, random hacks and heuristics here :/ Does there happen to be
additional topology that could aid us here? Does the CCD fabric itself
have a distance metric we can use?

---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 22e0a249e0a8..f1d6ed973410 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7036,6 +7036,7 @@ select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
 {
 	struct sched_domain *parent = sd->parent;
 	struct sched_group *sg;
+	int nr = 4;
 
 	/* Make sure to not cross nodes. */
 	if (!parent || parent->flags & SD_NUMA)
@@ -7050,6 +7051,9 @@ select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
 						test_idle_cores(cpu), cpu);
 			if ((unsigned)i < nr_cpumask_bits)
 				return i;
+
+			if (!--nr)
+				return -1;
 		}
 
 		sg = sg->next;
  
Peter Zijlstra June 1, 2023, 11:56 a.m. UTC | #5
On Thu, Jun 01, 2023 at 01:13:26PM +0200, Peter Zijlstra wrote:
> 
> This DeathStarBench thing seems to suggest that scanning up to 4 CCDs
> isn't too much of a bother; so perhaps something like so?
> 
> (on top of tip/sched/core from just a few hours ago, as I had to 'fix'
> this patch and force pushed the thing)
> 
> And yeah, random hacks and heuristics here :/ Does there happen to be
> additional topology that could aid us here? Does the CCD fabric itself
> have a distance metric we can use?

  https://www.anandtech.com/show/16529/amd-epyc-milan-review/4

Specifically:

  https://images.anandtech.com/doci/16529/Bounce-7763.png

That seems to suggest there are some very minor distance effects in the
CCD fabric. I didn't read the article too closely, but you'll note that
the first 4 CCDs have inter-CCD latency < 100 while the rest has > 100.

Could you also test on a Zen2 Epyc, does that require nr=8 instead of 4?
Should we perhaps write it like: 32 / llc_size ?

The Zen2 picture:

  https://images.anandtech.com/doci/16315/Bounce-7742.png

Shows a more pronounced CCD fabric topology, you can really see the 2
CCX inside the CCD but also there's two ligher green squares around the
CCDs themselves.
  
Peter Zijlstra June 1, 2023, noon UTC | #6
On Thu, Jun 01, 2023 at 01:56:43PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 01, 2023 at 01:13:26PM +0200, Peter Zijlstra wrote:
> > 
> > This DeathStarBench thing seems to suggest that scanning up to 4 CCDs
> > isn't too much of a bother; so perhaps something like so?
> > 
> > (on top of tip/sched/core from just a few hours ago, as I had to 'fix'
> > this patch and force pushed the thing)
> > 
> > And yeah, random hacks and heuristics here :/ Does there happen to be
> > additional topology that could aid us here? Does the CCD fabric itself
> > have a distance metric we can use?
> 
>   https://www.anandtech.com/show/16529/amd-epyc-milan-review/4
> 
> Specifically:
> 
>   https://images.anandtech.com/doci/16529/Bounce-7763.png
> 
> That seems to suggest there are some very minor distance effects in the
> CCD fabric. I didn't read the article too closely, but you'll note that
> the first 4 CCDs have inter-CCD latency < 100 while the rest has > 100.
> 
> Could you also test on a Zen2 Epyc, does that require nr=8 instead of 4?
> Should we perhaps write it like: 32 / llc_size ?
> 
> The Zen2 picture:
> 
>   https://images.anandtech.com/doci/16315/Bounce-7742.png
> 
> Shows a more pronounced CCD fabric topology, you can really see the 2
> CCX inside the CCD but also there's two ligher green squares around the
> CCDs themselves.

I can't seem to find pretty pictures for Zen4 Epyc; what does that want?
That's even bigger at 96/8=12 LLCs afaict.
  
Peter Zijlstra June 1, 2023, 2:47 p.m. UTC | #7
On Thu, Jun 01, 2023 at 02:00:01PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 01, 2023 at 01:56:43PM +0200, Peter Zijlstra wrote:
> > On Thu, Jun 01, 2023 at 01:13:26PM +0200, Peter Zijlstra wrote:
> > > 
> > > This DeathStarBench thing seems to suggest that scanning up to 4 CCDs
> > > isn't too much of a bother; so perhaps something like so?
> > > 
> > > (on top of tip/sched/core from just a few hours ago, as I had to 'fix'
> > > this patch and force pushed the thing)
> > > 
> > > And yeah, random hacks and heuristics here :/ Does there happen to be
> > > additional topology that could aid us here? Does the CCD fabric itself
> > > have a distance metric we can use?
> > 
> >   https://www.anandtech.com/show/16529/amd-epyc-milan-review/4
> > 
> > Specifically:
> > 
> >   https://images.anandtech.com/doci/16529/Bounce-7763.png
> > 
> > That seems to suggest there are some very minor distance effects in the
> > CCD fabric. I didn't read the article too closely, but you'll note that
> > the first 4 CCDs have inter-CCD latency < 100 while the rest has > 100.
> > 
> > Could you also test on a Zen2 Epyc, does that require nr=8 instead of 4?
> > Should we perhaps write it like: 32 / llc_size ?
> > 
> > The Zen2 picture:
> > 
> >   https://images.anandtech.com/doci/16315/Bounce-7742.png
> > 
> > Shows a more pronounced CCD fabric topology, you can really see the 2
> > CCX inside the CCD but also there's two ligher green squares around the
> > CCDs themselves.
> 
> I can't seem to find pretty pictures for Zen4 Epyc; what does that want?
> That's even bigger at 96/8=12 LLCs afaict.

One way to fix all this would be by having arch/x86/kernel/smpboot.c set
an AMD specific set_sched_topology() that has a CCD domain above the MC
and below the DIE domain that groups 'near' CCDs together based on some
AMD specific topology information.

Then for small systems that will probably be just a single CCD domain
and the degenerate code will make it go away, but for these large
systems it will do what is right for their respective configuration.

Then, since this new multi-llc code uses MC->parent it will end up on
the fancy new CCD domain and not scan the *entire* socket.

Hmm?
  
Peter Zijlstra June 1, 2023, 2:51 p.m. UTC | #8
On Thu, Jun 01, 2023 at 02:00:01PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 01, 2023 at 01:56:43PM +0200, Peter Zijlstra wrote:
> > On Thu, Jun 01, 2023 at 01:13:26PM +0200, Peter Zijlstra wrote:
> > > 
> > > This DeathStarBench thing seems to suggest that scanning up to 4 CCDs
> > > isn't too much of a bother; so perhaps something like so?
> > > 
> > > (on top of tip/sched/core from just a few hours ago, as I had to 'fix'
> > > this patch and force pushed the thing)
> > > 
> > > And yeah, random hacks and heuristics here :/ Does there happen to be
> > > additional topology that could aid us here? Does the CCD fabric itself
> > > have a distance metric we can use?
> > 
> >   https://www.anandtech.com/show/16529/amd-epyc-milan-review/4
> > 
> > Specifically:
> > 
> >   https://images.anandtech.com/doci/16529/Bounce-7763.png
> > 
> > That seems to suggest there are some very minor distance effects in the
> > CCD fabric. I didn't read the article too closely, but you'll note that
> > the first 4 CCDs have inter-CCD latency < 100 while the rest has > 100.
> > 
> > Could you also test on a Zen2 Epyc, does that require nr=8 instead of 4?
> > Should we perhaps write it like: 32 / llc_size ?
> > 
> > The Zen2 picture:
> > 
> >   https://images.anandtech.com/doci/16315/Bounce-7742.png
> > 
> > Shows a more pronounced CCD fabric topology, you can really see the 2
> > CCX inside the CCD but also there's two ligher green squares around the
> > CCDs themselves.
> 
> I can't seem to find pretty pictures for Zen4 Epyc; what does that want?
> That's even bigger at 96/8=12 LLCs afaict.

Going by random pictures on the interweb again, it looks like this Zen4
thing wants either 2 groups of 6 each, or 4 groups of 3.

But you have the hardware, so I'll let you figure it out.
  
Peter Zijlstra June 1, 2023, 3:35 p.m. UTC | #9
On Thu, Jun 01, 2023 at 04:47:06PM +0200, Peter Zijlstra wrote:

> One way to fix all this would be by having arch/x86/kernel/smpboot.c set
> an AMD specific set_sched_topology() that has a CCD domain above the MC
> and below the DIE domain that groups 'near' CCDs together based on some
> AMD specific topology information.
> 
> Then for small systems that will probably be just a single CCD domain
> and the degenerate code will make it go away, but for these large
> systems it will do what is right for their respective configuration.
> 
> Then, since this new multi-llc code uses MC->parent it will end up on
> the fancy new CCD domain and not scan the *entire* socket.
> 
> Hmm?

Something like the (untested) below might be a nice base to go from.

Then all you have to do is add something like:

	if (x86_has_ccd_topology) {
		x86_topology[i++] = (struct sched_domain_topology_level){
			cpu_ccd_mask, SD_INIT_NAME(CCD)
		};
	}

(and construct cpu_ccd_mask obviously...)

---
 arch/x86/kernel/smpboot.c | 94 ++++++++++++++++++++++-------------------------
 1 file changed, 43 insertions(+), 51 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 34066f6735dd..0a22d719b6b6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -563,50 +563,57 @@ static int x86_cluster_flags(void)
 #endif
 #endif
 
-static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_CLUSTER
-	{ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
-#endif
-#ifdef CONFIG_SCHED_MC
-	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
-#endif
-	{ NULL, },
-};
+/*
+ * Set if a package/die has multiple NUMA nodes inside.
+ * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
+ * Sub-NUMA Clustering have this.
+ */
+static bool x86_has_numa_in_package;
 
-static struct sched_domain_topology_level x86_hybrid_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_MC
-	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
-#endif
-	{ cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(DIE) },
-	{ NULL, },
-};
+static struct sched_domain_topology_level x86_topology[6];
+
+static void __init build_sched_topology(void)
+{
+	int i = 0;
 
-static struct sched_domain_topology_level x86_topology[] = {
 #ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+	x86_topology[i++] = (struct sched_domain_topology_level){
+		cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT)
+	};
 #endif
 #ifdef CONFIG_SCHED_CLUSTER
-	{ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+	/*
+	 * For now, skip the cluster domain on Hybrid.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
+		x86_topology[i++] = (struct sched_domain_topology_level){
+			cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
+		};
+	}
 #endif
 #ifdef CONFIG_SCHED_MC
-	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+	x86_topology[i++] = (struct sched_domain_topology_level){
+		cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
+	};
 #endif
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
-	{ NULL, },
-};
+	/*
+	 * When there is NUMA topology inside the package skip the DIE domain
+	 * since the NUMA domains will auto-magically create the right spanning
+	 * domains based on the SLIT.
+	 */
+	if (!x86_has_numa_in_package) {
+		x86_topology[i++] = (struct sched_domain_topology_level){
+			cpu_cpu_mask, SD_INIT_NAME(DIE)
+		};
+	}
 
-/*
- * Set if a package/die has multiple NUMA nodes inside.
- * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
- * Sub-NUMA Clustering have this.
- */
-static bool x86_has_numa_in_package;
+	/*
+	 * There must be one trailing NULL entry left.
+	 */
+	BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
+
+	set_sched_topology(x86_topology);
+}
 
 void set_cpu_sibling_map(int cpu)
 {
@@ -1390,15 +1397,6 @@ void __init smp_prepare_cpus_common(void)
 		zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
 	}
 
-	/*
-	 * Set 'default' x86 topology, this matches default_topology() in that
-	 * it has NUMA nodes as a topology level. See also
-	 * native_smp_cpus_done().
-	 *
-	 * Must be done before set_cpus_sibling_map() is ran.
-	 */
-	set_sched_topology(x86_topology);
-
 	set_cpu_sibling_map(0);
 }
 
@@ -1490,13 +1488,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 	pr_debug("Boot done\n");
 
 	calculate_max_logical_packages();
-
-	/* XXX for now assume numa-in-package and hybrid don't overlap */
-	if (x86_has_numa_in_package)
-		set_sched_topology(x86_numa_in_package_topology);
-	if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
-		set_sched_topology(x86_hybrid_topology);
-
+	build_sched_topology();
 	nmi_selftest();
 	impress_friends();
 	cache_aps_init();
  
Chen Yu June 1, 2023, 4:44 p.m. UTC | #10
On Thu, Jun 1, 2023 at 8:11 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Jun 01, 2023 at 03:03:39PM +0530, K Prateek Nayak wrote:
[...]
> > I wonder if extending SIS_UTIL for SIS_NODE would help some of these
> > cases but I've not tried tinkering with it yet. I'll continue
> > testing on other NPS modes which would decrease the search scope.
> > I'll also try running the same bunch of workloads on an even larger
> > 4th Generation EPYC server to see if the behavior there is similar.
>
> > >  /*
> > > + * For the multiple-LLC per node case, make sure to try the other LLC's if the
> > > + * local LLC comes up empty.
> > > + */
> > > +static int
> > > +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
> > > +{
> > > +   struct sched_domain *parent = sd->parent;
> > > +   struct sched_group *sg;
> > > +
> > > +   /* Make sure to not cross nodes. */
> > > +   if (!parent || parent->flags & SD_NUMA)
> > > +           return -1;
> > > +
> > > +   sg = parent->groups;
> > > +   do {
> > > +           int cpu = cpumask_first(sched_group_span(sg));
> > > +           struct sched_domain *sd_child;
> > > +
> > > +           sd_child = per_cpu(sd_llc, cpu);
> > > +           if (sd_child != sd) {
> > > +                   int i = select_idle_cpu(p, sd_child, test_idle_cores(cpu), cpu);
>
> Given how SIS_UTIL is inside select_idle_cpu() it should already be
> effective here, no?
>
I'm thinking of this scenario, when the system is overloaded and with
SIS_NODE disabled,
the SIS_UTIL could scan for example 4 CPUs and terminates, then wakeup
on local LLC.
When SIS_NODE is enabled, it could scan for 4 * number_of_llc_domain
CPUs.  The more
CPU it scans, the more likely it can find an idle CPU.
This seems to be a question of: what type of wakee is prefered to be
put on a non-idle CPU from local LLC,
or  an idle CPU  from remote LLC. It seems to depend on the working
set and task duration.


thanks,
Chenyu
  
K Prateek Nayak June 2, 2023, 3:12 a.m. UTC | #11
Hello Peter,

Thank you for taking a look at the report.

On 6/1/2023 4:43 PM, Peter Zijlstra wrote:
> On Thu, Jun 01, 2023 at 03:03:39PM +0530, K Prateek Nayak wrote:
>> Hello Peter, 
>>
>> Sharing some initial benchmark results with the patch below.
>>
>> tl;dr
>>
>> - Hackbench starts off well but performance drops as the number of groups
>>   increases.
>>
>> - schbench (old), tbench, netperf see improvement but there is a band of
>>   outlier results when system is fully loaded or slightly overloaded.
>>
>> - Stream and ycsb-mongodb are don't mind the extra search.
>>
>> - SPECjbb (with default scheduler tunables) and DeathStarBench are not
>>   very happy.
> 
> Figures :/ Every time something like this is changed someone gets to be
> sad..
> 
>> Tests were run on a dual socket 3rd Generation EPYC server(2 x64C/128T)
>> running in NPS1 mode. Following it the simplified machine topology:
> 
> Right, Zen3 8 cores / LLC, 64 cores total give 8 LLC per node.

Yes, correct!

> 
>> ~~~~~~~~~~~~~~~~~~~~~~~
>> ~ SPECjbb - Multi-JVM ~
>> ~~~~~~~~~~~~~~~~~~~~~~~
>>
>> o NPS1
>>
>> - Default Scheduler Tunables
>>
>> kernel			max-jOPS		critical-jOPS
>> tip			100.00%			100.00%
>> peter-next-level	 94.45% (-5.55%)	 98.25% (-1.75%)
>>
>> - Modified Scheduler Tunables
>>
>> kernel			max-jOPS		critical-jOPS
>> tip			100.00%			100.00%
>> peter-next-level	100.00% (0.00%)		102.41% (2.41%)
> 
> I'm slightly confused, either the default or the tuned is better. Given
> it's counting ops, I'm thinking higher is more better, so isn't this an
> improvement in the tuned case?

Default is bad. I believe migrating across the LLC boundary is not that
great from cache efficiency perspective here. Setting the tunables
makes task run for longer, and in that case, able to find an idle CPU
seems to be more beneficial.

> 
>> ~~~~~~~~~~~~~~~~~~
>> ~ DeathStarBench ~
>> ~~~~~~~~~~~~~~~~~~
>>
>> Pinning   Scaling	tip		peter-next-level
>> 1 CCD     1             100.00%      	100.30% (%diff:  0.30%)
>> 2 CCD     2             100.00%      	100.17% (%diff:  0.17%)
>> 4 CCD     4             100.00%      	 99.60% (%diff: -0.40%)
>> 8 CCD     8             100.00%      	 92.05% (%diff: -7.95%)	*
> 
> Right, so that's a definite loss.
> 
>> I wonder if extending SIS_UTIL for SIS_NODE would help some of these
>> cases but I've not tried tinkering with it yet. I'll continue
>> testing on other NPS modes which would decrease the search scope.
>> I'll also try running the same bunch of workloads on an even larger
>> 4th Generation EPYC server to see if the behavior there is similar.
> 
>>>  /*
>>> + * For the multiple-LLC per node case, make sure to try the other LLC's if the
>>> + * local LLC comes up empty.
>>> + */
>>> +static int
>>> +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
>>> +{
>>> +	struct sched_domain *parent = sd->parent;
>>> +	struct sched_group *sg;
>>> +
>>> +	/* Make sure to not cross nodes. */
>>> +	if (!parent || parent->flags & SD_NUMA)
>>> +		return -1;
>>> +
>>> +	sg = parent->groups;
>>> +	do {
>>> +		int cpu = cpumask_first(sched_group_span(sg));
>>> +		struct sched_domain *sd_child;
>>> +
>>> +		sd_child = per_cpu(sd_llc, cpu);
>>> +		if (sd_child != sd) {
>>> +			int i = select_idle_cpu(p, sd_child, test_idle_cores(cpu), cpu);
> 
> Given how SIS_UTIL is inside select_idle_cpu() it should already be
> effective here, no?

True, but if the entire higher domain is busy, iterating over the groups
itself adds to the scheduling latency only to fallback to the target.
Wondering if keeping track of the largest "sd_llc_shared->nr_idle_scan"
and the corresponding group, and starting from there makes sense.

> 
>>> +			if ((unsigned)i < nr_cpumask_bits)
>>> +				return i;
>>> +		}
>>> +
>>> +		sg = sg->next;
>>> +	} while (sg != parent->groups);
>>> +
>>> +	return -1;
>>> +}
> 
> This DeathStarBench thing seems to suggest that scanning up to 4 CCDs
> isn't too much of a bother; so perhaps something like so?
> 
> (on top of tip/sched/core from just a few hours ago, as I had to 'fix'
> this patch and force pushed the thing)
> 
> And yeah, random hacks and heuristics here :/ Does there happen to be
> additional topology that could aid us here? Does the CCD fabric itself
> have a distance metric we can use?
>

We do not have a CCD to CCD distance metric unfortunately :(
However NPS modes should mitigate some of the problems of the larger
search space (but adds additional NUMA complexity) which I still need to
test.

> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 22e0a249e0a8..f1d6ed973410 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7036,6 +7036,7 @@ select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
>  {
>  	struct sched_domain *parent = sd->parent;
>  	struct sched_group *sg;
> +	int nr = 4;
>  
>  	/* Make sure to not cross nodes. */
>  	if (!parent || parent->flags & SD_NUMA)
> @@ -7050,6 +7051,9 @@ select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
>  						test_idle_cores(cpu), cpu);
>  			if ((unsigned)i < nr_cpumask_bits)
>  				return i;
> +
> +			if (!--nr)
> +				return -1;
>  		}
>  
>  		sg = sg->next;

--
Thanks and Regards,
Prateek
  
K Prateek Nayak June 2, 2023, 5:13 a.m. UTC | #12
Hello Peter,

On 6/1/2023 8:17 PM, Peter Zijlstra wrote:
> On Thu, Jun 01, 2023 at 02:00:01PM +0200, Peter Zijlstra wrote:
>> On Thu, Jun 01, 2023 at 01:56:43PM +0200, Peter Zijlstra wrote:
>>> On Thu, Jun 01, 2023 at 01:13:26PM +0200, Peter Zijlstra wrote:
>>>>
>>>> This DeathStarBench thing seems to suggest that scanning up to 4 CCDs
>>>> isn't too much of a bother; so perhaps something like so?
>>>>
>>>> (on top of tip/sched/core from just a few hours ago, as I had to 'fix'
>>>> this patch and force pushed the thing)
>>>>
>>>> And yeah, random hacks and heuristics here :/ Does there happen to be
>>>> additional topology that could aid us here? Does the CCD fabric itself
>>>> have a distance metric we can use?
>>>
>>>   https://www.anandtech.com/show/16529/amd-epyc-milan-review/4
>>>
>>> Specifically:
>>>
>>>   https://images.anandtech.com/doci/16529/Bounce-7763.png
>>>
>>> That seems to suggest there are some very minor distance effects in the
>>> CCD fabric. I didn't read the article too closely, but you'll note that
>>> the first 4 CCDs have inter-CCD latency < 100 while the rest has > 100.

There seems to be visible difference in latencies when you cross:

- CCX boundary (From 20s to high 80s,low 90s)
- NPS4 Boundary (From low 90s to high 90s)
- NPS2 Boundary (From high 90s to 100-110s)
- Socket Boundary (From 100-110s to 180s)

>>>
>>> Could you also test on a Zen2 Epyc, does that require nr=8 instead of 4?
>>> Should we perhaps write it like: 32 / llc_size ?
>>>
>>> The Zen2 picture:
>>>
>>>   https://images.anandtech.com/doci/16315/Bounce-7742.png
>>>
>>> Shows a more pronounced CCD fabric topology, you can really see the 2
>>> CCX inside the CCD but also there's two ligher green squares around the
>>> CCDs themselves.

Yup, same as above except there is drop off going beyond CCX and
that low 100s remain until the NPS4 boundary. I'm not able to spot any
noticeable difference at the CCD boundary.

>>
>> I can't seem to find pretty pictures for Zen4 Epyc; what does that want?
>> That's even bigger at 96/8=12 LLCs afaict.
> 
> One way to fix all this would be by having arch/x86/kernel/smpboot.c set
> an AMD specific set_sched_topology() that has a CCD domain above the MC
> and below the DIE domain that groups 'near' CCDs together based on some
> AMD specific topology information.

This makes sense but I wonder if new domain will add more load balancing
jitters. Also this will require larger evaluation with some more diverse
workloads. Let me go check if we can get find the NPS2/NPS4 boundary when
in NPS1 mode.

> 
> Then for small systems that will probably be just a single CCD domain
> and the degenerate code will make it go away, but for these large
> systems it will do what is right for their respective configuration.
> 
> Then, since this new multi-llc code uses MC->parent it will end up on
> the fancy new CCD domain and not scan the *entire* socket.
> 
> Hmm?

Grouping near-CCX for the offerings that do not have 2CCX per CCD will
prevent degenration and limit the search scope yes. Here is what I'll
do, let me check if limiting search scope helps first, and then start
fiddling with the topology. How does that sound?

--
Thanks and Regards,
Prateek
  
K Prateek Nayak June 2, 2023, 5:17 a.m. UTC | #13
Hello Peter,

On 6/1/2023 8:21 PM, Peter Zijlstra wrote:
> On Thu, Jun 01, 2023 at 02:00:01PM +0200, Peter Zijlstra wrote:
>> On Thu, Jun 01, 2023 at 01:56:43PM +0200, Peter Zijlstra wrote:
>>> On Thu, Jun 01, 2023 at 01:13:26PM +0200, Peter Zijlstra wrote:
>>>>
>>>> This DeathStarBench thing seems to suggest that scanning up to 4 CCDs
>>>> isn't too much of a bother; so perhaps something like so?
>>>>
>>>> (on top of tip/sched/core from just a few hours ago, as I had to 'fix'
>>>> this patch and force pushed the thing)
>>>>
>>>> And yeah, random hacks and heuristics here :/ Does there happen to be
>>>> additional topology that could aid us here? Does the CCD fabric itself
>>>> have a distance metric we can use?
>>>
>>>   https://www.anandtech.com/show/16529/amd-epyc-milan-review/4
>>>
>>> Specifically:
>>>
>>>   https://images.anandtech.com/doci/16529/Bounce-7763.png
>>>
>>> That seems to suggest there are some very minor distance effects in the
>>> CCD fabric. I didn't read the article too closely, but you'll note that
>>> the first 4 CCDs have inter-CCD latency < 100 while the rest has > 100.
>>>
>>> Could you also test on a Zen2 Epyc, does that require nr=8 instead of 4?
>>> Should we perhaps write it like: 32 / llc_size ?
>>>
>>> The Zen2 picture:
>>>
>>>   https://images.anandtech.com/doci/16315/Bounce-7742.png
>>>
>>> Shows a more pronounced CCD fabric topology, you can really see the 2
>>> CCX inside the CCD but also there's two ligher green squares around the
>>> CCDs themselves.
>>
>> I can't seem to find pretty pictures for Zen4 Epyc; what does that want?
>> That's even bigger at 96/8=12 LLCs afaict.
> 
> Going by random pictures on the interweb again, it looks like this Zen4
> thing wants either 2 groups of 6 each, or 4 groups of 3.

I would think it is the latter since NPS4 does that but let me go verify.
Thank you for digging into this and providing the patch to extend the x86
topology in the thread :)

> 
> But you have the hardware, so I'll let you figure it out.

--
Thanks and Regards,
Prateek
  
Peter Zijlstra June 2, 2023, 6:54 a.m. UTC | #14
On Fri, Jun 02, 2023 at 10:43:37AM +0530, K Prateek Nayak wrote:
> Grouping near-CCX for the offerings that do not have 2CCX per CCD will
> prevent degenration and limit the search scope yes. Here is what I'll
> do, let me check if limiting search scope helps first, and then start
> fiddling with the topology. How does that sound?

So my preference would be the topology based solution, since the search
limit is random magic numbers that happen to work for 'your' machine but
who knows what it'll do for some other poor architecture that happens to
trip this.

That said; verifying the limit helps at all is of course a good start,
because if it doesn't then the topology thing will likely also not help
much.
  
Peter Zijlstra June 2, 2023, 7 a.m. UTC | #15
On Fri, Jun 02, 2023 at 10:43:37AM +0530, K Prateek Nayak wrote:

> This makes sense but I wonder if new domain will add more load balancing
> jitters. Also this will require larger evaluation with some more diverse
> workloads.

Always the case, isn't it :-)

> Let me go check if we can get find the NPS2/NPS4 boundary when
> in NPS1 mode.

Yeah, that would be nice; if not you can see if you can reverse engineer
them from FMS and the topology bits we do have and file a request with
your hardware people to pretty please expose this going forward.
  
Gautham R. Shenoy June 2, 2023, 9:06 a.m. UTC | #16
Hello Peter,

On Fri, Jun 02, 2023 at 10:47:07AM +0530, K Prateek Nayak wrote:
> Hello Peter,
> 
> On 6/1/2023 8:21 PM, Peter Zijlstra wrote:
> > On Thu, Jun 01, 2023 at 02:00:01PM +0200, Peter Zijlstra wrote:
> >> On Thu, Jun 01, 2023 at 01:56:43PM +0200, Peter Zijlstra wrote:
> >>> On Thu, Jun 01, 2023 at 01:13:26PM +0200, Peter Zijlstra wrote:
> >>>>
> >>>> This DeathStarBench thing seems to suggest that scanning up to 4 CCDs
> >>>> isn't too much of a bother; so perhaps something like so?
> >>>>
> >>>> (on top of tip/sched/core from just a few hours ago, as I had to 'fix'
> >>>> this patch and force pushed the thing)
> >>>>
> >>>> And yeah, random hacks and heuristics here :/ Does there happen to be
> >>>> additional topology that could aid us here? Does the CCD fabric itself
> >>>> have a distance metric we can use?
> >>>
> >>>   https://www.anandtech.com/show/16529/amd-epyc-milan-review/4
> >>>
> >>> Specifically:
> >>>
> >>>   https://images.anandtech.com/doci/16529/Bounce-7763.png
> >>>
> >>> That seems to suggest there are some very minor distance effects in the
> >>> CCD fabric. I didn't read the article too closely, but you'll note that
> >>> the first 4 CCDs have inter-CCD latency < 100 while the rest has > 100.
> >>>
> >>> Could you also test on a Zen2 Epyc, does that require nr=8 instead of 4?
> >>> Should we perhaps write it like: 32 / llc_size ?
> >>>
> >>> The Zen2 picture:
> >>>
> >>>   https://images.anandtech.com/doci/16315/Bounce-7742.png
> >>>
> >>> Shows a more pronounced CCD fabric topology, you can really see the 2
> >>> CCX inside the CCD but also there's two ligher green squares around the
> >>> CCDs themselves.
> >>
> >> I can't seem to find pretty pictures for Zen4 Epyc; what does that want?
> >> That's even bigger at 96/8=12 LLCs afaict.
> > 
> > Going by random pictures on the interweb again, it looks like this Zen4
> > thing wants either 2 groups of 6 each, or 4 groups of 3.
>

Yes, this is what the topology looks like

|---------------------------------------------------------------------------------| 
|                                                                                 |
|   ----------- ----------- -----------     ----------- ----------- -----------   |
|   |(0-7)    | |(8-15)   | |(16-23)  |     |(48-55)  | |(56-63)  | |(64-71)  |   |
|   | LLC0    | | LLC1    | | LLC2    |     | LLC6    | | LLC7    | | LLC8    |   |
|   |(96-103) | |(104-111)| |(112-119)|     |(144-151)| |(152-159)| |(160-167)|   |
|   ----------- ----------- -----------     ----------- ----------- -----------   |
|                                                                                 |
|                                                                                 |
|   ----------- ----------- -----------     ----------- ----------- -----------   |
|   |(24-31)  | |(32-39)  | |(40-47)  |     |(72-79)  | |(80-87)  | |(88-95)  |   |
|   | LLC3    | | LLC4    | | LLC5    |     | LLC9    | | LLC10   | | LLC11   |   |
|   |(120-127)| |(128-135)| |(136-143)|     |(168-175)| |(176-183)| |(184-191)|   |
|   ----------- ----------- -----------     ----------- ----------- -----------   |
|                                                                                 |
|---------------------------------------------------------------------------------|


> I would think it is the latter since NPS4 does that but let me go verify.

2 groups of 6 each is the vertical split which is NPS2.

4 groups of 3 each is the vertical and horizontal split, which is
NPS4.

In both these cases, currently the domain hierarchy

SMT --> MC --> NODE --> NUMA

where the NODE will be the parent of MC and be the 2nd level wakeup domain.

If we define CLS to be the group with 3 LLCs, which becomes the parent
of the MC domain, then, the hierarchy would be

NPS1 : SMT --> MC --> CLS --> DIE
NPS2 : SMT --> MC --> CLS --> NODE --> NUMA
NPS4 : SMT --> MC --> CLS --> NUMA

NPS2 will have 5 domains within a single socket. Oh well!

--
Thanks and Regards
gautham.
  
K Prateek Nayak June 2, 2023, 9:19 a.m. UTC | #17
Hello Peter,

On 6/2/2023 12:24 PM, Peter Zijlstra wrote:
> On Fri, Jun 02, 2023 at 10:43:37AM +0530, K Prateek Nayak wrote:
>> Grouping near-CCX for the offerings that do not have 2CCX per CCD will
>> prevent degenration and limit the search scope yes. Here is what I'll
>> do, let me check if limiting search scope helps first, and then start
>> fiddling with the topology. How does that sound?
> 
> So my preference would be the topology based solution,

I agree that is much cleaner but workloads rarely like clean solutions
nowadays :)

> since the search
> limit is random magic numbers that happen to work for 'your' machine but
> who knows what it'll do for some other poor architecture that happens to
> trip this.
> 
> That said; verifying the limit helps at all is of course a good start,
> because if it doesn't then the topology thing will likely also not help
> much.

Queued some tests on NPS2/4, and also with the "nr = 4" heuristic.
I'll share the results once they finish.
--
Thanks and Regards,
Prateek
  
Peter Zijlstra June 2, 2023, 11:23 a.m. UTC | #18
On Fri, Jun 02, 2023 at 02:36:37PM +0530, Gautham R. Shenoy wrote:

> Yes, this is what the topology looks like
> 
> |---------------------------------------------------------------------------------| 
> |                                                                                 |
> |   ----------- ----------- -----------     ----------- ----------- -----------   |
> |   |(0-7)    | |(8-15)   | |(16-23)  |     |(48-55)  | |(56-63)  | |(64-71)  |   |
> |   | LLC0    | | LLC1    | | LLC2    |     | LLC6    | | LLC7    | | LLC8    |   |
> |   |(96-103) | |(104-111)| |(112-119)|     |(144-151)| |(152-159)| |(160-167)|   |
> |   ----------- ----------- -----------     ----------- ----------- -----------   |
> |                                                                                 |
> |                                                                                 |
> |   ----------- ----------- -----------     ----------- ----------- -----------   |
> |   |(24-31)  | |(32-39)  | |(40-47)  |     |(72-79)  | |(80-87)  | |(88-95)  |   |
> |   | LLC3    | | LLC4    | | LLC5    |     | LLC9    | | LLC10   | | LLC11   |   |
> |   |(120-127)| |(128-135)| |(136-143)|     |(168-175)| |(176-183)| |(184-191)|   |
> |   ----------- ----------- -----------     ----------- ----------- -----------   |
> |                                                                                 |
> |---------------------------------------------------------------------------------|

Yup, that's the pictures I found online.

> > I would think it is the latter since NPS4 does that but let me go verify.
> 
> 2 groups of 6 each is the vertical split which is NPS2.
> 
> 4 groups of 3 each is the vertical and horizontal split, which is
> NPS4.
> 
> In both these cases, currently the domain hierarchy
> 
> SMT --> MC --> NODE --> NUMA
> 
> where the NODE will be the parent of MC and be the 2nd level wakeup domain.
> 
> If we define CLS to be the group with 3 LLCs, which becomes the parent
> of the MC domain, then, the hierarchy would be
> 
> NPS1 : SMT --> MC --> CLS --> DIE
> NPS2 : SMT --> MC --> CLS --> NODE --> NUMA
> NPS4 : SMT --> MC --> CLS --> NUMA
> 
> NPS2 will have 5 domains within a single socket. Oh well!

I think cluster/CLS is taken for L2, we should not re-purpose that for
groups of L3.

Anyway, yes, 5 levels. Shouldn't be a problem though, right?
  
Marek Szyprowski June 5, 2023, 3:25 p.m. UTC | #19
On 31.05.2023 14:04, tip-bot2 for Peter Zijlstra wrote:
> The following commit has been merged into the sched/core branch of tip:
>
> Commit-ID:     c7dfd6b9122d29d0e9a4587ab470c0564d7f92ab
> Gitweb:        https://git.kernel.org/tip/c7dfd6b9122d29d0e9a4587ab470c0564d7f92ab
> Author:        Peter Zijlstra <peterz@infradead.org>
> AuthorDate:    Tue, 30 May 2023 13:20:46 +02:00
> Committer:     Peter Zijlstra <peterz@infradead.org>
> CommitterDate: Tue, 30 May 2023 22:46:27 +02:00
>
> sched/fair: Multi-LLC select_idle_sibling()
>
> Tejun reported that when he targets workqueues towards a specific LLC
> on his Zen2 machine with 3 cores / LLC and 4 LLCs in total, he gets
> significant idle time.
>
> This is, of course, because of how select_idle_sibling() will not
> consider anything outside of the local LLC, and since all these tasks
> are short running the periodic idle load balancer is ineffective.
>
> And while it is good to keep work cache local, it is better to not
> have significant idle time. Therefore, have select_idle_sibling() try
> other LLCs inside the same node when the local one comes up empty.
>
> Reported-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

This patch landed in today's linux next-20230605 as commit c5214e13ad60 
("sched/fair: Multi-LLC select_idle_sibling()"). Unfortunately it causes 
regression on my ARM 64bit Exynos5433-based TM2e test board during the 
CPU hotplug tests. From time to time I get the NULL pointer dereference. 
Reverting $subject on top of linux-next fixes the issue. Let me know if 
I can help somehow debugging this issue. Here is a complete log (I've 
intentionally kept all the stack dumps, although they don't look very 
relevant...):

# for i in /sys/devices/system/cpu/cpu[1-9]; do echo 0 >$i/online; done
Unable to handle kernel NULL pointer dereference at virtual address 
0000000000000090

======================================================
WARNING: possible circular locking dependency detected
6.4.0-rc1+ #13640 Not tainted
------------------------------------------------------
cpuhp/6/43 is trying to acquire lock:
ffff80000ab65598 (console_owner){..-.}-{0:0}, at: 
console_flush_all+0x1ac/0x4fc

but task is already holding lock:
ffff00002836ed48 (&p->pi_lock){-.-.}-{2:2}, at: try_to_wake_up+0x58/0x46c

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #3 (&p->pi_lock){-.-.}-{2:2}:
        _raw_spin_lock_irqsave+0x60/0x88
        try_to_wake_up+0x58/0x46c
        default_wake_function+0x14/0x20
        autoremove_wake_function+0x18/0x44
        __wake_up_common+0x94/0x170
        __wake_up_common_lock+0x7c/0xcc
        __wake_up+0x18/0x24
        tty_wakeup+0x34/0x70
        tty_port_default_wakeup+0x20/0x38
        tty_port_tty_wakeup+0x18/0x24
        uart_write_wakeup+0x18/0x28
        s3c24xx_serial_tx_chars+0x20c/0x218
        s3c64xx_serial_handle_irq+0x9c/0xe0
        __handle_irq_event_percpu+0xb0/0x2d4
        handle_irq_event+0x4c/0xb8
        handle_fasteoi_irq+0xa4/0x198
        generic_handle_domain_irq+0x2c/0x44
        gic_handle_irq+0x44/0xc4
        call_on_irq_stack+0x24/0x4c
        do_interrupt_handler+0x80/0x84
        el1_interrupt+0x34/0x64
        el1h_64_irq_handler+0x18/0x24
        el1h_64_irq+0x64/0x68
        default_idle_call+0x9c/0x150
        do_idle+0x230/0x294
        cpu_startup_entry+0x28/0x2c
        rest_init+0x100/0x190
        arch_post_acpi_subsys_init+0x0/0x8
        start_kernel+0x594/0x684
        __primary_switched+0xbc/0xc4

-> #2 (&tty->write_wait){-.-.}-{2:2}:
        _raw_spin_lock_irqsave+0x60/0x88
        __wake_up_common_lock+0x5c/0xcc
        __wake_up+0x18/0x24
        tty_wakeup+0x34/0x70
        tty_port_default_wakeup+0x20/0x38
        tty_port_tty_wakeup+0x18/0x24
        uart_write_wakeup+0x18/0x28
        s3c24xx_serial_tx_chars+0x20c/0x218
        s3c64xx_serial_handle_irq+0x9c/0xe0
        __handle_irq_event_percpu+0xb0/0x2d4
        handle_irq_event+0x4c/0xb8
        handle_fasteoi_irq+0xa4/0x198
        generic_handle_domain_irq+0x2c/0x44
        gic_handle_irq+0x44/0xc4
        call_on_irq_stack+0x24/0x4c
        do_interrupt_handler+0x80/0x84
        el1_interrupt+0x34/0x64
        el1h_64_irq_handler+0x18/0x24
        el1h_64_irq+0x64/0x68
        default_idle_call+0x9c/0x150
        do_idle+0x230/0x294
        cpu_startup_entry+0x28/0x2c
        rest_init+0x100/0x190
        arch_post_acpi_subsys_init+0x0/0x8
        start_kernel+0x594/0x684
        __primary_switched+0xbc/0xc4

-> #1 (&port_lock_key){-.-.}-{2:2}:
        _raw_spin_lock_irqsave+0x60/0x88
        s3c24xx_serial_console_write+0xfc/0x124
        console_flush_all+0x208/0x4fc
        console_unlock+0x5c/0x14c
        vprintk_emit+0x15c/0x3b0
        vprintk_default+0x38/0x44
        vprintk+0xc0/0xe4
        _printk+0x5c/0x84
        register_console+0x1f4/0x420
        uart_add_one_port+0x50c/0x53c
        s3c24xx_serial_probe+0x34c/0x72c
        platform_probe+0x68/0xd8
        really_probe+0x148/0x2b4
        __driver_probe_device+0x78/0x12c
        driver_probe_device+0xd8/0x160
        __driver_attach+0x9c/0x1ac
        bus_for_each_dev+0x74/0xd4
        driver_attach+0x24/0x30
        bus_add_driver+0xe4/0x1e8
        driver_register+0x60/0x128
        __platform_driver_register+0x28/0x34
        samsung_serial_init+0x30/0x8c
        do_one_initcall+0x74/0x2f0
        kernel_init_freeable+0x288/0x4d8
        kernel_init+0x24/0x1dc
        ret_from_fork+0x10/0x20

-> #0 (console_owner){..-.}-{0:0}:
        __lock_acquire+0x13d0/0x217c
        lock_acquire+0x1e8/0x310
        console_flush_all+0x1f4/0x4fc
        console_unlock+0x5c/0x14c
        vprintk_emit+0x15c/0x3b0
        vprintk_default+0x38/0x44
        vprintk+0xc0/0xe4
        _printk+0x5c/0x84
        die_kernel_fault+0x48/0x37c
        __do_kernel_fault+0xd8/0x19c
        do_page_fault+0xac/0x6d8
        do_translation_fault+0xac/0xb8
        do_mem_abort+0x44/0x94
        el1_abort+0x44/0x70
        el1h_64_sync_handler+0xd8/0xe4
        el1h_64_sync+0x64/0x68
        __bitmap_and+0x4c/0x78
        select_task_rq_fair+0x724/0x1a30
        try_to_wake_up+0x17c/0x46c
        wake_up_process+0x18/0x24
        complete+0x58/0x8c
        __kthread_parkme+0x74/0xc8
        kthread_parkme+0x20/0x44
        smpboot_thread_fn+0x118/0x2a0
        kthread+0x124/0x128
        ret_from_fork+0x10/0x20

other info that might help us debug this:

Chain exists of:
   console_owner --> &tty->write_wait --> &p->pi_lock

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(&p->pi_lock);
                                lock(&tty->write_wait);
                                lock(&p->pi_lock);
   lock(console_owner);

  *** DEADLOCK ***

5 locks held by cpuhp/6/43:
  #0: ffff000023e68440 (&x->wait){....}-{2:2}, at: complete+0x24/0x8c
  #1: ffff00002836ed48 (&p->pi_lock){-.-.}-{2:2}, at: 
try_to_wake_up+0x58/0x46c
  #2: ffff80000abd6ac0 (rcu_read_lock){....}-{1:2}, at: 
select_task_rq_fair+0x114/0x1a30
  #3: ffff80000ab65390 (console_lock){+.+.}-{0:0}, at: 
vprintk_default+0x38/0x44
  #4: ffff80000ab65440 (console_srcu){....}-{0:0}, at: 
console_flush_all+0x7c/0x4fc

stack backtrace:
CPU: 6 PID: 43 Comm: cpuhp/6 Not tainted 6.4.0-rc1+ #13640
Hardware name: Samsung TM2E board (DT)
Call trace:
  dump_backtrace+0x98/0xf0
  show_stack+0x18/0x24
  dump_stack_lvl+0x60/0xac
  dump_stack+0x18/0x24
  print_circular_bug+0x26c/0x348
  check_noncircular+0x134/0x148
  __lock_acquire+0x13d0/0x217c
  lock_acquire+0x1e8/0x310
  console_flush_all+0x1f4/0x4fc
  console_unlock+0x5c/0x14c
  vprintk_emit+0x15c/0x3b0
  vprintk_default+0x38/0x44
  vprintk+0xc0/0xe4
  _printk+0x5c/0x84
  die_kernel_fault+0x48/0x37c
  __do_kernel_fault+0xd8/0x19c
  do_page_fault+0xac/0x6d8
  do_translation_fault+0xac/0xb8
  do_mem_abort+0x44/0x94
  el1_abort+0x44/0x70
  el1h_64_sync_handler+0xd8/0xe4
  el1h_64_sync+0x64/0x68
  __bitmap_and+0x4c/0x78
  select_task_rq_fair+0x724/0x1a30
  try_to_wake_up+0x17c/0x46c
  wake_up_process+0x18/0x24
  complete+0x58/0x8c
  __kthread_parkme+0x74/0xc8
  kthread_parkme+0x20/0x44
  smpboot_thread_fn+0x118/0x2a0
  kthread+0x124/0x128
  ret_from_fork+0x10/0x20
Mem abort info:
   ESR = 0x0000000096000006
   EC = 0x25: DABT (current EL), IL = 32 bits
   SET = 0, FnV = 0
   EA = 0, S1PTW = 0
   FSC = 0x06: level 2 translation fault
Data abort info:
   ISV = 0, ISS = 0x00000006
   CM = 0, WnR = 0
user pgtable: 4k pages, 48-bit VAs, pgdp=000000002783e000
[0000000000000090] pgd=080000002738f003, p4d=080000002738f003, 
pud=0800000027a24003, pmd=0000000000000000
Internal error: Oops: 0000000096000006 [#1] PREEMPT SMP
Modules linked in: brcmfmac_wcc cpufreq_powersave cpufreq_conservative 
brcmfmac brcmutil cfg80211 crct10dif_ce hci_uart btqca btbcm bluetooth 
s5p_jpeg exynos_gsc s3fwrn5_i2c s3fwrn5 s5p_mfc nci v4l2_mem2mem 
ecdh_generic nfc ecc videobuf2_dma_contig videobuf2_memops 
videobuf2_v4l2 videodev rfkill panfrost videobuf2_common 
drm_shmem_helper gpu_sched mc ip_tables x_tables ipv6

CPU: 6 PID: 43 Comm: cpuhp/6 Not tainted 6.4.0-rc1+ #13640
Hardware name: Samsung TM2E board (DT)
pstate: 000000c5 (nzcv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : __bitmap_and+0x4c/0x78
lr : select_idle_cpu+0x64/0x450
sp : ffff80000bd83b50
x29: ffff80000bd83b50 x28: ffff80000a152ad8 x27: ffff00002836e500
x26: ffff00002814f600 x25: ffff80000ab43e78 x24: 0000000000000000
x23: ffff80000ab3f000 x22: 0000000000000000 x21: ffff80000ab43e78
x20: 0000000000000000 x19: 0000000000000000 x18: ffff8000099ac098
x17: 0000000000000000 x16: 0000000000000067 x15: 0000000000000001
x14: 0000000000000000 x13: 00000000000000d8 x12: 0000000000000000
x11: 0000000000000001 x10: ffff80000b7c6e90 x9 : 0000000000000000
x8 : 0000000000000000 x7 : 0000000000000000 x6 : 0000000000000000
x5 : 0000000000000020 x4 : 00000000000000ff x3 : 00000000fffffff8
x2 : ffff00002836e7e0 x1 : 0000000000000090 x0 : ffff0000d5fc2ad8
Call trace:
  __bitmap_and+0x4c/0x78
  select_task_rq_fair+0x724/0x1a30
  try_to_wake_up+0x17c/0x46c
  wake_up_process+0x18/0x24
  complete+0x58/0x8c
  __kthread_parkme+0x74/0xc8
  kthread_parkme+0x20/0x44
  smpboot_thread_fn+0x118/0x2a0
  kthread+0x124/0x128
  ret_from_fork+0x10/0x20
Code: 2a0803e8 4b0303e3 92800004 9ac32484 (f8687823)
---[ end trace 0000000000000000 ]---
Kernel panic - not syncing: Oops: Fatal exception
SMP: stopping secondary CPUs
Kernel Offset: disabled
CPU features: 0x8c0004,1c780800,0000421b
Memory Limit: none
---[ end Kernel panic - not syncing: Oops: Fatal exception ]---


> ---
>   kernel/sched/fair.c     | 38 ++++++++++++++++++++++++++++++++++++++
>   kernel/sched/features.h |  1 +
>   2 files changed, 39 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 48b6f0c..0172458 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7028,6 +7028,38 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>   }
>   
>   /*
> + * For the multiple-LLC per node case, make sure to try the other LLC's if the
> + * local LLC comes up empty.
> + */
> +static int
> +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
> +{
> +	struct sched_domain *parent = sd->parent;
> +	struct sched_group *sg;
> +
> +	/* Make sure to not cross nodes. */
> +	if (!parent || parent->flags & SD_NUMA)
> +		return -1;
> +
> +	sg = parent->groups;
> +	do {
> +		int cpu = cpumask_first(sched_group_span(sg));
> +		struct sched_domain *sd_child;
> +
> +		sd_child = per_cpu(sd_llc, cpu);
> +		if (sd_child != sd) {
> +			int i = select_idle_cpu(p, sd_child, test_idle_cores(cpu), cpu);
> +			if ((unsigned)i < nr_cpumask_bits)
> +				return i;
> +		}
> +
> +		sg = sg->next;
> +	} while (sg != parent->groups);
> +
> +	return -1;
> +}
> +
> +/*
>    * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
>    * the task fits. If no CPU is big enough, but there are idle ones, try to
>    * maximize capacity.
> @@ -7199,6 +7231,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>   	if ((unsigned)i < nr_cpumask_bits)
>   		return i;
>   
> +	if (sched_feat(SIS_NODE)) {
> +		i = select_idle_node(p, sd, target);
> +		if ((unsigned)i < nr_cpumask_bits)
> +			return i;
> +	}
> +
>   	return target;
>   }
>   
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index ee7f23c..9e390eb 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -62,6 +62,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
>    */
>   SCHED_FEAT(SIS_PROP, false)
>   SCHED_FEAT(SIS_UTIL, true)
> +SCHED_FEAT(SIS_NODE, true)
>   
>   /*
>    * Issue a WARN when we do multiple update_rq_clock() calls

Best regards
  
Peter Zijlstra June 5, 2023, 5:56 p.m. UTC | #20
On Mon, Jun 05, 2023 at 05:25:30PM +0200, Marek Szyprowski wrote:
> On 31.05.2023 14:04, tip-bot2 for Peter Zijlstra wrote:
> > The following commit has been merged into the sched/core branch of tip:
> >
> > Commit-ID:     c7dfd6b9122d29d0e9a4587ab470c0564d7f92ab
> > Gitweb:        https://git.kernel.org/tip/c7dfd6b9122d29d0e9a4587ab470c0564d7f92ab
> > Author:        Peter Zijlstra <peterz@infradead.org>
> > AuthorDate:    Tue, 30 May 2023 13:20:46 +02:00
> > Committer:     Peter Zijlstra <peterz@infradead.org>
> > CommitterDate: Tue, 30 May 2023 22:46:27 +02:00
> >
> > sched/fair: Multi-LLC select_idle_sibling()
> >
> > Tejun reported that when he targets workqueues towards a specific LLC
> > on his Zen2 machine with 3 cores / LLC and 4 LLCs in total, he gets
> > significant idle time.
> >
> > This is, of course, because of how select_idle_sibling() will not
> > consider anything outside of the local LLC, and since all these tasks
> > are short running the periodic idle load balancer is ineffective.
> >
> > And while it is good to keep work cache local, it is better to not
> > have significant idle time. Therefore, have select_idle_sibling() try
> > other LLCs inside the same node when the local one comes up empty.
> >
> > Reported-by: Tejun Heo <tj@kernel.org>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> 
> This patch landed in today's linux next-20230605 as commit c5214e13ad60 
> ("sched/fair: Multi-LLC select_idle_sibling()"). Unfortunately it causes 
> regression on my ARM 64bit Exynos5433-based TM2e test board during the 
> CPU hotplug tests. From time to time I get the NULL pointer dereference. 
> Reverting $subject on top of linux-next fixes the issue. Let me know if 
> I can help somehow debugging this issue. Here is a complete log (I've 
> intentionally kept all the stack dumps, although they don't look very 
> relevant...):

Moo... OK, since our friends from AMD need some tuning on this anyway,
i'm going to pull the patch entirely. And we'll try again once they've
sorted out the best way to do this.
  
Peter Zijlstra June 5, 2023, 7:07 p.m. UTC | #21
On Mon, Jun 05, 2023 at 05:25:30PM +0200, Marek Szyprowski wrote:

> nfortunately it causes 
> regression on my ARM 64bit Exynos5433-based TM2e test board during the 
> CPU hotplug tests. 

Can you elucidate an ARM illiterate on the actual topology of that
machine?


> CPU: 6 PID: 43 Comm: cpuhp/6 Not tainted 6.4.0-rc1+ #13640
> Hardware name: Samsung TM2E board (DT)
> pstate: 000000c5 (nzcv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> pc : __bitmap_and+0x4c/0x78
> lr : select_idle_cpu+0x64/0x450

Btw, where is lr at? Is that perhaps per_cpu(sd_llc) being NULL  or
something?


> > ---
> >   kernel/sched/fair.c     | 38 ++++++++++++++++++++++++++++++++++++++
> >   kernel/sched/features.h |  1 +
> >   2 files changed, 39 insertions(+)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 48b6f0c..0172458 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -7028,6 +7028,38 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> >   }
> >   
> >   /*
> > + * For the multiple-LLC per node case, make sure to try the other LLC's if the
> > + * local LLC comes up empty.
> > + */
> > +static int
> > +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
> > +{
> > +	struct sched_domain *parent = sd->parent;
> > +	struct sched_group *sg;
> > +
> > +	/* Make sure to not cross nodes. */
> > +	if (!parent || parent->flags & SD_NUMA)
> > +		return -1;
> > +
> > +	sg = parent->groups;
> > +	do {
> > +		int cpu = cpumask_first(sched_group_span(sg));
> > +		struct sched_domain *sd_child;
> > +
> > +		sd_child = per_cpu(sd_llc, cpu);

IOW, sd_child end up NULL?

> > +		if (sd_child != sd) {
> > +			int i = select_idle_cpu(p, sd_child, test_idle_cores(cpu), cpu);
> > +			if ((unsigned)i < nr_cpumask_bits)
> > +				return i;
> > +		}
> > +
> > +		sg = sg->next;
> > +	} while (sg != parent->groups);
> > +
> > +	return -1;
> > +}
  
Marek Szyprowski June 5, 2023, 10:20 p.m. UTC | #22
On 05.06.2023 21:07, Peter Zijlstra wrote:
> On Mon, Jun 05, 2023 at 05:25:30PM +0200, Marek Szyprowski wrote:
>
>> nfortunately it causes
>> regression on my ARM 64bit Exynos5433-based TM2e test board during the
>> CPU hotplug tests.
> Can you elucidate an ARM illiterate on the actual topology of that
> machine?

Please check arch/arm64/boot/dts/exynos/exynos5433.dtsi This is typical 
ARM big.LITTLE machine with 4 'big' (Cortex-A53 in this case) cores in 
one cluster and another 4 'LITTLE' (Cortex-A57) in the latter.


>> CPU: 6 PID: 43 Comm: cpuhp/6 Not tainted 6.4.0-rc1+ #13640
>> Hardware name: Samsung TM2E board (DT)
>> pstate: 000000c5 (nzcv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>> pc : __bitmap_and+0x4c/0x78
>> lr : select_idle_cpu+0x64/0x450
> Btw, where is lr at? Is that perhaps per_cpu(sd_llc) being NULL  or
> something?

If I get it right:

# aarch64-linux-gnu-objdump -Sld --start-address=0xffff8000080e7064 vmlinux

ffff8000080e7064 <select_idle_cpu>:
...
select_idle_cpu():
kernel/sched/fair.c:6987
                 sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
ffff8000080e70c8:       f8747b21        ldr     x1, [x25, x20, lsl #3]
ffff8000080e70cc:       f0010340        adrp    x0, ffff80000a152000 
<kvm_hyp_ctxt+0x7a0>
ffff8000080e70d0:       91302000        add     x0, x0, #0xc08
ffff8000080e70d4:       f90047e0        str     x0, [sp, #136]
ffff8000080e70d8:       f8616814        ldr     x20, [x0, x1]
ffff8000080e70dc:       9442c570        bl      ffff80000919869c 
<debug_lockdep_rcu_enabled>
ffff8000080e70e0:       350017a0        cbnz    w0, ffff8000080e73d4 
<select_idle_cpu+0x370>

This kvm_hyp_ctxt smells a little bad here, because this board boots 
directly to EL1, so no hyp/kvm is used. This is relevant dmesg part:

--->8---
smp: Brought up 1 node, 8 CPUs
SMP: Total of 8 processors activated.
CPU features: detected: 32-bit EL0 Support
CPU features: detected: 32-bit EL1 Support
CPU features: detected: CRC32 instructions
CPU: All CPU(s) started at EL1

--->8---


>>> ---
>>>    kernel/sched/fair.c     | 38 ++++++++++++++++++++++++++++++++++++++
>>>    kernel/sched/features.h |  1 +
>>>    2 files changed, 39 insertions(+)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 48b6f0c..0172458 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -7028,6 +7028,38 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>>>    }
>>>    
>>>    /*
>>> + * For the multiple-LLC per node case, make sure to try the other LLC's if the
>>> + * local LLC comes up empty.
>>> + */
>>> +static int
>>> +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
>>> +{
>>> +	struct sched_domain *parent = sd->parent;
>>> +	struct sched_group *sg;
>>> +
>>> +	/* Make sure to not cross nodes. */
>>> +	if (!parent || parent->flags & SD_NUMA)
>>> +		return -1;
>>> +
>>> +	sg = parent->groups;
>>> +	do {
>>> +		int cpu = cpumask_first(sched_group_span(sg));
>>> +		struct sched_domain *sd_child;
>>> +
>>> +		sd_child = per_cpu(sd_llc, cpu);
> IOW, sd_child end up NULL?
>
>>> +		if (sd_child != sd) {
>>> +			int i = select_idle_cpu(p, sd_child, test_idle_cores(cpu), cpu);
>>> +			if ((unsigned)i < nr_cpumask_bits)
>>> +				return i;
>>> +		}
>>> +
>>> +		sg = sg->next;
>>> +	} while (sg != parent->groups);
>>> +
>>> +	return -1;
>>> +}

Best regards
  
Chen Yu June 6, 2023, 7:58 a.m. UTC | #23
On 2023-06-05 at 21:07:46 +0200, Peter Zijlstra wrote:
> On Mon, Jun 05, 2023 at 05:25:30PM +0200, Marek Szyprowski wrote:
> 
> > nfortunately it causes 
> > regression on my ARM 64bit Exynos5433-based TM2e test board during the 
> > CPU hotplug tests. 
> 
> Can you elucidate an ARM illiterate on the actual topology of that
> machine?
> 
> 
> > CPU: 6 PID: 43 Comm: cpuhp/6 Not tainted 6.4.0-rc1+ #13640
> > Hardware name: Samsung TM2E board (DT)
> > pstate: 000000c5 (nzcv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> > pc : __bitmap_and+0x4c/0x78
> > lr : select_idle_cpu+0x64/0x450
> 
> Btw, where is lr at? Is that perhaps per_cpu(sd_llc) being NULL  or
> something?
> 
> 
> > > ---
> > >   kernel/sched/fair.c     | 38 ++++++++++++++++++++++++++++++++++++++
> > >   kernel/sched/features.h |  1 +
> > >   2 files changed, 39 insertions(+)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 48b6f0c..0172458 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -7028,6 +7028,38 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> > >   }
> > >   
> > >   /*
> > > + * For the multiple-LLC per node case, make sure to try the other LLC's if the
> > > + * local LLC comes up empty.
> > > + */
> > > +static int
> > > +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
> > > +{
> > > +	struct sched_domain *parent = sd->parent;
> > > +	struct sched_group *sg;
> > > +
> > > +	/* Make sure to not cross nodes. */
> > > +	if (!parent || parent->flags & SD_NUMA)
> > > +		return -1;
> > > +
> > > +	sg = parent->groups;
> > > +	do {
> > > +		int cpu = cpumask_first(sched_group_span(sg));
> > > +		struct sched_domain *sd_child;
> > > +
> > > +		sd_child = per_cpu(sd_llc, cpu);
> 
> IOW, sd_child end up NULL?
>
Just wonder if we can use rcu_dereference(per_cpu(sd_llc, cpu)),
and cpu offline/online could modify sd_llc pointer. (We used rcu dereference
for sd_llc in other places)

thanks,
Chenyu
  
K Prateek Nayak June 7, 2023, 6:32 p.m. UTC | #24
Hello Peter,

Below are the benchmark results on different NPS modes for SIS_NODE
and SIS_NODE + additional suggested changes. None of them give a
total win. Limit helps but there are cases where it still leads to
regression. I'll leave full details below.

On 6/2/2023 12:24 PM, Peter Zijlstra wrote:
> On Fri, Jun 02, 2023 at 10:43:37AM +0530, K Prateek Nayak wrote:
>> Grouping near-CCX for the offerings that do not have 2CCX per CCD will
>> prevent degenration and limit the search scope yes. Here is what I'll
>> do, let me check if limiting search scope helps first, and then start
>> fiddling with the topology. How does that sound?
> 
> So my preference would be the topology based solution, since the search
> limit is random magic numbers that happen to work for 'your' machine but
> who knows what it'll do for some other poor architecture that happens to
> trip this.
> 
> That said; verifying the limit helps at all is of course a good start,
> because if it doesn't then the topology thing will likely also not help
> much.

o NPS Modes

NPS Modes are used to logically divide single socket into
multiple NUMA region.
Following is the NUMA configuration for each NPS mode on the system:

NPS1: Each socket is a NUMA node.
    Total 2 NUMA nodes in the dual socket machine.

    Node 0: 0-63,   128-191
    Node 1: 64-127, 192-255

    - 8CCX per node

NPS2: Each socket is further logically divided into 2 NUMA regions.
    Total 4 NUMA nodes exist over 2 socket.
   
    Node 0: 0-31,   128-159
    Node 1: 32-63,  160-191
    Node 2: 64-95,  192-223
    Node 3: 96-127, 223-255

    - 4 CCX per node

NPS4: Each socket is logically divided into 4 NUMA regions.
    Total 8 NUMA nodes exist over 2 socket.
   
    Node 0: 0-15,    128-143
    Node 1: 16-31,   144-159
    Node 2: 32-47,   160-175
    Node 3: 48-63,   176-191
    Node 4: 64-79,   192-207
    Node 5: 80-95,   208-223
    Node 6: 96-111,  223-231
    Node 7: 112-127, 232-255

    - 2 CCX per node

  Note:

  - Ideally in NPS2 and NPS4 modes SIS_NODE and SIS_NODE_LIMIT should
    behave similarly.

  - Ideally in NPS4 mode SIS_NODE and SIS_NODE_TOPOEXT should behave
    similarly.

o Kernel Versions

- tip              - tip:sched/core at commit e2a1f85bf9f5 "sched/psi:
                     Avoid resetting the min update period when it is
                     unnecessary")

- SIS_NODE         - tip:sched/core + this patch

- SIS_NODE_LIMIT   - tip:sched/core + this patch + nr=4 limit for SIS_NODE
		     (https://lore.kernel.org/all/20230601111326.GV4253@hirez.programming.kicks-ass.net/)

- SIS_NODE_TOPOEXT - tip:sched/core + this patch
                     + new sched domain (Multi-Multi-Core or MMC)
		     (https://lore.kernel.org/all/20230601153522.GB559993@hirez.programming.kicks-ass.net/)
		     MMC domain groups 2 nearby CCX.

o Benchmark Results

Note: All benchmarks were run with boost enabled and C2 disabled.

~~~~~~~~~~~~~
~ hackbench ~
~~~~~~~~~~~~~

o NPS1

Test:                   tip                     SIS_NODE           SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
 1-groups:         3.92 (0.00 pct)         4.05 (-3.31 pct)        3.78 (3.57 pct)         3.77 (3.82 pct)
 2-groups:         4.58 (0.00 pct)         3.84 (16.15 pct)        4.50 (1.74 pct)         4.34 (5.24 pct)
 4-groups:         4.99 (0.00 pct)         3.98 (20.24 pct)        4.93 (1.20 pct)         5.01 (-0.40 pct)
 8-groups:         5.67 (0.00 pct)         6.05 (-6.70 pct)        5.73 (-1.05 pct)        5.95 (-4.93 pct)
16-groups:         7.88 (0.00 pct)        10.56 (-34.01 pct)       7.83 (0.63 pct)         8.04 (-2.03 pct)

o NPS2

Test:                   tip                     SIS_NODE           SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
 1-groups:         3.82 (0.00 pct)         3.68 (3.66 pct)         3.87 (-1.30 pct)        3.74 (2.09 pct)
 2-groups:         4.40 (0.00 pct)         3.61 (17.95 pct)        4.45 (-1.13 pct)        4.30 (2.27 pct)
 4-groups:         4.84 (0.00 pct)         3.62 (25.20 pct)        4.84 (0.00 pct)         4.97 (-2.68 pct)
 8-groups:         5.45 (0.00 pct)         6.14 (-12.66 pct)       5.40 (0.91 pct)         5.68 (-4.22 pct)
16-groups:         6.94 (0.00 pct)         8.77 (-26.36 pct)       6.57 (5.33 pct)         7.87 (-13.40 pct)

o NPS4

Test:                   tip                     SIS_NODE           SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
 1-groups:         3.82 (0.00 pct)         3.84 (-0.52 pct)        3.83 (-0.26 pct)        3.85 (-0.78 pct)
 2-groups:         4.44 (0.00 pct)         4.15 (6.53 pct)         4.43 (0.22 pct)         4.18 (5.85 pct)
 4-groups:         4.86 (0.00 pct)         4.95 (-1.85 pct)        4.88 (-0.41 pct)        4.79 (1.44 pct)
 8-groups:         5.42 (0.00 pct)         5.80 (-7.01 pct)        5.41 (0.18 pct)         5.75 (-6.08 pct)
16-groups:         6.68 (0.00 pct)         9.07 (-35.77 pct)       6.72 (-0.59 pct)        8.66 (-29.64 pct)


~~~~~~~~~~~~~~~~~~
~ schbench (Old) ~
~~~~~~~~~~~~~~~~~~

o NPS1

#workers:   tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
1:      26.00 (0.00 pct)        24.00 (7.69 pct)        26.00 (0.00 pct)        20.00 (23.07 pct)
2:      27.00 (0.00 pct)        24.00 (11.11 pct)       25.00 (7.40 pct)        26.00 (3.70 pct)
4:      31.00 (0.00 pct)        28.00 (9.67 pct)        30.00 (3.22 pct)        28.00 (9.67 pct)
8:      36.00 (0.00 pct)        33.00 (8.33 pct)        34.00 (5.55 pct)        34.00 (5.55 pct)
16:      49.00 (0.00 pct)        47.00 (4.08 pct)        50.00 (-2.04 pct)       50.00 (-2.04 pct)
32:      80.00 (0.00 pct)        81.00 (-1.25 pct)       80.00 (0.00 pct)        81.00 (-1.25 pct)
64:     169.00 (0.00 pct)       169.00 (0.00 pct)       177.00 (-4.73 pct)      177.00 (-4.73 pct)
128:     343.00 (0.00 pct)       365.00 (-6.41 pct)      341.00 (0.58 pct)       336.00 (2.04 pct)
256:     42048.00 (0.00 pct)     35392.00 (15.82 pct)    45888.00 (-9.13 pct)    48576.00 (-15.52 pct)
512:     95104.00 (0.00 pct)     88704.00 (6.72 pct)     92032.00 (3.23 pct)     89984.00 (5.38 pct)

o NPS2

#workers:   tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
1:      23.00 (0.00 pct)        24.00 (-4.34 pct)       25.00 (-8.69 pct)       21.00 (8.69 pct)
2:      24.00 (0.00 pct)        24.00 (0.00 pct)        27.00 (-12.50 pct)      28.00 (-16.66 pct)
4:      31.00 (0.00 pct)        26.00 (16.12 pct)       26.00 (16.12 pct)       29.00 (6.45 pct)
8:      41.00 (0.00 pct)        38.00 (7.31 pct)        40.00 (2.43 pct)        38.00 (7.31 pct)
16:      48.00 (0.00 pct)        49.00 (-2.08 pct)       51.00 (-6.25 pct)       53.00 (-10.41 pct)
32:      81.00 (0.00 pct)        84.00 (-3.70 pct)       81.00 (0.00 pct)        86.00 (-6.17 pct)
64:     157.00 (0.00 pct)       169.00 (-7.64 pct)      171.00 (-8.91 pct)      172.00 (-9.55 pct)
128:     386.00 (0.00 pct)       400.00 (-3.62 pct)      436.00 (-12.95 pct)     384.00 (0.51 pct)
256:     48832.00 (0.00 pct)     44480.00 (8.91 pct)     48704.00 (0.26 pct)     48576.00 (0.52 pct)
512:     92032.00 (0.00 pct)     89472.00 (2.78 pct)     91776.00 (0.27 pct)     91008.00 (1.11 pct)

o NPS4

#workers:   tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
1:      21.00 (0.00 pct)        24.00 (-14.28 pct)      17.00 (19.04 pct)       22.00 (-4.76 pct)
2:      28.00 (0.00 pct)        24.00 (14.28 pct)       28.00 (0.00 pct)        28.00 (0.00 pct)
4:      32.00 (0.00 pct)        29.00 (9.37 pct)        28.00 (12.50 pct)       30.00 (6.25 pct)
8:      46.00 (0.00 pct)        43.00 (6.52 pct)        43.00 (6.52 pct)        42.00 (8.69 pct)
16:     51.00 (0.00 pct)        53.00 (-3.92 pct)       53.00 (-3.92 pct)       56.00 (-9.80 pct)
32:     82.00 (0.00 pct)        81.00 (1.21 pct)        83.00 (-1.21 pct)       83.00 (-1.21 pct)
64:     173.00 (0.00 pct)       172.00 (0.57 pct)       177.00 (-2.31 pct)      155.00 (10.40 pct)
128:    396.00 (0.00 pct)       384.00 (3.03 pct)       360.00 (9.09 pct)       386.00 (2.52 pct)
256:    48832.00 (0.00 pct)     46656.00 (4.45 pct)     49728.00 (-1.83 pct)    49472.00 (-1.31 pct)
512:    95104.00 (0.00 pct)     90752.00 (4.57 pct)     92544.00 (2.69 pct)     90496.00 (4.84 pct)


~~~~~~~~~~
~ tbench ~
~~~~~~~~~~

o NPS1

Clients:      tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
    1    452.49 (0.00 pct)       457.94 (1.20 pct)       458.13 (1.24 pct)       447.69 (-1.06 pct)
    2    862.44 (0.00 pct)       879.99 (2.03 pct)       881.19 (2.17 pct)       855.91 (-0.75 pct)
    4    1604.27 (0.00 pct)      1618.87 (0.91 pct)      1628.00 (1.47 pct)      1627.14 (1.42 pct)
    8    2966.77 (0.00 pct)      3040.90 (2.49 pct)      3037.70 (2.39 pct)      2957.91 (-0.29 pct)
   16    5176.70 (0.00 pct)      5292.29 (2.23 pct)      5445.15 (5.18 pct)      5241.61 (1.25 pct)
   32    8205.24 (0.00 pct)      8949.12 (9.06 pct)      8716.02 (6.22 pct)      8494.17 (3.52 pct)
   64    13956.71 (0.00 pct)     14461.42 (3.61 pct)     13620.04 (-2.41 pct)    15045.43 (7.80 pct)
  128    24005.50 (0.00 pct)     26052.75 (8.52 pct)     24975.03 (4.03 pct)     24008.73 (0.01 pct)
  256    32457.61 (0.00 pct)     21999.41 (-32.22 pct)   30810.93 (-5.07 pct)    31060.12 (-4.30 pct)
  512    34345.24 (0.00 pct)     41166.39 (19.86 pct)    30982.94 (-9.78 pct)    31864.14 (-7.22 pct)
 1024    33432.92 (0.00 pct)     40900.84 (22.33 pct)    30953.61 (-7.41 pct)    32006.81 (-4.26 pct)

o NPS2

Clients:      tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
    1    453.73 (0.00 pct)       451.63 (-0.46 pct)      455.97 (0.49 pct)       453.79 (0.01 pct)
    2    861.71 (0.00 pct)       857.85 (-0.44 pct)      868.30 (0.76 pct)       850.14 (-1.34 pct)
    4    1599.14 (0.00 pct)      1609.30 (0.63 pct)      1656.08 (3.56 pct)      1619.10 (1.24 pct)
    8    2951.03 (0.00 pct)      2944.71 (-0.21 pct)     3034.38 (2.82 pct)      2973.52 (0.76 pct)
   16    5080.32 (0.00 pct)      5160.39 (1.57 pct)      5173.32 (1.83 pct)      5150.99 (1.39 pct)
   32    7900.41 (0.00 pct)      8039.13 (1.75 pct)      8105.69 (2.59 pct)      7956.45 (0.70 pct)
   64    14629.65 (0.00 pct)     15391.08 (5.20 pct)     14546.09 (-0.57 pct)    15410.41 (5.33 pct)
  128    23155.88 (0.00 pct)     24015.45 (3.71 pct)     24263.82 (4.78 pct)     23351.35 (0.84 pct)
  256    33449.57 (0.00 pct)     33571.08 (0.36 pct)     32048.20 (-4.18 pct)    32869.85 (-1.73 pct)
  512    33757.47 (0.00 pct)     39872.69 (18.11 pct)    32945.66 (-2.40 pct)    34526.17 (2.27 pct)
 1024    34823.14 (0.00 pct)     41090.15 (17.99 pct)    32404.40 (-6.94 pct)    34522.97 (-0.86 pct)

o NPS4

Clients:      tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
    1    450.14 (0.00 pct)       454.46 (0.95 pct)       454.53 (0.97 pct)       451.43 (0.28 pct)
    2    863.26 (0.00 pct)       868.94 (0.65 pct)       891.89 (3.31 pct)       866.74 (0.40 pct)
    4    1618.71 (0.00 pct)      1599.13 (-1.20 pct)     1630.29 (0.71 pct)      1610.08 (-0.53 pct)
    8    2929.35 (0.00 pct)      3065.12 (4.63 pct)      3064.15 (4.60 pct)      3004.74 (2.57 pct)
   16    5114.04 (0.00 pct)      5261.40 (2.88 pct)      5238.04 (2.42 pct)      5108.53 (-0.10 pct)
   32    7912.18 (0.00 pct)      8926.77 (12.82 pct)     8382.51 (5.94 pct)      8214.73 (3.82 pct)
   64    14424.72 (0.00 pct)     14853.61 (2.97 pct)     14273.54 (-1.04 pct)    14430.17 (0.03 pct)
  128    23614.97 (0.00 pct)     24506.73 (3.77 pct)     24517.76 (3.82 pct)     23296.38 (-1.34 pct)
  256    34365.13 (0.00 pct)     35538.42 (3.41 pct)     31909.66 (-7.14 pct)    31009.12 (-9.76 pct)
  512    34215.50 (0.00 pct)     36017.49 (5.26 pct)     32696.70 (-4.43 pct)    33262.55 (-2.78 pct)
 1024    35421.90 (0.00 pct)     35193.81 (-0.64 pct)    32611.10 (-7.93 pct)    32795.86 (-7.41 pct)


~~~~~~~~~~
~ stream ~
~~~~~~~~~~

- 10 Runs

o NPS1

Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
 Copy:   271317.35 (0.00 pct)    292440.22 (7.78 pct)    302540.26 (11.50 pct)   287277.25 (5.88 pct)
Scale:   205533.77 (0.00 pct)    203362.60 (-1.05 pct)   207750.30 (1.07 pct)    205206.26 (-0.15 pct)
  Add:   221624.62 (0.00 pct)    225850.83 (1.90 pct)    233782.14 (5.48 pct)    229774.48 (3.67 pct)
Triad:   228500.68 (0.00 pct)    225885.25 (-1.14 pct)   238331.69 (4.30 pct)    240041.53 (5.05 pct)

o NPS2

Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
 Copy:   277761.29 (0.00 pct)    301816.34 (8.66 pct)    293563.58 (5.68 pct)    308218.80 (10.96 pct)
Scale:   215193.83 (0.00 pct)    212522.72 (-1.24 pct)   215758.66 (0.26 pct)    205678.94 (-4.42 pct)
  Add:   242725.75 (0.00 pct)    242695.13 (-0.01 pct)   246472.20 (1.54 pct)    238089.46 (-1.91 pct)
Triad:   237253.44 (0.00 pct)    250618.57 (5.63 pct)    239405.55 (0.90 pct)    249652.73 (5.22 pct)

o NPS4

Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
 Copy:   273307.14 (0.00 pct)    255091.78 (-6.66 pct)   301926.68 (10.47 pct)   262007.26 (-4.13 pct)
Scale:   235715.23 (0.00 pct)    222018.36 (-5.81 pct)   224881.52 (-4.59 pct)   222282.64 (-5.69 pct)
  Add:   244500.40 (0.00 pct)    230468.21 (-5.73 pct)   242625.18 (-0.76 pct)   227146.80 (-7.09 pct)
Triad:   250600.04 (0.00 pct)    236229.50 (-5.73 pct)   258064.49 (2.97 pct)    231772.02 (-7.51 pct)

- 100 Runs

Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
 Copy:   317381.65 (0.00 pct)    318827.08 (0.45 pct)    320898.32 (1.10 pct)    318922.96 (0.48 pct)
Scale:   214145.00 (0.00 pct)    206213.69 (-3.70 pct)   211019.12 (-1.45 pct)   210384.47 (-1.75 pct)
  Add:   239243.29 (0.00 pct)    229791.67 (-3.95 pct)   233827.11 (-2.26 pct)   236659.48 (-1.07 pct)
Triad:   249477.76 (0.00 pct)    236843.06 (-5.06 pct)   244688.91 (-1.91 pct)   235990.67 (-5.40 pct)

o NPS2

Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
 Copy:   318082.10 (0.00 pct)    322844.91 (1.49 pct)    310350.21 (-2.43 pct)   322495.84 (1.38 pct)
Scale:   219338.56 (0.00 pct)    218139.90 (-0.54 pct)   212288.47 (-3.21 pct)   221040.27 (0.77 pct)
  Add:   248118.20 (0.00 pct)    249826.98 (0.68 pct)    239682.55 (-3.39 pct)   253006.79 (1.97 pct)
Triad:   247088.55 (0.00 pct)    260488.38 (5.42 pct)    247892.42 (0.32 pct)    249081.33 (0.80 pct)

o NPS4

Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
 Copy:   345396.19 (0.00 pct)    343675.74 (-0.49 pct)   346990.96 (0.46 pct)    334677.55 (-3.10 pct)
Scale:   241521.63 (0.00 pct)    231494.70 (-4.15 pct)   236233.18 (-2.18 pct)   229159.01 (-5.11 pct)
  Add:   261157.86 (0.00 pct)    249663.86 (-4.40 pct)   253402.85 (-2.96 pct)   242257.98 (-7.23 pct)
Triad:   267804.99 (0.00 pct)    263071.00 (-1.76 pct)   264208.15 (-1.34 pct)   256978.50 (-4.04 pct)

~~~~~~~~~~~
~ netperf ~
~~~~~~~~~~~

o NPS1

                        tip                  SIS_NODE              SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
1-clients:       102839.97 (0.00 pct)    103540.33 (0.68 pct)    103769.74 (0.90 pct)    103271.77 (0.41 pct)
2-clients:       98428.08 (0.00 pct)     100431.67 (2.03 pct)    100555.62 (2.16 pct)    100417.11 (2.02 pct)
4-clients:       92298.45 (0.00 pct)     94800.51 (2.71 pct)     93706.09 (1.52 pct)     94981.10 (2.90 pct)
8-clients:       85618.41 (0.00 pct)     89130.14 (4.10 pct)     87677.84 (2.40 pct)     88284.61 (3.11 pct)
16-clients:      78722.18 (0.00 pct)     79715.38 (1.26 pct)     80488.76 (2.24 pct)     78980.88 (0.32 pct)
32-clients:      73610.75 (0.00 pct)     72801.41 (-1.09 pct)    72167.43 (-1.96 pct)    75077.55 (1.99 pct)
64-clients:      55285.07 (0.00 pct)     56184.38 (1.62 pct)     56443.79 (2.09 pct)     60689.05 (9.77 pct)
128-clients:     31176.92 (0.00 pct)     32830.06 (5.30 pct)     35511.93 (13.90 pct)    35638.50 (14.31 pct)
256-clients:     20011.44 (0.00 pct)     15135.39 (-24.36 pct)   17599.21 (-12.05 pct)   18219.29 (-8.95 pct)

o NPS2

                        tip                  SIS_NODE              SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
1-clients:       103105.55 (0.00 pct)    101582.75 (-1.47 pct)   103077.22 (-0.02 pct)   102233.63 (-0.84 pct)
2-clients:       98720.29 (0.00 pct)     98537.46 (-0.18 pct)    100761.54 (2.06 pct)    99211.39 (0.49 pct)
4-clients:       92289.39 (0.00 pct)     94332.45 (2.21 pct)     93622.46 (1.44 pct)     93321.77 (1.11 pct)
8-clients:       84998.63 (0.00 pct)     87180.90 (2.56 pct)     86970.84 (2.32 pct)     86076.75 (1.26 pct)
16-clients:      76395.81 (0.00 pct)     80017.06 (4.74 pct)     77937.29 (2.01 pct)     75090.85 (-1.70 pct)
32-clients:      71110.89 (0.00 pct)     69445.86 (-2.34 pct)    69273.81 (-2.58 pct)    66885.99 (-5.94 pct)
64-clients:      49526.21 (0.00 pct)     50004.13 (0.96 pct)     51649.09 (4.28 pct)     51100.52 (3.17 pct)
128-clients:     27917.51 (0.00 pct)     30581.70 (9.54 pct)     31587.40 (13.14 pct)    33477.65 (19.91 pct)
256-clients:     20067.17 (0.00 pct)     26002.42 (29.57 pct)    18681.28 (-6.90 pct)    18144.96 (-9.57 pct)

o NPS4

                        tip                  SIS_NODE              SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
1-clients:       102139.49 (0.00 pct)    103578.02 (1.40 pct)    103633.90 (1.46 pct)    101656.07 (-0.47 pct)
2-clients:       98259.53 (0.00 pct)     99336.70 (1.09 pct)     99720.37 (1.48 pct)     98812.86 (0.56 pct)
4-clients:       91576.79 (0.00 pct)     95278.30 (4.04 pct)     93688.37 (2.30 pct)     93848.94 (2.48 pct)
8-clients:       84742.30 (0.00 pct)     89005.65 (5.03 pct)     87703.04 (3.49 pct)     86709.29 (2.32 pct)
16-clients:      79540.75 (0.00 pct)     85478.97 (7.46 pct)     83195.92 (4.59 pct)     81016.24 (1.85 pct)
32-clients:      71166.14 (0.00 pct)     74254.01 (4.33 pct)     72422.76 (1.76 pct)     71391.62 (0.31 pct)
64-clients:      51763.24 (0.00 pct)     52565.56 (1.54 pct)     55159.65 (6.56 pct)     52472.91 (1.37 pct)
128-clients:     27829.29 (0.00 pct)     35774.61 (28.55 pct)    33738.97 (21.23 pct)    34564.10 (24.20 pct)
256-clients:     24185.37 (0.00 pct)     27215.35 (12.52 pct)    17675.87 (-26.91 pct)   24937.66 (3.11 pct)


~~~~~~~~~~~~~~~~
~ ycsb-mongodb ~
~~~~~~~~~~~~~~~~

o NPS1

tip:			131070.33 (var: 2.84%)
SIS_NODE:		131070.33 (var: 2.84%) (0.00%)
SIS_NODE_LIMIT:		137227.00 (var: 4.97%) (4.69%)
SIS_NODE_TOPOEXT:	133529.67 (var: 0.98%) (1.87%)

o NPS2

tip:			133693.67 (var: 1.69%)
SIS_NODE:		134173.00 (var: 4.07%) (0.35%)
SIS_NODE_LIMIT:		134124.67 (var: 2.20%) (0.32%)
SIS_NODE_TOPOEXT:	133747.33 (var: 2.49%) (0.04%)

o NPS4

tip:			132913.67 (var: 1.97%)
SIS_NODE:		133697.33 (var: 1.69%) (0.58%)
SIS_NODE_LIMIT:		133307.33 (var: 1.03%) (0.29%)
SIS_NODE_TOPOEXT:	133426.67 (var: 3.60%) (0.38%)

~~~~~~~~~~~~~
~ unixbench ~
~~~~~~~~~~~~~

o NPS1

kernel                        			tip                  SIS_NODE               SIS_NODE_LIMIT            SIS_NODE_TOPOEXT
Hmean     unixbench-dhry2reg-1   	  41322625.19 (   0.00%)    41224388.33 (  -0.24%)    41142898.66 (  -0.43%)    41222168.97 (  -0.24%)
Hmean     unixbench-dhry2reg-512	6252491108.60 (   0.00%)  6240160851.68 (  -0.20%)  6262714194.10 (   0.16%)  6259553403.67 (   0.11%)
Amean     unixbench-syscall-1    	   2501398.27 (   0.00%)    2577323.43 *  -3.04%*      2498697.20 (   0.11%)     2541279.77 *  -1.59%*
Amean     unixbench-syscall-512  	   8120524.00 (   0.00%)    7512955.87 *   7.48%*      7447849.67 *   8.28%*     7477129.17 *   7.92%*
Hmean     unixbench-pipe-1    		   2359346.02 (   0.00%)    2392308.62 *   1.40%*      2407625.04 *   2.05%*     2334146.94 *  -1.07%*
Hmean     unixbench-pipe-512		 338790322.61 (   0.00%)  337711432.92 (  -0.32%)    340399941.24 (   0.48%)   339008490.26 (   0.06%)
Hmean     unixbench-spawn-1       	      4261.52 (   0.00%)       4164.90 (  -2.27%)         4929.26 *  15.67%*        5111.16 *  19.94%*
Hmean     unixbench-spawn-512    	     64328.93 (   0.00%)      62257.64 *  -3.22%*        63740.04 *  -0.92%*       63291.18 *  -1.61%*
Hmean     unixbench-execl-1       	      3677.73 (   0.00%)       3652.08 (  -0.70%)         3642.56 *  -0.96%*        3671.98 (  -0.16%)
Hmean     unixbench-execl-512    	     11984.83 (   0.00%)      13585.65 *  13.36%*        12496.80 (   4.27%)       12306.01 (   2.68%)

o NPS2

kernel                        			tip                  SIS_NODE               SIS_NODE_LIMIT            SIS_NODE_TOPOEXT
Hmean     unixbench-dhry2reg-1   	  41311787.29 (   0.00%)    41412946.27 (   0.24%)    41035150.98 (  -0.67%)    41371003.93 (   0.14%)
Hmean     unixbench-dhry2reg-512	6243873272.76 (   0.00%)  6256893083.32 (   0.21%)  6236751880.89 (  -0.11%)  6235047089.83 (  -0.14%)
Amean     unixbench-syscall-1    	   2503190.70 (   0.00%)     2576854.30 *  -2.94%*     2496464.80 *   0.27%*     2540298.77 *  -1.48%*
Amean     unixbench-syscall-512  	   8012388.13 (   0.00%)     7503196.87 *   6.36%*     7493284.60 *   6.48%*     7495117.73 *   6.46%*
Hmean     unixbench-pipe-1    		   2340486.25 (   0.00%)     2388946.63 (   2.07%)     2412344.33 *   3.07%*     2360277.30 (   0.85%)
Hmean     unixbench-pipe-512		 338965319.79 (   0.00%)   337225630.07 (  -0.51%)   339053027.04 (   0.03%)   336939353.18 *  -0.60%*
Hmean     unixbench-spawn-1      	      5241.83 (   0.00%)        5246.00 (   0.08%)        4718.45 *  -9.98%*        4967.96 *  -5.22%*
Hmean     unixbench-spawn-512    	     65799.86 (   0.00%)       64817.15 *  -1.49%*       66418.37 (   0.94%)       66820.63 *   1.55%*
Hmean     unixbench-execl-1       	      3670.65 (   0.00%)        3622.36 *  -1.32%*        3661.04 (  -0.26%)        3660.08 (  -0.29%)
Hmean     unixbench-execl-512    	     13682.00 (   0.00%)       13699.90 (   0.13%)       14103.91 (   3.08%)       12960.11 (  -5.28%)

o NPS4

kernel                        			tip                  SIS_NODE               SIS_NODE_LIMIT            SIS_NODE_TOPOEXT
Hmean     unixbench-dhry2reg-1   	  41025577.99 (   0.00%)    40879469.78 (  -0.36%)    41082700.61 (   0.14%)    41260407.54 (   0.57%)
Hmean     unixbench-dhry2reg-512	6255568261.91 (   0.00%)  6258326086.80 (   0.04%)  6252223940.32 (  -0.05%)  6259088809.43 (   0.06%)
Amean     unixbench-syscall-1    	   2507165.37 (   0.00%)    2579108.77 *  -2.87%*      2488617.40 *   0.74%*     2517574.40 (  -0.42%)
Amean     unixbench-syscall-512  	   7458476.50 (   0.00%)    7502528.67 *  -0.59%*      7978379.53 *  -6.97%*     7580369.27 *  -1.63%*
Hmean     unixbench-pipe-1    		   2369301.21 (   0.00%)    2392905.29 *   1.00%*      2410432.93 *   1.74%*     2347814.20 (  -0.91%)
Hmean     unixbench-pipe-512		 340299405.72 (   0.00%)  339139980.01 *  -0.34%*    340403992.95 (   0.03%)   338708678.82 *  -0.47%*
Hmean     unixbench-spawn-1      	      5571.78 (   0.00%)       5423.03 (  -2.67%)         5462.82 (  -1.96%)        5543.08 (  -0.52%)
Hmean     unixbench-spawn-512   	     63999.96 (   0.00%)      63485.41 (  -0.80%)        64730.98 *   1.14%*       67486.34 *   5.45%*
Hmean     unixbench-execl-1       	      3587.15 (   0.00%)       3624.44 *   1.04%*         3638.74 *   1.44%*        3639.57 *   1.46%*
Hmean     unixbench-execl-512    	     14184.17 (   0.00%)      13784.17 (  -2.82%)        13104.71 *  -7.61%*       13598.22 (  -4.13%)

~~~~~~~~~~~~~~~~~~
~ DeathStarBench ~
~~~~~~~~~~~~~~~~~~

o NPS1
CCD	Scaling	       tip    SIS_NODE 		SIS_NODE_LIMIT 		SIS_NODE_TOPOEXT
1	1		0%      0.30%  		 	0.83%  		 	0.79%
1	1		0%      0.17%  		 	2.53%  		 	0.91%
1	1		0%      -0.40% 		 	2.90%  		 	1.61%
1	1		0%      -7.95% 		 	1.19%  		 	-1.56%

o NPS2

CCD	Scaling	       tip    SIS_NODE 		SIS_NODE_LIMIT 		SIS_NODE_TOPOEXT
1	1		0%      0.34%  		 	-0.73% 		 	-0.62%
1	1		0%      -0.02% 		 	0.14%  		 	-1.15%
1	1		0%      -12.34%		 	-9.64% 		 	-7.80%
1	1		0%      -12.41%		 	-1.03% 		 	-9.85%

Note: In NPS2, 8 CCD case shows 10% run to run variation.

o NPS4

CCD	Scaling	       tip    SIS_NODE 		SIS_NODE_LIMIT 		SIS_NODE_TOPOEXT
1	1		0%      -1.32% 		 	-0.71% 		 	-1.09%
1	1		0%      -1.53% 		 	-1.11% 		 	-1.73%
1	1		0%      7.19%  		 	-3.47% 		 	5.75%
1	1		0%      -4.66% 		 	-1.91% 		 	-7.52%

--
If you would like me to collect any more information during any of the
above benchmark runs, please let me know.

--
Thanks and Regards,
Prateek
  
Peter Zijlstra June 13, 2023, 8:25 a.m. UTC | #25
On Thu, Jun 08, 2023 at 12:02:15AM +0530, K Prateek Nayak wrote:
> Hello Peter,
> 
> Below are the benchmark results on different NPS modes for SIS_NODE
> and SIS_NODE + additional suggested changes. None of them give a
> total win. Limit helps but there are cases where it still leads to
> regression. I'll leave full details below.
> 
> On 6/2/2023 12:24 PM, Peter Zijlstra wrote:
> > On Fri, Jun 02, 2023 at 10:43:37AM +0530, K Prateek Nayak wrote:
> >> Grouping near-CCX for the offerings that do not have 2CCX per CCD will
> >> prevent degenration and limit the search scope yes. Here is what I'll
> >> do, let me check if limiting search scope helps first, and then start
> >> fiddling with the topology. How does that sound?
> > 
> > So my preference would be the topology based solution, since the search
> > limit is random magic numbers that happen to work for 'your' machine but
> > who knows what it'll do for some other poor architecture that happens to
> > trip this.
> > 
> > That said; verifying the limit helps at all is of course a good start,
> > because if it doesn't then the topology thing will likely also not help
> > much.
> 
> o NPS Modes
> 
> NPS Modes are used to logically divide single socket into
> multiple NUMA region.
> Following is the NUMA configuration for each NPS mode on the system:
> 
> NPS1: Each socket is a NUMA node.
>     Total 2 NUMA nodes in the dual socket machine.
> 
>     Node 0: 0-63,   128-191
>     Node 1: 64-127, 192-255
> 
>     - 8CCX per node

Ok, so this is a dual-socket Zen3 with 64 cores per socket, right?


> o Kernel Versions
> 
> - tip              - tip:sched/core at commit e2a1f85bf9f5 "sched/psi:
>                      Avoid resetting the min update period when it is
>                      unnecessary")
> 
> - SIS_NODE         - tip:sched/core + this patch
> 
> - SIS_NODE_LIMIT   - tip:sched/core + this patch + nr=4 limit for SIS_NODE
> 		     (https://lore.kernel.org/all/20230601111326.GV4253@hirez.programming.kicks-ass.net/)
> 
> - SIS_NODE_TOPOEXT - tip:sched/core + this patch
>                      + new sched domain (Multi-Multi-Core or MMC)
> 		     (https://lore.kernel.org/all/20230601153522.GB559993@hirez.programming.kicks-ass.net/)
> 		     MMC domain groups 2 nearby CCX.

OK, so you managed to get the NPS4 topology in NPS1 mode?

> o Benchmark Results
> 
> Note: All benchmarks were run with boost enabled and C2 disabled.
> 
> ~~~~~~~~~~~~~
> ~ hackbench ~
> ~~~~~~~~~~~~~
> 
> o NPS1
> 
> Test:                   tip                     SIS_NODE           SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>  1-groups:         3.92 (0.00 pct)         4.05 (-3.31 pct)        3.78 (3.57 pct)         3.77 (3.82 pct)
>  2-groups:         4.58 (0.00 pct)         3.84 (16.15 pct)        4.50 (1.74 pct)         4.34 (5.24 pct)
>  4-groups:         4.99 (0.00 pct)         3.98 (20.24 pct)        4.93 (1.20 pct)         5.01 (-0.40 pct)
>  8-groups:         5.67 (0.00 pct)         6.05 (-6.70 pct)        5.73 (-1.05 pct)        5.95 (-4.93 pct)
> 16-groups:         7.88 (0.00 pct)        10.56 (-34.01 pct)       7.83 (0.63 pct)         8.04 (-2.03 pct)
> 
> o NPS2
> 
> Test:                   tip                     SIS_NODE           SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>  1-groups:         3.82 (0.00 pct)         3.68 (3.66 pct)         3.87 (-1.30 pct)        3.74 (2.09 pct)
>  2-groups:         4.40 (0.00 pct)         3.61 (17.95 pct)        4.45 (-1.13 pct)        4.30 (2.27 pct)
>  4-groups:         4.84 (0.00 pct)         3.62 (25.20 pct)        4.84 (0.00 pct)         4.97 (-2.68 pct)
>  8-groups:         5.45 (0.00 pct)         6.14 (-12.66 pct)       5.40 (0.91 pct)         5.68 (-4.22 pct)
> 16-groups:         6.94 (0.00 pct)         8.77 (-26.36 pct)       6.57 (5.33 pct)         7.87 (-13.40 pct)
> 
> o NPS4
> 
> Test:                   tip                     SIS_NODE           SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>  1-groups:         3.82 (0.00 pct)         3.84 (-0.52 pct)        3.83 (-0.26 pct)        3.85 (-0.78 pct)
>  2-groups:         4.44 (0.00 pct)         4.15 (6.53 pct)         4.43 (0.22 pct)         4.18 (5.85 pct)
>  4-groups:         4.86 (0.00 pct)         4.95 (-1.85 pct)        4.88 (-0.41 pct)        4.79 (1.44 pct)
>  8-groups:         5.42 (0.00 pct)         5.80 (-7.01 pct)        5.41 (0.18 pct)         5.75 (-6.08 pct)
> 16-groups:         6.68 (0.00 pct)         9.07 (-35.77 pct)       6.72 (-0.59 pct)        8.66 (-29.64 pct)

Win for NODE_LIMIT for having the least regressions, but also no real
gains.

Given NODE_TOPO does NPS4 that should be roughtly similar to limit=2 it
should do 'better' but it doesn't, it's markedly worse... weird.

In fact, none of the NPS4 numbers make any sense, if you've already
split the whole thing into 4, you remain with 2 CCXs per node and
NODE should be NODE_LIMIT should be NODE_TOPO.

All the NODE variants should end up scanning both CCXs and performance
should really be the same.

Something's wrong there.


> ~~~~~~~~~~
> ~ tbench ~
> ~~~~~~~~~~
> 
> o NPS1
> 
> Clients:      tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
>     1    452.49 (0.00 pct)       457.94 (1.20 pct)       458.13 (1.24 pct)       447.69 (-1.06 pct)
>     2    862.44 (0.00 pct)       879.99 (2.03 pct)       881.19 (2.17 pct)       855.91 (-0.75 pct)
>     4    1604.27 (0.00 pct)      1618.87 (0.91 pct)      1628.00 (1.47 pct)      1627.14 (1.42 pct)
>     8    2966.77 (0.00 pct)      3040.90 (2.49 pct)      3037.70 (2.39 pct)      2957.91 (-0.29 pct)
>    16    5176.70 (0.00 pct)      5292.29 (2.23 pct)      5445.15 (5.18 pct)      5241.61 (1.25 pct)
>    32    8205.24 (0.00 pct)      8949.12 (9.06 pct)      8716.02 (6.22 pct)      8494.17 (3.52 pct)
>    64    13956.71 (0.00 pct)     14461.42 (3.61 pct)     13620.04 (-2.41 pct)    15045.43 (7.80 pct)
>   128    24005.50 (0.00 pct)     26052.75 (8.52 pct)     24975.03 (4.03 pct)     24008.73 (0.01 pct)
>   256    32457.61 (0.00 pct)     21999.41 (-32.22 pct)   30810.93 (-5.07 pct)    31060.12 (-4.30 pct)
>   512    34345.24 (0.00 pct)     41166.39 (19.86 pct)    30982.94 (-9.78 pct)    31864.14 (-7.22 pct)
>  1024    33432.92 (0.00 pct)     40900.84 (22.33 pct)    30953.61 (-7.41 pct)    32006.81 (-4.26 pct)
> 
> o NPS2
> 
> Clients:      tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
>     1    453.73 (0.00 pct)       451.63 (-0.46 pct)      455.97 (0.49 pct)       453.79 (0.01 pct)
>     2    861.71 (0.00 pct)       857.85 (-0.44 pct)      868.30 (0.76 pct)       850.14 (-1.34 pct)
>     4    1599.14 (0.00 pct)      1609.30 (0.63 pct)      1656.08 (3.56 pct)      1619.10 (1.24 pct)
>     8    2951.03 (0.00 pct)      2944.71 (-0.21 pct)     3034.38 (2.82 pct)      2973.52 (0.76 pct)
>    16    5080.32 (0.00 pct)      5160.39 (1.57 pct)      5173.32 (1.83 pct)      5150.99 (1.39 pct)
>    32    7900.41 (0.00 pct)      8039.13 (1.75 pct)      8105.69 (2.59 pct)      7956.45 (0.70 pct)
>    64    14629.65 (0.00 pct)     15391.08 (5.20 pct)     14546.09 (-0.57 pct)    15410.41 (5.33 pct)
>   128    23155.88 (0.00 pct)     24015.45 (3.71 pct)     24263.82 (4.78 pct)     23351.35 (0.84 pct)
>   256    33449.57 (0.00 pct)     33571.08 (0.36 pct)     32048.20 (-4.18 pct)    32869.85 (-1.73 pct)
>   512    33757.47 (0.00 pct)     39872.69 (18.11 pct)    32945.66 (-2.40 pct)    34526.17 (2.27 pct)
>  1024    34823.14 (0.00 pct)     41090.15 (17.99 pct)    32404.40 (-6.94 pct)    34522.97 (-0.86 pct)
> 
> o NPS4
> 
> Clients:      tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
>     1    450.14 (0.00 pct)       454.46 (0.95 pct)       454.53 (0.97 pct)       451.43 (0.28 pct)
>     2    863.26 (0.00 pct)       868.94 (0.65 pct)       891.89 (3.31 pct)       866.74 (0.40 pct)
>     4    1618.71 (0.00 pct)      1599.13 (-1.20 pct)     1630.29 (0.71 pct)      1610.08 (-0.53 pct)
>     8    2929.35 (0.00 pct)      3065.12 (4.63 pct)      3064.15 (4.60 pct)      3004.74 (2.57 pct)
>    16    5114.04 (0.00 pct)      5261.40 (2.88 pct)      5238.04 (2.42 pct)      5108.53 (-0.10 pct)
>    32    7912.18 (0.00 pct)      8926.77 (12.82 pct)     8382.51 (5.94 pct)      8214.73 (3.82 pct)
>    64    14424.72 (0.00 pct)     14853.61 (2.97 pct)     14273.54 (-1.04 pct)    14430.17 (0.03 pct)
>   128    23614.97 (0.00 pct)     24506.73 (3.77 pct)     24517.76 (3.82 pct)     23296.38 (-1.34 pct)
>   256    34365.13 (0.00 pct)     35538.42 (3.41 pct)     31909.66 (-7.14 pct)    31009.12 (-9.76 pct)
>   512    34215.50 (0.00 pct)     36017.49 (5.26 pct)     32696.70 (-4.43 pct)    33262.55 (-2.78 pct)
>  1024    35421.90 (0.00 pct)     35193.81 (-0.64 pct)    32611.10 (-7.93 pct)    32795.86 (-7.41 pct)

tbench likes NODE

> ~~~~~~~~~~
> ~ stream ~
> ~~~~~~~~~~
> 
> - 10 Runs
> 
> o NPS1
> 
> Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>  Copy:   271317.35 (0.00 pct)    292440.22 (7.78 pct)    302540.26 (11.50 pct)   287277.25 (5.88 pct)
> Scale:   205533.77 (0.00 pct)    203362.60 (-1.05 pct)   207750.30 (1.07 pct)    205206.26 (-0.15 pct)
>   Add:   221624.62 (0.00 pct)    225850.83 (1.90 pct)    233782.14 (5.48 pct)    229774.48 (3.67 pct)
> Triad:   228500.68 (0.00 pct)    225885.25 (-1.14 pct)   238331.69 (4.30 pct)    240041.53 (5.05 pct)
> 
> o NPS2
> 
> Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>  Copy:   277761.29 (0.00 pct)    301816.34 (8.66 pct)    293563.58 (5.68 pct)    308218.80 (10.96 pct)
> Scale:   215193.83 (0.00 pct)    212522.72 (-1.24 pct)   215758.66 (0.26 pct)    205678.94 (-4.42 pct)
>   Add:   242725.75 (0.00 pct)    242695.13 (-0.01 pct)   246472.20 (1.54 pct)    238089.46 (-1.91 pct)
> Triad:   237253.44 (0.00 pct)    250618.57 (5.63 pct)    239405.55 (0.90 pct)    249652.73 (5.22 pct)
> 
> o NPS4
> 
> Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>  Copy:   273307.14 (0.00 pct)    255091.78 (-6.66 pct)   301926.68 (10.47 pct)   262007.26 (-4.13 pct)
> Scale:   235715.23 (0.00 pct)    222018.36 (-5.81 pct)   224881.52 (-4.59 pct)   222282.64 (-5.69 pct)
>   Add:   244500.40 (0.00 pct)    230468.21 (-5.73 pct)   242625.18 (-0.76 pct)   227146.80 (-7.09 pct)
> Triad:   250600.04 (0.00 pct)    236229.50 (-5.73 pct)   258064.49 (2.97 pct)    231772.02 (-7.51 pct)
> 
> - 100 Runs
> 
> Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>  Copy:   317381.65 (0.00 pct)    318827.08 (0.45 pct)    320898.32 (1.10 pct)    318922.96 (0.48 pct)
> Scale:   214145.00 (0.00 pct)    206213.69 (-3.70 pct)   211019.12 (-1.45 pct)   210384.47 (-1.75 pct)
>   Add:   239243.29 (0.00 pct)    229791.67 (-3.95 pct)   233827.11 (-2.26 pct)   236659.48 (-1.07 pct)
> Triad:   249477.76 (0.00 pct)    236843.06 (-5.06 pct)   244688.91 (-1.91 pct)   235990.67 (-5.40 pct)
> 
> o NPS2
> 
> Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>  Copy:   318082.10 (0.00 pct)    322844.91 (1.49 pct)    310350.21 (-2.43 pct)   322495.84 (1.38 pct)
> Scale:   219338.56 (0.00 pct)    218139.90 (-0.54 pct)   212288.47 (-3.21 pct)   221040.27 (0.77 pct)
>   Add:   248118.20 (0.00 pct)    249826.98 (0.68 pct)    239682.55 (-3.39 pct)   253006.79 (1.97 pct)
> Triad:   247088.55 (0.00 pct)    260488.38 (5.42 pct)    247892.42 (0.32 pct)    249081.33 (0.80 pct)
> 
> o NPS4
> 
> Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>  Copy:   345396.19 (0.00 pct)    343675.74 (-0.49 pct)   346990.96 (0.46 pct)    334677.55 (-3.10 pct)
> Scale:   241521.63 (0.00 pct)    231494.70 (-4.15 pct)   236233.18 (-2.18 pct)   229159.01 (-5.11 pct)
>   Add:   261157.86 (0.00 pct)    249663.86 (-4.40 pct)   253402.85 (-2.96 pct)   242257.98 (-7.23 pct)
> Triad:   267804.99 (0.00 pct)    263071.00 (-1.76 pct)   264208.15 (-1.34 pct)   256978.50 (-4.04 pct)

Again, the NPS4 reults are weird.

> ~~~~~~~~~~~
> ~ netperf ~
> ~~~~~~~~~~~
> 
> o NPS1
> 
>                         tip                  SIS_NODE              SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
> 1-clients:       102839.97 (0.00 pct)    103540.33 (0.68 pct)    103769.74 (0.90 pct)    103271.77 (0.41 pct)
> 2-clients:       98428.08 (0.00 pct)     100431.67 (2.03 pct)    100555.62 (2.16 pct)    100417.11 (2.02 pct)
> 4-clients:       92298.45 (0.00 pct)     94800.51 (2.71 pct)     93706.09 (1.52 pct)     94981.10 (2.90 pct)
> 8-clients:       85618.41 (0.00 pct)     89130.14 (4.10 pct)     87677.84 (2.40 pct)     88284.61 (3.11 pct)
> 16-clients:      78722.18 (0.00 pct)     79715.38 (1.26 pct)     80488.76 (2.24 pct)     78980.88 (0.32 pct)
> 32-clients:      73610.75 (0.00 pct)     72801.41 (-1.09 pct)    72167.43 (-1.96 pct)    75077.55 (1.99 pct)
> 64-clients:      55285.07 (0.00 pct)     56184.38 (1.62 pct)     56443.79 (2.09 pct)     60689.05 (9.77 pct)
> 128-clients:     31176.92 (0.00 pct)     32830.06 (5.30 pct)     35511.93 (13.90 pct)    35638.50 (14.31 pct)
> 256-clients:     20011.44 (0.00 pct)     15135.39 (-24.36 pct)   17599.21 (-12.05 pct)   18219.29 (-8.95 pct)
> 
> o NPS2
> 
>                         tip                  SIS_NODE              SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
> 1-clients:       103105.55 (0.00 pct)    101582.75 (-1.47 pct)   103077.22 (-0.02 pct)   102233.63 (-0.84 pct)
> 2-clients:       98720.29 (0.00 pct)     98537.46 (-0.18 pct)    100761.54 (2.06 pct)    99211.39 (0.49 pct)
> 4-clients:       92289.39 (0.00 pct)     94332.45 (2.21 pct)     93622.46 (1.44 pct)     93321.77 (1.11 pct)
> 8-clients:       84998.63 (0.00 pct)     87180.90 (2.56 pct)     86970.84 (2.32 pct)     86076.75 (1.26 pct)
> 16-clients:      76395.81 (0.00 pct)     80017.06 (4.74 pct)     77937.29 (2.01 pct)     75090.85 (-1.70 pct)
> 32-clients:      71110.89 (0.00 pct)     69445.86 (-2.34 pct)    69273.81 (-2.58 pct)    66885.99 (-5.94 pct)
> 64-clients:      49526.21 (0.00 pct)     50004.13 (0.96 pct)     51649.09 (4.28 pct)     51100.52 (3.17 pct)
> 128-clients:     27917.51 (0.00 pct)     30581.70 (9.54 pct)     31587.40 (13.14 pct)    33477.65 (19.91 pct)
> 256-clients:     20067.17 (0.00 pct)     26002.42 (29.57 pct)    18681.28 (-6.90 pct)    18144.96 (-9.57 pct)
> 
> o NPS4
> 
>                         tip                  SIS_NODE              SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
> 1-clients:       102139.49 (0.00 pct)    103578.02 (1.40 pct)    103633.90 (1.46 pct)    101656.07 (-0.47 pct)
> 2-clients:       98259.53 (0.00 pct)     99336.70 (1.09 pct)     99720.37 (1.48 pct)     98812.86 (0.56 pct)
> 4-clients:       91576.79 (0.00 pct)     95278.30 (4.04 pct)     93688.37 (2.30 pct)     93848.94 (2.48 pct)
> 8-clients:       84742.30 (0.00 pct)     89005.65 (5.03 pct)     87703.04 (3.49 pct)     86709.29 (2.32 pct)
> 16-clients:      79540.75 (0.00 pct)     85478.97 (7.46 pct)     83195.92 (4.59 pct)     81016.24 (1.85 pct)
> 32-clients:      71166.14 (0.00 pct)     74254.01 (4.33 pct)     72422.76 (1.76 pct)     71391.62 (0.31 pct)
> 64-clients:      51763.24 (0.00 pct)     52565.56 (1.54 pct)     55159.65 (6.56 pct)     52472.91 (1.37 pct)
> 128-clients:     27829.29 (0.00 pct)     35774.61 (28.55 pct)    33738.97 (21.23 pct)    34564.10 (24.20 pct)
> 256-clients:     24185.37 (0.00 pct)     27215.35 (12.52 pct)    17675.87 (-26.91 pct)   24937.66 (3.11 pct)

NPS4 is weird again, but mostly wins.

Based on the NPS1 results I'd say this one goes to TOPO

> ~~~~~~~~~~~~~~~~
> ~ ycsb-mongodb ~
> ~~~~~~~~~~~~~~~~
> 
> o NPS1
> 
> tip:			131070.33 (var: 2.84%)
> SIS_NODE:		131070.33 (var: 2.84%) (0.00%)
> SIS_NODE_LIMIT:		137227.00 (var: 4.97%) (4.69%)
> SIS_NODE_TOPOEXT:	133529.67 (var: 0.98%) (1.87%)
> 
> o NPS2
> 
> tip:			133693.67 (var: 1.69%)
> SIS_NODE:		134173.00 (var: 4.07%) (0.35%)
> SIS_NODE_LIMIT:		134124.67 (var: 2.20%) (0.32%)
> SIS_NODE_TOPOEXT:	133747.33 (var: 2.49%) (0.04%)
> 
> o NPS4
> 
> tip:			132913.67 (var: 1.97%)
> SIS_NODE:		133697.33 (var: 1.69%) (0.58%)
> SIS_NODE_LIMIT:		133307.33 (var: 1.03%) (0.29%)
> SIS_NODE_TOPOEXT:	133426.67 (var: 3.60%) (0.38%)
> 
> ~~~~~~~~~~~~~
> ~ unixbench ~
> ~~~~~~~~~~~~~
> 
> o NPS1
> 
> kernel                        			tip                  SIS_NODE               SIS_NODE_LIMIT            SIS_NODE_TOPOEXT
> Hmean     unixbench-dhry2reg-1   	  41322625.19 (   0.00%)    41224388.33 (  -0.24%)    41142898.66 (  -0.43%)    41222168.97 (  -0.24%)
> Hmean     unixbench-dhry2reg-512	6252491108.60 (   0.00%)  6240160851.68 (  -0.20%)  6262714194.10 (   0.16%)  6259553403.67 (   0.11%)
> Amean     unixbench-syscall-1    	   2501398.27 (   0.00%)    2577323.43 *  -3.04%*      2498697.20 (   0.11%)     2541279.77 *  -1.59%*
> Amean     unixbench-syscall-512  	   8120524.00 (   0.00%)    7512955.87 *   7.48%*      7447849.67 *   8.28%*     7477129.17 *   7.92%*
> Hmean     unixbench-pipe-1    		   2359346.02 (   0.00%)    2392308.62 *   1.40%*      2407625.04 *   2.05%*     2334146.94 *  -1.07%*
> Hmean     unixbench-pipe-512		 338790322.61 (   0.00%)  337711432.92 (  -0.32%)    340399941.24 (   0.48%)   339008490.26 (   0.06%)
> Hmean     unixbench-spawn-1       	      4261.52 (   0.00%)       4164.90 (  -2.27%)         4929.26 *  15.67%*        5111.16 *  19.94%*
> Hmean     unixbench-spawn-512    	     64328.93 (   0.00%)      62257.64 *  -3.22%*        63740.04 *  -0.92%*       63291.18 *  -1.61%*
> Hmean     unixbench-execl-1       	      3677.73 (   0.00%)       3652.08 (  -0.70%)         3642.56 *  -0.96%*        3671.98 (  -0.16%)
> Hmean     unixbench-execl-512    	     11984.83 (   0.00%)      13585.65 *  13.36%*        12496.80 (   4.27%)       12306.01 (   2.68%)
> 
> o NPS2
> 
> kernel                        			tip                  SIS_NODE               SIS_NODE_LIMIT            SIS_NODE_TOPOEXT
> Hmean     unixbench-dhry2reg-1   	  41311787.29 (   0.00%)    41412946.27 (   0.24%)    41035150.98 (  -0.67%)    41371003.93 (   0.14%)
> Hmean     unixbench-dhry2reg-512	6243873272.76 (   0.00%)  6256893083.32 (   0.21%)  6236751880.89 (  -0.11%)  6235047089.83 (  -0.14%)
> Amean     unixbench-syscall-1    	   2503190.70 (   0.00%)     2576854.30 *  -2.94%*     2496464.80 *   0.27%*     2540298.77 *  -1.48%*
> Amean     unixbench-syscall-512  	   8012388.13 (   0.00%)     7503196.87 *   6.36%*     7493284.60 *   6.48%*     7495117.73 *   6.46%*
> Hmean     unixbench-pipe-1    		   2340486.25 (   0.00%)     2388946.63 (   2.07%)     2412344.33 *   3.07%*     2360277.30 (   0.85%)
> Hmean     unixbench-pipe-512		 338965319.79 (   0.00%)   337225630.07 (  -0.51%)   339053027.04 (   0.03%)   336939353.18 *  -0.60%*
> Hmean     unixbench-spawn-1      	      5241.83 (   0.00%)        5246.00 (   0.08%)        4718.45 *  -9.98%*        4967.96 *  -5.22%*
> Hmean     unixbench-spawn-512    	     65799.86 (   0.00%)       64817.15 *  -1.49%*       66418.37 (   0.94%)       66820.63 *   1.55%*
> Hmean     unixbench-execl-1       	      3670.65 (   0.00%)        3622.36 *  -1.32%*        3661.04 (  -0.26%)        3660.08 (  -0.29%)
> Hmean     unixbench-execl-512    	     13682.00 (   0.00%)       13699.90 (   0.13%)       14103.91 (   3.08%)       12960.11 (  -5.28%)
> 
> o NPS4
> 
> kernel                        			tip                  SIS_NODE               SIS_NODE_LIMIT            SIS_NODE_TOPOEXT
> Hmean     unixbench-dhry2reg-1   	  41025577.99 (   0.00%)    40879469.78 (  -0.36%)    41082700.61 (   0.14%)    41260407.54 (   0.57%)
> Hmean     unixbench-dhry2reg-512	6255568261.91 (   0.00%)  6258326086.80 (   0.04%)  6252223940.32 (  -0.05%)  6259088809.43 (   0.06%)
> Amean     unixbench-syscall-1    	   2507165.37 (   0.00%)    2579108.77 *  -2.87%*      2488617.40 *   0.74%*     2517574.40 (  -0.42%)
> Amean     unixbench-syscall-512  	   7458476.50 (   0.00%)    7502528.67 *  -0.59%*      7978379.53 *  -6.97%*     7580369.27 *  -1.63%*
> Hmean     unixbench-pipe-1    		   2369301.21 (   0.00%)    2392905.29 *   1.00%*      2410432.93 *   1.74%*     2347814.20 (  -0.91%)
> Hmean     unixbench-pipe-512		 340299405.72 (   0.00%)  339139980.01 *  -0.34%*    340403992.95 (   0.03%)   338708678.82 *  -0.47%*
> Hmean     unixbench-spawn-1      	      5571.78 (   0.00%)       5423.03 (  -2.67%)         5462.82 (  -1.96%)        5543.08 (  -0.52%)
> Hmean     unixbench-spawn-512   	     63999.96 (   0.00%)      63485.41 (  -0.80%)        64730.98 *   1.14%*       67486.34 *   5.45%*
> Hmean     unixbench-execl-1       	      3587.15 (   0.00%)       3624.44 *   1.04%*         3638.74 *   1.44%*        3639.57 *   1.46%*
> Hmean     unixbench-execl-512    	     14184.17 (   0.00%)      13784.17 (  -2.82%)        13104.71 *  -7.61%*       13598.22 (  -4.13%)
> 
> ~~~~~~~~~~~~~~~~~~
> ~ DeathStarBench ~
> ~~~~~~~~~~~~~~~~~~
> 
> o NPS1
> CCD	Scaling	       tip    SIS_NODE 		SIS_NODE_LIMIT 		SIS_NODE_TOPOEXT
> 1	1		0%      0.30%  		 	0.83%  		 	0.79%
> 1	1		0%      0.17%  		 	2.53%  		 	0.91%
> 1	1		0%      -0.40% 		 	2.90%  		 	1.61%
> 1	1		0%      -7.95% 		 	1.19%  		 	-1.56%
> 
> o NPS2
> 
> CCD	Scaling	       tip    SIS_NODE 		SIS_NODE_LIMIT 		SIS_NODE_TOPOEXT
> 1	1		0%      0.34%  		 	-0.73% 		 	-0.62%
> 1	1		0%      -0.02% 		 	0.14%  		 	-1.15%
> 1	1		0%      -12.34%		 	-9.64% 		 	-7.80%
> 1	1		0%      -12.41%		 	-1.03% 		 	-9.85%
> 
> Note: In NPS2, 8 CCD case shows 10% run to run variation.
> 
> o NPS4
> 
> CCD	Scaling	       tip    SIS_NODE 		SIS_NODE_LIMIT 		SIS_NODE_TOPOEXT
> 1	1		0%      -1.32% 		 	-0.71% 		 	-1.09%
> 1	1		0%      -1.53% 		 	-1.11% 		 	-1.73%
> 1	1		0%      7.19%  		 	-3.47% 		 	5.75%
> 1	1		0%      -4.66% 		 	-1.91% 		 	-7.52%

LIMIT seems to do well for the NPS1 case, but how come it falls apart
for NPS2 ?!? that doesn't realy make sense, does it?

And again NPS4 is all over the place :/

> 
> --
> If you would like me to collect any more information during any of the
> above benchmark runs, please let me know.

dizzy with numbers ....

Perhaps see if you can figure out why NPS4 is so weird, there's only 2
CCXs to go around per node on that thing, the various results should not
be all over the map.

Perhaps pick hackbenc since it shows the problem and is easy and quick
to run?

Also, can you share the TOPOEXT code?
  
K Prateek Nayak June 13, 2023, 10:30 a.m. UTC | #26
Hello Peter,

On 6/13/2023 1:55 PM, Peter Zijlstra wrote:
> On Thu, Jun 08, 2023 at 12:02:15AM +0530, K Prateek Nayak wrote:
>> Hello Peter,
>>
>> Below are the benchmark results on different NPS modes for SIS_NODE
>> and SIS_NODE + additional suggested changes. None of them give a
>> total win. Limit helps but there are cases where it still leads to
>> regression. I'll leave full details below.
>>
>> On 6/2/2023 12:24 PM, Peter Zijlstra wrote:
>>> On Fri, Jun 02, 2023 at 10:43:37AM +0530, K Prateek Nayak wrote:
>>>> Grouping near-CCX for the offerings that do not have 2CCX per CCD will
>>>> prevent degenration and limit the search scope yes. Here is what I'll
>>>> do, let me check if limiting search scope helps first, and then start
>>>> fiddling with the topology. How does that sound?
>>>
>>> So my preference would be the topology based solution, since the search
>>> limit is random magic numbers that happen to work for 'your' machine but
>>> who knows what it'll do for some other poor architecture that happens to
>>> trip this.
>>>
>>> That said; verifying the limit helps at all is of course a good start,
>>> because if it doesn't then the topology thing will likely also not help
>>> much.
>>
>> o NPS Modes
>>
>> NPS Modes are used to logically divide single socket into
>> multiple NUMA region.
>> Following is the NUMA configuration for each NPS mode on the system:
>>
>> NPS1: Each socket is a NUMA node.
>>     Total 2 NUMA nodes in the dual socket machine.
>>
>>     Node 0: 0-63,   128-191
>>     Node 1: 64-127, 192-255
>>
>>     - 8CCX per node
> 
> Ok, so this is a dual-socket Zen3 with 64 cores per socket, right?

Yup!

> 
> 
>> o Kernel Versions
>>
>> - tip              - tip:sched/core at commit e2a1f85bf9f5 "sched/psi:
>>                      Avoid resetting the min update period when it is
>>                      unnecessary")
>>
>> - SIS_NODE         - tip:sched/core + this patch
>>
>> - SIS_NODE_LIMIT   - tip:sched/core + this patch + nr=4 limit for SIS_NODE
>> 		     (https://lore.kernel.org/all/20230601111326.GV4253@hirez.programming.kicks-ass.net/)
>>
>> - SIS_NODE_TOPOEXT - tip:sched/core + this patch
>>                      + new sched domain (Multi-Multi-Core or MMC)
>> 		     (https://lore.kernel.org/all/20230601153522.GB559993@hirez.programming.kicks-ass.net/)
>> 		     MMC domain groups 2 nearby CCX.
> 
> OK, so you managed to get the NPS4 topology in NPS1 mode?

Yup! But it is a hack. I'll leave the patch at the end.

> 
>> o Benchmark Results
>>
>> Note: All benchmarks were run with boost enabled and C2 disabled.
>>
>> ~~~~~~~~~~~~~
>> ~ hackbench ~
>> ~~~~~~~~~~~~~
>>
>> o NPS1
>>
>> Test:                   tip                     SIS_NODE           SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>>  1-groups:         3.92 (0.00 pct)         4.05 (-3.31 pct)        3.78 (3.57 pct)         3.77 (3.82 pct)
>>  2-groups:         4.58 (0.00 pct)         3.84 (16.15 pct)        4.50 (1.74 pct)         4.34 (5.24 pct)
>>  4-groups:         4.99 (0.00 pct)         3.98 (20.24 pct)        4.93 (1.20 pct)         5.01 (-0.40 pct)
>>  8-groups:         5.67 (0.00 pct)         6.05 (-6.70 pct)        5.73 (-1.05 pct)        5.95 (-4.93 pct)
>> 16-groups:         7.88 (0.00 pct)        10.56 (-34.01 pct)       7.83 (0.63 pct)         8.04 (-2.03 pct)
>>
>> o NPS2
>>
>> Test:                   tip                     SIS_NODE           SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>>  1-groups:         3.82 (0.00 pct)         3.68 (3.66 pct)         3.87 (-1.30 pct)        3.74 (2.09 pct)
>>  2-groups:         4.40 (0.00 pct)         3.61 (17.95 pct)        4.45 (-1.13 pct)        4.30 (2.27 pct)
>>  4-groups:         4.84 (0.00 pct)         3.62 (25.20 pct)        4.84 (0.00 pct)         4.97 (-2.68 pct)
>>  8-groups:         5.45 (0.00 pct)         6.14 (-12.66 pct)       5.40 (0.91 pct)         5.68 (-4.22 pct)
>> 16-groups:         6.94 (0.00 pct)         8.77 (-26.36 pct)       6.57 (5.33 pct)         7.87 (-13.40 pct)
>>
>> o NPS4
>>
>> Test:                   tip                     SIS_NODE           SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>>  1-groups:         3.82 (0.00 pct)         3.84 (-0.52 pct)        3.83 (-0.26 pct)        3.85 (-0.78 pct)
>>  2-groups:         4.44 (0.00 pct)         4.15 (6.53 pct)         4.43 (0.22 pct)         4.18 (5.85 pct)
>>  4-groups:         4.86 (0.00 pct)         4.95 (-1.85 pct)        4.88 (-0.41 pct)        4.79 (1.44 pct)
>>  8-groups:         5.42 (0.00 pct)         5.80 (-7.01 pct)        5.41 (0.18 pct)         5.75 (-6.08 pct)
>> 16-groups:         6.68 (0.00 pct)         9.07 (-35.77 pct)       6.72 (-0.59 pct)        8.66 (-29.64 pct)
> 
> Win for NODE_LIMIT for having the least regressions, but also no real
> gains.
> 
> Given NODE_TOPO does NPS4 that should be roughtly similar to limit=2 it
> should do 'better' but it doesn't, it's markedly worse... weird.
> 
> In fact, none of the NPS4 numbers make any sense, if you've already
> split the whole thing into 4, you remain with 2 CCXs per node and
> NODE should be NODE_LIMIT should be NODE_TOPO.
> 
> All the NODE variants should end up scanning both CCXs and performance
> should really be the same.
> 
> Something's wrong there.

Yup! I'm rerunning SIS_NODE_LIMIT because the numbers are completely off.
Possibly an error on my part when applying the patch.

> 
> 
>> ~~~~~~~~~~
>> ~ tbench ~
>> ~~~~~~~~~~
>>
>> o NPS1
>>
>> Clients:      tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
>>     1    452.49 (0.00 pct)       457.94 (1.20 pct)       458.13 (1.24 pct)       447.69 (-1.06 pct)
>>     2    862.44 (0.00 pct)       879.99 (2.03 pct)       881.19 (2.17 pct)       855.91 (-0.75 pct)
>>     4    1604.27 (0.00 pct)      1618.87 (0.91 pct)      1628.00 (1.47 pct)      1627.14 (1.42 pct)
>>     8    2966.77 (0.00 pct)      3040.90 (2.49 pct)      3037.70 (2.39 pct)      2957.91 (-0.29 pct)
>>    16    5176.70 (0.00 pct)      5292.29 (2.23 pct)      5445.15 (5.18 pct)      5241.61 (1.25 pct)
>>    32    8205.24 (0.00 pct)      8949.12 (9.06 pct)      8716.02 (6.22 pct)      8494.17 (3.52 pct)
>>    64    13956.71 (0.00 pct)     14461.42 (3.61 pct)     13620.04 (-2.41 pct)    15045.43 (7.80 pct)
>>   128    24005.50 (0.00 pct)     26052.75 (8.52 pct)     24975.03 (4.03 pct)     24008.73 (0.01 pct)
>>   256    32457.61 (0.00 pct)     21999.41 (-32.22 pct)   30810.93 (-5.07 pct)    31060.12 (-4.30 pct)
>>   512    34345.24 (0.00 pct)     41166.39 (19.86 pct)    30982.94 (-9.78 pct)    31864.14 (-7.22 pct)
>>  1024    33432.92 (0.00 pct)     40900.84 (22.33 pct)    30953.61 (-7.41 pct)    32006.81 (-4.26 pct)
>>
>> o NPS2
>>
>> Clients:      tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
>>     1    453.73 (0.00 pct)       451.63 (-0.46 pct)      455.97 (0.49 pct)       453.79 (0.01 pct)
>>     2    861.71 (0.00 pct)       857.85 (-0.44 pct)      868.30 (0.76 pct)       850.14 (-1.34 pct)
>>     4    1599.14 (0.00 pct)      1609.30 (0.63 pct)      1656.08 (3.56 pct)      1619.10 (1.24 pct)
>>     8    2951.03 (0.00 pct)      2944.71 (-0.21 pct)     3034.38 (2.82 pct)      2973.52 (0.76 pct)
>>    16    5080.32 (0.00 pct)      5160.39 (1.57 pct)      5173.32 (1.83 pct)      5150.99 (1.39 pct)
>>    32    7900.41 (0.00 pct)      8039.13 (1.75 pct)      8105.69 (2.59 pct)      7956.45 (0.70 pct)
>>    64    14629.65 (0.00 pct)     15391.08 (5.20 pct)     14546.09 (-0.57 pct)    15410.41 (5.33 pct)
>>   128    23155.88 (0.00 pct)     24015.45 (3.71 pct)     24263.82 (4.78 pct)     23351.35 (0.84 pct)
>>   256    33449.57 (0.00 pct)     33571.08 (0.36 pct)     32048.20 (-4.18 pct)    32869.85 (-1.73 pct)
>>   512    33757.47 (0.00 pct)     39872.69 (18.11 pct)    32945.66 (-2.40 pct)    34526.17 (2.27 pct)
>>  1024    34823.14 (0.00 pct)     41090.15 (17.99 pct)    32404.40 (-6.94 pct)    34522.97 (-0.86 pct)
>>
>> o NPS4
>>
>> Clients:      tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
>>     1    450.14 (0.00 pct)       454.46 (0.95 pct)       454.53 (0.97 pct)       451.43 (0.28 pct)
>>     2    863.26 (0.00 pct)       868.94 (0.65 pct)       891.89 (3.31 pct)       866.74 (0.40 pct)
>>     4    1618.71 (0.00 pct)      1599.13 (-1.20 pct)     1630.29 (0.71 pct)      1610.08 (-0.53 pct)
>>     8    2929.35 (0.00 pct)      3065.12 (4.63 pct)      3064.15 (4.60 pct)      3004.74 (2.57 pct)
>>    16    5114.04 (0.00 pct)      5261.40 (2.88 pct)      5238.04 (2.42 pct)      5108.53 (-0.10 pct)
>>    32    7912.18 (0.00 pct)      8926.77 (12.82 pct)     8382.51 (5.94 pct)      8214.73 (3.82 pct)
>>    64    14424.72 (0.00 pct)     14853.61 (2.97 pct)     14273.54 (-1.04 pct)    14430.17 (0.03 pct)
>>   128    23614.97 (0.00 pct)     24506.73 (3.77 pct)     24517.76 (3.82 pct)     23296.38 (-1.34 pct)
>>   256    34365.13 (0.00 pct)     35538.42 (3.41 pct)     31909.66 (-7.14 pct)    31009.12 (-9.76 pct)
>>   512    34215.50 (0.00 pct)     36017.49 (5.26 pct)     32696.70 (-4.43 pct)    33262.55 (-2.78 pct)
>>  1024    35421.90 (0.00 pct)     35193.81 (-0.64 pct)    32611.10 (-7.93 pct)    32795.86 (-7.41 pct)
> 
> tbench likes NODE
> 
>> ~~~~~~~~~~
>> ~ stream ~
>> ~~~~~~~~~~
>>
>> - 10 Runs
>>
>> o NPS1
>>
>> Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>>  Copy:   271317.35 (0.00 pct)    292440.22 (7.78 pct)    302540.26 (11.50 pct)   287277.25 (5.88 pct)
>> Scale:   205533.77 (0.00 pct)    203362.60 (-1.05 pct)   207750.30 (1.07 pct)    205206.26 (-0.15 pct)
>>   Add:   221624.62 (0.00 pct)    225850.83 (1.90 pct)    233782.14 (5.48 pct)    229774.48 (3.67 pct)
>> Triad:   228500.68 (0.00 pct)    225885.25 (-1.14 pct)   238331.69 (4.30 pct)    240041.53 (5.05 pct)
>>
>> o NPS2
>>
>> Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>>  Copy:   277761.29 (0.00 pct)    301816.34 (8.66 pct)    293563.58 (5.68 pct)    308218.80 (10.96 pct)
>> Scale:   215193.83 (0.00 pct)    212522.72 (-1.24 pct)   215758.66 (0.26 pct)    205678.94 (-4.42 pct)
>>   Add:   242725.75 (0.00 pct)    242695.13 (-0.01 pct)   246472.20 (1.54 pct)    238089.46 (-1.91 pct)
>> Triad:   237253.44 (0.00 pct)    250618.57 (5.63 pct)    239405.55 (0.90 pct)    249652.73 (5.22 pct)
>>
>> o NPS4
>>
>> Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>>  Copy:   273307.14 (0.00 pct)    255091.78 (-6.66 pct)   301926.68 (10.47 pct)   262007.26 (-4.13 pct)
>> Scale:   235715.23 (0.00 pct)    222018.36 (-5.81 pct)   224881.52 (-4.59 pct)   222282.64 (-5.69 pct)
>>   Add:   244500.40 (0.00 pct)    230468.21 (-5.73 pct)   242625.18 (-0.76 pct)   227146.80 (-7.09 pct)
>> Triad:   250600.04 (0.00 pct)    236229.50 (-5.73 pct)   258064.49 (2.97 pct)    231772.02 (-7.51 pct)
>>
>> - 100 Runs
>>
>> Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>>  Copy:   317381.65 (0.00 pct)    318827.08 (0.45 pct)    320898.32 (1.10 pct)    318922.96 (0.48 pct)
>> Scale:   214145.00 (0.00 pct)    206213.69 (-3.70 pct)   211019.12 (-1.45 pct)   210384.47 (-1.75 pct)
>>   Add:   239243.29 (0.00 pct)    229791.67 (-3.95 pct)   233827.11 (-2.26 pct)   236659.48 (-1.07 pct)
>> Triad:   249477.76 (0.00 pct)    236843.06 (-5.06 pct)   244688.91 (-1.91 pct)   235990.67 (-5.40 pct)
>>
>> o NPS2
>>
>> Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>>  Copy:   318082.10 (0.00 pct)    322844.91 (1.49 pct)    310350.21 (-2.43 pct)   322495.84 (1.38 pct)
>> Scale:   219338.56 (0.00 pct)    218139.90 (-0.54 pct)   212288.47 (-3.21 pct)   221040.27 (0.77 pct)
>>   Add:   248118.20 (0.00 pct)    249826.98 (0.68 pct)    239682.55 (-3.39 pct)   253006.79 (1.97 pct)
>> Triad:   247088.55 (0.00 pct)    260488.38 (5.42 pct)    247892.42 (0.32 pct)    249081.33 (0.80 pct)
>>
>> o NPS4
>>
>> Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
>>  Copy:   345396.19 (0.00 pct)    343675.74 (-0.49 pct)   346990.96 (0.46 pct)    334677.55 (-3.10 pct)
>> Scale:   241521.63 (0.00 pct)    231494.70 (-4.15 pct)   236233.18 (-2.18 pct)   229159.01 (-5.11 pct)
>>   Add:   261157.86 (0.00 pct)    249663.86 (-4.40 pct)   253402.85 (-2.96 pct)   242257.98 (-7.23 pct)
>> Triad:   267804.99 (0.00 pct)    263071.00 (-1.76 pct)   264208.15 (-1.34 pct)   256978.50 (-4.04 pct)
> 
> Again, the NPS4 reults are weird.

NPS4 numbers are more or less within the run to run variation window.
I think the couple of drops for SIS_NODE_TOPOEXT is just a bad run.
Will rerun.

> 
>> ~~~~~~~~~~~
>> ~ netperf ~
>> ~~~~~~~~~~~
>>
>> o NPS1
>>
>>                         tip                  SIS_NODE              SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
>> 1-clients:       102839.97 (0.00 pct)    103540.33 (0.68 pct)    103769.74 (0.90 pct)    103271.77 (0.41 pct)
>> 2-clients:       98428.08 (0.00 pct)     100431.67 (2.03 pct)    100555.62 (2.16 pct)    100417.11 (2.02 pct)
>> 4-clients:       92298.45 (0.00 pct)     94800.51 (2.71 pct)     93706.09 (1.52 pct)     94981.10 (2.90 pct)
>> 8-clients:       85618.41 (0.00 pct)     89130.14 (4.10 pct)     87677.84 (2.40 pct)     88284.61 (3.11 pct)
>> 16-clients:      78722.18 (0.00 pct)     79715.38 (1.26 pct)     80488.76 (2.24 pct)     78980.88 (0.32 pct)
>> 32-clients:      73610.75 (0.00 pct)     72801.41 (-1.09 pct)    72167.43 (-1.96 pct)    75077.55 (1.99 pct)
>> 64-clients:      55285.07 (0.00 pct)     56184.38 (1.62 pct)     56443.79 (2.09 pct)     60689.05 (9.77 pct)
>> 128-clients:     31176.92 (0.00 pct)     32830.06 (5.30 pct)     35511.93 (13.90 pct)    35638.50 (14.31 pct)
>> 256-clients:     20011.44 (0.00 pct)     15135.39 (-24.36 pct)   17599.21 (-12.05 pct)   18219.29 (-8.95 pct)
>>
>> o NPS2
>>
>>                         tip                  SIS_NODE              SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
>> 1-clients:       103105.55 (0.00 pct)    101582.75 (-1.47 pct)   103077.22 (-0.02 pct)   102233.63 (-0.84 pct)
>> 2-clients:       98720.29 (0.00 pct)     98537.46 (-0.18 pct)    100761.54 (2.06 pct)    99211.39 (0.49 pct)
>> 4-clients:       92289.39 (0.00 pct)     94332.45 (2.21 pct)     93622.46 (1.44 pct)     93321.77 (1.11 pct)
>> 8-clients:       84998.63 (0.00 pct)     87180.90 (2.56 pct)     86970.84 (2.32 pct)     86076.75 (1.26 pct)
>> 16-clients:      76395.81 (0.00 pct)     80017.06 (4.74 pct)     77937.29 (2.01 pct)     75090.85 (-1.70 pct)
>> 32-clients:      71110.89 (0.00 pct)     69445.86 (-2.34 pct)    69273.81 (-2.58 pct)    66885.99 (-5.94 pct)
>> 64-clients:      49526.21 (0.00 pct)     50004.13 (0.96 pct)     51649.09 (4.28 pct)     51100.52 (3.17 pct)
>> 128-clients:     27917.51 (0.00 pct)     30581.70 (9.54 pct)     31587.40 (13.14 pct)    33477.65 (19.91 pct)
>> 256-clients:     20067.17 (0.00 pct)     26002.42 (29.57 pct)    18681.28 (-6.90 pct)    18144.96 (-9.57 pct)
>>
>> o NPS4
>>
>>                         tip                  SIS_NODE              SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
>> 1-clients:       102139.49 (0.00 pct)    103578.02 (1.40 pct)    103633.90 (1.46 pct)    101656.07 (-0.47 pct)
>> 2-clients:       98259.53 (0.00 pct)     99336.70 (1.09 pct)     99720.37 (1.48 pct)     98812.86 (0.56 pct)
>> 4-clients:       91576.79 (0.00 pct)     95278.30 (4.04 pct)     93688.37 (2.30 pct)     93848.94 (2.48 pct)
>> 8-clients:       84742.30 (0.00 pct)     89005.65 (5.03 pct)     87703.04 (3.49 pct)     86709.29 (2.32 pct)
>> 16-clients:      79540.75 (0.00 pct)     85478.97 (7.46 pct)     83195.92 (4.59 pct)     81016.24 (1.85 pct)
>> 32-clients:      71166.14 (0.00 pct)     74254.01 (4.33 pct)     72422.76 (1.76 pct)     71391.62 (0.31 pct)
>> 64-clients:      51763.24 (0.00 pct)     52565.56 (1.54 pct)     55159.65 (6.56 pct)     52472.91 (1.37 pct)
>> 128-clients:     27829.29 (0.00 pct)     35774.61 (28.55 pct)    33738.97 (21.23 pct)    34564.10 (24.20 pct)
>> 256-clients:     24185.37 (0.00 pct)     27215.35 (12.52 pct)    17675.87 (-26.91 pct)   24937.66 (3.11 pct)
> 
> NPS4 is weird again, but mostly wins.
> 
> Based on the NPS1 results I'd say this one goes to TOPO
> 
>> ~~~~~~~~~~~~~~~~
>> ~ ycsb-mongodb ~
>> ~~~~~~~~~~~~~~~~
>>
>> o NPS1
>>
>> tip:			131070.33 (var: 2.84%)
>> SIS_NODE:		131070.33 (var: 2.84%) (0.00%)
>> SIS_NODE_LIMIT:		137227.00 (var: 4.97%) (4.69%)
>> SIS_NODE_TOPOEXT:	133529.67 (var: 0.98%) (1.87%)
>>
>> o NPS2
>>
>> tip:			133693.67 (var: 1.69%)
>> SIS_NODE:		134173.00 (var: 4.07%) (0.35%)
>> SIS_NODE_LIMIT:		134124.67 (var: 2.20%) (0.32%)
>> SIS_NODE_TOPOEXT:	133747.33 (var: 2.49%) (0.04%)
>>
>> o NPS4
>>
>> tip:			132913.67 (var: 1.97%)
>> SIS_NODE:		133697.33 (var: 1.69%) (0.58%)
>> SIS_NODE_LIMIT:		133307.33 (var: 1.03%) (0.29%)
>> SIS_NODE_TOPOEXT:	133426.67 (var: 3.60%) (0.38%)
>>
>> ~~~~~~~~~~~~~
>> ~ unixbench ~
>> ~~~~~~~~~~~~~
>>
>> o NPS1
>>
>> kernel                        			tip                  SIS_NODE               SIS_NODE_LIMIT            SIS_NODE_TOPOEXT
>> Hmean     unixbench-dhry2reg-1   	  41322625.19 (   0.00%)    41224388.33 (  -0.24%)    41142898.66 (  -0.43%)    41222168.97 (  -0.24%)
>> Hmean     unixbench-dhry2reg-512	6252491108.60 (   0.00%)  6240160851.68 (  -0.20%)  6262714194.10 (   0.16%)  6259553403.67 (   0.11%)
>> Amean     unixbench-syscall-1    	   2501398.27 (   0.00%)    2577323.43 *  -3.04%*      2498697.20 (   0.11%)     2541279.77 *  -1.59%*
>> Amean     unixbench-syscall-512  	   8120524.00 (   0.00%)    7512955.87 *   7.48%*      7447849.67 *   8.28%*     7477129.17 *   7.92%*
>> Hmean     unixbench-pipe-1    		   2359346.02 (   0.00%)    2392308.62 *   1.40%*      2407625.04 *   2.05%*     2334146.94 *  -1.07%*
>> Hmean     unixbench-pipe-512		 338790322.61 (   0.00%)  337711432.92 (  -0.32%)    340399941.24 (   0.48%)   339008490.26 (   0.06%)
>> Hmean     unixbench-spawn-1       	      4261.52 (   0.00%)       4164.90 (  -2.27%)         4929.26 *  15.67%*        5111.16 *  19.94%*
>> Hmean     unixbench-spawn-512    	     64328.93 (   0.00%)      62257.64 *  -3.22%*        63740.04 *  -0.92%*       63291.18 *  -1.61%*
>> Hmean     unixbench-execl-1       	      3677.73 (   0.00%)       3652.08 (  -0.70%)         3642.56 *  -0.96%*        3671.98 (  -0.16%)
>> Hmean     unixbench-execl-512    	     11984.83 (   0.00%)      13585.65 *  13.36%*        12496.80 (   4.27%)       12306.01 (   2.68%)
>>
>> o NPS2
>>
>> kernel                        			tip                  SIS_NODE               SIS_NODE_LIMIT            SIS_NODE_TOPOEXT
>> Hmean     unixbench-dhry2reg-1   	  41311787.29 (   0.00%)    41412946.27 (   0.24%)    41035150.98 (  -0.67%)    41371003.93 (   0.14%)
>> Hmean     unixbench-dhry2reg-512	6243873272.76 (   0.00%)  6256893083.32 (   0.21%)  6236751880.89 (  -0.11%)  6235047089.83 (  -0.14%)
>> Amean     unixbench-syscall-1    	   2503190.70 (   0.00%)     2576854.30 *  -2.94%*     2496464.80 *   0.27%*     2540298.77 *  -1.48%*
>> Amean     unixbench-syscall-512  	   8012388.13 (   0.00%)     7503196.87 *   6.36%*     7493284.60 *   6.48%*     7495117.73 *   6.46%*
>> Hmean     unixbench-pipe-1    		   2340486.25 (   0.00%)     2388946.63 (   2.07%)     2412344.33 *   3.07%*     2360277.30 (   0.85%)
>> Hmean     unixbench-pipe-512		 338965319.79 (   0.00%)   337225630.07 (  -0.51%)   339053027.04 (   0.03%)   336939353.18 *  -0.60%*
>> Hmean     unixbench-spawn-1      	      5241.83 (   0.00%)        5246.00 (   0.08%)        4718.45 *  -9.98%*        4967.96 *  -5.22%*
>> Hmean     unixbench-spawn-512    	     65799.86 (   0.00%)       64817.15 *  -1.49%*       66418.37 (   0.94%)       66820.63 *   1.55%*
>> Hmean     unixbench-execl-1       	      3670.65 (   0.00%)        3622.36 *  -1.32%*        3661.04 (  -0.26%)        3660.08 (  -0.29%)
>> Hmean     unixbench-execl-512    	     13682.00 (   0.00%)       13699.90 (   0.13%)       14103.91 (   3.08%)       12960.11 (  -5.28%)
>>
>> o NPS4
>>
>> kernel                        			tip                  SIS_NODE               SIS_NODE_LIMIT            SIS_NODE_TOPOEXT
>> Hmean     unixbench-dhry2reg-1   	  41025577.99 (   0.00%)    40879469.78 (  -0.36%)    41082700.61 (   0.14%)    41260407.54 (   0.57%)
>> Hmean     unixbench-dhry2reg-512	6255568261.91 (   0.00%)  6258326086.80 (   0.04%)  6252223940.32 (  -0.05%)  6259088809.43 (   0.06%)
>> Amean     unixbench-syscall-1    	   2507165.37 (   0.00%)    2579108.77 *  -2.87%*      2488617.40 *   0.74%*     2517574.40 (  -0.42%)
>> Amean     unixbench-syscall-512  	   7458476.50 (   0.00%)    7502528.67 *  -0.59%*      7978379.53 *  -6.97%*     7580369.27 *  -1.63%*
>> Hmean     unixbench-pipe-1    		   2369301.21 (   0.00%)    2392905.29 *   1.00%*      2410432.93 *   1.74%*     2347814.20 (  -0.91%)
>> Hmean     unixbench-pipe-512		 340299405.72 (   0.00%)  339139980.01 *  -0.34%*    340403992.95 (   0.03%)   338708678.82 *  -0.47%*
>> Hmean     unixbench-spawn-1      	      5571.78 (   0.00%)       5423.03 (  -2.67%)         5462.82 (  -1.96%)        5543.08 (  -0.52%)
>> Hmean     unixbench-spawn-512   	     63999.96 (   0.00%)      63485.41 (  -0.80%)        64730.98 *   1.14%*       67486.34 *   5.45%*
>> Hmean     unixbench-execl-1       	      3587.15 (   0.00%)       3624.44 *   1.04%*         3638.74 *   1.44%*        3639.57 *   1.46%*
>> Hmean     unixbench-execl-512    	     14184.17 (   0.00%)      13784.17 (  -2.82%)        13104.71 *  -7.61%*       13598.22 (  -4.13%)
>>
>> ~~~~~~~~~~~~~~~~~~
>> ~ DeathStarBench ~
>> ~~~~~~~~~~~~~~~~~~
>>
>> o NPS1
>> CCD	Scaling	       tip    SIS_NODE 		SIS_NODE_LIMIT 		SIS_NODE_TOPOEXT
>> 1	1		0%      0.30%  		 	0.83%  		 	0.79%
>> 1	1		0%      0.17%  		 	2.53%  		 	0.91%
>> 1	1		0%      -0.40% 		 	2.90%  		 	1.61%
>> 1	1		0%      -7.95% 		 	1.19%  		 	-1.56%
>>
>> o NPS2
>>
>> CCD	Scaling	       tip    SIS_NODE 		SIS_NODE_LIMIT 		SIS_NODE_TOPOEXT
>> 1	1		0%      0.34%  		 	-0.73% 		 	-0.62%
>> 1	1		0%      -0.02% 		 	0.14%  		 	-1.15%
>> 1	1		0%      -12.34%		 	-9.64% 		 	-7.80%
>> 1	1		0%      -12.41%		 	-1.03% 		 	-9.85%
>>
>> Note: In NPS2, 8 CCD case shows 10% run to run variation.
>>
>> o NPS4
>>
>> CCD	Scaling	       tip    SIS_NODE 		SIS_NODE_LIMIT 		SIS_NODE_TOPOEXT
>> 1	1		0%      -1.32% 		 	-0.71% 		 	-1.09%
>> 1	1		0%      -1.53% 		 	-1.11% 		 	-1.73%
>> 1	1		0%      7.19%  		 	-3.47% 		 	5.75%
>> 1	1		0%      -4.66% 		 	-1.91% 		 	-7.52%
> 
> LIMIT seems to do well for the NPS1 case, but how come it falls apart
> for NPS2 ?!? that doesn't realy make sense, does it?
> 
> And again NPS4 is all over the place :/

Yup! I'm doing a full rerun for SIS_NODE_LIMIT. 

> 
>>
>> --
>> If you would like me to collect any more information during any of the
>> above benchmark runs, please let me know.
> 
> dizzy with numbers ....
> 
> Perhaps see if you can figure out why NPS4 is so weird, there's only 2
> CCXs to go around per node on that thing, the various results should not
> be all over the map.
> 
> Perhaps pick hackbenc since it shows the problem and is easy and quick
> to run?

I've ran hackbench on NPS2 and the results from SIS_NODE_LIMIT_RE is
similar to SIS_NODE.

Test:                   tip                    SIS_NODE             SIS_NODE_LIMIT        SIS_NODE_LIMIT_RE       SIS_NODE_TOPOEXT
 1-groups:         3.82 (0.00 pct)         3.68 (3.66 pct)         3.87 (-1.30 pct)        3.80 (0.52 pct)         3.74 (2.09 pct)
 2-groups:         4.40 (0.00 pct)         3.61 (17.95 pct)        4.45 (-1.13 pct)        3.90 (11.36 pct)        4.30 (2.27 pct)
 4-groups:         4.84 (0.00 pct)         3.62 (25.20 pct)        4.84 (0.00 pct)         4.11 (15.08 pct)        4.97 (-2.68 pct)
 8-groups:         5.45 (0.00 pct)         6.14 (-12.66 pct)       5.40 (0.91 pct)         6.15 (-12.84 pct)       5.68 (-4.22 pct)
16-groups:         6.94 (0.00 pct)         8.77 (-26.36 pct)       6.57 (5.33 pct)         9.51 (-37.03 pct)       7.87 (-13.40 pct)

Since I've messed something up for SIS_NODE_LIMIT I'll give it a full
rerun.

> 
> Also, can you share the TOPOEXT code?

Here you go. It is not pretty and assigning the mmc_id is a hack.
Below diff should apply cleanly on top of commit e2a1f85bf9f5
("sched/psi: Avoid resetting the min update period when it is
unnecessary") with the SIS_NODE patch.

---
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4e91054c84be..cca5d147d8e1 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -16,8 +16,10 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
 /* cpus sharing the last level cache: */
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_mc_shared_map);
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
 DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
+DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_mc_id);
 DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
 
 DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
@@ -166,6 +168,11 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
 	return per_cpu(cpu_llc_shared_map, cpu);
 }
 
+static inline struct cpumask *cpu_mc_shared_mask(int cpu)
+{
+	return per_cpu(cpu_mc_shared_map, cpu);
+}
+
 static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
 {
 	return per_cpu(cpu_l2c_shared_map, cpu);
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 458c891a8273..b3519d2d0b56 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -102,6 +102,7 @@ static inline void setup_node_to_cpumask_map(void) { }
 
 #include <asm-generic/topology.h>
 
+extern const struct cpumask *cpu_mcgroup_mask(int cpu);
 extern const struct cpumask *cpu_coregroup_mask(int cpu);
 extern const struct cpumask *cpu_clustergroup_mask(int cpu);
 
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 4063e8991211..f6e3be6f2512 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -35,6 +35,7 @@
 
 /* Shared last level cache maps */
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_mc_shared_map);
 
 /* Shared L2 cache maps */
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
@@ -677,6 +678,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
 		 * Core complex ID is ApicId[3] for these processors.
 		 */
 		per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+		per_cpu(cpu_mc_id, cpu) = c->apicid >> 4;
 	} else {
 		/*
 		 * LLC ID is calculated from the number of threads sharing the
@@ -693,6 +695,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
 			int bits = get_count_order(num_sharing_cache);
 
 			per_cpu(cpu_llc_id, cpu) = c->apicid >> bits;
+			per_cpu(cpu_mc_id, cpu) = (c->apicid >> bits) >> 1;
 		}
 	}
 }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 80710a68ef7d..a320516bf767 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -81,6 +81,7 @@ EXPORT_SYMBOL(smp_num_siblings);
 
 /* Last level cache ID of each logical CPU */
 DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
+DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_mc_id) = BAD_APICID;
 
 u16 get_llc_id(unsigned int cpu)
 {
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 28fcd292f5fd..dedf86b9e8cb 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -536,6 +536,23 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	return topology_sane(c, o, "llc");
 }
 
+static bool match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+	/*
+	 * Do not match if we do not have a valid APICID for cpu.
+	 * TODO: For non AMD processors, return topology_same_node(c, o)?
+	 */
+	if (per_cpu(cpu_mc_id, cpu1) == BAD_APICID)
+		return false;
+
+	/* Do not match if LLC id does not match: */
+	if (per_cpu(cpu_mc_id, cpu1) != per_cpu(cpu_mc_id, cpu2))
+		return false;
+
+	return topology_sane(c, o, "mmc");
+}
 
 #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC)
 static inline int x86_sched_itmt_flags(void)
@@ -570,7 +587,7 @@ static int x86_cluster_flags(void)
  */
 static bool x86_has_numa_in_package;
 
-static struct sched_domain_topology_level x86_topology[6];
+static struct sched_domain_topology_level x86_topology[7];
 
 static void __init build_sched_topology(void)
 {
@@ -596,6 +613,16 @@ static void __init build_sched_topology(void)
 		cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
 	};
 #endif
+
+	/*
+	 * Multi-Multi-Core Domain Experimentation
+	 */
+	if (static_cpu_has(X86_FEATURE_ZEN)) {
+		x86_topology[i++] = (struct sched_domain_topology_level){
+			cpu_mcgroup_mask, SD_INIT_NAME(MMC)
+		};
+	}
+
 	/*
 	 * When there is NUMA topology inside the package skip the DIE domain
 	 * since the NUMA domains will auto-magically create the right spanning
@@ -628,6 +655,7 @@ void set_cpu_sibling_map(int cpu)
 	if (!has_mp) {
 		cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
 		cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+		cpumask_set_cpu(cpu, cpu_mc_shared_mask(cpu));
 		cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu));
 		cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
 		cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
@@ -647,6 +675,9 @@ void set_cpu_sibling_map(int cpu)
 		if ((i == cpu) || (has_mp && match_llc(c, o)))
 			link_mask(cpu_llc_shared_mask, cpu, i);
 
+		if ((i == cpu) || (has_mp && match_mc(c, o)))
+			link_mask(cpu_mc_shared_mask, cpu, i);
+
 		if ((i == cpu) || (has_mp && match_l2c(c, o)))
 			link_mask(cpu_l2c_shared_mask, cpu, i);
 
@@ -700,6 +731,12 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
 	return cpu_llc_shared_mask(cpu);
 }
 
+/* maps the cpu to the sched domain representing multi-multi-core */
+const struct cpumask *cpu_mcgroup_mask(int cpu)
+{
+	return cpu_mc_shared_mask(cpu);
+}
+
 const struct cpumask *cpu_clustergroup_mask(int cpu)
 {
 	return cpu_l2c_shared_mask(cpu);
@@ -1393,6 +1430,7 @@ void __init smp_prepare_cpus_common(void)
 		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
 		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
 		zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(cpu_mc_shared_map, i), GFP_KERNEL);
 		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
 		zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
 	}
@@ -1626,9 +1664,12 @@ static void remove_siblinginfo(int cpu)
 
 	for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
 		cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+	for_each_cpu(sibling, cpu_mc_shared_mask(cpu))
+		cpumask_clear_cpu(cpu, cpu_mc_shared_mask(sibling));
 	for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
 		cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling));
 	cpumask_clear(cpu_llc_shared_mask(cpu));
+	cpumask_clear(cpu_mc_shared_mask(cpu));
 	cpumask_clear(cpu_l2c_shared_mask(cpu));
 	cpumask_clear(topology_sibling_cpumask(cpu));
 	cpumask_clear(topology_core_cpumask(cpu));
--

I'll share the data from the reruns of SIS_NODE_LIMIT soon. In the
meantime, if there is anything you would like more data on, please do let
me know.

--
Thanks and Regards,
Prateek
  
Peter Zijlstra June 14, 2023, 8:17 a.m. UTC | #27
On Tue, Jun 13, 2023 at 04:00:39PM +0530, K Prateek Nayak wrote:

> >> - SIS_NODE_TOPOEXT - tip:sched/core + this patch
> >>                      + new sched domain (Multi-Multi-Core or MMC)
> >> 		     (https://lore.kernel.org/all/20230601153522.GB559993@hirez.programming.kicks-ass.net/)
> >> 		     MMC domain groups 2 nearby CCX.
> > 
> > OK, so you managed to get the NPS4 topology in NPS1 mode?
> 
> Yup! But it is a hack. I'll leave the patch at the end.

Chen Yu, could we do the reverse? Instead of building a bigger LLC
domain, can we split our LLC based on SNC (sub-numa-cluster) topologies?

Because as you know, Intel chips are having the reverse problem of the
LLC being entirely too large, so perhaps we can break it up along the
SNC lines.

Could you see if that works?

> Here you go. It is not pretty and assigning the mmc_id is a hack.
> Below diff should apply cleanly on top of commit e2a1f85bf9f5
> ("sched/psi: Avoid resetting the min update period when it is
> unnecessary") with the SIS_NODE patch.
> 
> ---
> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
> index 4e91054c84be..cca5d147d8e1 100644
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -16,8 +16,10 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
>  DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
>  /* cpus sharing the last level cache: */
>  DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
> +DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_mc_shared_map);
>  DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
>  DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
> +DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_mc_id);
>  DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
>  
>  DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
> @@ -166,6 +168,11 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
>  	return per_cpu(cpu_llc_shared_map, cpu);
>  }
>  
> +static inline struct cpumask *cpu_mc_shared_mask(int cpu)
> +{
> +	return per_cpu(cpu_mc_shared_map, cpu);
> +}
> +
>  static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
>  {
>  	return per_cpu(cpu_l2c_shared_map, cpu);
> diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
> index 458c891a8273..b3519d2d0b56 100644
> --- a/arch/x86/include/asm/topology.h
> +++ b/arch/x86/include/asm/topology.h
> @@ -102,6 +102,7 @@ static inline void setup_node_to_cpumask_map(void) { }
>  
>  #include <asm-generic/topology.h>
>  
> +extern const struct cpumask *cpu_mcgroup_mask(int cpu);
>  extern const struct cpumask *cpu_coregroup_mask(int cpu);
>  extern const struct cpumask *cpu_clustergroup_mask(int cpu);
>  
> diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
> index 4063e8991211..f6e3be6f2512 100644
> --- a/arch/x86/kernel/cpu/cacheinfo.c
> +++ b/arch/x86/kernel/cpu/cacheinfo.c
> @@ -35,6 +35,7 @@
>  
>  /* Shared last level cache maps */
>  DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
> +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_mc_shared_map);
>  
>  /* Shared L2 cache maps */
>  DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
> @@ -677,6 +678,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
>  		 * Core complex ID is ApicId[3] for these processors.
>  		 */
>  		per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
> +		per_cpu(cpu_mc_id, cpu) = c->apicid >> 4;
>  	} else {
>  		/*
>  		 * LLC ID is calculated from the number of threads sharing the
> @@ -693,6 +695,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
>  			int bits = get_count_order(num_sharing_cache);
>  
>  			per_cpu(cpu_llc_id, cpu) = c->apicid >> bits;
> +			per_cpu(cpu_mc_id, cpu) = (c->apicid >> bits) >> 1;
>  		}
>  	}
>  }
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 80710a68ef7d..a320516bf767 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -81,6 +81,7 @@ EXPORT_SYMBOL(smp_num_siblings);
>  
>  /* Last level cache ID of each logical CPU */
>  DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
> +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_mc_id) = BAD_APICID;
>  
>  u16 get_llc_id(unsigned int cpu)
>  {
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 28fcd292f5fd..dedf86b9e8cb 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -536,6 +536,23 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
>  	return topology_sane(c, o, "llc");
>  }
>  
> +static bool match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
> +{
> +	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
> +
> +	/*
> +	 * Do not match if we do not have a valid APICID for cpu.
> +	 * TODO: For non AMD processors, return topology_same_node(c, o)?
> +	 */
> +	if (per_cpu(cpu_mc_id, cpu1) == BAD_APICID)
> +		return false;
> +
> +	/* Do not match if LLC id does not match: */
> +	if (per_cpu(cpu_mc_id, cpu1) != per_cpu(cpu_mc_id, cpu2))
> +		return false;
> +
> +	return topology_sane(c, o, "mmc");
> +}
>  
>  #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC)
>  static inline int x86_sched_itmt_flags(void)
> @@ -570,7 +587,7 @@ static int x86_cluster_flags(void)
>   */
>  static bool x86_has_numa_in_package;
>  
> -static struct sched_domain_topology_level x86_topology[6];
> +static struct sched_domain_topology_level x86_topology[7];
>  
>  static void __init build_sched_topology(void)
>  {
> @@ -596,6 +613,16 @@ static void __init build_sched_topology(void)
>  		cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
>  	};
>  #endif
> +
> +	/*
> +	 * Multi-Multi-Core Domain Experimentation
> +	 */
> +	if (static_cpu_has(X86_FEATURE_ZEN)) {
> +		x86_topology[i++] = (struct sched_domain_topology_level){
> +			cpu_mcgroup_mask, SD_INIT_NAME(MMC)
> +		};
> +	}
> +
>  	/*
>  	 * When there is NUMA topology inside the package skip the DIE domain
>  	 * since the NUMA domains will auto-magically create the right spanning
> @@ -628,6 +655,7 @@ void set_cpu_sibling_map(int cpu)
>  	if (!has_mp) {
>  		cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
>  		cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
> +		cpumask_set_cpu(cpu, cpu_mc_shared_mask(cpu));
>  		cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu));
>  		cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
>  		cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
> @@ -647,6 +675,9 @@ void set_cpu_sibling_map(int cpu)
>  		if ((i == cpu) || (has_mp && match_llc(c, o)))
>  			link_mask(cpu_llc_shared_mask, cpu, i);
>  
> +		if ((i == cpu) || (has_mp && match_mc(c, o)))
> +			link_mask(cpu_mc_shared_mask, cpu, i);
> +
>  		if ((i == cpu) || (has_mp && match_l2c(c, o)))
>  			link_mask(cpu_l2c_shared_mask, cpu, i);
>  
> @@ -700,6 +731,12 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
>  	return cpu_llc_shared_mask(cpu);
>  }
>  
> +/* maps the cpu to the sched domain representing multi-multi-core */
> +const struct cpumask *cpu_mcgroup_mask(int cpu)
> +{
> +	return cpu_mc_shared_mask(cpu);
> +}
> +
>  const struct cpumask *cpu_clustergroup_mask(int cpu)
>  {
>  	return cpu_l2c_shared_mask(cpu);
> @@ -1393,6 +1430,7 @@ void __init smp_prepare_cpus_common(void)
>  		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
>  		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
>  		zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
> +		zalloc_cpumask_var(&per_cpu(cpu_mc_shared_map, i), GFP_KERNEL);
>  		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
>  		zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
>  	}
> @@ -1626,9 +1664,12 @@ static void remove_siblinginfo(int cpu)
>  
>  	for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
>  		cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
> +	for_each_cpu(sibling, cpu_mc_shared_mask(cpu))
> +		cpumask_clear_cpu(cpu, cpu_mc_shared_mask(sibling));
>  	for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
>  		cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling));
>  	cpumask_clear(cpu_llc_shared_mask(cpu));
> +	cpumask_clear(cpu_mc_shared_mask(cpu));
>  	cpumask_clear(cpu_l2c_shared_mask(cpu));
>  	cpumask_clear(topology_sibling_cpumask(cpu));
>  	cpumask_clear(topology_core_cpumask(cpu));
> --
> 
> I'll share the data from the reruns of SIS_NODE_LIMIT soon. In the
> meantime, if there is anything you would like more data on, please do let
> me know.
> 
> --
> Thanks and Regards,
> Prateek
  
Chen Yu June 14, 2023, 2:58 p.m. UTC | #28
On 2023-06-14 at 10:17:57 +0200, Peter Zijlstra wrote:
> On Tue, Jun 13, 2023 at 04:00:39PM +0530, K Prateek Nayak wrote:
> 
> > >> - SIS_NODE_TOPOEXT - tip:sched/core + this patch
> > >>                      + new sched domain (Multi-Multi-Core or MMC)
> > >> 		     (https://lore.kernel.org/all/20230601153522.GB559993@hirez.programming.kicks-ass.net/)
> > >> 		     MMC domain groups 2 nearby CCX.
> > > 
> > > OK, so you managed to get the NPS4 topology in NPS1 mode?
> > 
> > Yup! But it is a hack. I'll leave the patch at the end.
> 
> Chen Yu, could we do the reverse? Instead of building a bigger LLC
> domain, can we split our LLC based on SNC (sub-numa-cluster) topologies?
>
Hi Peter,
Do you mean with SNC enabled, if the LLC domain gets smaller? 
According to the test, the answer seems to be yes.

SNC enabled:
 grep . /sys/kernel/debug/sched/domains/cpu0/domain*/{name,flags}
/sys/kernel/debug/sched/domains/cpu0/domain0/name:SMT
/sys/kernel/debug/sched/domains/cpu0/domain1/name:MC
/sys/kernel/debug/sched/domains/cpu0/domain2/name:NUMA
/sys/kernel/debug/sched/domains/cpu0/domain3/name:NUMA
/sys/kernel/debug/sched/domains/cpu0/domain0/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_SHARE_CPUCAPACITY SD_SHARE_PKG_RESOURCES SD_PREFER_SIBLING
/sys/kernel/debug/sched/domains/cpu0/domain1/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_SHARE_PKG_RESOURCES SD_PREFER_SIBLING
/sys/kernel/debug/sched/domains/cpu0/domain2/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_SERIALIZE SD_OVERLAP SD_NUMA
/sys/kernel/debug/sched/domains/cpu0/domain3/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_SERIALIZE SD_OVERLAP SD_NUMA

The MC domain1 has the SD_SHARE_PKG_RESOURCES flag, while
the sub-NUMA domain2 does not have it.

cat /proc/schedstat | grep cpu0 -A 4
cpu0 0 0 0 0 0 0 737153151491 189570367069 38103461
domain0 00000000,00000000,00000000,00010000,00000000,00000000,00000001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000fff,ffff0000,00000000,00000000,0fffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 00000000,000000ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

SNC disabled:
grep . /sys/kernel/debug/sched/domains/cpu0/domain*/{name,flags}
/sys/kernel/debug/sched/domains/cpu0/domain0/name:SMT
/sys/kernel/debug/sched/domains/cpu0/domain1/name:MC
/sys/kernel/debug/sched/domains/cpu0/domain2/name:NUMA
/sys/kernel/debug/sched/domains/cpu0/domain0/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_SHARE_CPUCAPACITY SD_SHARE_PKG_RESOURCES SD_PREFER_SIBLING
/sys/kernel/debug/sched/domains/cpu0/domain1/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_SHARE_PKG_RESOURCES SD_PREFER_SIBLING
/sys/kernel/debug/sched/domains/cpu0/domain2/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_SERIALIZE SD_OVERLAP SD_NUMA


cat /proc/schedstat | grep cpu0 -A 4
cpu0 0 0 0 0 0 0 1030156602546 31734021553 1590389
domain0 00000000,00000000,00000000,00010000,00000000,00000000,00000001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,000000ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 747972895325 29319670483 1420637

> Because as you know, Intel chips are having the reverse problem of the
> LLC being entirely too large, so perhaps we can break it up along the
> SNC lines.
> 
> Could you see if that works?
I think spliting LLC would help. Or do you mean even with SNC disabled, are
we able to detect the SNC border? Currently the numa topo is detected from ACPI
table, not sure if SNC is disabled in BIOS, can the OS detect that. I'll have
a check.

thanks,
Chenyu
  
Peter Zijlstra June 14, 2023, 3:13 p.m. UTC | #29
On Wed, Jun 14, 2023 at 10:58:20PM +0800, Chen Yu wrote:
> On 2023-06-14 at 10:17:57 +0200, Peter Zijlstra wrote:
> > On Tue, Jun 13, 2023 at 04:00:39PM +0530, K Prateek Nayak wrote:
> > 
> > > >> - SIS_NODE_TOPOEXT - tip:sched/core + this patch
> > > >>                      + new sched domain (Multi-Multi-Core or MMC)
> > > >> 		     (https://lore.kernel.org/all/20230601153522.GB559993@hirez.programming.kicks-ass.net/)
> > > >> 		     MMC domain groups 2 nearby CCX.
> > > > 
> > > > OK, so you managed to get the NPS4 topology in NPS1 mode?
> > > 
> > > Yup! But it is a hack. I'll leave the patch at the end.
> > 
> > Chen Yu, could we do the reverse? Instead of building a bigger LLC
> > domain, can we split our LLC based on SNC (sub-numa-cluster) topologies?
> >
> Hi Peter,
> Do you mean with SNC enabled, if the LLC domain gets smaller? 
> According to the test, the answer seems to be yes.

No, I mean to build smaller LLC domains even with SNC disabled, as-if
SNC were active.
  
K Prateek Nayak June 16, 2023, 6:34 a.m. UTC | #30
Hello Peter,

On 6/13/2023 1:55 PM, Peter Zijlstra wrote:
> LIMIT seems to do well for the NPS1 case, but how come it falls apart
> for NPS2 ?!? that doesn't realy make sense, does it?

Including the results again with a rerun of SIS_NODE_LIMIT which I
definitely messed up last time. I'll include a small tl;dr for each
benchmark to reduce the dizziness from all the numbers :)

o System Details

Dual Socket 3rd Generation EPYC System (2 x 64C/128T)

o NPS Modes

NPS Modes are used to logically divide single socket into
multiple NUMA region.
Following is the NUMA configuration for each NPS mode on the system:

NPS1: Each socket is a NUMA node.
    Total 2 NUMA nodes in the dual socket machine.

    Node 0: 0-63,   128-191
    Node 1: 64-127, 192-255

    - 8CCX per node

NPS2: Each socket is further logically divided into 2 NUMA regions.
    Total 4 NUMA nodes exist over 2 socket.
   
    Node 0: 0-31,   128-159
    Node 1: 32-63,  160-191
    Node 2: 64-95,  192-223
    Node 3: 96-127, 223-255

    - 4 CCX per node

NPS4: Each socket is logically divided into 4 NUMA regions.
    Total 8 NUMA nodes exist over 2 socket.
   
    Node 0: 0-15,    128-143
    Node 1: 16-31,   144-159
    Node 2: 32-47,   160-175
    Node 3: 48-63,   176-191
    Node 4: 64-79,   192-207
    Node 5: 80-95,   208-223
    Node 6: 96-111,  223-231
    Node 7: 112-127, 232-255

    - 2 CCX per node

  Note:

  - Ideally in NPS2 and NPS4 modes SIS_NODE and SIS_NODE_LIMIT should
    behave similarly.

  - Ideally in NPS4 mode SIS_NODE and SIS_NODE_TOPOEXT should behave
    similarly.

o Kernel Versions

- tip              - tip:sched/core at commit e2a1f85bf9f5 "sched/psi:
                     Avoid resetting the min update period when it is
                     unnecessary")

- SIS_NODE         - tip:sched/core + this patch

- SIS_NODE_LIMIT   - tip:sched/core + this patch + nr=4 limit for SIS_NODE [1]

- SIS_NODE_TOPOEXT - tip:sched/core + this patch
                     + new sched domain (Multi-Multi-Core or MMC) [2]
		     (MMC domain groups 2 nearby CCX.)

[1] https://lore.kernel.org/all/20230601111326.GV4253@hirez.programming.kicks-ass.net/
[2] https://lore.kernel.org/all/3402dcc4-d52f-d99f-e6ce-b435478a5a59@amd.com/

o Benchmark Results

Note: All benchmarks were run with boost enabled and C2 disabled.

~~~~~~~~~~~~~
~ hackbench ~
~~~~~~~~~~~~~

tl;dr

- Similar trend is observed for SIS_NODE and SIS_NODE_LIMIT in NPS2 and
  NPS4 mode.
- Similar trend is observed for all SIS_NODE and SIS_NODE_TOPOEXT for
  NPS4 mode.
- Limiting search score seem to reduce the regression seen from plain
  SIS_NODE with a couple of outliers but the absolute numbers are not
  far off except for 16-groups in NPS4 mode for SIS_NODE_LIMIT.

o NPS1

Test:                   tip                     SIS_NODE           SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
1-groups:         3.92 (0.00 pct)         4.05 (-3.31 pct)        3.92 (0.00 pct)         3.77 (3.82 pct)
2-groups:         4.58 (0.00 pct)         3.84 (16.15 pct)        3.81 (16.81 pct)        4.34 (5.24 pct)
4-groups:         4.99 (0.00 pct)         3.98 (20.24 pct)        4.02 (19.43 pct)        5.01 (-0.40 pct)
8-groups:         5.67 (0.00 pct)         6.05 (-6.70 pct)        6.35 (-11.99 pct)       5.95 (-4.93 pct)
16-groups:        7.88 (0.00 pct)        10.56 (-34.01 pct)      10.07 (-27.79 pct)       8.04 (-2.03 pct)

o NPS4

Test:                   tip                     SIS_NODE           SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
1-groups:         3.82 (0.00 pct)         3.68 (3.66 pct)         3.80 (0.52 pct)         3.74 (2.09 pct)
2-groups:         4.40 (0.00 pct)         3.61 (17.95 pct)        3.90 (11.36 pct)        4.30 (2.27 pct)
4-groups:         4.84 (0.00 pct)         3.62 (25.20 pct)        4.11 (15.08 pct)        4.97 (-2.68 pct)
8-groups:         5.45 (0.00 pct)         6.14 (-12.66 pct)       6.15 (-12.84 pct)       5.68 (-4.22 pct)
16-groups:        6.94 (0.00 pct)         8.77 (-26.36 pct)       9.51 (-37.03 pct)       7.87 (-13.40 pct)

o NPS4

Test:                   tip                     SIS_NODE           SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
1-groups:         3.82 (0.00 pct)         3.84 (-0.52 pct)        3.83 (-0.26 pct)        3.85 (-0.78 pct)
2-groups:         4.44 (0.00 pct)         4.15 (6.53 pct)         4.25 (4.27 pct)         4.18 (5.85 pct)
4-groups:         4.86 (0.00 pct)         4.95 (-1.85 pct)        5.07 (-4.32 pct)        4.79 (1.44 pct)
8-groups:         5.42 (0.00 pct)         5.80 (-7.01 pct)        6.19 (-14.20 pct)       5.75 (-6.08 pct)
16-groups:        6.68 (0.00 pct)         9.07 (-35.77 pct)      13.80 (-106.58 pct)      8.66 (-29.64 pct)


~~~~~~~~~~~~~~~~~~
~ schbench (Old) ~
~~~~~~~~~~~~~~~~~~

tl;dr

- Similar to what was observed with hackbench with few outliers here
  and there.

o NPS1

#workers:       tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
1:          26.00 (0.00 pct)        24.00 (7.69 pct)        26.00 (0.00 pct)        20.00 (23.07 pct)
2:          27.00 (0.00 pct)        24.00 (11.11 pct)       24.00 (11.11 pct)       26.00 (3.70 pct)
4:          31.00 (0.00 pct)        28.00 (9.67 pct)        29.00 (6.45 pct)        28.00 (9.67 pct)
8:          36.00 (0.00 pct)        33.00 (8.33 pct)        34.00 (5.55 pct)        34.00 (5.55 pct)
16:         49.00 (0.00 pct)        47.00 (4.08 pct)        51.00 (-4.08 pct)       50.00 (-2.04 pct)
32:         80.00 (0.00 pct)        81.00 (-1.25 pct)       82.00 (-2.50 pct)       81.00 (-1.25 pct)
64:        169.00 (0.00 pct)       169.00 (0.00 pct)       175.00 (-3.55 pct)      177.00 (-4.73 pct)
128:       343.00 (0.00 pct)       365.00 (-6.41 pct)      361.00 (-5.24 pct)      336.00 (2.04 pct)
256:     42048.00 (0.00 pct)     35392.00 (15.82 pct)    42048.00 (0.00 pct)     48576.00 (-15.52 pct)
512:     95104.00 (0.00 pct)     88704.00 (6.72 pct)     88192.00 (7.26 pct)     89984.00 (5.38 pct)

o NPS2

#workers:       tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
1:          23.00 (0.00 pct)        24.00 (-4.34 pct)       19.00 (17.39 pct)       21.00 (8.69 pct)
2:          24.00 (0.00 pct)        24.00 (0.00 pct)        26.00 (-8.33 pct)       28.00 (-16.66 pct)
4:          31.00 (0.00 pct)        26.00 (16.12 pct)       26.00 (16.12 pct)       29.00 (6.45 pct)
8:          41.00 (0.00 pct)        38.00 (7.31 pct)        38.00 (7.31 pct)        38.00 (7.31 pct)
16:         48.00 (0.00 pct)        49.00 (-2.08 pct)       50.00 (-4.16 pct)       53.00 (-10.41 pct)
32:         81.00 (0.00 pct)        84.00 (-3.70 pct)       80.00 (1.23 pct)        86.00 (-6.17 pct)
64:        157.00 (0.00 pct)       169.00 (-7.64 pct)      162.00 (-3.18 pct)      172.00 (-9.55 pct)
128:       386.00 (0.00 pct)       400.00 (-3.62 pct)      400.00 (-3.62 pct)      384.00 (0.51 pct)
256:     48832.00 (0.00 pct)     44480.00 (8.91 pct)     44480.00 (8.91 pct)     48576.00 (0.52 pct)
512:     92032.00 (0.00 pct)     89472.00 (2.78 pct)     88448.00 (3.89 pct)     91008.00 (1.11 pct)

o NPS4

#workers:       tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
1:          21.00 (0.00 pct)        24.00 (-14.28 pct)      23.00 (-9.52 pct)       22.00 (-4.76 pct)
2:          28.00 (0.00 pct)        24.00 (14.28 pct)       29.00 (-3.57 pct)       28.00 (0.00 pct)
4:          32.00 (0.00 pct)        29.00 (9.37 pct)        28.00 (12.50 pct)       30.00 (6.25 pct)
8:          46.00 (0.00 pct)        43.00 (6.52 pct)        44.00 (4.34 pct)        42.00 (8.69 pct)
16:         51.00 (0.00 pct)        53.00 (-3.92 pct)       57.00 (-11.76 pct)      56.00 (-9.80 pct)
32:         82.00 (0.00 pct)        81.00 (1.21 pct)        87.00 (-6.09 pct)       83.00 (-1.21 pct)
64:        173.00 (0.00 pct)       172.00 (0.57 pct)       176.00 (-1.73 pct)      155.00 (10.40 pct)
128:       396.00 (0.00 pct)       384.00 (3.03 pct)       366.00 (7.57 pct)       386.00 (2.52 pct)
256:     48832.00 (0.00 pct)     46656.00 (4.45 pct)     47296.00 (3.14 pct)     49472.00 (-1.31 pct)
512:     95104.00 (0.00 pct)     90752.00 (4.57 pct)     91264.00 (4.03 pct)     90496.00 (4.84 pct)


~~~~~~~~~~
~ tbench ~
~~~~~~~~~~

tl;dr

- I'll rerun the datapoints marked with (^) because they are
  surprisingly good for SIS_NODE and not SIS_NODE_LIMIT when both are
  supposed to behave exactly same.
- IN NPS1 (marked with (*)) there is a sharp outlier at 256 clients
  where larger search scope seems to be more successful as machine
  becomes more overloaded (?!) I'll go check some idle stats here. 

o NPS1

Clients:      tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
1        452.49 (0.00 pct)       457.94 (1.20 pct)       455.34 (0.62 pct)       447.69 (-1.06 pct)
2        862.44 (0.00 pct)       879.99 (2.03 pct)       871.34 (1.03 pct)       855.91 (-0.75 pct)
4       1604.27 (0.00 pct)      1618.87 (0.91 pct)      1621.18 (1.05 pct)      1627.14 (1.42 pct)
8       2966.77 (0.00 pct)      3040.90 (2.49 pct)      3002.73 (1.21 pct)      2957.91 (-0.29 pct)
16      5176.70 (0.00 pct)      5292.29 (2.23 pct)      5249.57 (1.40 pct)      5241.61 (1.25 pct)
32      8205.24 (0.00 pct)      8949.12 (9.06 pct)      9006.75 (9.76 pct)      8494.17 (3.52 pct)
64      13956.71 (0.00 pct)     14461.42 (3.61 pct)     15150.47 (8.55 pct)     15045.43 (7.80 pct)
128     24005.50 (0.00 pct)     26052.75 (8.52 pct)     26570.09 (10.68 pct)    24008.73 (0.01 pct)  *
256     32457.61 (0.00 pct)     21999.41 (-32.22 pct)   21635.06 (-33.34 pct)   31060.12 (-4.30 pct) *
512     34345.24 (0.00 pct)     41166.39 (19.86 pct)    35180.72 (2.43 pct)     31864.14 (-7.22 pct) *
1024    33432.92 (0.00 pct)     40900.84 (22.33 pct)    36075.90 (7.90 pct)     32006.81 (-4.26 pct)

o NPS2

Clients:      tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
1         453.73 (0.00 pct)       451.63 (-0.46 pct)      449.92 (-0.83 pct)      453.79 (0.01 pct)
2         861.71 (0.00 pct)       857.85 (-0.44 pct)      867.28 (0.64 pct)       850.14 (-1.34 pct)
4        1599.14 (0.00 pct)      1609.30 (0.63 pct)      1601.12 (0.12 pct)      1619.10 (1.24 pct)
8        2951.03 (0.00 pct)      2944.71 (-0.21 pct)     2987.26 (1.22 pct)      2973.52 (0.76 pct)
16       5080.32 (0.00 pct)      5160.39 (1.57 pct)      5213.69 (2.62 pct)      5150.99 (1.39 pct)
32       7900.41 (0.00 pct)      8039.13 (1.75 pct)      8238.31 (4.27 pct)      7956.45 (0.70 pct)
64      14629.65 (0.00 pct)     15391.08 (5.20 pct)     14246.13 (-2.62 pct)    15410.41 (5.33 pct)
128     23155.88 (0.00 pct)     24015.45 (3.71 pct)     25187.85 (8.77 pct)     23351.35 (0.84 pct)
256     33449.57 (0.00 pct)     33571.08 (0.36 pct)     29242.15 (-12.57 pct)   32869.85 (-1.73 pct)
512     33757.47 (0.00 pct)     39872.69 (18.11 pct) ^  33893.44 (0.40 pct)     34526.17 (2.27 pct)
1024    34823.14 (0.00 pct)     41090.15 (17.99 pct) ^  33173.55 (-4.73 pct)    34522.97 (-0.86 pct)

o NPS4

Clients:      tip                     SIS_NODE             SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
1        450.14 (0.00 pct)       454.46 (0.95 pct)       449.90 (-0.05 pct)      451.43 (0.28 pct)
2        863.26 (0.00 pct)       868.94 (0.65 pct)       856.32 (-0.80 pct)      866.74 (0.40 pct)
4        1618.71 (0.00 pct)      1599.13 (-1.20 pct)     1592.50 (-1.61 pct)     1610.08 (-0.53 pct)
8        2929.35 (0.00 pct)      3065.12 (4.63 pct)      3052.68 (4.21 pct)      3004.74 (2.57 pct)
16       5114.04 (0.00 pct)      5261.40 (2.88 pct)      5220.86 (2.08 pct)      5108.53 (-0.10 pct)
32       7912.18 (0.00 pct)      8926.77 (12.82 pct)     8944.70 (13.04 pct)     8214.73 (3.82 pct)
64      14424.72 (0.00 pct)     14853.61 (2.97 pct)     14531.64 (0.74 pct)     14430.17 (0.03 pct)
128     23614.97 (0.00 pct)     24506.73 (3.77 pct)     25404.57 (7.57 pct)     23296.38 (-1.34 pct)
256     34365.13 (0.00 pct)     35538.42 (3.41 pct)     33123.21 (-3.61 pct)    31009.12 (-9.76 pct)
512     34215.50 (0.00 pct)     36017.49 (5.26 pct)     34398.11 (0.53 pct)     33262.55 (-2.78 pct)
1024    35421.90 (0.00 pct)     35193.81 (-0.64 pct)    36448.71 (2.89 pct)     32795.86 (-7.41 pct) *


~~~~~~~~~~
~ stream ~
~~~~~~~~~~

tl;dr

- Ideally nothing should change here since we are only running 16
  STREAM threads on such a large system however in NPS4, STREAM does
  not seem to be happy. (I'll go check whether it is indeed SIS_NODE)

- 10 Runs

o NPS1

Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
Copy:    271317.35 (0.00 pct)    292440.22 (7.78 pct)    241027.63 (-11.16 pct)  287277.25 (5.88 pct)
Scale:   205533.77 (0.00 pct)    203362.60 (-1.05 pct)   202615.73 (-1.41 pct)   205206.26 (-0.15 pct)
  Add:   221624.62 (0.00 pct)    225850.83 (1.90 pct)    212399.58 (-4.16 pct)   229774.48 (3.67 pct)
Triad:   228500.68 (0.00 pct)    225885.25 (-1.14 pct)   213445.64 (-6.58 pct)   240041.53 (5.05 pct)

o NPS2

Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
 Copy:   277761.29 (0.00 pct)    301816.34 (8.66 pct)    283260.62 (1.97 pct)    308218.80 (10.96 pct)
Scale:   215193.83 (0.00 pct)    212522.72 (-1.24 pct)   215479.38 (0.13 pct)    205678.94 (-4.42 pct)
  Add:   242725.75 (0.00 pct)    242695.13 (-0.01 pct)   240395.33 (-0.96 pct)   238089.46 (-1.91 pct)
Triad:   237253.44 (0.00 pct)    250618.57 (5.63 pct)    239077.27 (0.76 pct)    249652.73 (5.22 pct)

o NPS4

Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
 Copy:   273307.14 (0.00 pct)    255091.78 (-6.66 pct)   255965.91 (-6.34 pct)   262007.26 (-4.13 pct)
Scale:   235715.23 (0.00 pct)    222018.36 (-5.81 pct)   223368.73 (-5.23 pct)   222282.64 (-5.69 pct)
  Add:   244500.40 (0.00 pct)    230468.21 (-5.73 pct)   223901.86 (-8.42 pct)   227146.80 (-7.09 pct)
Triad:   250600.04 (0.00 pct)    236229.50 (-5.73 pct)   221054.69 (-11.78 pct)  231772.02 (-7.51 pct)

- 100 Runs

o NPS1

Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
Copy:    317381.65 (0.00 pct)    318827.08 (0.45 pct)    316606.07 (-0.24 pct)   318922.96 (0.48 pct)
Scale:   214145.00 (0.00 pct)    206213.69 (-3.70 pct)   216937.82 (1.30 pct)    210384.47 (-1.75 pct)
  Add:   239243.29 (0.00 pct)    229791.67 (-3.95 pct)   243660.76 (1.84 pct)    236659.48 (-1.07 pct)
Triad:   249477.76 (0.00 pct)    236843.06 (-5.06 pct)   237186.61 (-4.92 pct)   235990.67 (-5.40 pct)

o NPS2

Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
 Copy:   318082.10 (0.00 pct)    322844.91 (1.49 pct)    317697.43 (-0.12 pct)   322495.84 (1.38 pct)
Scale:   219338.56 (0.00 pct)    218139.90 (-0.54 pct)   220850.47 (0.68 pct)    221040.27 (0.77 pct)
  Add:   248118.20 (0.00 pct)    249826.98 (0.68 pct)    249156.94 (0.41 pct)    253006.79 (1.97 pct)
Triad:   247088.55 (0.00 pct)    260488.38 (5.42 pct)    255260.30 (3.30 pct)    249081.33 (0.80 pct)

o NPS4

Test:         tip                     SIS_NODE             SIS_NODE_LIMIT          SIS_NODE_TOPOEXT
 Copy:   345396.19 (0.00 pct)    343675.74 (-0.49 pct)   338130.49 (-2.10 pct)   334677.55 (-3.10 pct)
Scale:   241521.63 (0.00 pct)    231494.70 (-4.15 pct)   239206.00 (-0.95 pct)   229159.01 (-5.11 pct)
  Add:   261157.86 (0.00 pct)    249663.86 (-4.40 pct)   255752.77 (-2.06 pct)   242257.98 (-7.23 pct)
Triad:   267804.99 (0.00 pct)    263071.00 (-1.76 pct)   253672.27 (-5.27 pct)   256978.50 (-4.04 pct)

~~~~~~~~~~~
~ netperf ~
~~~~~~~~~~~

tl;dr

- Similar trend as hackbench with a few outliers.

o NPS1

                        tip                  SIS_NODE              SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
 1-clients:      102839.97 (0.00 pct)    103540.33 (0.68 pct)    102890.49 (0.04 pct)    103271.77 (0.41 pct)
 2-clients:      98428.08 (0.00 pct)     100431.67 (2.03 pct)    100804.93 (2.41 pct)    100417.11 (2.02 pct)
 4-clients:      92298.45 (0.00 pct)     94800.51 (2.71 pct)     94137.00 (1.99 pct)     94981.10 (2.90 pct)
 8-clients:      85618.41 (0.00 pct)     89130.14 (4.10 pct)     85573.71 (-0.05 pct)    88284.61 (3.11 pct)
16-clients:      78722.18 (0.00 pct)     79715.38 (1.26 pct)     79422.52 (0.88 pct)     78980.88 (0.32 pct)
32-clients:      73610.75 (0.00 pct)     72801.41 (-1.09 pct)    74763.95 (1.56 pct)     75077.55 (1.99 pct)
64-clients:      55285.07 (0.00 pct)     56184.38 (1.62 pct)     58976.82 (6.67 pct)     60689.05 (9.77 pct)
128-clients:     31176.92 (0.00 pct)     32830.06 (5.30 pct)     39403.20 (26.38 pct)    35638.50 (14.31 pct)
256-clients:     20011.44 (0.00 pct)     15135.39 (-24.36 pct)   17117.22 (-14.46 pct)   18219.29 (-8.95 pct)
512-clients:     10012.28 (0.00 pct)     9726.37 (-2.85 pct)     10492.91 (4.80 pct)     9395.51 (-6.16 pct)


o NPS2

                        tip                  SIS_NODE              SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
1-clients:      103105.55 (0.00 pct)    101582.75 (-1.47 pct)   101709.57 (-1.35 pct)   102233.63 (-0.84 pct)
2-clients:       98720.29 (0.00 pct)     98537.46 (-0.18 pct)    98469.37 (-0.25 pct)    99211.39 (0.49 pct)
4-clients:       92289.39 (0.00 pct)     94332.45 (2.21 pct)     95033.41 (2.97 pct)     93321.77 (1.11 pct)
8-clients:       84998.63 (0.00 pct)     87180.90 (2.56 pct)     88343.42 (3.93 pct)     86076.75 (1.26 pct)
16-clients:      76395.81 (0.00 pct)     80017.06 (4.74 pct)     79511.47 (4.07 pct)     75090.85 (-1.70 pct)
32-clients:      71110.89 (0.00 pct)     69445.86 (-2.34 pct)    67564.76 (-4.98 pct)    66885.99 (-5.94 pct)
64-clients:      49526.21 (0.00 pct)     50004.13 (0.96 pct)     53450.34 (7.92 pct)     51100.52 (3.17 pct)
128-clients:     27917.51 (0.00 pct)     30581.70 (9.54 pct)     32451.97 (16.24 pct)    33477.65 (19.91 pct)
256-clients:     20067.17 (0.00 pct)     26002.42 (29.57 pct)    25011.83 (24.64 pct)    18144.96 (-9.57 pct)

o NPS4

                        tip                  SIS_NODE              SIS_NODE_LIMIT         SIS_NODE_TOPOEXT
1-clients:      102139.49 (0.00 pct)    103578.02 (1.40 pct)    103036.47 (0.87 pct)    101656.07 (-0.47 pct)
2-clients:       98259.53 (0.00 pct)     99336.70 (1.09 pct)     99442.86 (1.20 pct)     98812.86 (0.56 pct)
4-clients:       91576.79 (0.00 pct)     95278.30 (4.04 pct)     94412.90 (3.09 pct)     93848.94 (2.48 pct)
8-clients:       84742.30 (0.00 pct)     89005.65 (5.03 pct)     86045.25 (1.53 pct)     86709.29 (2.32 pct)
16-clients:      79540.75 (0.00 pct)     85478.97 (7.46 pct)     81959.57 (3.04 pct)     81016.24 (1.85 pct)
32-clients:      71166.14 (0.00 pct)     74254.01 (4.33 pct)     71476.02 (0.43 pct)     71391.62 (0.31 pct)
64-clients:      51763.24 (0.00 pct)     52565.56 (1.54 pct)     50651.09 (-2.14 pct)    52472.91 (1.37 pct)
128-clients:     27829.29 (0.00 pct)     35774.61 (28.55 pct)    35541.25 (27.71 pct)    34564.10 (24.20 pct)
256-clients:     24185.37 (0.00 pct)     27215.35 (12.52 pct)    31619.39 (30.73 pct)    24937.66 (3.11 pct)


~~~~~~~~~~~~~~~~
~ ycsb-mongodb ~
~~~~~~~~~~~~~~~~

tl;dr

- SIS_NODE_LIMIT shows good improvements in NPS1 mode. Other modes
  are more or less similar.

o NPS1

tip:			131070.33 (var: 2.84%)
SIS_NODE:		131070.33 (var: 2.84%) (0.00%)
SIS_NODE_LIMIT:		137525.00 (var: 1.24%) (4.92%)
SIS_NODE_TOPOEXT:	133529.67 (var: 0.98%) (1.87%)

o NPS2

tip:			133693.67 (var: 1.69%)
SIS_NODE:		134173.00 (var: 4.07%) (0.35%)
SIS_NODE_LIMIT:		134068.33 (var: 2.10%) (0.28%)
SIS_NODE_TOPOEXT:	133747.33 (var: 2.49%) (0.04%)

o NPS4

tip:			132913.67 (var: 1.97%)
SIS_NODE:		133697.33 (var: 1.69%) (0.58%)
SIS_NODE_LIMIT:		133545.33 (var: 1.03%) (0.47%)
SIS_NODE_TOPOEXT:	133426.67 (var: 3.60%) (0.38%)

~~~~~~~~~~~~~
~ unixbench ~
~~~~~~~~~~~~~

tl;dr

- unixbench-syscall-512 in NPS1 and NPS2 seems to be the only gains
  while others stay more or less the same with few outliers as
  usual.

o NPS1

kernel                        			tip                  SIS_NODE               SIS_NODE_LIMIT            SIS_NODE_TOPOEXT
Hmean     unixbench-dhry2reg-1            41322625.19 (   0.00%)   41224388.33 (  -0.24%)    41064084.38 (  -0.63%)    41222168.97 (  -0.24%)
Hmean     unixbench-dhry2reg-512        6252491108.60 (   0.00%) 6240160851.68 (  -0.20%)  6245401623.65 (  -0.11%)  6259553403.67 (   0.11%)
Amean     unixbench-syscall-1              2501398.27 (   0.00%)    2577323.43 *  -3.04%*     2555415.40 *  -2.16%*     2541279.77 *  -1.59%*
Amean     unixbench-syscall-512            8120524.00 (   0.00%)    7512955.87 *   7.48%*     7443914.67 *   8.33%*     7477129.17 *   7.92%*
Hmean     unixbench-pipe-1                 2359346.02 (   0.00%)    2392308.62 *   1.40%*     2387202.32 *   1.18%*     2334146.94 *  -1.07%*
Hmean     unixbench-pipe-512             338790322.61 (   0.00%)  337711432.92 (  -0.32%)   337228635.73 *  -0.46%*   339008490.26 (   0.06%)
Hmean     unixbench-spawn-1                   4261.52 (   0.00%)       4164.90 (  -2.27%)        4351.26 (   2.11%)        5111.16 *  19.94%*
Hmean     unixbench-spawn-512                64328.93 (   0.00%)      62257.64 *  -3.22%*       63966.41 (  -0.56%)       63291.18 *  -1.61%*
Hmean     unixbench-execl-1                   3677.73 (   0.00%)       3652.08 (  -0.70%)        3599.76 *  -2.12%*        3671.98 (  -0.16%)
Hmean     unixbench-execl-512                11984.83 (   0.00%)      13585.65 *  13.36%*       13521.41 *  12.82%*       12306.01 (   2.68%)

o NPS2

kernel                        			tip                  SIS_NODE               SIS_NODE_LIMIT            SIS_NODE_TOPOEXT
Hmean     unixbench-dhry2reg-1            41311787.29 (   0.00%)    41412946.27 (   0.24%)    41305805.50 (  -0.01%)    41371003.93 (   0.14%)
Hmean     unixbench-dhry2reg-512        6243873272.76 (   0.00%)  6256893083.32 (   0.21%)  6246099911.20 (   0.04%)  6235047089.83 (  -0.14%)
Amean     unixbench-syscall-1              2503190.70 (   0.00%)     2576854.30 *  -2.94%*     2554603.93 *  -2.05%*     2540298.77 *  -1.48%*
Amean     unixbench-syscall-512            8012388.13 (   0.00%)     7503196.87 *   6.36%*     8076005.50 *  -0.79%*     7495117.73 *   6.46%*
Hmean     unixbench-pipe-1                 2340486.25 (   0.00%)     2388946.63 (   2.07%)     2397554.45 *   2.44%*     2360277.30 (   0.85%)
Hmean     unixbench-pipe-512             338965319.79 (   0.00%)   337225630.07 (  -0.51%)   336766051.70 *  -0.65%*   336939353.18 *  -0.60%*
Hmean     unixbench-spawn-1                   5241.83 (   0.00%)        5246.00 (   0.08%)        5248.48 (   0.13%)        4967.96 *  -5.22%*
Hmean     unixbench-spawn-512                65799.86 (   0.00%)       64817.15 *  -1.49%*       64569.20 *  -1.87%*       66820.63 *   1.55%*
Hmean     unixbench-execl-1                   3670.65 (   0.00%)        3622.36 *  -1.32%*        3633.71 (  -1.01%)        3660.08 (  -0.29%)
Hmean     unixbench-execl-512                13682.00 (   0.00%)       13699.90 (   0.13%)       13969.40 (   2.10%)       12960.11 (  -5.28%)

o NPS4

kernel                        			tip                  SIS_NODE               SIS_NODE_LIMIT            SIS_NODE_TOPOEXT
Hmean     unixbench-dhry2reg-1            41025577.99 (   0.00%)    40879469.78 (  -0.36%)    41209188.37 *   0.45%*    41260407.54 (   0.57%)
Hmean     unixbench-dhry2reg-512        6255568261.91 (   0.00%)  6258326086.80 (   0.04%)  6256509267.44 (   0.02%)  6259088809.43 (   0.06%)
Amean     unixbench-syscall-1              2507165.37 (   0.00%)     2579108.77 *  -2.87%*     2554902.63 *  -1.90%*     2517574.40 (  -0.42%)
Amean     unixbench-syscall-512            7458476.50 (   0.00%)     7502528.67 *  -0.59%*     7587645.17 *  -1.73%*     7580369.27 *  -1.63%*
Hmean     unixbench-pipe-1                 2369301.21 (   0.00%)     2392905.29 *   1.00%*     2383343.22 (   0.59%)     2347814.20 (  -0.91%)
Hmean     unixbench-pipe-512             340299405.72 (   0.00%)   339139980.01 *  -0.34%*   337309478.87 *  -0.88%*   338708678.82 *  -0.47%*
Hmean     unixbench-spawn-1                   5571.78 (   0.00%)        5423.03 (  -2.67%)        5076.02 *  -8.90%*        5543.08 (  -0.52%)
Hmean     unixbench-spawn-512                63999.96 (   0.00%)       63485.41 (  -0.80%)       63705.16 (  -0.46%)       67486.34 *   5.45%*
Hmean     unixbench-execl-1                   3587.15 (   0.00%)        3624.44 *   1.04%*        3614.43 *   0.76%*        3639.57 *   1.46%*
Hmean     unixbench-execl-512                14184.17 (   0.00%)       13784.17 (  -2.82%)       13985.35 (  -1.40%)       13598.22 (  -4.13%)

~~~~~~~~~~~~~~~~~~
~ DeathStarBench ~
~~~~~~~~~~~~~~~~~~

tl;dr

- Similar trend as hackbench except in NPS4 mode with 8CCXs (one
  socket), SIS_NODE_LIMIT and SIS_NODE_TOPOEXT are noticeably
  worse than SIS_NODE but all are bad.

o NPS1

CCD	Scaling	       tip    SIS_NODE 		SIS_NODE_LIMIT 		SIS_NODE_TOPOEXT
1	1		0%    0.30%             0.34%                   0.79%
2       2		0%    0.17%             0.78%                   0.91%
4       4		0%    -0.40%            0.91%                   1.61%
8       8		0%    -7.95%            -6.73%                  -1.56%

o NPS2

CCD	Scaling	       tip    SIS_NODE 		SIS_NODE_LIMIT 		SIS_NODE_TOPOEXT
1	1		0%    0.34%             0.07%                   -0.62%
2       2		0%    -0.02%            0.24%                   -1.15%
4       4		0%    -12.34%           -6.47%                  -7.80% * High Run to Run Variance (~10%)
8       8		0%    -12.41%           -12.83%                 -9.85%

o NPS4

CCD	Scaling	       tip    SIS_NODE 		SIS_NODE_LIMIT 		SIS_NODE_TOPOEXT
1	1		0%      -1.32%          -1.25%                  -1.09%
2       2		0%      -1.53%          -1.62%                  -1.73%
4       4		0%       7.19%           9.08%                   5.75%
8       8		0%      -4.66%          -9.68%                  -7.52%

--

Conclusion seems to be that most workloads would like to run on an idle
thread as quickly as possible, however, once the system becomes
overloaded, even iterating over the groups to find an idle CPU outside
of the target group can affect the workload performance. TOPOEXT is a
clean way to limit search (as long as marking the boundaries can be
done in a clean way) but there are concerns about the load balancing
jitters the new domain will introduce. There will also be an increase
in amount of C2C transfers as some of the shared data structures are
accessed and modified (for example sched_domain_shared->has_idle_cores
updates).

I'll go get some additional data for some of the data points mentioned
above. Meanwhile if you would like any additional data, please do let
me know. I'll also try to see if a dynamic limit for SIS_NODE_LIMIT
based on Chenyu's SIS_UTIL / ILB_UTIL [3] logic helps.

[3] https://lore.kernel.org/all/bc96b53efb712337a645e2c2a340975545ed5a28.1686554037.git.yu.c.chen@intel.com/

--
Thanks and Regards,
Prateek
  
Chen Yu June 21, 2023, 7:16 a.m. UTC | #31
On 2023-06-14 at 17:13:48 +0200, Peter Zijlstra wrote:
> On Wed, Jun 14, 2023 at 10:58:20PM +0800, Chen Yu wrote:
> > On 2023-06-14 at 10:17:57 +0200, Peter Zijlstra wrote:
> > > On Tue, Jun 13, 2023 at 04:00:39PM +0530, K Prateek Nayak wrote:
> > > 
> > > > >> - SIS_NODE_TOPOEXT - tip:sched/core + this patch
> > > > >>                      + new sched domain (Multi-Multi-Core or MMC)
> > > > >> 		     (https://lore.kernel.org/all/20230601153522.GB559993@hirez.programming.kicks-ass.net/)
> > > > >> 		     MMC domain groups 2 nearby CCX.
> > > > > 
> > > > > OK, so you managed to get the NPS4 topology in NPS1 mode?
> > > > 
> > > > Yup! But it is a hack. I'll leave the patch at the end.
> > > 
> > > Chen Yu, could we do the reverse? Instead of building a bigger LLC
> > > domain, can we split our LLC based on SNC (sub-numa-cluster) topologies?
> > >
> > Hi Peter,
> > Do you mean with SNC enabled, if the LLC domain gets smaller? 
> > According to the test, the answer seems to be yes.
> 
> No, I mean to build smaller LLC domains even with SNC disabled, as-if
> SNC were active.
> 
>
The topology on Sapphire Rapids is that there are 4 memory controllers within
1 package per lstopo result, and the LLCs could have slightly difference distance
to the 4 mc with SNC disabled. Unfortunately there is no interface for the OS
to query this partition. I used a hack to split the LLC into 4 smaller ones
with SNC disabled, according to the topology in SNC4. Then I had a test on this
platform with/withouth this LLC split, both with SIS_NODE enabled and with
this issue fixed[1]. Something like this when iterating the groups in select_idle_node():

if (cpumask_test_cpu(target, sched_group_span(sg)))
	continue;

The SIS_NODE should have no impact on non-LLC-split version on
Sapphire Rapids, so the baseline is vanilla+SIS_NODE.

In summary, huge improvement from netperf was observed, but also regression from
hackbench/schbench was observed when the system is under load. I'll collect some
schedstats to check the scan depth in the problematic cases.


With SNC disabled and with the hack llc-split patch applied, there is a new
Die domain generated, the LLC is divided into 4 sub-llc groups:

 grep  . domain*/{name,flags}
domain0/name:SMT
domain1/name:MC
domain2/name:DIE
domain3/name:NUMA
domain0/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_SHARE_CPUCAPACITY SD_SHARE_PKG_RESOURCES SD_PREFER_SIBLING
domain1/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_SHARE_PKG_RESOURCES SD_PREFER_SIBLING
domain2/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_PREFER_SIBLING
domain3/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_SERIALIZE SD_OVERLAP SD_NUMA

cat /proc/schedstat | grep cpu0 -A 4
cpu0 0 0 0 0 0 0 15968391465 3630455022 18084
domain0 00000000,00000000,00000000,00010000,00000000,00000000,00000001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,3fff0000,00000000,00000000,00003fff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 00000000,000000ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


hackbench
=========
case                    load            baseline(std%)  compare%( std%)
process-pipe            1-groups         1.00 (  3.81)  -100.18 (  0.19)
process-pipe            2-groups         1.00 ( 10.74)  -59.21 (  0.91)
process-pipe            4-groups         1.00 (  5.37)  -56.37 (  0.56)
process-pipe            8-groups         1.00 (  0.36)  +17.11 (  0.82)
process-sockets         1-groups         1.00 (  0.09)  -26.53 (  1.45)
process-sockets         2-groups         1.00 (  0.82)  -26.45 (  0.40)
process-sockets         4-groups         1.00 (  0.21)   -4.09 (  0.19)
process-sockets         8-groups         1.00 (  0.13)   -5.31 (  0.36)
threads-pipe            1-groups         1.00 (  2.14)  -62.87 (  1.11)
threads-pipe            2-groups         1.00 (  3.18)  -55.82 (  1.14)
threads-pipe            4-groups         1.00 (  4.68)  -54.92 (  0.34)
threads-pipe            8-groups         1.00 (  5.08)  +15.81 (  3.08)
threads-sockets         1-groups         1.00 (  2.60)  -18.28 (  6.03)
threads-sockets         2-groups         1.00 (  0.83)  -30.17 (  0.60)
threads-sockets         4-groups         1.00 (  0.16)   -4.15 (  0.27)
threads-sockets         8-groups         1.00 (  0.36)   -5.92 (  0.94)

The 1 group, 2 groups, 4 groups suffered.

netperf
=======
case                    load            baseline(std%)  compare%( std%)
TCP_RR                  56-threads       1.00 (  2.75)  +10.49 ( 10.88)
TCP_RR                  112-threads      1.00 (  2.39)   -1.88 (  2.82)
TCP_RR                  168-threads      1.00 (  2.05)   +8.31 (  9.73)
TCP_RR                  224-threads      1.00 (  2.32)  +788.25 (  1.94)
TCP_RR                  280-threads      1.00 ( 59.77)  +83.07 ( 12.38)
TCP_RR                  336-threads      1.00 ( 21.61)   -0.22 ( 28.72)
TCP_RR                  392-threads      1.00 ( 31.26)   -0.13 ( 36.11)
TCP_RR                  448-threads      1.00 ( 39.93)   -0.14 ( 45.71)
UDP_RR                  56-threads       1.00 (  5.57)   +2.38 (  7.41)
UDP_RR                  112-threads      1.00 ( 24.53)   +1.51 (  8.43)
UDP_RR                  168-threads      1.00 ( 11.83)   +7.34 ( 20.20)
UDP_RR                  224-threads      1.00 ( 10.55)  +163.81 ( 20.64)
UDP_RR                  280-threads      1.00 ( 11.32)  +176.04 ( 21.83)
UDP_RR                  336-threads      1.00 ( 31.79)  +12.87 ( 37.23)
UDP_RR                  392-threads      1.00 ( 34.06)  +15.64 ( 44.62)
UDP_RR                  448-threads      1.00 ( 59.09)  +14.00 ( 52.93)

The 224-thread/280-threads show good improvement.

tbench
======
case                    load            baseline(std%)  compare%( std%)
loopback                56-threads       1.00 (  0.83)   +1.38 (  1.56)
loopback                112-threads      1.00 (  0.19)   -4.25 (  0.90)
loopback                168-threads      1.00 ( 56.43)  -31.12 (  0.37)
loopback                224-threads      1.00 (  0.28)   -2.50 (  0.44)
loopback                280-threads      1.00 (  0.10)   -1.64 (  0.81)
loopback                336-threads      1.00 (  0.19)   -2.10 (  0.10)
loopback                392-threads      1.00 (  0.13)   -2.15 (  0.39)
loopback                448-threads      1.00 (  0.45)   -2.14 (  0.43)

Might have no impact to tbench(the 168 threads result is unstable and could
be ignored)

schbench
========
case                    load            baseline(std%)  compare%( std%)
normal                  1-mthreads       1.00 (  0.42)   -0.59 (  0.72)
normal                  2-mthreads       1.00 (  2.72)   +1.76 (  0.42)
normal                  4-mthreads       1.00 (  0.75)   -1.22 (  1.86)
normal                  8-mthreads       1.00 (  6.44)  -14.56 (  5.64)

8 message case is not good for schbench.


diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 352f0ce1ece4..ffc44639447e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -511,6 +511,30 @@ static const struct x86_cpu_id intel_cod_cpu[] = {
 	{}
 };
 
+static unsigned int sub_llc_nr;
+
+static int __init parse_sub_llc(char *str)
+{
+	get_option(&str, &sub_llc_nr);
+
+	return 0;
+}
+early_param("sub_llc_nr", parse_sub_llc);
+
+static bool
+topology_same_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+	int idx1, idx2;
+
+	if (!sub_llc_nr)
+		return true;
+
+	idx1 = c->apicid / sub_llc_nr;
+	idx2 = o->apicid / sub_llc_nr;
+
+	return idx1 == idx2;
+}
+
 static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 {
 	const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
@@ -530,7 +554,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	 * means 'c' does not share the LLC of 'o'. This will be
 	 * reflected to userspace.
 	 */
-	if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
+	if (match_pkg(c, o) && (!topology_same_node(c, o) || !topology_same_llc(c, o)) && intel_snc)
 		return false;
 
 	return topology_sane(c, o, "llc");
  
Peter Zijlstra July 5, 2023, 11:57 a.m. UTC | #32
On Fri, Jun 16, 2023 at 12:04:48PM +0530, K Prateek Nayak wrote:

> [2] https://lore.kernel.org/all/3402dcc4-d52f-d99f-e6ce-b435478a5a59@amd.com/

With the crucial bit being:

 		per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+		per_cpu(cpu_mc_id, cpu) = c->apicid >> 4;

Would need some adjustments for <Zen3 I would think, because this simply
groups two LLCs, but those chips have a 4 core LLC and might be better
off with something like >> 5 instead.

> Conclusion seems to be that most workloads would like to run on an idle
> thread as quickly as possible, however, once the system becomes
> overloaded, even iterating over the groups to find an idle CPU outside
> of the target group can affect the workload performance. TOPOEXT is a
> clean way to limit search (as long as marking the boundaries can be
> done in a clean way) but there are concerns about the load balancing
> jitters the new domain will introduce. There will also be an increase
> in amount of C2C transfers as some of the shared data structures are
> accessed and modified (for example sched_domain_shared->has_idle_cores
> updates).

So per the parent of all this, I do think we want something like
SIS_NODE, at the very least for the desktop parts, doubly so for the
<Zen3 parts that have super dinky LLCs (like TJs desktop).

It's just that your big-ass chips need a little 'help' and in that
regard the TOPOEXT thing does look the most reasonable of the bunch.

One variant I did consider was to make SIS_NODE a domain flag, that
way the architecture can decide and we run less risk of randomly
regressing other archs that might not want this.

(did not yet test the SD flag version below)

---
Subject: sched/fair: Multi-LLC select_idle_sibling()

Tejun reported that when he targets workqueues towards a specific LLC
on his Zen2 machine with 3 cores / LLC and 4 LLCs in total, he gets
significant idle time.

This is, of course, because of how select_idle_sibling() will not
consider anything outside of the local LLC, and since all these tasks
are short running the periodic idle load balancer is ineffective.

And while it is good to keep work cache local, it is better to not
have significant idle time. Therefore, have select_idle_sibling() try
other LLCs inside the same node when the local one comes up empty.

Reported-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/168560901866.404.8439301702539997013.tip-bot2@tip-bot2
---
 arch/x86/kernel/smpboot.c      |    2 +-
 include/linux/sched/sd_flags.h |    7 +++++++
 kernel/sched/fair.c            |   38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 1 deletion(-)

--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -596,7 +596,7 @@ static inline int x86_sched_itmt_flags(v
 #ifdef CONFIG_SCHED_MC
 static int x86_core_flags(void)
 {
-	return cpu_core_flags() | x86_sched_itmt_flags();
+	return cpu_core_flags() | x86_sched_itmt_flags() | SD_IDLE_SIBLING;
 }
 #endif
 #ifdef CONFIG_SCHED_SMT
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -161,3 +161,10 @@ SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT |
  * NEEDS_GROUPS: No point in preserving domain if it has a single group.
  */
 SD_FLAG(SD_NUMA, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
+
+/*
+ * Search for idle CPUs in sibling groups
+ *
+ * NEEDS_GROUPS: Load balancing flag.
+ */
+SD_FLAG(SD_IDLE_SIBLING, SDF_NEEDS_GROUPS)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7046,6 +7046,38 @@ static int select_idle_cpu(struct task_s
 }
 
 /*
+ * For the multiple-LLC per node case, make sure to try the other LLC's if the
+ * local LLC comes up empty.
+ */
+static int
+select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
+{
+	struct sched_domain *parent = sd->parent;
+	struct sched_group *sg;
+
+	/* Make sure to not cross nodes. */
+	if (!parent || parent->flags & SD_NUMA)
+		return -1;
+
+	sg = parent->groups;
+	do {
+		int cpu = cpumask_first(sched_group_span(sg));
+		struct sched_domain *sd_child = per_cpu(sd_llc, cpu);
+
+		if (!cpus_share_cache(cpu, target) && sd_child) {
+			int i = select_idle_cpu(p, sd_child,
+						test_idle_cores(cpu), cpu);
+			if ((unsigned)i < nr_cpumask_bits)
+				return i;
+		}
+
+		sg = sg->next;
+	} while (sg != parent->groups);
+
+	return -1;
+}
+
+/*
  * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
  * the task fits. If no CPU is big enough, but there are idle ones, try to
  * maximize capacity.
@@ -7217,6 +7249,12 @@ static int select_idle_sibling(struct ta
 	if ((unsigned)i < nr_cpumask_bits)
 		return i;
 
+	if (sd->flags & SD_IDLE_SIBLING) {
+		i = select_idle_node(p, sd, target);
+		if ((unsigned)i < nr_cpumask_bits)
+			return i;
+	}
+
 	return target;
 }
  
Chen Yu July 8, 2023, 1:17 p.m. UTC | #33
On 2023-07-05 at 13:57:02 +0200, Peter Zijlstra wrote:
> On Fri, Jun 16, 2023 at 12:04:48PM +0530, K Prateek Nayak wrote:
> 
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -596,7 +596,7 @@ static inline int x86_sched_itmt_flags(v
>  #ifdef CONFIG_SCHED_MC
>  static int x86_core_flags(void)
>  {
> -	return cpu_core_flags() | x86_sched_itmt_flags();
> +	return cpu_core_flags() | x86_sched_itmt_flags() | SD_IDLE_SIBLING;
>  }
I guess this flag might need to be added into the valid mask:

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d3a3b2646ec4..4a563e9f7b10 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1540,6 +1540,7 @@ static struct cpumask		***sched_domains_numa_masks;
 #define TOPOLOGY_SD_FLAGS		\
 	(SD_SHARE_CPUCAPACITY	|	\
 	 SD_SHARE_PKG_RESOURCES |	\
+	 SD_IDLE_SIBLING	|	\
 	 SD_NUMA		|	\
 	 SD_ASYM_PACKING)
>  #endif
>  #ifdef CONFIG_SCHED_SMT
> --- a/include/linux/sched/sd_flags.h
> +++ b/include/linux/sched/sd_flags.h
> @@ -161,3 +161,10 @@ SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT |
>   * NEEDS_GROUPS: No point in preserving domain if it has a single group.
>   */
>  SD_FLAG(SD_NUMA, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
> +
> +/*
> + * Search for idle CPUs in sibling groups
> + *
> + * NEEDS_GROUPS: Load balancing flag.
> + */
> +SD_FLAG(SD_IDLE_SIBLING, SDF_NEEDS_GROUPS)
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7046,6 +7046,38 @@ static int select_idle_cpu(struct task_s
>  }
>  
>  /*
> + * For the multiple-LLC per node case, make sure to try the other LLC's if the
> + * local LLC comes up empty.
> + */
> +static int
> +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
> +{
> +	struct sched_domain *parent = sd->parent;
> +	struct sched_group *sg;
> +
> +	/* Make sure to not cross nodes. */
> +	if (!parent || parent->flags & SD_NUMA)
> +		return -1;
> +
> +	sg = parent->groups;
> +	do {
> +		int cpu = cpumask_first(sched_group_span(sg));
> +		struct sched_domain *sd_child = per_cpu(sd_llc, cpu);
>
I wonder if we can use rcu_dereference() in case the cpu hotplug
changes the content sd_llc points to. (I'm still thinking of the
symptom you described here:)
https://lore.kernel.org/lkml/20230605190746.GX83892@hirez.programming.kicks-ass.net/

I'll launch some tests with this version on Sapphire Rapids(and with/without LLC-split hack patch).

thanks,
Chenyu
  
Chen Yu July 12, 2023, 5:19 p.m. UTC | #34
On 2023-07-08 at 21:17:10 +0800, Chen Yu wrote:
> On 2023-07-05 at 13:57:02 +0200, Peter Zijlstra wrote:
> > On Fri, Jun 16, 2023 at 12:04:48PM +0530, K Prateek Nayak wrote:
> > 
> > --- a/arch/x86/kernel/smpboot.c
> > +++ b/arch/x86/kernel/smpboot.c
> > @@ -596,7 +596,7 @@ static inline int x86_sched_itmt_flags(v
> >  #ifdef CONFIG_SCHED_MC
> >  static int x86_core_flags(void)
> >  {
> > -	return cpu_core_flags() | x86_sched_itmt_flags();
> > +	return cpu_core_flags() | x86_sched_itmt_flags() | SD_IDLE_SIBLING;
> >  }
> I guess this flag might need to be added into the valid mask:
> 
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index d3a3b2646ec4..4a563e9f7b10 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1540,6 +1540,7 @@ static struct cpumask		***sched_domains_numa_masks;
>  #define TOPOLOGY_SD_FLAGS		\
>  	(SD_SHARE_CPUCAPACITY	|	\
>  	 SD_SHARE_PKG_RESOURCES |	\
> +	 SD_IDLE_SIBLING	|	\
>  	 SD_NUMA		|	\
>  	 SD_ASYM_PACKING)
> >  #endif
> >  #ifdef CONFIG_SCHED_SMT
> > --- a/include/linux/sched/sd_flags.h
> > +++ b/include/linux/sched/sd_flags.h
> > @@ -161,3 +161,10 @@ SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT |
> >   * NEEDS_GROUPS: No point in preserving domain if it has a single group.
> >   */
> >  SD_FLAG(SD_NUMA, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
> > +
> > +/*
> > + * Search for idle CPUs in sibling groups
> > + *
> > + * NEEDS_GROUPS: Load balancing flag.
> > + */
> > +SD_FLAG(SD_IDLE_SIBLING, SDF_NEEDS_GROUPS)
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -7046,6 +7046,38 @@ static int select_idle_cpu(struct task_s
> >  }
> >  
> >  /*
> > + * For the multiple-LLC per node case, make sure to try the other LLC's if the
> > + * local LLC comes up empty.
> > + */
> > +static int
> > +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
> > +{
> > +	struct sched_domain *parent = sd->parent;
> > +	struct sched_group *sg;
> > +
> > +	/* Make sure to not cross nodes. */
> > +	if (!parent || parent->flags & SD_NUMA)
> > +		return -1;
> > +
> > +	sg = parent->groups;
> > +	do {
> > +		int cpu = cpumask_first(sched_group_span(sg));
> > +		struct sched_domain *sd_child = per_cpu(sd_llc, cpu);
> >
> I wonder if we can use rcu_dereference() in case the cpu hotplug
> changes the content sd_llc points to. (I'm still thinking of the
> symptom you described here:)
> https://lore.kernel.org/lkml/20230605190746.GX83892@hirez.programming.kicks-ass.net/
> 
> I'll launch some tests with this version on Sapphire Rapids(and with/without LLC-split hack patch).

Tested on Sapphire Rapids, which has 2 x 56C/112T and 224 CPUs in total. C-states
deeper than C1E are disabled. Turbo is disabled. CPU frequency governor is performance.

The baseline is v6.4-rc1 tip:sched/core, on top of
commit 637c9509f3db ("sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle()")

patch0: this SD_IDLE_SIBLING patch with above change to TOPOLOGY_SD_FLAGS
patch1: hack patch to split 1 LLC domain into 4 smaller LLC domains(with some fixes on top of
        https://lore.kernel.org/lkml/ZJKjvx%2FNxooM5z1Y@chenyu5-mobl2.ccr.corp.intel.com/)
        The test data in above link is invalid due to bugs in the hack patch, fixed in this version)


Baseline vs Baseline+patch0:
There is no much difference between the two, and it is expected because Sapphire Rapids
does not have multiple LLC domains within 1 Numa node(also consider the run to run variation):

hackbench
=========
case            	load    	baseline(std%)	compare%( std%)
process-pipe    	1-groups	 1.00 (  2.66)	+13.84 ( 12.80)
process-pipe    	2-groups	 1.00 (  3.67)	 -8.37 (  2.33)
process-pipe    	4-groups	 1.00 (  6.45)	 +4.17 (  6.36)
process-pipe    	8-groups	 1.00 (  1.69)	 +2.28 (  1.72)
process-sockets 	1-groups	 1.00 (  1.73)	 +0.61 (  0.69)
process-sockets 	2-groups	 1.00 (  2.68)	 -2.20 (  0.55)
process-sockets 	4-groups	 1.00 (  0.03)	 -0.34 (  0.17)
process-sockets 	8-groups	 1.00 (  0.09)	 -0.28 (  0.09)
threads-pipe    	1-groups	 1.00 (  2.42)	 +6.95 (  3.86)
threads-pipe    	2-groups	 1.00 (  2.26)	 +2.68 (  6.56)
threads-pipe    	4-groups	 1.00 (  5.08)	 +3.57 (  4.61)
threads-pipe    	8-groups	 1.00 (  7.89)	 -2.52 (  3.45)
threads-sockets 	1-groups	 1.00 (  1.15)	 +0.87 (  3.13)
threads-sockets 	2-groups	 1.00 (  0.63)	 -0.02 (  1.27)
threads-sockets 	4-groups	 1.00 (  0.27)	 +0.29 (  0.17)
threads-sockets 	8-groups	 1.00 (  0.07)	 -0.42 (  0.40)

netperf
=======
case            	load    	baseline(std%)	compare%( std%)
TCP_RR          	56-threads	 1.00 (  2.56)	 -0.25 (  3.27)
TCP_RR          	112-threads	 1.00 (  2.26)	 +0.04 (  2.18)
TCP_RR          	168-threads	 1.00 (  0.81)	 +0.01 (  0.74)
TCP_RR          	224-threads	 1.00 (  0.65)	 +0.04 (  0.66)
TCP_RR          	280-threads	 1.00 ( 64.56)	+69.47 ( 56.78)
TCP_RR          	336-threads	 1.00 ( 20.39)	 +0.08 ( 19.58)
TCP_RR          	392-threads	 1.00 ( 31.63)	 +0.17 ( 31.08)
TCP_RR          	448-threads	 1.00 ( 39.72)	 -0.14 ( 39.14)
UDP_RR          	56-threads	 1.00 (  8.94)	 -0.71 ( 12.03)
UDP_RR          	112-threads	 1.00 ( 18.72)	 +0.78 ( 16.71)
UDP_RR          	168-threads	 1.00 ( 11.39)	 -0.18 (  8.34)
UDP_RR          	224-threads	 1.00 (  9.02)	 +0.81 ( 11.47)
UDP_RR          	280-threads	 1.00 ( 15.87)	 -0.12 ( 12.87)
UDP_RR          	336-threads	 1.00 ( 39.89)	 +2.25 ( 32.35)
UDP_RR          	392-threads	 1.00 ( 28.17)	 +3.47 ( 25.99)
UDP_RR          	448-threads	 1.00 ( 58.68)	 +0.35 ( 56.16)

tbench
======
case            	load    	baseline(std%)	compare%( std%)
loopback        	56-threads	 1.00 (  0.94)	 +0.24 (  0.69)
loopback        	112-threads	 1.00 (  0.19)	 +0.18 (  0.25)
loopback        	168-threads	 1.00 ( 52.17)	 -1.42 ( 50.95)
loopback        	224-threads	 1.00 (  0.86)	 -0.38 (  0.19)
loopback        	280-threads	 1.00 (  0.12)	 -0.28 (  0.17)
loopback        	336-threads	 1.00 (  0.10)	 -0.33 (  0.19)
loopback        	392-threads	 1.00 (  0.27)	 -0.49 (  0.26)
loopback        	448-threads	 1.00 (  0.06)	 -0.88 (  0.59)

schbench
========
case            	load    	baseline(std%)	compare%( std%)
normal          	1-mthreads	 1.00 (  0.72)	 -1.47 (  0.41)
normal          	2-mthreads	 1.00 (  1.66)	 +1.18 (  2.63)
normal          	4-mthreads	 1.00 (  1.12)	 +1.20 (  4.52)
normal          	8-mthreads	 1.00 ( 11.03)	 -3.87 (  5.14)


Baseline+patch1    vs    Baseline+patch0+patch1:

With multiple LLC domains in 1 Numa node, SD_IDLE_SIBLING brings improvement
to hackbench/schbench, while brings downgrading to netperf/tbench. This is aligned
with what was observed previously, if the waker and wakee wakes up each other
frequently, they would like to be put together for cache locality. While for
other tasks do not have shared resource, always choosing an idle CPU is better.
Maybe in the future we can look back at SIS_SHORT and terminates scan in
select_idle_node() if the waker and wakee have close relationship with
each other.


hackbench
=========
case            	load    	baseline(std%)	compare%( std%)
process-pipe    	1-groups	 1.00 (  0.25)	+31.65 (  6.77)
process-pipe    	2-groups	 1.00 (  0.28)	+29.50 (  5.35)
process-pipe    	4-groups	 1.00 (  0.08)	+16.77 (  1.30)
process-pipe    	8-groups	 1.00 (  0.20)	 -5.18 (  0.04)
process-sockets 	1-groups	 1.00 (  0.23)	+13.68 (  1.28)
process-sockets 	2-groups	 1.00 (  0.16)	+11.18 (  1.87)
process-sockets 	4-groups	 1.00 (  0.23)	 -0.06 (  0.21)
process-sockets 	8-groups	 1.00 (  0.36)	 +2.34 (  0.15)
threads-pipe    	1-groups	 1.00 (  5.23)	+16.38 ( 12.10)
threads-pipe    	2-groups	 1.00 (  1.63)	+28.52 (  5.17)
threads-pipe    	4-groups	 1.00 (  0.77)	+23.28 (  2.42)
threads-pipe    	8-groups	 1.00 (  2.27)	 +2.35 (  5.75)
threads-sockets 	1-groups	 1.00 (  2.31)	 +0.42 (  1.68)
threads-sockets 	2-groups	 1.00 (  0.56)	 +3.98 (  0.65)
threads-sockets 	4-groups	 1.00 (  0.12)	 +0.29 (  0.32)
threads-sockets 	8-groups	 1.00 (  0.86)	 +1.92 (  0.27)

netperf
=======
case            	load    	baseline(std%)	compare%( std%)
TCP_RR          	56-threads	 1.00 ( 12.46)	 -1.62 ( 12.14)
TCP_RR          	112-threads	 1.00 (  1.34)	 -0.16 (  1.42)
TCP_RR          	168-threads	 1.00 (  6.26)	 -0.88 (  6.08)
TCP_RR          	224-threads	 1.00 (  2.19)	-90.18 (  6.12)
TCP_RR          	280-threads	 1.00 ( 12.27)	-63.81 ( 74.25)
TCP_RR          	336-threads	 1.00 ( 29.28)	 -6.21 ( 18.48)
TCP_RR          	392-threads	 1.00 ( 39.39)	 -3.87 ( 26.63)
TCP_RR          	448-threads	 1.00 ( 47.45)	 -2.34 ( 32.37)
UDP_RR          	56-threads	 1.00 (  3.28)	 -0.31 (  2.81)
UDP_RR          	112-threads	 1.00 (  7.03)	 +0.55 (  7.03)
UDP_RR          	168-threads	 1.00 ( 17.42)	 -0.51 ( 15.63)
UDP_RR          	224-threads	 1.00 ( 20.79)	-68.28 ( 14.32)
UDP_RR          	280-threads	 1.00 ( 26.23)	-68.58 ( 18.60)
UDP_RR          	336-threads	 1.00 ( 38.99)	 -0.55 ( 21.19)
UDP_RR          	392-threads	 1.00 ( 44.22)	 -1.91 ( 27.44)
UDP_RR          	448-threads	 1.00 ( 55.11)	 -2.74 ( 38.55)

tbench
======
case            	load    	baseline(std%)	compare%( std%)
loopback        	56-threads	 1.00 (  2.69)	 -2.30 (  2.69)
loopback        	112-threads	 1.00 (  1.92)	 +0.62 (  1.46)
loopback        	168-threads	 1.00 (  0.97)	-67.69 (  0.06)
loopback        	224-threads	 1.00 (  0.24)	 -6.79 (  8.81)
loopback        	280-threads	 1.00 (  0.10)	 +0.47 (  0.62)
loopback        	336-threads	 1.00 (  0.85)	 -0.05 (  0.05)
loopback        	392-threads	 1.00 (  0.62)	 +0.77 (  0.50)
loopback        	448-threads	 1.00 (  0.36)	 +0.77 (  0.77)

schbench
========
case            	load    	baseline(std%)	compare%( std%)
normal          	1-mthreads	 1.00 (  0.82)	 +1.44 (  1.24)
normal          	2-mthreads	 1.00 (  2.13)	 +1.16 (  0.41)
normal          	4-mthreads	 1.00 (  3.82)	 -0.30 (  1.48)
normal          	8-mthreads	 1.00 (  4.80)	+22.43 ( 13.03)

But since the multiple LLC is just a simulation on Intel platform for now,
the patch is ok and:

Tested-by: Chen Yu <yu.c.chen@intel.com>

thanks,
Chenyu
  
K Prateek Nayak July 13, 2023, 3:43 a.m. UTC | #35
Hello Chenyu,

On 7/12/2023 10:49 PM, Chen Yu wrote:
> On 2023-07-08 at 21:17:10 +0800, Chen Yu wrote:
>> On 2023-07-05 at 13:57:02 +0200, Peter Zijlstra wrote:
>>> On Fri, Jun 16, 2023 at 12:04:48PM +0530, K Prateek Nayak wrote:
>>>
>>> --- a/arch/x86/kernel/smpboot.c
>>> +++ b/arch/x86/kernel/smpboot.c
>>> @@ -596,7 +596,7 @@ static inline int x86_sched_itmt_flags(v
>>>  #ifdef CONFIG_SCHED_MC
>>>  static int x86_core_flags(void)
>>>  {
>>> -	return cpu_core_flags() | x86_sched_itmt_flags();
>>> +	return cpu_core_flags() | x86_sched_itmt_flags() | SD_IDLE_SIBLING;
>>>  }
>> I guess this flag might need to be added into the valid mask:
>>
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index d3a3b2646ec4..4a563e9f7b10 100644
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -1540,6 +1540,7 @@ static struct cpumask		***sched_domains_numa_masks;
>>  #define TOPOLOGY_SD_FLAGS		\
>>  	(SD_SHARE_CPUCAPACITY	|	\
>>  	 SD_SHARE_PKG_RESOURCES |	\
>> +	 SD_IDLE_SIBLING	|	\
>>  	 SD_NUMA		|	\
>>  	 SD_ASYM_PACKING)
>>>  #endif
>>>  #ifdef CONFIG_SCHED_SMT
>>> --- a/include/linux/sched/sd_flags.h
>>> +++ b/include/linux/sched/sd_flags.h
>>> @@ -161,3 +161,10 @@ SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT |
>>>   * NEEDS_GROUPS: No point in preserving domain if it has a single group.
>>>   */
>>>  SD_FLAG(SD_NUMA, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
>>> +
>>> +/*
>>> + * Search for idle CPUs in sibling groups
>>> + *
>>> + * NEEDS_GROUPS: Load balancing flag.
>>> + */
>>> +SD_FLAG(SD_IDLE_SIBLING, SDF_NEEDS_GROUPS)
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -7046,6 +7046,38 @@ static int select_idle_cpu(struct task_s
>>>  }
>>>  
>>>  /*
>>> + * For the multiple-LLC per node case, make sure to try the other LLC's if the
>>> + * local LLC comes up empty.
>>> + */
>>> +static int
>>> +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
>>> +{
>>> +	struct sched_domain *parent = sd->parent;
>>> +	struct sched_group *sg;
>>> +
>>> +	/* Make sure to not cross nodes. */
>>> +	if (!parent || parent->flags & SD_NUMA)
>>> +		return -1;
>>> +
>>> +	sg = parent->groups;
>>> +	do {
>>> +		int cpu = cpumask_first(sched_group_span(sg));
>>> +		struct sched_domain *sd_child = per_cpu(sd_llc, cpu);
>>>
>> I wonder if we can use rcu_dereference() in case the cpu hotplug
>> changes the content sd_llc points to. (I'm still thinking of the
>> symptom you described here:)
>> https://lore.kernel.org/lkml/20230605190746.GX83892@hirez.programming.kicks-ass.net/
>>
>> I'll launch some tests with this version on Sapphire Rapids(and with/without LLC-split hack patch).
> 
> Tested on Sapphire Rapids, which has 2 x 56C/112T and 224 CPUs in total. C-states
> deeper than C1E are disabled. Turbo is disabled. CPU frequency governor is performance.
> 
> The baseline is v6.4-rc1 tip:sched/core, on top of
> commit 637c9509f3db ("sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle()")
> 
> patch0: this SD_IDLE_SIBLING patch with above change to TOPOLOGY_SD_FLAGS
> patch1: hack patch to split 1 LLC domain into 4 smaller LLC domains(with some fixes on top of
>         https://lore.kernel.org/lkml/ZJKjvx%2FNxooM5z1Y@chenyu5-mobl2.ccr.corp.intel.com/)
>         The test data in above link is invalid due to bugs in the hack patch, fixed in this version)
> 
> 
> Baseline vs Baseline+patch0:
> There is no much difference between the two, and it is expected because Sapphire Rapids
> does not have multiple LLC domains within 1 Numa node(also consider the run to run variation):
> 
> hackbench
> =========
> case            	load    	baseline(std%)	compare%( std%)
> process-pipe    	1-groups	 1.00 (  2.66)	+13.84 ( 12.80)
> process-pipe    	2-groups	 1.00 (  3.67)	 -8.37 (  2.33)
> process-pipe    	4-groups	 1.00 (  6.45)	 +4.17 (  6.36)
> process-pipe    	8-groups	 1.00 (  1.69)	 +2.28 (  1.72)
> process-sockets 	1-groups	 1.00 (  1.73)	 +0.61 (  0.69)
> process-sockets 	2-groups	 1.00 (  2.68)	 -2.20 (  0.55)
> process-sockets 	4-groups	 1.00 (  0.03)	 -0.34 (  0.17)
> process-sockets 	8-groups	 1.00 (  0.09)	 -0.28 (  0.09)
> threads-pipe    	1-groups	 1.00 (  2.42)	 +6.95 (  3.86)
> threads-pipe    	2-groups	 1.00 (  2.26)	 +2.68 (  6.56)
> threads-pipe    	4-groups	 1.00 (  5.08)	 +3.57 (  4.61)
> threads-pipe    	8-groups	 1.00 (  7.89)	 -2.52 (  3.45)
> threads-sockets 	1-groups	 1.00 (  1.15)	 +0.87 (  3.13)
> threads-sockets 	2-groups	 1.00 (  0.63)	 -0.02 (  1.27)
> threads-sockets 	4-groups	 1.00 (  0.27)	 +0.29 (  0.17)
> threads-sockets 	8-groups	 1.00 (  0.07)	 -0.42 (  0.40)
> 
> netperf
> =======
> case            	load    	baseline(std%)	compare%( std%)
> TCP_RR          	56-threads	 1.00 (  2.56)	 -0.25 (  3.27)
> TCP_RR          	112-threads	 1.00 (  2.26)	 +0.04 (  2.18)
> TCP_RR          	168-threads	 1.00 (  0.81)	 +0.01 (  0.74)
> TCP_RR          	224-threads	 1.00 (  0.65)	 +0.04 (  0.66)
> TCP_RR          	280-threads	 1.00 ( 64.56)	+69.47 ( 56.78)
> TCP_RR          	336-threads	 1.00 ( 20.39)	 +0.08 ( 19.58)
> TCP_RR          	392-threads	 1.00 ( 31.63)	 +0.17 ( 31.08)
> TCP_RR          	448-threads	 1.00 ( 39.72)	 -0.14 ( 39.14)
> UDP_RR          	56-threads	 1.00 (  8.94)	 -0.71 ( 12.03)
> UDP_RR          	112-threads	 1.00 ( 18.72)	 +0.78 ( 16.71)
> UDP_RR          	168-threads	 1.00 ( 11.39)	 -0.18 (  8.34)
> UDP_RR          	224-threads	 1.00 (  9.02)	 +0.81 ( 11.47)
> UDP_RR          	280-threads	 1.00 ( 15.87)	 -0.12 ( 12.87)
> UDP_RR          	336-threads	 1.00 ( 39.89)	 +2.25 ( 32.35)
> UDP_RR          	392-threads	 1.00 ( 28.17)	 +3.47 ( 25.99)
> UDP_RR          	448-threads	 1.00 ( 58.68)	 +0.35 ( 56.16)
> 
> tbench
> ======
> case            	load    	baseline(std%)	compare%( std%)
> loopback        	56-threads	 1.00 (  0.94)	 +0.24 (  0.69)
> loopback        	112-threads	 1.00 (  0.19)	 +0.18 (  0.25)
> loopback        	168-threads	 1.00 ( 52.17)	 -1.42 ( 50.95)
> loopback        	224-threads	 1.00 (  0.86)	 -0.38 (  0.19)
> loopback        	280-threads	 1.00 (  0.12)	 -0.28 (  0.17)
> loopback        	336-threads	 1.00 (  0.10)	 -0.33 (  0.19)
> loopback        	392-threads	 1.00 (  0.27)	 -0.49 (  0.26)
> loopback        	448-threads	 1.00 (  0.06)	 -0.88 (  0.59)
> 
> schbench
> ========
> case            	load    	baseline(std%)	compare%( std%)
> normal          	1-mthreads	 1.00 (  0.72)	 -1.47 (  0.41)
> normal          	2-mthreads	 1.00 (  1.66)	 +1.18 (  2.63)
> normal          	4-mthreads	 1.00 (  1.12)	 +1.20 (  4.52)
> normal          	8-mthreads	 1.00 ( 11.03)	 -3.87 (  5.14)
> 
> 
> Baseline+patch1    vs    Baseline+patch0+patch1:
> 
> With multiple LLC domains in 1 Numa node, SD_IDLE_SIBLING brings improvement
> to hackbench/schbench, while brings downgrading to netperf/tbench. This is aligned
> with what was observed previously, if the waker and wakee wakes up each other
> frequently, they would like to be put together for cache locality. While for
> other tasks do not have shared resource, always choosing an idle CPU is better.
> Maybe in the future we can look back at SIS_SHORT and terminates scan in
> select_idle_node() if the waker and wakee have close relationship with
> each other.

Gautham and I were discussing this and realized that when calling
ttwu_queue_wakelist(), in a simulated split-LLC case, ttwu_queue_cond()
will recommend using the wakelist and send an IPI despite the
groups of the DIE domain sharing the cache in your case.

Can you check if the following change helps the regression?
(Note: Completely untested and there may be other such cases lurking
around that we've not yet considered)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a68d1276bab0..a8cab1c81aca 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3929,7 +3929,7 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 	 * If the CPU does not share cache, then queue the task on the
 	 * remote rqs wakelist to avoid accessing remote data.
 	 */
-	if (!cpus_share_cache(smp_processor_id(), cpu))
+	if (cpu_to_node(smp_processor_id()) !=  cpu_to_node(cpu))
 		return true;
 
 	if (cpu == smp_processor_id())
--

> 
> 
> hackbench
> =========
> case            	load    	baseline(std%)	compare%( std%)
> process-pipe    	1-groups	 1.00 (  0.25)	+31.65 (  6.77)
> process-pipe    	2-groups	 1.00 (  0.28)	+29.50 (  5.35)
> process-pipe    	4-groups	 1.00 (  0.08)	+16.77 (  1.30)
> process-pipe    	8-groups	 1.00 (  0.20)	 -5.18 (  0.04)
> process-sockets 	1-groups	 1.00 (  0.23)	+13.68 (  1.28)
> process-sockets 	2-groups	 1.00 (  0.16)	+11.18 (  1.87)
> process-sockets 	4-groups	 1.00 (  0.23)	 -0.06 (  0.21)
> process-sockets 	8-groups	 1.00 (  0.36)	 +2.34 (  0.15)
> threads-pipe    	1-groups	 1.00 (  5.23)	+16.38 ( 12.10)
> threads-pipe    	2-groups	 1.00 (  1.63)	+28.52 (  5.17)
> threads-pipe    	4-groups	 1.00 (  0.77)	+23.28 (  2.42)
> threads-pipe    	8-groups	 1.00 (  2.27)	 +2.35 (  5.75)
> threads-sockets 	1-groups	 1.00 (  2.31)	 +0.42 (  1.68)
> threads-sockets 	2-groups	 1.00 (  0.56)	 +3.98 (  0.65)
> threads-sockets 	4-groups	 1.00 (  0.12)	 +0.29 (  0.32)
> threads-sockets 	8-groups	 1.00 (  0.86)	 +1.92 (  0.27)
> 
> netperf
> =======
> case            	load    	baseline(std%)	compare%( std%)
> TCP_RR          	56-threads	 1.00 ( 12.46)	 -1.62 ( 12.14)
> TCP_RR          	112-threads	 1.00 (  1.34)	 -0.16 (  1.42)
> TCP_RR          	168-threads	 1.00 (  6.26)	 -0.88 (  6.08)
> TCP_RR          	224-threads	 1.00 (  2.19)	-90.18 (  6.12)
> TCP_RR          	280-threads	 1.00 ( 12.27)	-63.81 ( 74.25)
> TCP_RR          	336-threads	 1.00 ( 29.28)	 -6.21 ( 18.48)
> TCP_RR          	392-threads	 1.00 ( 39.39)	 -3.87 ( 26.63)
> TCP_RR          	448-threads	 1.00 ( 47.45)	 -2.34 ( 32.37)
> UDP_RR          	56-threads	 1.00 (  3.28)	 -0.31 (  2.81)
> UDP_RR          	112-threads	 1.00 (  7.03)	 +0.55 (  7.03)
> UDP_RR          	168-threads	 1.00 ( 17.42)	 -0.51 ( 15.63)
> UDP_RR          	224-threads	 1.00 ( 20.79)	-68.28 ( 14.32)
> UDP_RR          	280-threads	 1.00 ( 26.23)	-68.58 ( 18.60)
> UDP_RR          	336-threads	 1.00 ( 38.99)	 -0.55 ( 21.19)
> UDP_RR          	392-threads	 1.00 ( 44.22)	 -1.91 ( 27.44)
> UDP_RR          	448-threads	 1.00 ( 55.11)	 -2.74 ( 38.55)
> 
> tbench
> ======
> case            	load    	baseline(std%)	compare%( std%)
> loopback        	56-threads	 1.00 (  2.69)	 -2.30 (  2.69)
> loopback        	112-threads	 1.00 (  1.92)	 +0.62 (  1.46)
> loopback        	168-threads	 1.00 (  0.97)	-67.69 (  0.06)
> loopback        	224-threads	 1.00 (  0.24)	 -6.79 (  8.81)
> loopback        	280-threads	 1.00 (  0.10)	 +0.47 (  0.62)
> loopback        	336-threads	 1.00 (  0.85)	 -0.05 (  0.05)
> loopback        	392-threads	 1.00 (  0.62)	 +0.77 (  0.50)
> loopback        	448-threads	 1.00 (  0.36)	 +0.77 (  0.77)
> 
> schbench
> ========
> case            	load    	baseline(std%)	compare%( std%)
> normal          	1-mthreads	 1.00 (  0.82)	 +1.44 (  1.24)
> normal          	2-mthreads	 1.00 (  2.13)	 +1.16 (  0.41)
> normal          	4-mthreads	 1.00 (  3.82)	 -0.30 (  1.48)
> normal          	8-mthreads	 1.00 (  4.80)	+22.43 ( 13.03)
> 
> But since the multiple LLC is just a simulation on Intel platform for now,
> the patch is ok and:
> 
> Tested-by: Chen Yu <yu.c.chen@intel.com>
> 
> thanks,
> Chenyu

--
Thanks and Regards,
Prateek
  
Chen Yu July 17, 2023, 1:09 a.m. UTC | #36
Hi Prateek,

On 2023-07-13 at 09:13:29 +0530, K Prateek Nayak wrote:
> Hello Chenyu,
> 
> > 
> > Tested on Sapphire Rapids, which has 2 x 56C/112T and 224 CPUs in total. C-states
> > deeper than C1E are disabled. Turbo is disabled. CPU frequency governor is performance.
> > 
> > The baseline is v6.4-rc1 tip:sched/core, on top of
> > commit 637c9509f3db ("sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle()")
> > 
> > patch0: this SD_IDLE_SIBLING patch with above change to TOPOLOGY_SD_FLAGS
> > patch1: hack patch to split 1 LLC domain into 4 smaller LLC domains(with some fixes on top of
> >         https://lore.kernel.org/lkml/ZJKjvx%2FNxooM5z1Y@chenyu5-mobl2.ccr.corp.intel.com/)
> >         The test data in above link is invalid due to bugs in the hack patch, fixed in this version)
> > 
> > 
> > Baseline vs Baseline+patch0:
> > There is no much difference between the two, and it is expected because Sapphire Rapids
> > does not have multiple LLC domains within 1 Numa node(also consider the run to run variation):
> >

[snip] 

> > 
> > Baseline+patch1    vs    Baseline+patch0+patch1:
> > 
> > With multiple LLC domains in 1 Numa node, SD_IDLE_SIBLING brings improvement
> > to hackbench/schbench, while brings downgrading to netperf/tbench. This is aligned
> > with what was observed previously, if the waker and wakee wakes up each other
> > frequently, they would like to be put together for cache locality. While for
> > other tasks do not have shared resource, always choosing an idle CPU is better.
> > Maybe in the future we can look back at SIS_SHORT and terminates scan in
> > select_idle_node() if the waker and wakee have close relationship with
> > each other.
> 
> Gautham and I were discussing this and realized that when calling
> ttwu_queue_wakelist(), in a simulated split-LLC case, ttwu_queue_cond()
> will recommend using the wakelist and send an IPI despite the
> groups of the DIE domain sharing the cache in your case.
> 
> Can you check if the following change helps the regression?
> (Note: Completely untested and there may be other such cases lurking
> around that we've not yet considered)
> 

Good point. There are quite some cpus_share_cache() in the code, and it
could behave differently if simulated split-LLC is enabled. For example,
the chance to choose a previous CPU, or a recent_used_cpu is lower in
select_idle_sibling(), because the range of cpus_share_cache() shrinks.

I launched netperf(224 threads) and hackbench (2 groups) with below patch
applied, it seems there was no much difference(consider the run-to-run variation)

patch2: the cpus_share_cache() change below.


Baseline+patch1    vs    Baseline+patch0+patch1+patch2:


netperf
=======
case            	load    	baseline(std%)	compare%( std%)
TCP_RR          	224-threads	 1.00 (  2.36)	 -0.19 (  2.30)

hackbench
=========
case            	load    	baseline(std%)	compare%( std%)
process-pipe    	2-groups	 1.00 (  4.78)	 -6.28 (  9.42)

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index a68d1276bab0..a8cab1c81aca 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3929,7 +3929,7 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
>  	 * If the CPU does not share cache, then queue the task on the
>  	 * remote rqs wakelist to avoid accessing remote data.
>  	 */
> -	if (!cpus_share_cache(smp_processor_id(), cpu))
> +	if (cpu_to_node(smp_processor_id()) !=  cpu_to_node(cpu))
>  		return true;
>  
>  	if (cpu == smp_processor_id())
> --
>

Then I did a hack patch3 in select_idle_node(), to put C/S 1:1 wakeup workloads together.
For netperf, it is a 1:1 waker/wakee relationship, for hackbench, it is 1:16 waker/wakee
by default(verified by bpftrace).


patch3:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5904da690f59..3bdfbd546f14 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7161,6 +7161,11 @@ select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
 	if (!parent || parent->flags & SD_NUMA)
 		return -1;
 
+	/* Tasks pair should be put on local LLC as much as possible. */
+	if (current->last_wakee == p && p->last_wakee == current &&
+	    !current->wakee_flips && !p->wakee_flips)
+		return -1;
+
 	sg = parent->groups;
 	do {
 		int cpu = cpumask_first(sched_group_span(sg));
  

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 48b6f0c..0172458 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7028,6 +7028,38 @@  static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool 
 }
 
 /*
+ * For the multiple-LLC per node case, make sure to try the other LLC's if the
+ * local LLC comes up empty.
+ */
+static int
+select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
+{
+	struct sched_domain *parent = sd->parent;
+	struct sched_group *sg;
+
+	/* Make sure to not cross nodes. */
+	if (!parent || parent->flags & SD_NUMA)
+		return -1;
+
+	sg = parent->groups;
+	do {
+		int cpu = cpumask_first(sched_group_span(sg));
+		struct sched_domain *sd_child;
+
+		sd_child = per_cpu(sd_llc, cpu);
+		if (sd_child != sd) {
+			int i = select_idle_cpu(p, sd_child, test_idle_cores(cpu), cpu);
+			if ((unsigned)i < nr_cpumask_bits)
+				return i;
+		}
+
+		sg = sg->next;
+	} while (sg != parent->groups);
+
+	return -1;
+}
+
+/*
  * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
  * the task fits. If no CPU is big enough, but there are idle ones, try to
  * maximize capacity.
@@ -7199,6 +7231,12 @@  static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if ((unsigned)i < nr_cpumask_bits)
 		return i;
 
+	if (sched_feat(SIS_NODE)) {
+		i = select_idle_node(p, sd, target);
+		if ((unsigned)i < nr_cpumask_bits)
+			return i;
+	}
+
 	return target;
 }
 
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index ee7f23c..9e390eb 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -62,6 +62,7 @@  SCHED_FEAT(TTWU_QUEUE, true)
  */
 SCHED_FEAT(SIS_PROP, false)
 SCHED_FEAT(SIS_UTIL, true)
+SCHED_FEAT(SIS_NODE, true)
 
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls