[v2,3/4] sched: add sched_numa_find_nth_cpu()

Message ID 20221112190946.728270-4-yury.norov@gmail.com
State New
Headers
Series cpumask: improve on cpumask_local_spread() locality |

Commit Message

Yury Norov Nov. 12, 2022, 7:09 p.m. UTC
  The function finds Nth set CPU in a given cpumask starting from a given
node.

Leveraging the fact that each hop in sched_domains_numa_masks includes the
same or greater number of CPUs than the previous one, we can use binary
search on hops instead of linear walk, which makes the overall complexity
of O(log n) in terms of number of cpumask_weight() calls.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/topology.h |  8 ++++++
 kernel/sched/topology.c  | 55 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)
  

Comments

Andy Shevchenko Nov. 14, 2022, 2:32 p.m. UTC | #1
On Sat, Nov 12, 2022 at 11:09:45AM -0800, Yury Norov wrote:
> The function finds Nth set CPU in a given cpumask starting from a given
> node.
> 
> Leveraging the fact that each hop in sched_domains_numa_masks includes the
> same or greater number of CPUs than the previous one, we can use binary
> search on hops instead of linear walk, which makes the overall complexity
> of O(log n) in terms of number of cpumask_weight() calls.

...

> +struct __cmp_key {
> +	const struct cpumask *cpus;
> +	struct cpumask ***masks;
> +	int node;
> +	int cpu;
> +	int w;
> +};
> +
> +static int cmp(const void *a, const void *b)

Calling them key and pivot (as in the caller), would make more sense.

> +{

What about

	const (?) struct cpumask ***masks = (...)pivot;

> +	struct cpumask **prev_hop = *((struct cpumask ***)b - 1);

	= masks[-1];

> +	struct cpumask **cur_hop = *(struct cpumask ***)b;

	= masks[0];

?

> +	struct __cmp_key *k = (struct __cmp_key *)a;

> +	if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
> +		return 1;

> +	k->w = (b == k->masks) ? 0 : cpumask_weight_and(k->cpus, prev_hop[k->node]);
> +	if (k->w <= k->cpu)
> +		return 0;

Can k->cpu be negative? If no, we can rewrite above as

	k->w = 0;
	if (b == k->masks)
		return 0;

	k->w = cpumask_weight_and(k->cpus, prev_hop[k->node]);

> +	return -1;
> +}

...

> +int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
> +{
> +	struct __cmp_key k = { cpus, NULL, node, cpu, 0 };

You can drop NULL and 0 while using C99 assignments.

> +	int hop, ret = nr_cpu_ids;

> +	rcu_read_lock();

+ Blank line?

> +	k.masks = rcu_dereference(sched_domains_numa_masks);
> +	if (!k.masks)
> +		goto unlock;

> +	hop = (struct cpumask ***)
> +		bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), cmp) - k.masks;

Strange indentation. I would rather see the split on parameters and
maybe '-' operator.

sizeof(*k.masks) is a bit shorter, right?

Also we may go with


	struct cpumask ***masks;
	struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };



> +	ret = hop ?
> +		cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
> +		cpumask_nth_and(cpu - k.w, cpus, k.masks[0][node]);

> +unlock:

out_unlock: shows the intention more clearly, no?

> +	rcu_read_unlock();
> +	return ret;
> +}
  
Andy Shevchenko Nov. 14, 2022, 3:02 p.m. UTC | #2
On Mon, Nov 14, 2022 at 04:32:10PM +0200, Andy Shevchenko wrote:
> On Sat, Nov 12, 2022 at 11:09:45AM -0800, Yury Norov wrote:
> > The function finds Nth set CPU in a given cpumask starting from a given
> > node.
> > 
> > Leveraging the fact that each hop in sched_domains_numa_masks includes the
> > same or greater number of CPUs than the previous one, we can use binary
> > search on hops instead of linear walk, which makes the overall complexity
> > of O(log n) in terms of number of cpumask_weight() calls.
> 
> ...
> 
> > +struct __cmp_key {
> > +	const struct cpumask *cpus;
> > +	struct cpumask ***masks;
> > +	int node;
> > +	int cpu;
> > +	int w;
> > +};
> > +
> > +static int cmp(const void *a, const void *b)
> 
> Calling them key and pivot (as in the caller), would make more sense.
> 
> > +{
> 
> What about
> 
> 	const (?) struct cpumask ***masks = (...)pivot;
> 
> > +	struct cpumask **prev_hop = *((struct cpumask ***)b - 1);
> 
> 	= masks[-1];
> 
> > +	struct cpumask **cur_hop = *(struct cpumask ***)b;
> 
> 	= masks[0];
> 
> ?
> 
> > +	struct __cmp_key *k = (struct __cmp_key *)a;
> 
> > +	if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
> > +		return 1;
> 
> > +	k->w = (b == k->masks) ? 0 : cpumask_weight_and(k->cpus, prev_hop[k->node]);
> > +	if (k->w <= k->cpu)
> > +		return 0;
> 
> Can k->cpu be negative? If no, we can rewrite above as
> 
> 	k->w = 0;
> 	if (b == k->masks)
> 		return 0;
> 
> 	k->w = cpumask_weight_and(k->cpus, prev_hop[k->node]);
> 
> > +	return -1;
> > +}
> 
> ...
> 
> > +int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
> > +{
> > +	struct __cmp_key k = { cpus, NULL, node, cpu, 0 };
> 
> You can drop NULL and 0 while using C99 assignments.
> 
> > +	int hop, ret = nr_cpu_ids;
> 
> > +	rcu_read_lock();
> 
> + Blank line?
> 
> > +	k.masks = rcu_dereference(sched_domains_numa_masks);
> > +	if (!k.masks)
> > +		goto unlock;
> 
> > +	hop = (struct cpumask ***)
> > +		bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), cmp) - k.masks;
> 
> Strange indentation. I would rather see the split on parameters and
> maybe '-' operator.
> 
> sizeof(*k.masks) is a bit shorter, right?
> 
> Also we may go with
> 
> 
> 	struct cpumask ***masks;
> 	struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
> 
> 
> 
> > +	ret = hop ?
> > +		cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
> > +		cpumask_nth_and(cpu - k.w, cpus, k.masks[0][node]);
> 
> > +unlock:
> 
> out_unlock: shows the intention more clearly, no?
> 
> > +	rcu_read_unlock();
> > +	return ret;
> > +}

Below is a diff I have got on top of your patch, only compile tested:

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 024f1da0e941..e04262578b52 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2070,26 +2070,28 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
 }
 
 struct __cmp_key {
-	const struct cpumask *cpus;
 	struct cpumask ***masks;
+	const struct cpumask *cpus;
 	int node;
 	int cpu;
 	int w;
 };
 
-static int cmp(const void *a, const void *b)
+static int cmp(const void *key, const void *pivot)
 {
-	struct cpumask **prev_hop = *((struct cpumask ***)b - 1);
-	struct cpumask **cur_hop = *(struct cpumask ***)b;
-	struct __cmp_key *k = (struct __cmp_key *)a;
+	struct __cmp_key *k = container_of(key, struct __cmp_key, masks);
+	const struct cpumask ***masks = (const struct cpumask ***)pivot;
+	const struct cpumask **prev = masks[-1];
+	const struct cpumask **cur = masks[0];
 
-	if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
+	if (cpumask_weight_and(k->cpus, cur[k->node]) <= k->cpu)
 		return 1;
 
-	k->w = (b == k->masks) ? 0 : cpumask_weight_and(k->cpus, prev_hop[k->node]);
-	if (k->w <= k->cpu)
+	k->w = 0;
+	if (masks == (const struct cpumask ***)k->masks)
 		return 0;
 
+	k->w = cpumask_weight_and(k->cpus, prev[k->node]);
 	return -1;
 }
 
@@ -2103,17 +2105,17 @@ static int cmp(const void *a, const void *b)
  */
 int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 {
-	struct __cmp_key k = { cpus, NULL, node, cpu, 0 };
+	struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
 	int hop, ret = nr_cpu_ids;
+	struct cpumask ***masks;
 
 	rcu_read_lock();
 	k.masks = rcu_dereference(sched_domains_numa_masks);
 	if (!k.masks)
 		goto unlock;
 
-	hop = (struct cpumask ***)
-		bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), cmp) - k.masks;
-
+	masks = bsearch(&k.masks, k.masks, sched_domains_numa_levels, sizeof(*k.masks), cmp);
+	hop = masks - k.masks;
 	ret = hop ?
 		cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
 		cpumask_nth_and(cpu - k.w, cpus, k.masks[0][node]);
  
Valentin Schneider Nov. 15, 2022, 5:25 p.m. UTC | #3
On 12/11/22 11:09, Yury Norov wrote:
> The function finds Nth set CPU in a given cpumask starting from a given
> node.
>
> Leveraging the fact that each hop in sched_domains_numa_masks includes the
> same or greater number of CPUs than the previous one, we can use binary
> search on hops instead of linear walk, which makes the overall complexity
> of O(log n) in terms of number of cpumask_weight() calls.
>

So one thing regarding the bsearch and NUMA levels; until not so long ago
we couldn't even support 3 hops [1], and this only got detected when such
machines started showing up.

Your bsearch here operates on NUMA levels, which represent hops, and so far
we know of systems that have up to 4 levels. I'd be surprised (and also
appalled) if we even doubled that in the next decade, so with that in mind,
a linear walk might not be so horrible.

[1]: https://lore.kernel.org/all/20210224030944.15232-1-song.bao.hua@hisilicon.com/


> Signed-off-by: Yury Norov <yury.norov@gmail.com>
> ---
> +int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
> +{
> +	struct __cmp_key k = { cpus, NULL, node, cpu, 0 };
> +	int hop, ret = nr_cpu_ids;
> +
> +	rcu_read_lock();
> +	k.masks = rcu_dereference(sched_domains_numa_masks);
> +	if (!k.masks)
> +		goto unlock;
> +
> +	hop = (struct cpumask ***)
> +		bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), cmp) - k.masks;
> +
> +	ret = hop ?
> +		cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
> +		cpumask_nth_and(cpu - k.w, cpus, k.masks[0][node]);
                                      ^^^
                  wouldn't this always be 0 here?

> +unlock:
> +	rcu_read_unlock();
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
>  #endif /* CONFIG_NUMA */
>
>  static int __sdt_alloc(const struct cpumask *cpu_map)
> --
> 2.34.1
  
Yury Norov Dec. 8, 2022, 2:55 a.m. UTC | #4
On Mon, Nov 14, 2022 at 04:32:09PM +0200, Andy Shevchenko wrote:
> On Sat, Nov 12, 2022 at 11:09:45AM -0800, Yury Norov wrote:
> > The function finds Nth set CPU in a given cpumask starting from a given
> > node.
> > 
> > Leveraging the fact that each hop in sched_domains_numa_masks includes the
> > same or greater number of CPUs than the previous one, we can use binary
> > search on hops instead of linear walk, which makes the overall complexity
> > of O(log n) in terms of number of cpumask_weight() calls.
> 
> ...
> 
> > +struct __cmp_key {
> > +	const struct cpumask *cpus;
> > +	struct cpumask ***masks;
> > +	int node;
> > +	int cpu;
> > +	int w;
> > +};
> > +
> > +static int cmp(const void *a, const void *b)
> 
> Calling them key and pivot (as in the caller), would make more sense.

I think they are named opaque intentionally, so that user (me) would
cast them to proper data structures and give meaningful names. So I did.
 
> > +{
> 
> What about
> 
> 	const (?) struct cpumask ***masks = (...)pivot;
> 
> > +	struct cpumask **prev_hop = *((struct cpumask ***)b - 1);
> 
> 	= masks[-1];
> 
> > +	struct cpumask **cur_hop = *(struct cpumask ***)b;
> 
> 	= masks[0];
> 
> ?

It would work as well. Not better neither worse.

> > +	struct __cmp_key *k = (struct __cmp_key *)a;
> 
> > +	if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
> > +		return 1;
> 
> > +	k->w = (b == k->masks) ? 0 : cpumask_weight_and(k->cpus, prev_hop[k->node]);
> > +	if (k->w <= k->cpu)
> > +		return 0;
> 
> Can k->cpu be negative?

User may pass negative value. Currently cpumask_local_spread() will
return nr_cpu_ids.

After rework, bsearch() will return hop #0, After that cpumask_nth_and()
will cast negative cpu to unsigned long, and because it's a too big number,
again will return nr_cpu_ids.

> If no, we can rewrite above as
> 
> 	k->w = 0;
> 	if (b == k->masks)
> 		return 0;
> 
> 	k->w = cpumask_weight_and(k->cpus, prev_hop[k->node]);

Here we still need to compare weight of prev_hop against k->cpu.
Returning -1 unconditionally is wrong.

> > +	return -1;
> > +}
> 
> ...
> 
> > +int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
> > +{
> > +	struct __cmp_key k = { cpus, NULL, node, cpu, 0 };
> 
> You can drop NULL and 0 while using C99 assignments.
> 
> > +	int hop, ret = nr_cpu_ids;
> 
> > +	rcu_read_lock();
> 
> + Blank line?
> 
> > +	k.masks = rcu_dereference(sched_domains_numa_masks);
> > +	if (!k.masks)
> > +		goto unlock;
> 
> > +	hop = (struct cpumask ***)
> > +		bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), cmp) - k.masks;
> 
> Strange indentation. I would rather see the split on parameters and
> maybe '-' operator.
> 
> sizeof(*k.masks) is a bit shorter, right?
> 
> Also we may go with
> 
> 
> 	struct cpumask ***masks;
> 	struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
> 
> 
> 
> > +	ret = hop ?
> > +		cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
> > +		cpumask_nth_and(cpu - k.w, cpus, k.masks[0][node]);
> 
> > +unlock:
> 
> out_unlock: shows the intention more clearly, no?

No

> > +	rcu_read_unlock();
> > +	return ret;
> > +}
> 
> -- 
> With Best Regards,
> Andy Shevchenko
>
  

Patch

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 4564faafd0e1..b2e87728caea 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -245,5 +245,13 @@  static inline const struct cpumask *cpu_cpu_mask(int cpu)
 	return cpumask_of_node(cpu_to_node(cpu));
 }
 
+#ifdef CONFIG_NUMA
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
+#else
+static inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
+{
+	return cpumask_nth(cpu, cpus);
+}
+#endif	/* CONFIG_NUMA */
 
 #endif /* _LINUX_TOPOLOGY_H */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8739c2a5a54e..024f1da0e941 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1764,6 +1764,8 @@  bool find_numa_distance(int distance)
  *   there is an intermediary node C, which is < N hops away from both
  *   nodes A and B, the system is a glueless mesh.
  */
+#include <linux/bsearch.h>
+
 static void init_numa_topology_type(int offline_node)
 {
 	int a, b, c, n;
@@ -2067,6 +2069,59 @@  int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
 	return found;
 }
 
+struct __cmp_key {
+	const struct cpumask *cpus;
+	struct cpumask ***masks;
+	int node;
+	int cpu;
+	int w;
+};
+
+static int cmp(const void *a, const void *b)
+{
+	struct cpumask **prev_hop = *((struct cpumask ***)b - 1);
+	struct cpumask **cur_hop = *(struct cpumask ***)b;
+	struct __cmp_key *k = (struct __cmp_key *)a;
+
+	if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
+		return 1;
+
+	k->w = (b == k->masks) ? 0 : cpumask_weight_and(k->cpus, prev_hop[k->node]);
+	if (k->w <= k->cpu)
+		return 0;
+
+	return -1;
+}
+
+/*
+ * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
+ *                             closest to @cpu from @cpumask.
+ * cpumask: cpumask to find a cpu from
+ * cpu: Nth cpu to find
+ *
+ * returns: cpu, or nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
+{
+	struct __cmp_key k = { cpus, NULL, node, cpu, 0 };
+	int hop, ret = nr_cpu_ids;
+
+	rcu_read_lock();
+	k.masks = rcu_dereference(sched_domains_numa_masks);
+	if (!k.masks)
+		goto unlock;
+
+	hop = (struct cpumask ***)
+		bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), cmp) - k.masks;
+
+	ret = hop ?
+		cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
+		cpumask_nth_and(cpu - k.w, cpus, k.masks[0][node]);
+unlock:
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
 #endif /* CONFIG_NUMA */
 
 static int __sdt_alloc(const struct cpumask *cpu_map)