[RFC,2/4] sched/fair: Make tg->load_avg per node

Message ID 20230718134120.81199-3-aaron.lu@intel.com
State New
Headers
Series Reduce cost of accessing tg->load_avg |

Commit Message

Aaron Lu July 18, 2023, 1:41 p.m. UTC
  When using sysbench to benchmark Postgres in a single docker instance
with sysbench's nr_threads set to nr_cpu, it is observed there are times
update_cfs_group() and update_load_avg() shows noticeable overhead on
a 2sockets/112core/224cpu Intel Sapphire Rapids(SPR):

        13.75%    13.74%  [kernel.vmlinux]           [k] update_cfs_group
        10.63%    10.04%  [kernel.vmlinux]           [k] update_load_avg

Annotate shows the cycles are mostly spent on accessing tg->load_avg
with update_load_avg() being the write side and update_cfs_group() being
the read side.

Tim Chen told me that PeterZ once mentioned a way to solve a similar
problem by making a counter per node so do the same for tg->load_avg.
After this change, the cost of the two functions are reduced and
sysbench transactions are increased on SPR. Below are test results.

===============================================
postgres_sysbench(transaction, higher is better)
nr_thread=100%/75%/50% were tested on 2 sockets SPR and Icelake and
results that have a measuable difference are:

nr_thread=100% on SPR
base:  90569.11±1.15%
node: 104152.26±0.34%  +15.0%

nr_thread=75% on SPR
base: 100803.96±0.57%
node: 107333.58±0.44%   +6.5%

=======================================================================
hackbench/pipe/threads/fd=20/loop=1000000 (throughput, higher is better)
group=1/4/8/16 were tested on 2 sockets SPR and Cascade lake and the
results that have a measuable difference are:

group=8 on SPR:
base:  437163±2.6%
node:  471203±1.2%   +7.8%

group=16 on SPR:
base:  468279±1.9%
node:  580385±1.7%  +23.9%

=============================================
netperf/TCP_STRAM
nr_thread=1/25%/50%/75%/100% were tested on 2 sockets SPR and Cascade
Lake and there is no measuable difference.

=============================================
netperf/UDP_RR (throughput, higher is better)
nr_thread=1/25%/50%/75%/100% were tested on 2 sockets SPR and Cascade
Lake and results that have measuable difference are:

nr_thread=75% on Cascade lake:
base:  36701±1.7%
node:  39949±1.4%   +8.8%

nr_thread=75% on SPR:
base:  14249±3.8%
node:  19890±2.0%   +39.6%

nr_thread=100% on Cascade lake
base:  52275±0.6%
node:  53827±0.4%   +3.0%

nr_thread=100% on SPR
base:   9560±1.6%
node:  14186±3.9%   +48.4%

Reported-by: Nitin Tekchandani <nitin.tekchandani@intel.com>
Signed-off-by: Aaron Lu <aaron.lu@intel.com>
---
 kernel/sched/debug.c |  2 +-
 kernel/sched/fair.c  | 29 ++++++++++++++++++++++++++---
 kernel/sched/sched.h | 43 +++++++++++++++++++++++++++++++++----------
 3 files changed, 60 insertions(+), 14 deletions(-)
  

Comments

Peter Zijlstra July 19, 2023, 11:53 a.m. UTC | #1
On Tue, Jul 18, 2023 at 09:41:18PM +0800, Aaron Lu wrote:
> +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> +static inline long tg_load_avg(struct task_group *tg)
> +{
> +	long load_avg = 0;
> +	int i;
> +
> +	/*
> +	 * The only path that can give us a root_task_group
> +	 * here is from print_cfs_rq() thus unlikely.
> +	 */
> +	if (unlikely(tg == &root_task_group))
> +		return 0;
> +
> +	for_each_node(i)
> +		load_avg += atomic_long_read(&tg->node_info[i]->load_avg);
> +
> +	return load_avg;
> +}
> +#endif

So I was working on something else numa and noticed that for_each_node()
(and most of the nodemask stuff) is quite moronic, afaict we should do
something like the below.

I now see Mike added the nr_node_ids thing fairly recent, but given
distros have NODES_SHIFT=10 and actual machines typically only have <=4
nodes, this would save a factor of 256 scanning.

Specifically, your for_each_node() would scan the full 1024 bit bitmap
looking for more bits that would never be there.

---

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 8d07116caaf1..c23c0889b8cf 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -109,7 +109,7 @@ extern nodemask_t _unused_nodemask_arg_;
 				__nodemask_pr_bits(maskp)
 static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
 {
-	return m ? MAX_NUMNODES : 0;
+	return m ? nr_node_ids : 0;
 }
 static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
 {
@@ -137,13 +137,13 @@ static inline void __node_clear(int node, volatile nodemask_t *dstp)
 	clear_bit(node, dstp->bits);
 }
 
-#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
+#define nodes_setall(dst) __nodes_setall(&(dst), nr_node_ids)
 static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
 {
 	bitmap_fill(dstp->bits, nbits);
 }
 
-#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
+#define nodes_clear(dst) __nodes_clear(&(dst), nr_node_ids)
 static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
 {
 	bitmap_zero(dstp->bits, nbits);
@@ -160,7 +160,7 @@ static inline bool __node_test_and_set(int node, nodemask_t *addr)
 }
 
 #define nodes_and(dst, src1, src2) \
-			__nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
+			__nodes_and(&(dst), &(src1), &(src2), nr_node_ids)
 static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
 					const nodemask_t *src2p, unsigned int nbits)
 {
@@ -168,7 +168,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
 }
 
 #define nodes_or(dst, src1, src2) \
-			__nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
+			__nodes_or(&(dst), &(src1), &(src2), nr_node_ids)
 static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
 					const nodemask_t *src2p, unsigned int nbits)
 {
@@ -176,7 +176,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
 }
 
 #define nodes_xor(dst, src1, src2) \
-			__nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
+			__nodes_xor(&(dst), &(src1), &(src2), nr_node_ids)
 static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
 					const nodemask_t *src2p, unsigned int nbits)
 {
@@ -184,7 +184,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
 }
 
 #define nodes_andnot(dst, src1, src2) \
-			__nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
+			__nodes_andnot(&(dst), &(src1), &(src2), nr_node_ids)
 static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
 					const nodemask_t *src2p, unsigned int nbits)
 {
@@ -192,7 +192,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
 }
 
 #define nodes_complement(dst, src) \
-			__nodes_complement(&(dst), &(src), MAX_NUMNODES)
+			__nodes_complement(&(dst), &(src), nr_node_ids)
 static inline void __nodes_complement(nodemask_t *dstp,
 					const nodemask_t *srcp, unsigned int nbits)
 {
@@ -200,7 +200,7 @@ static inline void __nodes_complement(nodemask_t *dstp,
 }
 
 #define nodes_equal(src1, src2) \
-			__nodes_equal(&(src1), &(src2), MAX_NUMNODES)
+			__nodes_equal(&(src1), &(src2), nr_node_ids)
 static inline bool __nodes_equal(const nodemask_t *src1p,
 					const nodemask_t *src2p, unsigned int nbits)
 {
@@ -208,7 +208,7 @@ static inline bool __nodes_equal(const nodemask_t *src1p,
 }
 
 #define nodes_intersects(src1, src2) \
-			__nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
+			__nodes_intersects(&(src1), &(src2), nr_node_ids)
 static inline bool __nodes_intersects(const nodemask_t *src1p,
 					const nodemask_t *src2p, unsigned int nbits)
 {
@@ -216,33 +216,33 @@ static inline bool __nodes_intersects(const nodemask_t *src1p,
 }
 
 #define nodes_subset(src1, src2) \
-			__nodes_subset(&(src1), &(src2), MAX_NUMNODES)
+			__nodes_subset(&(src1), &(src2), nr_node_ids)
 static inline bool __nodes_subset(const nodemask_t *src1p,
 					const nodemask_t *src2p, unsigned int nbits)
 {
 	return bitmap_subset(src1p->bits, src2p->bits, nbits);
 }
 
-#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
+#define nodes_empty(src) __nodes_empty(&(src), nr_node_ids)
 static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
 {
 	return bitmap_empty(srcp->bits, nbits);
 }
 
-#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
+#define nodes_full(nodemask) __nodes_full(&(nodemask), nr_node_ids)
 static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
 {
 	return bitmap_full(srcp->bits, nbits);
 }
 
-#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
+#define nodes_weight(nodemask) __nodes_weight(&(nodemask), nr_node_ids)
 static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
 {
 	return bitmap_weight(srcp->bits, nbits);
 }
 
 #define nodes_shift_right(dst, src, n) \
-			__nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
+			__nodes_shift_right(&(dst), &(src), (n), nr_node_ids)
 static inline void __nodes_shift_right(nodemask_t *dstp,
 					const nodemask_t *srcp, int n, int nbits)
 {
@@ -250,7 +250,7 @@ static inline void __nodes_shift_right(nodemask_t *dstp,
 }
 
 #define nodes_shift_left(dst, src, n) \
-			__nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
+			__nodes_shift_left(&(dst), &(src), (n), nr_node_ids)
 static inline void __nodes_shift_left(nodemask_t *dstp,
 					const nodemask_t *srcp, int n, int nbits)
 {
@@ -385,7 +385,7 @@ static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
 #if MAX_NUMNODES > 1
 #define for_each_node_mask(node, mask)				    \
 	for ((node) = first_node(mask);				    \
-	     (node) < MAX_NUMNODES;				    \
+	     (node) < nr_node_ids;				    \
 	     (node) = next_node((node), (mask)))
 #else /* MAX_NUMNODES == 1 */
 #define for_each_node_mask(node, mask)                                  \
  
Aaron Lu July 19, 2023, 1:45 p.m. UTC | #2
On Wed, Jul 19, 2023 at 01:53:58PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 18, 2023 at 09:41:18PM +0800, Aaron Lu wrote:
> > +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> > +static inline long tg_load_avg(struct task_group *tg)
> > +{
> > +	long load_avg = 0;
> > +	int i;
> > +
> > +	/*
> > +	 * The only path that can give us a root_task_group
> > +	 * here is from print_cfs_rq() thus unlikely.
> > +	 */
> > +	if (unlikely(tg == &root_task_group))
> > +		return 0;
> > +
> > +	for_each_node(i)
> > +		load_avg += atomic_long_read(&tg->node_info[i]->load_avg);
> > +
> > +	return load_avg;
> > +}
> > +#endif
> 
> So I was working on something else numa and noticed that for_each_node()
> (and most of the nodemask stuff) is quite moronic, afaict we should do
> something like the below.
> 
> I now see Mike added the nr_node_ids thing fairly recent, but given
> distros have NODES_SHIFT=10 and actual machines typically only have <=4
> nodes, this would save a factor of 256 scanning.

Nice :-)

> 
> Specifically, your for_each_node() would scan the full 1024 bit bitmap
> looking for more bits that would never be there.

Yes indeed.
I'll rebase this per-node patch on top of below diff.

Thanks for the info.

> 
> ---
> 
> diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
> index 8d07116caaf1..c23c0889b8cf 100644
> --- a/include/linux/nodemask.h
> +++ b/include/linux/nodemask.h
> @@ -109,7 +109,7 @@ extern nodemask_t _unused_nodemask_arg_;
>  				__nodemask_pr_bits(maskp)
>  static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
>  {
> -	return m ? MAX_NUMNODES : 0;
> +	return m ? nr_node_ids : 0;
>  }
>  static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
>  {
> @@ -137,13 +137,13 @@ static inline void __node_clear(int node, volatile nodemask_t *dstp)
>  	clear_bit(node, dstp->bits);
>  }
>  
> -#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
> +#define nodes_setall(dst) __nodes_setall(&(dst), nr_node_ids)
>  static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
>  {
>  	bitmap_fill(dstp->bits, nbits);
>  }
>  
> -#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
> +#define nodes_clear(dst) __nodes_clear(&(dst), nr_node_ids)
>  static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
>  {
>  	bitmap_zero(dstp->bits, nbits);
> @@ -160,7 +160,7 @@ static inline bool __node_test_and_set(int node, nodemask_t *addr)
>  }
>  
>  #define nodes_and(dst, src1, src2) \
> -			__nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
> +			__nodes_and(&(dst), &(src1), &(src2), nr_node_ids)
>  static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
>  					const nodemask_t *src2p, unsigned int nbits)
>  {
> @@ -168,7 +168,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
>  }
>  
>  #define nodes_or(dst, src1, src2) \
> -			__nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
> +			__nodes_or(&(dst), &(src1), &(src2), nr_node_ids)
>  static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
>  					const nodemask_t *src2p, unsigned int nbits)
>  {
> @@ -176,7 +176,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
>  }
>  
>  #define nodes_xor(dst, src1, src2) \
> -			__nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
> +			__nodes_xor(&(dst), &(src1), &(src2), nr_node_ids)
>  static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
>  					const nodemask_t *src2p, unsigned int nbits)
>  {
> @@ -184,7 +184,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
>  }
>  
>  #define nodes_andnot(dst, src1, src2) \
> -			__nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
> +			__nodes_andnot(&(dst), &(src1), &(src2), nr_node_ids)
>  static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
>  					const nodemask_t *src2p, unsigned int nbits)
>  {
> @@ -192,7 +192,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
>  }
>  
>  #define nodes_complement(dst, src) \
> -			__nodes_complement(&(dst), &(src), MAX_NUMNODES)
> +			__nodes_complement(&(dst), &(src), nr_node_ids)
>  static inline void __nodes_complement(nodemask_t *dstp,
>  					const nodemask_t *srcp, unsigned int nbits)
>  {
> @@ -200,7 +200,7 @@ static inline void __nodes_complement(nodemask_t *dstp,
>  }
>  
>  #define nodes_equal(src1, src2) \
> -			__nodes_equal(&(src1), &(src2), MAX_NUMNODES)
> +			__nodes_equal(&(src1), &(src2), nr_node_ids)
>  static inline bool __nodes_equal(const nodemask_t *src1p,
>  					const nodemask_t *src2p, unsigned int nbits)
>  {
> @@ -208,7 +208,7 @@ static inline bool __nodes_equal(const nodemask_t *src1p,
>  }
>  
>  #define nodes_intersects(src1, src2) \
> -			__nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
> +			__nodes_intersects(&(src1), &(src2), nr_node_ids)
>  static inline bool __nodes_intersects(const nodemask_t *src1p,
>  					const nodemask_t *src2p, unsigned int nbits)
>  {
> @@ -216,33 +216,33 @@ static inline bool __nodes_intersects(const nodemask_t *src1p,
>  }
>  
>  #define nodes_subset(src1, src2) \
> -			__nodes_subset(&(src1), &(src2), MAX_NUMNODES)
> +			__nodes_subset(&(src1), &(src2), nr_node_ids)
>  static inline bool __nodes_subset(const nodemask_t *src1p,
>  					const nodemask_t *src2p, unsigned int nbits)
>  {
>  	return bitmap_subset(src1p->bits, src2p->bits, nbits);
>  }
>  
> -#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
> +#define nodes_empty(src) __nodes_empty(&(src), nr_node_ids)
>  static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
>  {
>  	return bitmap_empty(srcp->bits, nbits);
>  }
>  
> -#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
> +#define nodes_full(nodemask) __nodes_full(&(nodemask), nr_node_ids)
>  static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
>  {
>  	return bitmap_full(srcp->bits, nbits);
>  }
>  
> -#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
> +#define nodes_weight(nodemask) __nodes_weight(&(nodemask), nr_node_ids)
>  static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
>  {
>  	return bitmap_weight(srcp->bits, nbits);
>  }
>  
>  #define nodes_shift_right(dst, src, n) \
> -			__nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
> +			__nodes_shift_right(&(dst), &(src), (n), nr_node_ids)
>  static inline void __nodes_shift_right(nodemask_t *dstp,
>  					const nodemask_t *srcp, int n, int nbits)
>  {
> @@ -250,7 +250,7 @@ static inline void __nodes_shift_right(nodemask_t *dstp,
>  }
>  
>  #define nodes_shift_left(dst, src, n) \
> -			__nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
> +			__nodes_shift_left(&(dst), &(src), (n), nr_node_ids)
>  static inline void __nodes_shift_left(nodemask_t *dstp,
>  					const nodemask_t *srcp, int n, int nbits)
>  {
> @@ -385,7 +385,7 @@ static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
>  #if MAX_NUMNODES > 1
>  #define for_each_node_mask(node, mask)				    \
>  	for ((node) = first_node(mask);				    \
> -	     (node) < MAX_NUMNODES;				    \
> +	     (node) < nr_node_ids;				    \
>  	     (node) = next_node((node), (mask)))
>  #else /* MAX_NUMNODES == 1 */
>  #define for_each_node_mask(node, mask)                                  \
  
Peter Zijlstra July 19, 2023, 1:53 p.m. UTC | #3
On Wed, Jul 19, 2023 at 09:45:00PM +0800, Aaron Lu wrote:
> I'll rebase this per-node patch on top of below diff.

Oh, please double check I didn't wreck anything. I skipped 'converting'
the find_*_bit() functions because the users of those iterators might be
expecting NR_MAXNODES when not found.

But perhaps there's more I overlooked.
  
Aaron Lu July 19, 2023, 2:22 p.m. UTC | #4
On Wed, Jul 19, 2023 at 03:53:05PM +0200, Peter Zijlstra wrote:
> On Wed, Jul 19, 2023 at 09:45:00PM +0800, Aaron Lu wrote:
> > I'll rebase this per-node patch on top of below diff.
> 
> Oh, please double check I didn't wreck anything. I skipped 'converting'
> the find_*_bit() functions because the users of those iterators might be
> expecting NR_MAXNODES when not found.
>

Sounds more work than I had expected :)

> But perhaps there's more I overlooked.

Got it, will see if I can make it work.
  
Yury Norov July 19, 2023, 3:59 p.m. UTC | #5
On Wed, Jul 19, 2023 at 01:53:58PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 18, 2023 at 09:41:18PM +0800, Aaron Lu wrote:
> > +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> > +static inline long tg_load_avg(struct task_group *tg)
> > +{
> > +	long load_avg = 0;
> > +	int i;
> > +
> > +	/*
> > +	 * The only path that can give us a root_task_group
> > +	 * here is from print_cfs_rq() thus unlikely.
> > +	 */
> > +	if (unlikely(tg == &root_task_group))
> > +		return 0;
> > +
> > +	for_each_node(i)
> > +		load_avg += atomic_long_read(&tg->node_info[i]->load_avg);
> > +
> > +	return load_avg;
> > +}
> > +#endif
> 
> So I was working on something else numa and noticed that for_each_node()
> (and most of the nodemask stuff) is quite moronic, afaict we should do
> something like the below.
> 
> I now see Mike added the nr_node_ids thing fairly recent, but given
> distros have NODES_SHIFT=10 and actual machines typically only have <=4
> nodes, this would save a factor of 256 scanning.
> 
> Specifically, your for_each_node() would scan the full 1024 bit bitmap
> looking for more bits that would never be there.
> 
> ---
> 
> diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
> index 8d07116caaf1..c23c0889b8cf 100644
> --- a/include/linux/nodemask.h
> +++ b/include/linux/nodemask.h
> @@ -109,7 +109,7 @@ extern nodemask_t _unused_nodemask_arg_;
>  				__nodemask_pr_bits(maskp)
>  static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
>  {
> -	return m ? MAX_NUMNODES : 0;
> +	return m ? nr_node_ids : 0;
>  }
>  static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
>  {
> @@ -137,13 +137,13 @@ static inline void __node_clear(int node, volatile nodemask_t *dstp)
>  	clear_bit(node, dstp->bits);
>  }
>  
> -#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
> +#define nodes_setall(dst) __nodes_setall(&(dst), nr_node_ids)
>  static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
>  {
>  	bitmap_fill(dstp->bits, nbits);
>  }
>  
> -#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
> +#define nodes_clear(dst) __nodes_clear(&(dst), nr_node_ids)
>  static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
>  {
>  	bitmap_zero(dstp->bits, nbits);
> @@ -160,7 +160,7 @@ static inline bool __node_test_and_set(int node, nodemask_t *addr)
>  }
>  
>  #define nodes_and(dst, src1, src2) \
> -			__nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
> +			__nodes_and(&(dst), &(src1), &(src2), nr_node_ids)
>  static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
>  					const nodemask_t *src2p, unsigned int nbits)
>  {

This would break small_const_nbits() optimization for those configuring
their kernels properly. This is very similar to cpumasks and nr_cpu_ids
problem.

See 596ff4a09b8 ("cpumask: re-introduce constant-sized cpumask optimizations")

Thanks,
Yury
  
Peter Zijlstra Aug. 2, 2023, 11:28 a.m. UTC | #6
On Wed, Jul 19, 2023 at 09:45:00PM +0800, Aaron Lu wrote:
> On Wed, Jul 19, 2023 at 01:53:58PM +0200, Peter Zijlstra wrote:
> > On Tue, Jul 18, 2023 at 09:41:18PM +0800, Aaron Lu wrote:
> > > +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> > > +static inline long tg_load_avg(struct task_group *tg)
> > > +{
> > > +	long load_avg = 0;
> > > +	int i;
> > > +
> > > +	/*
> > > +	 * The only path that can give us a root_task_group
> > > +	 * here is from print_cfs_rq() thus unlikely.
> > > +	 */
> > > +	if (unlikely(tg == &root_task_group))
> > > +		return 0;
> > > +
> > > +	for_each_node(i)
> > > +		load_avg += atomic_long_read(&tg->node_info[i]->load_avg);
> > > +
> > > +	return load_avg;
> > > +}
> > > +#endif
> > 
> > So I was working on something else numa and noticed that for_each_node()
> > (and most of the nodemask stuff) is quite moronic, afaict we should do
> > something like the below.
> > 
> > I now see Mike added the nr_node_ids thing fairly recent, but given
> > distros have NODES_SHIFT=10 and actual machines typically only have <=4
> > nodes, this would save a factor of 256 scanning.

More complete nodemask patch here:

  https://lkml.kernel.org/r/20230802112458.230221601%40infradead.org
  
Aaron Lu Aug. 11, 2023, 9:48 a.m. UTC | #7
On Wed, Aug 02, 2023 at 01:28:36PM +0200, Peter Zijlstra wrote:
> On Wed, Jul 19, 2023 at 09:45:00PM +0800, Aaron Lu wrote:
> > On Wed, Jul 19, 2023 at 01:53:58PM +0200, Peter Zijlstra wrote:
> > > On Tue, Jul 18, 2023 at 09:41:18PM +0800, Aaron Lu wrote:
> > > > +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> > > > +static inline long tg_load_avg(struct task_group *tg)
> > > > +{
> > > > +	long load_avg = 0;
> > > > +	int i;
> > > > +
> > > > +	/*
> > > > +	 * The only path that can give us a root_task_group
> > > > +	 * here is from print_cfs_rq() thus unlikely.
> > > > +	 */
> > > > +	if (unlikely(tg == &root_task_group))
> > > > +		return 0;
> > > > +
> > > > +	for_each_node(i)
> > > > +		load_avg += atomic_long_read(&tg->node_info[i]->load_avg);
> > > > +
> > > > +	return load_avg;
> > > > +}
> > > > +#endif
> > > 
> > > So I was working on something else numa and noticed that for_each_node()
> > > (and most of the nodemask stuff) is quite moronic, afaict we should do
> > > something like the below.
> > > 
> > > I now see Mike added the nr_node_ids thing fairly recent, but given
> > > distros have NODES_SHIFT=10 and actual machines typically only have <=4
> > > nodes, this would save a factor of 256 scanning.
> 
> More complete nodemask patch here:
> 
>   https://lkml.kernel.org/r/20230802112458.230221601%40infradead.org

Thanks for the update.

I incorperated this numa change and collected some data and found that
with the newly proposed approach to rate limit updates to tg->load_avg
to at most once per ms, the cost of accessing tg->load_avg is dropped
so much that adding other optimizations doesn't make much difference.

So I was thinking maybe I just need that one ratelimit patch to reduce
the cost of accessing tg->load_avg. The detailed data is here:
https://lore.kernel.org/lkml/20230811092811.GA399195@ziqianlu-dell/
  

Patch

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 066ff1c8ae4e..3af965a18866 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -691,7 +691,7 @@  void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
 			cfs_rq->tg_load_avg_contrib);
 	SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg",
-			atomic_long_read(&cfs_rq->tg->load_avg));
+			tg_load_avg(cfs_rq->tg));
 #endif
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0f913487928d..aceb8f5922cb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3496,7 +3496,7 @@  static long calc_group_shares(struct cfs_rq *cfs_rq)
 
 	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
 
-	tg_weight = atomic_long_read(&tg->load_avg);
+	tg_weight = tg_load_avg(tg);
 
 	/* Ensure tg_weight >= load */
 	tg_weight -= cfs_rq->tg_load_avg_contrib;
@@ -3665,6 +3665,7 @@  static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 {
 	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+	int node = cpu_to_node(smp_processor_id());
 
 	/*
 	 * No need to update load_avg for root_task_group as it is not used.
@@ -3673,7 +3674,7 @@  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 		return;
 
 	if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
-		atomic_long_add(delta, &cfs_rq->tg->load_avg);
+		atomic_long_add(delta, &cfs_rq->tg->node_info[node]->load_avg);
 		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
 	}
 }
@@ -12439,7 +12440,7 @@  int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
-	int i;
+	int i, nodes;
 
 	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
 	if (!tg->cfs_rq)
@@ -12468,8 +12469,30 @@  int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		init_entity_runnable_average(se);
 	}
 
+#ifdef CONFIG_SMP
+	nodes = num_possible_nodes();
+	tg->node_info = kcalloc(nodes, sizeof(struct tg_node_info *), GFP_KERNEL);
+	if (!tg->node_info)
+		goto err_free;
+
+	for_each_node(i) {
+		tg->node_info[i] = kzalloc_node(sizeof(struct tg_node_info), GFP_KERNEL, i);
+		if (!tg->node_info[i])
+			goto err_free_node;
+	}
+#endif
+
 	return 1;
 
+#ifdef CONFIG_SMP
+err_free_node:
+	for_each_node(i) {
+		kfree(tg->node_info[i]);
+		if (!tg->node_info[i])
+			break;
+	}
+	kfree(tg->node_info);
+#endif
 err_free:
 	for_each_possible_cpu(i) {
 		kfree(tg->cfs_rq[i]);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14dfaafb3a8f..9cece2dbc95b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -359,6 +359,17 @@  struct cfs_bandwidth {
 #endif
 };
 
+struct tg_node_info {
+	/*
+	 * load_avg can be heavily contended at clock tick time and task
+	 * enqueue/dequeue time, so put it in its own cacheline separated
+	 * from other fields.
+	 */
+	struct {
+		atomic_long_t		load_avg;
+	} ____cacheline_aligned_in_smp;
+};
+
 /* Task group related information */
 struct task_group {
 	struct cgroup_subsys_state css;
@@ -373,15 +384,8 @@  struct task_group {
 	/* A positive value indicates that this is a SCHED_IDLE group. */
 	int			idle;
 
-#ifdef	CONFIG_SMP
-	/*
-	 * load_avg can be heavily contended at clock tick time, so put
-	 * it in its own cacheline separated from the fields above which
-	 * will also be accessed at each tick.
-	 */
-	struct {
-		atomic_long_t		load_avg;
-	} ____cacheline_aligned_in_smp;
+#ifdef CONFIG_SMP
+	struct tg_node_info	**node_info;
 #endif
 #endif
 
@@ -413,9 +417,28 @@  struct task_group {
 	/* Effective clamp values used for a task group */
 	struct uclamp_se	uclamp[UCLAMP_CNT];
 #endif
-
 };
 
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+static inline long tg_load_avg(struct task_group *tg)
+{
+	long load_avg = 0;
+	int i;
+
+	/*
+	 * The only path that can give us a root_task_group
+	 * here is from print_cfs_rq() thus unlikely.
+	 */
+	if (unlikely(tg == &root_task_group))
+		return 0;
+
+	for_each_node(i)
+		load_avg += atomic_long_read(&tg->node_info[i]->load_avg);
+
+	return load_avg;
+}
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #define ROOT_TASK_GROUP_LOAD	NICE_0_LOAD