sched: async unthrottling for cfs bandwidth

Message ID 20221017234750.454419-1-joshdon@google.com
State New
Headers
Series sched: async unthrottling for cfs bandwidth |

Commit Message

Josh Don Oct. 17, 2022, 11:47 p.m. UTC
  CFS bandwidth currently distributes new runtime and unthrottles cfs_rq's
inline in an hrtimer callback. Runtime distribution is a per-cpu
operation, and unthrottling is a per-cgroup operation, since a tg walk
is required. On machines with a large number of cpus and large cgroup
hierarchies, this cpus*cgroups work can be too much to do in a single
hrtimer callback: since IRQ are disabled, hard lockups may easily occur.
Specifically, we've found this scalability issue on configurations with
256 cpus, O(1000) cgroups in the hierarchy being throttled, and high
memory bandwidth usage.

To fix this, we can instead unthrottle cfs_rq's asynchronously via a
CSD. Each cpu is responsible for unthrottling itself, thus sharding the
total work more fairly across the system, and avoiding hard lockups.

Signed-off-by: Josh Don <joshdon@google.com>
---
 kernel/sched/fair.c  | 119 +++++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h |   9 ++++
 2 files changed, 119 insertions(+), 9 deletions(-)
  

Comments

kernel test robot Oct. 19, 2022, 2:43 a.m. UTC | #1
Hi Josh,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/sched/core]
[also build test ERROR on tip/master linus/master v6.1-rc1 next-20221018]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Josh-Don/sched-async-unthrottling-for-cfs-bandwidth/20221018-074836
patch link:    https://lore.kernel.org/r/20221017234750.454419-1-joshdon%40google.com
patch subject: [PATCH] sched: async unthrottling for cfs bandwidth
config: x86_64-randconfig-a001
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/2d8aac2b02e92a201802872a33cb4f1c46ee54b9
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Josh-Don/sched-async-unthrottling-for-cfs-bandwidth/20221018-074836
        git checkout 2d8aac2b02e92a201802872a33cb4f1c46ee54b9
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   kernel/sched/fair.c:672:5: warning: no previous prototype for function 'sched_update_scaling' [-Wmissing-prototypes]
   int sched_update_scaling(void)
       ^
   kernel/sched/fair.c:672:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   int sched_update_scaling(void)
   ^
   static 
>> kernel/sched/fair.c:5236:28: error: no member named 'throttled_csd_list' in 'struct cfs_rq'; did you mean 'throttled_list'?
                   if (!list_empty(&cfs_rq->throttled_csd_list))
                                            ^~~~~~~~~~~~~~~~~~
                                            throttled_list
   kernel/sched/sched.h:648:19: note: 'throttled_list' declared here
           struct list_head        throttled_list;
                                   ^
>> kernel/sched/fair.c:5627:4: error: implicit declaration of function '__cfsb_csd_unthrottle' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
                           __cfsb_csd_unthrottle(rq);
                           ^
   1 warning and 2 errors generated.


vim +5236 kernel/sched/fair.c

  5213	
  5214	static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
  5215	{
  5216		struct cfs_rq *cfs_rq;
  5217		u64 runtime, remaining = 1;
  5218		bool throttled = false;
  5219	
  5220		rcu_read_lock();
  5221		list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
  5222					throttled_list) {
  5223			struct rq *rq = rq_of(cfs_rq);
  5224			struct rq_flags rf;
  5225	
  5226			if (!remaining) {
  5227				throttled = true;
  5228				break;
  5229			}
  5230	
  5231			rq_lock_irqsave(rq, &rf);
  5232			if (!cfs_rq_throttled(cfs_rq))
  5233				goto next;
  5234	
  5235			/* Already queued for async unthrottle */
> 5236			if (!list_empty(&cfs_rq->throttled_csd_list))
  5237				goto next;
  5238	
  5239			/* By the above checks, this should never be true */
  5240			SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
  5241	
  5242			raw_spin_lock(&cfs_b->lock);
  5243			runtime = -cfs_rq->runtime_remaining + 1;
  5244			if (runtime > cfs_b->runtime)
  5245				runtime = cfs_b->runtime;
  5246			cfs_b->runtime -= runtime;
  5247			remaining = cfs_b->runtime;
  5248			raw_spin_unlock(&cfs_b->lock);
  5249	
  5250			cfs_rq->runtime_remaining += runtime;
  5251	
  5252			/* we check whether we're throttled above */
  5253			if (cfs_rq->runtime_remaining > 0)
  5254				unthrottle_cfs_rq_async(cfs_rq);
  5255	
  5256	next:
  5257			rq_unlock_irqrestore(rq, &rf);
  5258		}
  5259		rcu_read_unlock();
  5260	
  5261		return throttled;
  5262	}
  5263
  
kernel test robot Oct. 19, 2022, 6:46 a.m. UTC | #2
Hi Josh,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/sched/core]
[also build test ERROR on tip/master linus/master v6.1-rc1 next-20221019]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Josh-Don/sched-async-unthrottling-for-cfs-bandwidth/20221018-074836
patch link:    https://lore.kernel.org/r/20221017234750.454419-1-joshdon%40google.com
patch subject: [PATCH] sched: async unthrottling for cfs bandwidth
config: x86_64-randconfig-a013
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/2d8aac2b02e92a201802872a33cb4f1c46ee54b9
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Josh-Don/sched-async-unthrottling-for-cfs-bandwidth/20221018-074836
        git checkout 2d8aac2b02e92a201802872a33cb4f1c46ee54b9
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   kernel/sched/fair.c:672:5: warning: no previous prototype for 'sched_update_scaling' [-Wmissing-prototypes]
     672 | int sched_update_scaling(void)
         |     ^~~~~~~~~~~~~~~~~~~~
   kernel/sched/fair.c: In function 'distribute_cfs_runtime':
>> kernel/sched/fair.c:5236:42: error: 'struct cfs_rq' has no member named 'throttled_csd_list'; did you mean 'throttled_list'?
    5236 |                 if (!list_empty(&cfs_rq->throttled_csd_list))
         |                                          ^~~~~~~~~~~~~~~~~~
         |                                          throttled_list
   kernel/sched/fair.c: In function 'destroy_cfs_bandwidth':
>> kernel/sched/fair.c:5627:25: error: implicit declaration of function '__cfsb_csd_unthrottle' [-Werror=implicit-function-declaration]
    5627 |                         __cfsb_csd_unthrottle(rq);
         |                         ^~~~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors


vim +5236 kernel/sched/fair.c

  5213	
  5214	static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
  5215	{
  5216		struct cfs_rq *cfs_rq;
  5217		u64 runtime, remaining = 1;
  5218		bool throttled = false;
  5219	
  5220		rcu_read_lock();
  5221		list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
  5222					throttled_list) {
  5223			struct rq *rq = rq_of(cfs_rq);
  5224			struct rq_flags rf;
  5225	
  5226			if (!remaining) {
  5227				throttled = true;
  5228				break;
  5229			}
  5230	
  5231			rq_lock_irqsave(rq, &rf);
  5232			if (!cfs_rq_throttled(cfs_rq))
  5233				goto next;
  5234	
  5235			/* Already queued for async unthrottle */
> 5236			if (!list_empty(&cfs_rq->throttled_csd_list))
  5237				goto next;
  5238	
  5239			/* By the above checks, this should never be true */
  5240			SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
  5241	
  5242			raw_spin_lock(&cfs_b->lock);
  5243			runtime = -cfs_rq->runtime_remaining + 1;
  5244			if (runtime > cfs_b->runtime)
  5245				runtime = cfs_b->runtime;
  5246			cfs_b->runtime -= runtime;
  5247			remaining = cfs_b->runtime;
  5248			raw_spin_unlock(&cfs_b->lock);
  5249	
  5250			cfs_rq->runtime_remaining += runtime;
  5251	
  5252			/* we check whether we're throttled above */
  5253			if (cfs_rq->runtime_remaining > 0)
  5254				unthrottle_cfs_rq_async(cfs_rq);
  5255	
  5256	next:
  5257			rq_unlock_irqrestore(rq, &rf);
  5258		}
  5259		rcu_read_unlock();
  5260	
  5261		return throttled;
  5262	}
  5263
  

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4a0b8bd941c..b4c2c5728292 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5318,10 +5318,73 @@  void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 		resched_curr(rq);
 }
 
-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+#ifdef CONFIG_SMP
+static void __cfsb_csd_unthrottle(void *arg)
+{
+	struct rq *rq = arg;
+	struct rq_flags rf;
+	struct cfs_rq *cursor, *tmp;
+
+	rq_lock(rq, &rf);
+
+	list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
+				 throttled_csd_list) {
+		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cursor->tg);
+
+		list_del_init(&cursor->throttled_csd_list);
+		atomic_dec(&cfs_b->throttled_csd_count);
+
+		if (cfs_rq_throttled(cursor))
+			unthrottle_cfs_rq(cursor);
+	}
+
+	rq_unlock(rq, &rf);
+}
+
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct cfs_bandwidth *cfs_b;
+
+	if (rq == this_rq()) {
+		unthrottle_cfs_rq(cfs_rq);
+		return;
+	}
+
+	/* Already enqueued */
+	if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+		return;
+
+	cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+
+	list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
+	atomic_inc(&cfs_b->throttled_csd_count);
+
+	smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
+}
+#else
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+	unthrottle_cfs_rq(cfs_rq);
+}
+#endif
+
+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+	lockdep_assert_rq_held(rq_of(cfs_rq));
+
+	if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+	    cfs_rq->runtime_remaining <= 0))
+		return;
+
+	__unthrottle_cfs_rq_async(cfs_rq);
+}
+
+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 {
 	struct cfs_rq *cfs_rq;
 	u64 runtime, remaining = 1;
+	bool throttled = false;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -5329,11 +5392,20 @@  static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 		struct rq *rq = rq_of(cfs_rq);
 		struct rq_flags rf;
 
+		if (!remaining) {
+			throttled = true;
+			break;
+		}
+
 		rq_lock_irqsave(rq, &rf);
 		if (!cfs_rq_throttled(cfs_rq))
 			goto next;
 
-		/* By the above check, this should never be true */
+		/* Already queued for async unthrottle */
+		if (!list_empty(&cfs_rq->throttled_csd_list))
+			goto next;
+
+		/* By the above checks, this should never be true */
 		SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
 
 		raw_spin_lock(&cfs_b->lock);
@@ -5348,15 +5420,14 @@  static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 
 		/* we check whether we're throttled above */
 		if (cfs_rq->runtime_remaining > 0)
-			unthrottle_cfs_rq(cfs_rq);
+			unthrottle_cfs_rq_async(cfs_rq);
 
 next:
 		rq_unlock_irqrestore(rq, &rf);
-
-		if (!remaining)
-			break;
 	}
 	rcu_read_unlock();
+
+	return throttled;
 }
 
 /*
@@ -5401,10 +5472,8 @@  static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
 	while (throttled && cfs_b->runtime > 0) {
 		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 		/* we can't nest cfs_b->lock while distributing bandwidth */
-		distribute_cfs_runtime(cfs_b);
+		throttled = distribute_cfs_runtime(cfs_b);
 		raw_spin_lock_irqsave(&cfs_b->lock, flags);
-
-		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 	}
 
 	/*
@@ -5675,12 +5744,16 @@  void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
 	cfs_b->slack_started = false;
+	atomic_set(&cfs_b->throttled_csd_count, 0);
 }
 
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#ifdef CONFIG_SMP
+	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+#endif
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5703,6 +5776,29 @@  static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 
 	hrtimer_cancel(&cfs_b->period_timer);
 	hrtimer_cancel(&cfs_b->slack_timer);
+
+	/*
+	 * It is possible that we still have some cfs_rq's pending on the CSD
+	 * list, but this race is very rare. In order for this to occur, we must
+	 * have raced with the last task leaving the group while there exist throttled
+	 * cfs_rq(s), and the period_timer must have queued the CSD item but the remote
+	 * cpu has not yet processed it. In this case, we can simply process all CSD
+	 * work inline here.
+	 */
+	if (unlikely(atomic_read(&cfs_b->throttled_csd_count) > 0)) {
+		unsigned long flags;
+		int i;
+
+		for_each_possible_cpu(i) {
+			struct rq *rq = cpu_rq(i);
+
+			local_irq_save(flags);
+			__cfsb_csd_unthrottle(rq);
+			local_irq_restore(flags);
+		}
+
+		SCHED_WARN_ON(atomic_read(&cfs_b->throttled_csd_count) > 0);
+	}
 }
 
 /*
@@ -12237,6 +12333,11 @@  __init void init_sched_fair_class(void)
 	for_each_possible_cpu(i) {
 		zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
 		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
+
+#ifdef CONFIG_CFS_BANDWIDTH
+		INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
+		INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
+#endif
 	}
 
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1644242ecd11..e6f505f8c351 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -355,6 +355,7 @@  struct cfs_bandwidth {
 	struct hrtimer		period_timer;
 	struct hrtimer		slack_timer;
 	struct list_head	throttled_cfs_rq;
+	atomic_t		throttled_csd_count;
 
 	/* Statistics: */
 	int			nr_periods;
@@ -645,6 +646,9 @@  struct cfs_rq {
 	int			throttled;
 	int			throttle_count;
 	struct list_head	throttled_list;
+#ifdef CONFIG_SMP
+	struct list_head	throttled_csd_list;
+#endif
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
@@ -1144,6 +1148,11 @@  struct rq {
 	unsigned int		core_forceidle_occupation;
 	u64			core_forceidle_start;
 #endif
+
+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+	call_single_data_t	cfsb_csd;
+	struct list_head	cfsb_csd_list;
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED