[2/2] sched: add throttled time stat for throttled children

Message ID 20230518013414.3053254-2-joshdon@google.com
State New
Headers
Series [1/2] sched: don't account throttle time for empty groups |

Commit Message

Josh Don May 18, 2023, 1:34 a.m. UTC
  We currently export the total throttled time for cgroups that are given
a bandwidth limit. This patch extends this accounting to also account
the total time that each children cgroup has been throttled.

This is useful to understand the degree to which children have been
affected by the throttling control. Children which are not runnable
during the entire throttled period, for example, will not show any
self-throttling time during this period.

Signed-off-by: Josh Don <joshdon@google.com>
---
 kernel/sched/core.c  | 21 +++++++++++++++++++--
 kernel/sched/fair.c  | 17 +++++++++++++++++
 kernel/sched/sched.h |  2 ++
 3 files changed, 38 insertions(+), 2 deletions(-)
  

Comments

kernel test robot May 18, 2023, 12:04 p.m. UTC | #1
Hi Josh,

kernel test robot noticed the following build errors:

[auto build test ERROR on tip/sched/core]
[also build test ERROR on tip/master tip/auto-latest linus/master v6.4-rc2 next-20230518]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Josh-Don/sched-add-throttled-time-stat-for-throttled-children/20230518-095541
base:   tip/sched/core
patch link:    https://lore.kernel.org/r/20230518013414.3053254-2-joshdon%40google.com
patch subject: [PATCH 2/2] sched: add throttled time stat for throttled children
config: mips-decstation_defconfig
compiler: mipsel-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/557f81b701a0759daea26b798b7c00ac5aa027ae
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Josh-Don/sched-add-throttled-time-stat-for-throttled-children/20230518-095541
        git checkout 557f81b701a0759daea26b798b7c00ac5aa027ae
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=mips olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=mips SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202305181921.IsyatAmG-lkp@intel.com/

All errors (new ones prefixed by >>):

   kernel/sched/fair.c: In function 'enqueue_entity':
   kernel/sched/fair.c:4880:36: error: 'struct cfs_rq' has no member named 'throttled_clock'
    4880 |                         if (!cfs_rq->throttled_clock)
         |                                    ^~
   kernel/sched/fair.c:4881:39: error: 'struct cfs_rq' has no member named 'throttled_clock'
    4881 |                                 cfs_rq->throttled_clock = rq_clock(rq);
         |                                       ^~
>> kernel/sched/fair.c:4882:36: error: 'struct cfs_rq' has no member named 'throttled_clock_self'
    4882 |                         if (!cfs_rq->throttled_clock_self)
         |                                    ^~
   kernel/sched/fair.c:4883:39: error: 'struct cfs_rq' has no member named 'throttled_clock_self'
    4883 |                                 cfs_rq->throttled_clock_self = rq_clock(rq);
         |                                       ^~
   kernel/sched/fair.c: At top level:
   kernel/sched/fair.c:6198:6: warning: no previous prototype for 'init_cfs_bandwidth' [-Wmissing-prototypes]
    6198 | void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
         |      ^~~~~~~~~~~~~~~~~~
   kernel/sched/fair.c:12634:6: warning: no previous prototype for 'free_fair_sched_group' [-Wmissing-prototypes]
   12634 | void free_fair_sched_group(struct task_group *tg) { }
         |      ^~~~~~~~~~~~~~~~~~~~~
   kernel/sched/fair.c:12636:5: warning: no previous prototype for 'alloc_fair_sched_group' [-Wmissing-prototypes]
   12636 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
         |     ^~~~~~~~~~~~~~~~~~~~~~
   kernel/sched/fair.c:12641:6: warning: no previous prototype for 'online_fair_sched_group' [-Wmissing-prototypes]
   12641 | void online_fair_sched_group(struct task_group *tg) { }
         |      ^~~~~~~~~~~~~~~~~~~~~~~
   kernel/sched/fair.c:12643:6: warning: no previous prototype for 'unregister_fair_sched_group' [-Wmissing-prototypes]
   12643 | void unregister_fair_sched_group(struct task_group *tg) { }
         |      ^~~~~~~~~~~~~~~~~~~~~~~~~~~


vim +4882 kernel/sched/fair.c

  4792	
  4793	/*
  4794	 * MIGRATION
  4795	 *
  4796	 *	dequeue
  4797	 *	  update_curr()
  4798	 *	    update_min_vruntime()
  4799	 *	  vruntime -= min_vruntime
  4800	 *
  4801	 *	enqueue
  4802	 *	  update_curr()
  4803	 *	    update_min_vruntime()
  4804	 *	  vruntime += min_vruntime
  4805	 *
  4806	 * this way the vruntime transition between RQs is done when both
  4807	 * min_vruntime are up-to-date.
  4808	 *
  4809	 * WAKEUP (remote)
  4810	 *
  4811	 *	->migrate_task_rq_fair() (p->state == TASK_WAKING)
  4812	 *	  vruntime -= min_vruntime
  4813	 *
  4814	 *	enqueue
  4815	 *	  update_curr()
  4816	 *	    update_min_vruntime()
  4817	 *	  vruntime += min_vruntime
  4818	 *
  4819	 * this way we don't have the most up-to-date min_vruntime on the originating
  4820	 * CPU and an up-to-date min_vruntime on the destination CPU.
  4821	 */
  4822	
  4823	static void
  4824	enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  4825	{
  4826		bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
  4827		bool curr = cfs_rq->curr == se;
  4828		struct rq *rq = rq_of(cfs_rq);
  4829	
  4830		/*
  4831		 * If we're the current task, we must renormalise before calling
  4832		 * update_curr().
  4833		 */
  4834		if (renorm && curr)
  4835			se->vruntime += cfs_rq->min_vruntime;
  4836	
  4837		update_curr(cfs_rq);
  4838	
  4839		/*
  4840		 * Otherwise, renormalise after, such that we're placed at the current
  4841		 * moment in time, instead of some random moment in the past. Being
  4842		 * placed in the past could significantly boost this task to the
  4843		 * fairness detriment of existing tasks.
  4844		 */
  4845		if (renorm && !curr)
  4846			se->vruntime += cfs_rq->min_vruntime;
  4847	
  4848		/*
  4849		 * When enqueuing a sched_entity, we must:
  4850		 *   - Update loads to have both entity and cfs_rq synced with now.
  4851		 *   - For group_entity, update its runnable_weight to reflect the new
  4852		 *     h_nr_running of its group cfs_rq.
  4853		 *   - For group_entity, update its weight to reflect the new share of
  4854		 *     its group cfs_rq
  4855		 *   - Add its new weight to cfs_rq->load.weight
  4856		 */
  4857		update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
  4858		se_update_runnable(se);
  4859		update_cfs_group(se);
  4860		account_entity_enqueue(cfs_rq, se);
  4861	
  4862		if (flags & ENQUEUE_WAKEUP)
  4863			place_entity(cfs_rq, se, 0);
  4864		/* Entity has migrated, no longer consider this task hot */
  4865		if (flags & ENQUEUE_MIGRATED)
  4866			se->exec_start = 0;
  4867	
  4868		check_schedstat_required();
  4869		update_stats_enqueue_fair(cfs_rq, se, flags);
  4870		check_spread(cfs_rq, se);
  4871		if (!curr)
  4872			__enqueue_entity(cfs_rq, se);
  4873		se->on_rq = 1;
  4874	
  4875		if (cfs_rq->nr_running == 1) {
  4876			check_enqueue_throttle(cfs_rq);
  4877			if (!throttled_hierarchy(cfs_rq)) {
  4878				list_add_leaf_cfs_rq(cfs_rq);
  4879			} else {
  4880				if (!cfs_rq->throttled_clock)
  4881					cfs_rq->throttled_clock = rq_clock(rq);
> 4882				if (!cfs_rq->throttled_clock_self)
  4883					cfs_rq->throttled_clock_self = rq_clock(rq);
  4884			}
  4885		}
  4886	}
  4887
  
Tejun Heo May 23, 2023, 9:44 p.m. UTC | #2
Hello,

(cc'ing Johannes)

On Wed, May 17, 2023 at 06:34:14PM -0700, Josh Don wrote:
> We currently export the total throttled time for cgroups that are given
> a bandwidth limit. This patch extends this accounting to also account
> the total time that each children cgroup has been throttled.
> 
> This is useful to understand the degree to which children have been
> affected by the throttling control. Children which are not runnable
> during the entire throttled period, for example, will not show any
> self-throttling time during this period.
...
> @@ -11204,20 +11217,24 @@ static int cpu_extra_stat_show(struct seq_file *sf,
>  	{
>  		struct task_group *tg = css_tg(css);
>  		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
> -		u64 throttled_usec, burst_usec;
> +		u64 throttled_usec, burst_usec, throttled_self_usec;
>  
>  		throttled_usec = cfs_b->throttled_time;
>  		do_div(throttled_usec, NSEC_PER_USEC);
> +		throttled_self_usec = throttled_time_self(tg);
> +		do_div(throttled_self_usec, NSEC_PER_USEC);
>  		burst_usec = cfs_b->burst_time;
>  		do_div(burst_usec, NSEC_PER_USEC);
>  
>  		seq_printf(sf, "nr_periods %d\n"
>  			   "nr_throttled %d\n"
>  			   "throttled_usec %llu\n"
> +			   "throttled_self_usec %llu\n"

This is fine in principle but I think it'd be better to keep it consistent
with how non-hierarchical events are in memory.events.local. ie. Can we
please add cpu.stat.local instead of adding the _self key to cpu.stat?

Thanks.
  
Josh Don May 25, 2023, 12:06 a.m. UTC | #3
Hey Tejun,

On Tue, May 23, 2023 at 2:44 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> (cc'ing Johannes)
>
> On Wed, May 17, 2023 at 06:34:14PM -0700, Josh Don wrote:
> > We currently export the total throttled time for cgroups that are given
> > a bandwidth limit. This patch extends this accounting to also account
> > the total time that each children cgroup has been throttled.
> >
> > This is useful to understand the degree to which children have been
> > affected by the throttling control. Children which are not runnable
> > during the entire throttled period, for example, will not show any
> > self-throttling time during this period.
> ...
> > @@ -11204,20 +11217,24 @@ static int cpu_extra_stat_show(struct seq_file *sf,
> >       {
> >               struct task_group *tg = css_tg(css);
> >               struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
> > -             u64 throttled_usec, burst_usec;
> > +             u64 throttled_usec, burst_usec, throttled_self_usec;
> >
> >               throttled_usec = cfs_b->throttled_time;
> >               do_div(throttled_usec, NSEC_PER_USEC);
> > +             throttled_self_usec = throttled_time_self(tg);
> > +             do_div(throttled_self_usec, NSEC_PER_USEC);
> >               burst_usec = cfs_b->burst_time;
> >               do_div(burst_usec, NSEC_PER_USEC);
> >
> >               seq_printf(sf, "nr_periods %d\n"
> >                          "nr_throttled %d\n"
> >                          "throttled_usec %llu\n"
> > +                        "throttled_self_usec %llu\n"
>
> This is fine in principle but I think it'd be better to keep it consistent
> with how non-hierarchical events are in memory.events.local. ie. Can we
> please add cpu.stat.local instead of adding the _self key to cpu.stat?

It seemed unfortunate to split it out into a separate interface, since
there wouldn't be much to put there, but I don't have a strong
objection to this. I'll go ahead and make that change for v2.

Best,
Josh
  

Patch

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a384344ec9ee..fcd702889fcb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11064,6 +11064,18 @@  static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 	return ret;
 }
 
+static u64 throttled_time_self(struct task_group *tg)
+{
+	int i;
+	u64 total = 0;
+
+	for_each_possible_cpu(i) {
+		total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
+	}
+
+	return total;
+}
+
 static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
 {
 	struct task_group *tg = css_tg(seq_css(sf));
@@ -11072,6 +11084,7 @@  static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
 	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
 	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
 	seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
+	seq_printf(sf, "throttled_time_self %llu\n", throttled_time_self(tg));
 
 	if (schedstat_enabled() && tg != &root_task_group) {
 		struct sched_statistics *stats;
@@ -11204,20 +11217,24 @@  static int cpu_extra_stat_show(struct seq_file *sf,
 	{
 		struct task_group *tg = css_tg(css);
 		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
-		u64 throttled_usec, burst_usec;
+		u64 throttled_usec, burst_usec, throttled_self_usec;
 
 		throttled_usec = cfs_b->throttled_time;
 		do_div(throttled_usec, NSEC_PER_USEC);
+		throttled_self_usec = throttled_time_self(tg);
+		do_div(throttled_self_usec, NSEC_PER_USEC);
 		burst_usec = cfs_b->burst_time;
 		do_div(burst_usec, NSEC_PER_USEC);
 
 		seq_printf(sf, "nr_periods %d\n"
 			   "nr_throttled %d\n"
 			   "throttled_usec %llu\n"
+			   "throttled_self_usec %llu\n"
 			   "nr_bursts %d\n"
 			   "burst_usec %llu\n",
 			   cfs_b->nr_periods, cfs_b->nr_throttled,
-			   throttled_usec, cfs_b->nr_burst, burst_usec);
+			   throttled_usec, throttled_self_usec, cfs_b->nr_burst,
+			   burst_usec);
 	}
 #endif
 	return 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 85c2c0c3cab6..fd348b9421c1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4822,6 +4822,8 @@  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		} else {
 			if (!cfs_rq->throttled_clock)
 				cfs_rq->throttled_clock = rq_clock(rq);
+			if (!cfs_rq->throttled_clock_self)
+				cfs_rq->throttled_clock_self = rq_clock(rq);
 		}
 	}
 }
@@ -5327,6 +5329,17 @@  static int tg_unthrottle_up(struct task_group *tg, void *data)
 			list_add_leaf_cfs_rq(cfs_rq);
 	}
 
+	if (cfs_rq->throttled_clock_self) {
+		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+
+		cfs_rq->throttled_clock_self = 0;
+
+		if (SCHED_WARN_ON((s64)delta < 0))
+			delta = 0;
+
+		cfs_rq->throttled_clock_self_time += delta;
+	}
+
 	return 0;
 }
 
@@ -5342,6 +5355,10 @@  static int tg_throttle_down(struct task_group *tg, void *data)
 	}
 	cfs_rq->throttle_count++;
 
+	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+	if (cfs_rq->nr_running)
+		cfs_rq->throttled_clock_self = rq_clock(rq);
+
 	return 0;
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 060616944d7a..ddc48b2f1fd0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -642,6 +642,8 @@  struct cfs_rq {
 	u64			throttled_clock;
 	u64			throttled_clock_pelt;
 	u64			throttled_clock_pelt_time;
+	u64			throttled_clock_self;
+	u64			throttled_clock_self_time;
 	int			throttled;
 	int			throttle_count;
 	struct list_head	throttled_list;