[2/2] sched: add throttled time stat for throttled children
Commit Message
We currently export the total throttled time for cgroups that are given
a bandwidth limit. This patch extends this accounting to also account
the total time that each children cgroup has been throttled.
This is useful to understand the degree to which children have been
affected by the throttling control. Children which are not runnable
during the entire throttled period, for example, will not show any
self-throttling time during this period.
Signed-off-by: Josh Don <joshdon@google.com>
---
kernel/sched/core.c | 21 +++++++++++++++++++--
kernel/sched/fair.c | 17 +++++++++++++++++
kernel/sched/sched.h | 2 ++
3 files changed, 38 insertions(+), 2 deletions(-)
Comments
Hi Josh,
kernel test robot noticed the following build errors:
[auto build test ERROR on tip/sched/core]
[also build test ERROR on tip/master tip/auto-latest linus/master v6.4-rc2 next-20230518]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Josh-Don/sched-add-throttled-time-stat-for-throttled-children/20230518-095541
base: tip/sched/core
patch link: https://lore.kernel.org/r/20230518013414.3053254-2-joshdon%40google.com
patch subject: [PATCH 2/2] sched: add throttled time stat for throttled children
config: mips-decstation_defconfig
compiler: mipsel-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/557f81b701a0759daea26b798b7c00ac5aa027ae
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Josh-Don/sched-add-throttled-time-stat-for-throttled-children/20230518-095541
git checkout 557f81b701a0759daea26b798b7c00ac5aa027ae
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=mips olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=mips SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202305181921.IsyatAmG-lkp@intel.com/
All errors (new ones prefixed by >>):
kernel/sched/fair.c: In function 'enqueue_entity':
kernel/sched/fair.c:4880:36: error: 'struct cfs_rq' has no member named 'throttled_clock'
4880 | if (!cfs_rq->throttled_clock)
| ^~
kernel/sched/fair.c:4881:39: error: 'struct cfs_rq' has no member named 'throttled_clock'
4881 | cfs_rq->throttled_clock = rq_clock(rq);
| ^~
>> kernel/sched/fair.c:4882:36: error: 'struct cfs_rq' has no member named 'throttled_clock_self'
4882 | if (!cfs_rq->throttled_clock_self)
| ^~
kernel/sched/fair.c:4883:39: error: 'struct cfs_rq' has no member named 'throttled_clock_self'
4883 | cfs_rq->throttled_clock_self = rq_clock(rq);
| ^~
kernel/sched/fair.c: At top level:
kernel/sched/fair.c:6198:6: warning: no previous prototype for 'init_cfs_bandwidth' [-Wmissing-prototypes]
6198 | void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
| ^~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:12634:6: warning: no previous prototype for 'free_fair_sched_group' [-Wmissing-prototypes]
12634 | void free_fair_sched_group(struct task_group *tg) { }
| ^~~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:12636:5: warning: no previous prototype for 'alloc_fair_sched_group' [-Wmissing-prototypes]
12636 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
| ^~~~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:12641:6: warning: no previous prototype for 'online_fair_sched_group' [-Wmissing-prototypes]
12641 | void online_fair_sched_group(struct task_group *tg) { }
| ^~~~~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:12643:6: warning: no previous prototype for 'unregister_fair_sched_group' [-Wmissing-prototypes]
12643 | void unregister_fair_sched_group(struct task_group *tg) { }
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~
vim +4882 kernel/sched/fair.c
4792
4793 /*
4794 * MIGRATION
4795 *
4796 * dequeue
4797 * update_curr()
4798 * update_min_vruntime()
4799 * vruntime -= min_vruntime
4800 *
4801 * enqueue
4802 * update_curr()
4803 * update_min_vruntime()
4804 * vruntime += min_vruntime
4805 *
4806 * this way the vruntime transition between RQs is done when both
4807 * min_vruntime are up-to-date.
4808 *
4809 * WAKEUP (remote)
4810 *
4811 * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
4812 * vruntime -= min_vruntime
4813 *
4814 * enqueue
4815 * update_curr()
4816 * update_min_vruntime()
4817 * vruntime += min_vruntime
4818 *
4819 * this way we don't have the most up-to-date min_vruntime on the originating
4820 * CPU and an up-to-date min_vruntime on the destination CPU.
4821 */
4822
4823 static void
4824 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4825 {
4826 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
4827 bool curr = cfs_rq->curr == se;
4828 struct rq *rq = rq_of(cfs_rq);
4829
4830 /*
4831 * If we're the current task, we must renormalise before calling
4832 * update_curr().
4833 */
4834 if (renorm && curr)
4835 se->vruntime += cfs_rq->min_vruntime;
4836
4837 update_curr(cfs_rq);
4838
4839 /*
4840 * Otherwise, renormalise after, such that we're placed at the current
4841 * moment in time, instead of some random moment in the past. Being
4842 * placed in the past could significantly boost this task to the
4843 * fairness detriment of existing tasks.
4844 */
4845 if (renorm && !curr)
4846 se->vruntime += cfs_rq->min_vruntime;
4847
4848 /*
4849 * When enqueuing a sched_entity, we must:
4850 * - Update loads to have both entity and cfs_rq synced with now.
4851 * - For group_entity, update its runnable_weight to reflect the new
4852 * h_nr_running of its group cfs_rq.
4853 * - For group_entity, update its weight to reflect the new share of
4854 * its group cfs_rq
4855 * - Add its new weight to cfs_rq->load.weight
4856 */
4857 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
4858 se_update_runnable(se);
4859 update_cfs_group(se);
4860 account_entity_enqueue(cfs_rq, se);
4861
4862 if (flags & ENQUEUE_WAKEUP)
4863 place_entity(cfs_rq, se, 0);
4864 /* Entity has migrated, no longer consider this task hot */
4865 if (flags & ENQUEUE_MIGRATED)
4866 se->exec_start = 0;
4867
4868 check_schedstat_required();
4869 update_stats_enqueue_fair(cfs_rq, se, flags);
4870 check_spread(cfs_rq, se);
4871 if (!curr)
4872 __enqueue_entity(cfs_rq, se);
4873 se->on_rq = 1;
4874
4875 if (cfs_rq->nr_running == 1) {
4876 check_enqueue_throttle(cfs_rq);
4877 if (!throttled_hierarchy(cfs_rq)) {
4878 list_add_leaf_cfs_rq(cfs_rq);
4879 } else {
4880 if (!cfs_rq->throttled_clock)
4881 cfs_rq->throttled_clock = rq_clock(rq);
> 4882 if (!cfs_rq->throttled_clock_self)
4883 cfs_rq->throttled_clock_self = rq_clock(rq);
4884 }
4885 }
4886 }
4887
Hello,
(cc'ing Johannes)
On Wed, May 17, 2023 at 06:34:14PM -0700, Josh Don wrote:
> We currently export the total throttled time for cgroups that are given
> a bandwidth limit. This patch extends this accounting to also account
> the total time that each children cgroup has been throttled.
>
> This is useful to understand the degree to which children have been
> affected by the throttling control. Children which are not runnable
> during the entire throttled period, for example, will not show any
> self-throttling time during this period.
...
> @@ -11204,20 +11217,24 @@ static int cpu_extra_stat_show(struct seq_file *sf,
> {
> struct task_group *tg = css_tg(css);
> struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
> - u64 throttled_usec, burst_usec;
> + u64 throttled_usec, burst_usec, throttled_self_usec;
>
> throttled_usec = cfs_b->throttled_time;
> do_div(throttled_usec, NSEC_PER_USEC);
> + throttled_self_usec = throttled_time_self(tg);
> + do_div(throttled_self_usec, NSEC_PER_USEC);
> burst_usec = cfs_b->burst_time;
> do_div(burst_usec, NSEC_PER_USEC);
>
> seq_printf(sf, "nr_periods %d\n"
> "nr_throttled %d\n"
> "throttled_usec %llu\n"
> + "throttled_self_usec %llu\n"
This is fine in principle but I think it'd be better to keep it consistent
with how non-hierarchical events are in memory.events.local. ie. Can we
please add cpu.stat.local instead of adding the _self key to cpu.stat?
Thanks.
Hey Tejun,
On Tue, May 23, 2023 at 2:44 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> (cc'ing Johannes)
>
> On Wed, May 17, 2023 at 06:34:14PM -0700, Josh Don wrote:
> > We currently export the total throttled time for cgroups that are given
> > a bandwidth limit. This patch extends this accounting to also account
> > the total time that each children cgroup has been throttled.
> >
> > This is useful to understand the degree to which children have been
> > affected by the throttling control. Children which are not runnable
> > during the entire throttled period, for example, will not show any
> > self-throttling time during this period.
> ...
> > @@ -11204,20 +11217,24 @@ static int cpu_extra_stat_show(struct seq_file *sf,
> > {
> > struct task_group *tg = css_tg(css);
> > struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
> > - u64 throttled_usec, burst_usec;
> > + u64 throttled_usec, burst_usec, throttled_self_usec;
> >
> > throttled_usec = cfs_b->throttled_time;
> > do_div(throttled_usec, NSEC_PER_USEC);
> > + throttled_self_usec = throttled_time_self(tg);
> > + do_div(throttled_self_usec, NSEC_PER_USEC);
> > burst_usec = cfs_b->burst_time;
> > do_div(burst_usec, NSEC_PER_USEC);
> >
> > seq_printf(sf, "nr_periods %d\n"
> > "nr_throttled %d\n"
> > "throttled_usec %llu\n"
> > + "throttled_self_usec %llu\n"
>
> This is fine in principle but I think it'd be better to keep it consistent
> with how non-hierarchical events are in memory.events.local. ie. Can we
> please add cpu.stat.local instead of adding the _self key to cpu.stat?
It seemed unfortunate to split it out into a separate interface, since
there wouldn't be much to put there, but I don't have a strong
objection to this. I'll go ahead and make that change for v2.
Best,
Josh
@@ -11064,6 +11064,18 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
return ret;
}
+static u64 throttled_time_self(struct task_group *tg)
+{
+ int i;
+ u64 total = 0;
+
+ for_each_possible_cpu(i) {
+ total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
+ }
+
+ return total;
+}
+
static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
@@ -11072,6 +11084,7 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
+ seq_printf(sf, "throttled_time_self %llu\n", throttled_time_self(tg));
if (schedstat_enabled() && tg != &root_task_group) {
struct sched_statistics *stats;
@@ -11204,20 +11217,24 @@ static int cpu_extra_stat_show(struct seq_file *sf,
{
struct task_group *tg = css_tg(css);
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
- u64 throttled_usec, burst_usec;
+ u64 throttled_usec, burst_usec, throttled_self_usec;
throttled_usec = cfs_b->throttled_time;
do_div(throttled_usec, NSEC_PER_USEC);
+ throttled_self_usec = throttled_time_self(tg);
+ do_div(throttled_self_usec, NSEC_PER_USEC);
burst_usec = cfs_b->burst_time;
do_div(burst_usec, NSEC_PER_USEC);
seq_printf(sf, "nr_periods %d\n"
"nr_throttled %d\n"
"throttled_usec %llu\n"
+ "throttled_self_usec %llu\n"
"nr_bursts %d\n"
"burst_usec %llu\n",
cfs_b->nr_periods, cfs_b->nr_throttled,
- throttled_usec, cfs_b->nr_burst, burst_usec);
+ throttled_usec, throttled_self_usec, cfs_b->nr_burst,
+ burst_usec);
}
#endif
return 0;
@@ -4822,6 +4822,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
} else {
if (!cfs_rq->throttled_clock)
cfs_rq->throttled_clock = rq_clock(rq);
+ if (!cfs_rq->throttled_clock_self)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
}
}
}
@@ -5327,6 +5329,17 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
list_add_leaf_cfs_rq(cfs_rq);
}
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+
+ cfs_rq->throttled_clock_self = 0;
+
+ if (SCHED_WARN_ON((s64)delta < 0))
+ delta = 0;
+
+ cfs_rq->throttled_clock_self_time += delta;
+ }
+
return 0;
}
@@ -5342,6 +5355,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
}
cfs_rq->throttle_count++;
+ SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+ if (cfs_rq->nr_running)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+
return 0;
}
@@ -642,6 +642,8 @@ struct cfs_rq {
u64 throttled_clock;
u64 throttled_clock_pelt;
u64 throttled_clock_pelt_time;
+ u64 throttled_clock_self;
+ u64 throttled_clock_self_time;
int throttled;
int throttle_count;
struct list_head throttled_list;