[v10,6/9] sched/fair: Add sched group latency support
Commit Message
Task can set its latency priority with sched_setattr(), which is then used
to set the latency offset of its sched_enity, but sched group entities
still have the default latency offset value.
Add a latency.nice field in cpu cgroup controller to set the latency
priority of the group similarly to sched_setattr(). The latency priority
is then used to set the offset of the sched_entities of the group.
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
Documentation/admin-guide/cgroup-v2.rst | 10 +++++
kernel/sched/core.c | 52 +++++++++++++++++++++++++
kernel/sched/fair.c | 33 ++++++++++++++++
kernel/sched/sched.h | 4 ++
4 files changed, 99 insertions(+)
Comments
On Fri, Jan 13, 2023 at 03:12:31PM +0100, Vincent Guittot wrote:
> +static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
> + struct cftype *cft)
> +{
> + int prio, delta, last_delta = INT_MAX;
> + s64 weight;
> +
> + weight = css_tg(css)->latency_offset * NICE_LATENCY_WEIGHT_MAX;
> + weight = div_s64(weight, get_sleep_latency(false));
> +
> + /* Find the closest nice value to the current weight */
This comment isn't entirely accurate, since we only have the nice_write
interface below, this will be an exact match. The thing with weight is
that we first had the raw weight value interface and then the nice
interface had to map random values back to a 'nice' value.
Arguably we can simply store the raw nice value in write and print it
out again here.
> + for (prio = 0; prio < ARRAY_SIZE(sched_latency_to_weight); prio++) {
> + delta = abs(sched_latency_to_weight[prio] - weight);
> + if (delta >= last_delta)
> + break;
> + last_delta = delta;
> + }
> +
> + return LATENCY_TO_NICE(prio-1);
> +}
> +
> +static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
> + struct cftype *cft, s64 nice)
> +{
> + s64 latency_offset;
> + long weight;
> + int idx;
> +
> + if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE)
> + return -ERANGE;
> +
> + idx = NICE_TO_LATENCY(nice);
> + idx = array_index_nospec(idx, LATENCY_NICE_WIDTH);
> + weight = sched_latency_to_weight[idx];
> +
> + latency_offset = weight * get_sleep_latency(false);
> + latency_offset = div_s64(latency_offset, NICE_LATENCY_WEIGHT_MAX);
> +
> + return sched_group_set_latency(css_tg(css), latency_offset);
> +}
On Tue, 21 Feb 2023 at 16:01, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Jan 13, 2023 at 03:12:31PM +0100, Vincent Guittot wrote:
>
> > +static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
> > + struct cftype *cft)
> > +{
> > + int prio, delta, last_delta = INT_MAX;
> > + s64 weight;
> > +
> > + weight = css_tg(css)->latency_offset * NICE_LATENCY_WEIGHT_MAX;
> > + weight = div_s64(weight, get_sleep_latency(false));
> > +
> > + /* Find the closest nice value to the current weight */
>
> This comment isn't entirely accurate, since we only have the nice_write
> interface below, this will be an exact match. The thing with weight is
> that we first had the raw weight value interface and then the nice
> interface had to map random values back to a 'nice' value.
Yes, there was a long discussion about the interface and without any
simple raw value to share, we decided to only use latency_nice until
we found a generic metric
>
> Arguably we can simply store the raw nice value in write and print it
> out again here.
Probably, I just wanted to prevent the latency.nice being the main
value saved in cgroup . But I suppose it could be ok to save it
directly
>
> > + for (prio = 0; prio < ARRAY_SIZE(sched_latency_to_weight); prio++) {
> > + delta = abs(sched_latency_to_weight[prio] - weight);
> > + if (delta >= last_delta)
> > + break;
> > + last_delta = delta;
> > + }
> > +
> > + return LATENCY_TO_NICE(prio-1);
> > +}
> > +
> > +static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
> > + struct cftype *cft, s64 nice)
> > +{
> > + s64 latency_offset;
> > + long weight;
> > + int idx;
> > +
> > + if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE)
> > + return -ERANGE;
> > +
> > + idx = NICE_TO_LATENCY(nice);
> > + idx = array_index_nospec(idx, LATENCY_NICE_WIDTH);
> > + weight = sched_latency_to_weight[idx];
> > +
> > + latency_offset = weight * get_sleep_latency(false);
> > + latency_offset = div_s64(latency_offset, NICE_LATENCY_WEIGHT_MAX);
> > +
> > + return sched_group_set_latency(css_tg(css), latency_offset);
> > +}
@@ -1121,6 +1121,16 @@ All time durations are in microseconds.
values similar to the sched_setattr(2). This maximum utilization
value is used to clamp the task specific maximum utilization clamp.
+ cpu.latency.nice
+ A read-write single value file which exists on non-root
+ cgroups. The default is "0".
+
+ The nice value is in the range [-20, 19].
+
+ This interface file allows reading and setting latency using the
+ same values used by sched_setattr(2). The latency_nice of a group is
+ used to limit the impact of the latency_nice of a task outside the
+ group.
Memory
@@ -11007,6 +11007,47 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
{
return sched_group_set_idle(css_tg(css), idle);
}
+
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ int prio, delta, last_delta = INT_MAX;
+ s64 weight;
+
+ weight = css_tg(css)->latency_offset * NICE_LATENCY_WEIGHT_MAX;
+ weight = div_s64(weight, get_sleep_latency(false));
+
+ /* Find the closest nice value to the current weight */
+ for (prio = 0; prio < ARRAY_SIZE(sched_latency_to_weight); prio++) {
+ delta = abs(sched_latency_to_weight[prio] - weight);
+ if (delta >= last_delta)
+ break;
+ last_delta = delta;
+ }
+
+ return LATENCY_TO_NICE(prio-1);
+}
+
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 nice)
+{
+ s64 latency_offset;
+ long weight;
+ int idx;
+
+ if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE)
+ return -ERANGE;
+
+ idx = NICE_TO_LATENCY(nice);
+ idx = array_index_nospec(idx, LATENCY_NICE_WIDTH);
+ weight = sched_latency_to_weight[idx];
+
+ latency_offset = weight * get_sleep_latency(false);
+ latency_offset = div_s64(latency_offset, NICE_LATENCY_WEIGHT_MAX);
+
+ return sched_group_set_latency(css_tg(css), latency_offset);
+}
+
#endif
static struct cftype cpu_legacy_files[] = {
@@ -11021,6 +11062,11 @@ static struct cftype cpu_legacy_files[] = {
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
+ {
+ .name = "latency.nice",
+ .read_s64 = cpu_latency_nice_read_s64,
+ .write_s64 = cpu_latency_nice_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -11238,6 +11284,12 @@ static struct cftype cpu_files[] = {
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
+ {
+ .name = "latency.nice",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_latency_nice_read_s64,
+ .write_s64 = cpu_latency_nice_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -12296,6 +12296,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
goto err;
tg->shares = NICE_0_LOAD;
+ tg->latency_offset = 0;
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
@@ -12394,6 +12395,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
}
se->my_q = cfs_rq;
+
+ se->latency_offset = tg->latency_offset;
+
/* guarantee group entities always have weight */
update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
@@ -12524,6 +12528,35 @@ int sched_group_set_idle(struct task_group *tg, long idle)
return 0;
}
+int sched_group_set_latency(struct task_group *tg, s64 latency)
+{
+ int i;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ if (abs(latency) > sysctl_sched_latency)
+ return -EINVAL;
+
+ mutex_lock(&shares_mutex);
+
+ if (tg->latency_offset == latency) {
+ mutex_unlock(&shares_mutex);
+ return 0;
+ }
+
+ tg->latency_offset = latency;
+
+ for_each_possible_cpu(i) {
+ struct sched_entity *se = tg->se[i];
+
+ WRITE_ONCE(se->latency_offset, latency);
+ }
+
+ mutex_unlock(&shares_mutex);
+ return 0;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
void free_fair_sched_group(struct task_group *tg) { }
@@ -383,6 +383,8 @@ struct task_group {
/* A positive value indicates that this is a SCHED_IDLE group. */
int idle;
+ /* latency constraint of the group. */
+ int latency_offset;
#ifdef CONFIG_SMP
/*
@@ -493,6 +495,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern int sched_group_set_idle(struct task_group *tg, long idle);
+extern int sched_group_set_latency(struct task_group *tg, s64 latency);
+
#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);