[v10,6/9] sched/fair: Add sched group latency support

Message ID 20230113141234.260128-7-vincent.guittot@linaro.org
State New
Headers
Series Add latency priority for CFS class |

Commit Message

Vincent Guittot Jan. 13, 2023, 2:12 p.m. UTC
  Task can set its latency priority with sched_setattr(), which is then used
to set the latency offset of its sched_enity, but sched group entities
still have the default latency offset value.

Add a latency.nice field in cpu cgroup controller to set the latency
priority of the group similarly to sched_setattr(). The latency priority
is then used to set the offset of the sched_entities of the group.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
 Documentation/admin-guide/cgroup-v2.rst | 10 +++++
 kernel/sched/core.c                     | 52 +++++++++++++++++++++++++
 kernel/sched/fair.c                     | 33 ++++++++++++++++
 kernel/sched/sched.h                    |  4 ++
 4 files changed, 99 insertions(+)
  

Comments

Peter Zijlstra Feb. 21, 2023, 3:01 p.m. UTC | #1
On Fri, Jan 13, 2023 at 03:12:31PM +0100, Vincent Guittot wrote:

> +static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
> +				    struct cftype *cft)
> +{
> +	int prio, delta, last_delta = INT_MAX;
> +	s64 weight;
> +
> +	weight = css_tg(css)->latency_offset * NICE_LATENCY_WEIGHT_MAX;
> +	weight = div_s64(weight, get_sleep_latency(false));
> +
> +	/* Find the closest nice value to the current weight */

This comment isn't entirely accurate, since we only have the nice_write
interface below, this will be an exact match. The thing with weight is
that we first had the raw weight value interface and then the nice
interface had to map random values back to a 'nice' value.

Arguably we can simply store the raw nice value in write and print it
out again here.

> +	for (prio = 0; prio < ARRAY_SIZE(sched_latency_to_weight); prio++) {
> +		delta = abs(sched_latency_to_weight[prio] - weight);
> +		if (delta >= last_delta)
> +			break;
> +		last_delta = delta;
> +	}
> +
> +	return LATENCY_TO_NICE(prio-1);
> +}
> +
> +static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
> +				     struct cftype *cft, s64 nice)
> +{
> +	s64 latency_offset;
> +	long weight;
> +	int idx;
> +
> +	if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE)
> +		return -ERANGE;
> +
> +	idx = NICE_TO_LATENCY(nice);
> +	idx = array_index_nospec(idx, LATENCY_NICE_WIDTH);
> +	weight = sched_latency_to_weight[idx];
> +
> +	latency_offset = weight * get_sleep_latency(false);
> +	latency_offset = div_s64(latency_offset, NICE_LATENCY_WEIGHT_MAX);
> +
> +	return sched_group_set_latency(css_tg(css), latency_offset);
> +}
  
Vincent Guittot Feb. 21, 2023, 3:32 p.m. UTC | #2
On Tue, 21 Feb 2023 at 16:01, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Jan 13, 2023 at 03:12:31PM +0100, Vincent Guittot wrote:
>
> > +static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
> > +                                 struct cftype *cft)
> > +{
> > +     int prio, delta, last_delta = INT_MAX;
> > +     s64 weight;
> > +
> > +     weight = css_tg(css)->latency_offset * NICE_LATENCY_WEIGHT_MAX;
> > +     weight = div_s64(weight, get_sleep_latency(false));
> > +
> > +     /* Find the closest nice value to the current weight */
>
> This comment isn't entirely accurate, since we only have the nice_write
> interface below, this will be an exact match. The thing with weight is
> that we first had the raw weight value interface and then the nice
> interface had to map random values back to a 'nice' value.

Yes, there was a long discussion about the interface and without any
simple raw value to share, we decided to only use latency_nice until
we found a generic metric
>
> Arguably we can simply store the raw nice value in write and print it
> out again here.

Probably, I just wanted to prevent the latency.nice being the main
value saved in cgroup . But I suppose it could be ok to save it
directly

>
> > +     for (prio = 0; prio < ARRAY_SIZE(sched_latency_to_weight); prio++) {
> > +             delta = abs(sched_latency_to_weight[prio] - weight);
> > +             if (delta >= last_delta)
> > +                     break;
> > +             last_delta = delta;
> > +     }
> > +
> > +     return LATENCY_TO_NICE(prio-1);
> > +}
> > +
> > +static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
> > +                                  struct cftype *cft, s64 nice)
> > +{
> > +     s64 latency_offset;
> > +     long weight;
> > +     int idx;
> > +
> > +     if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE)
> > +             return -ERANGE;
> > +
> > +     idx = NICE_TO_LATENCY(nice);
> > +     idx = array_index_nospec(idx, LATENCY_NICE_WIDTH);
> > +     weight = sched_latency_to_weight[idx];
> > +
> > +     latency_offset = weight * get_sleep_latency(false);
> > +     latency_offset = div_s64(latency_offset, NICE_LATENCY_WEIGHT_MAX);
> > +
> > +     return sched_group_set_latency(css_tg(css), latency_offset);
> > +}
  

Patch

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 1b3ed1c3b3f1..c08424593e4a 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1121,6 +1121,16 @@  All time durations are in microseconds.
         values similar to the sched_setattr(2). This maximum utilization
         value is used to clamp the task specific maximum utilization clamp.
 
+  cpu.latency.nice
+	A read-write single value file which exists on non-root
+	cgroups.  The default is "0".
+
+	The nice value is in the range [-20, 19].
+
+	This interface file allows reading and setting latency using the
+	same values used by sched_setattr(2). The latency_nice of a group is
+	used to limit the impact of the latency_nice of a task outside the
+	group.
 
 
 Memory
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 402c8d622b76..6798c9a297d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11007,6 +11007,47 @@  static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
 {
 	return sched_group_set_idle(css_tg(css), idle);
 }
+
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
+				    struct cftype *cft)
+{
+	int prio, delta, last_delta = INT_MAX;
+	s64 weight;
+
+	weight = css_tg(css)->latency_offset * NICE_LATENCY_WEIGHT_MAX;
+	weight = div_s64(weight, get_sleep_latency(false));
+
+	/* Find the closest nice value to the current weight */
+	for (prio = 0; prio < ARRAY_SIZE(sched_latency_to_weight); prio++) {
+		delta = abs(sched_latency_to_weight[prio] - weight);
+		if (delta >= last_delta)
+			break;
+		last_delta = delta;
+	}
+
+	return LATENCY_TO_NICE(prio-1);
+}
+
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
+				     struct cftype *cft, s64 nice)
+{
+	s64 latency_offset;
+	long weight;
+	int idx;
+
+	if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE)
+		return -ERANGE;
+
+	idx = NICE_TO_LATENCY(nice);
+	idx = array_index_nospec(idx, LATENCY_NICE_WIDTH);
+	weight = sched_latency_to_weight[idx];
+
+	latency_offset = weight * get_sleep_latency(false);
+	latency_offset = div_s64(latency_offset, NICE_LATENCY_WEIGHT_MAX);
+
+	return sched_group_set_latency(css_tg(css), latency_offset);
+}
+
 #endif
 
 static struct cftype cpu_legacy_files[] = {
@@ -11021,6 +11062,11 @@  static struct cftype cpu_legacy_files[] = {
 		.read_s64 = cpu_idle_read_s64,
 		.write_s64 = cpu_idle_write_s64,
 	},
+	{
+		.name = "latency.nice",
+		.read_s64 = cpu_latency_nice_read_s64,
+		.write_s64 = cpu_latency_nice_write_s64,
+	},
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
@@ -11238,6 +11284,12 @@  static struct cftype cpu_files[] = {
 		.read_s64 = cpu_idle_read_s64,
 		.write_s64 = cpu_idle_write_s64,
 	},
+	{
+		.name = "latency.nice",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = cpu_latency_nice_read_s64,
+		.write_s64 = cpu_latency_nice_write_s64,
+	},
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e87a863a2aa6..80ad27ddb4a1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12296,6 +12296,7 @@  int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		goto err;
 
 	tg->shares = NICE_0_LOAD;
+	tg->latency_offset = 0;
 
 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
@@ -12394,6 +12395,9 @@  void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	}
 
 	se->my_q = cfs_rq;
+
+	se->latency_offset = tg->latency_offset;
+
 	/* guarantee group entities always have weight */
 	update_load_set(&se->load, NICE_0_LOAD);
 	se->parent = parent;
@@ -12524,6 +12528,35 @@  int sched_group_set_idle(struct task_group *tg, long idle)
 	return 0;
 }
 
+int sched_group_set_latency(struct task_group *tg, s64 latency)
+{
+	int i;
+
+	if (tg == &root_task_group)
+		return -EINVAL;
+
+	if (abs(latency) > sysctl_sched_latency)
+		return -EINVAL;
+
+	mutex_lock(&shares_mutex);
+
+	if (tg->latency_offset == latency) {
+		mutex_unlock(&shares_mutex);
+		return 0;
+	}
+
+	tg->latency_offset = latency;
+
+	for_each_possible_cpu(i) {
+		struct sched_entity *se = tg->se[i];
+
+		WRITE_ONCE(se->latency_offset, latency);
+	}
+
+	mutex_unlock(&shares_mutex);
+	return 0;
+}
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
 
 void free_fair_sched_group(struct task_group *tg) { }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fd099f3961e4..5a4ce8e61f47 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -383,6 +383,8 @@  struct task_group {
 
 	/* A positive value indicates that this is a SCHED_IDLE group. */
 	int			idle;
+	/* latency constraint of the group. */
+	int			latency_offset;
 
 #ifdef	CONFIG_SMP
 	/*
@@ -493,6 +495,8 @@  extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
 extern int sched_group_set_idle(struct task_group *tg, long idle);
 
+extern int sched_group_set_latency(struct task_group *tg, s64 latency);
+
 #ifdef CONFIG_SMP
 extern void set_task_rq_fair(struct sched_entity *se,
 			     struct cfs_rq *prev, struct cfs_rq *next);