[RFC,13/15] sched/fair: Implement latency-nice

Message ID 20230531124604.477939524@infradead.org
State New
Headers
Series sched: EEVDF and latency-nice and/or slice-attr |

Commit Message

Peter Zijlstra May 31, 2023, 11:58 a.m. UTC
  Implement latency-nice as a modulation of the EEVDF r_i parameter,
specifically apply the inverse sched_prio_to_weight[] relation on
base_slice.

Given a base slice of 3 [ms], this gives a range of:

  latency-nice  19: 3*1024 / 15    ~= 204.8 [ms]
  latency-nice -20: 3*1024 / 88761 ~= 0.034 [ms]

(which might not make sense)

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
 kernel/sched/core.c  |   14 ++++++++++----
 kernel/sched/fair.c  |   22 +++++++++++++++-------
 kernel/sched/sched.h |    2 ++
 3 files changed, 27 insertions(+), 11 deletions(-)
  

Comments

Vincent Guittot June 6, 2023, 2:54 p.m. UTC | #1
On Wed, 31 May 2023 at 14:47, Peter Zijlstra <peterz@infradead.org> wrote:
>
> Implement latency-nice as a modulation of the EEVDF r_i parameter,
> specifically apply the inverse sched_prio_to_weight[] relation on
> base_slice.
>
> Given a base slice of 3 [ms], this gives a range of:
>
>   latency-nice  19: 3*1024 / 15    ~= 204.8 [ms]
>   latency-nice -20: 3*1024 / 88761 ~= 0.034 [ms]

I have reread the publication. I have question about

Theorem 1: The lag of any active client k in a steady system is
bounded as follows,
    -rmax < lagk (d) < max(rmax ; q);

and

Corollary 2: Consider a steady system and a client k such that no
request of client k is larger than a
time quantum. Then at any time t, the lag of client k is bounded as follows:
    -q < lagk (t) < q

q being the time quanta a task can run
and rmax the maximum slice of active task

I wonder how it applies to us. What is our time quanta q ? I guess
that it's the tick because it is assumed that the algorithm evaluates
which task should run next for each q interval in order to fulfill the
fairness IIUC.So I don't think that we can assume a q shorter than the
tick (at least with current implementation) unless we trigger some
additional interrupts

Then asking for a request shorter than the tick also means that
scheduler must enqueue a new request (on behalf of the task) during
the tick and evaluate if the task is still the one to be scheduled
now. So similarly to q, the request size r should be at least a tick
in order to reevaluate which task will run next after the end of a
request. In fact, the real limit is : r/wi >= tick/(Sum wj)

On Arm64 system, tick is 4ms long and on arm32 it raises to 10ms

We can always not follow these assumptions made in the publication but
I wonder how we can then rely on its theorems and corollaries

>
> (which might not make sense)
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> ---
>  kernel/sched/core.c  |   14 ++++++++++----
>  kernel/sched/fair.c  |   22 +++++++++++++++-------
>  kernel/sched/sched.h |    2 ++
>  3 files changed, 27 insertions(+), 11 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_
>         }
>  }
>
> +static inline void set_latency_prio(struct task_struct *p, int prio)
> +{
> +       p->latency_prio = prio;
> +       set_latency_fair(&p->se, prio - MAX_RT_PRIO);
> +}
> +
>  #ifdef CONFIG_UCLAMP_TASK
>  /*
>   * Serializes updates of utilization clamp values
> @@ -4464,9 +4470,10 @@ static void __sched_fork(unsigned long c
>         p->se.nr_migrations             = 0;
>         p->se.vruntime                  = 0;
>         p->se.vlag                      = 0;
> -       p->se.slice                     = sysctl_sched_base_slice;
>         INIT_LIST_HEAD(&p->se.group_node);
>
> +       set_latency_prio(p, p->latency_prio);
> +
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>         p->se.cfs_rq                    = NULL;
>  #endif
> @@ -4718,8 +4725,7 @@ int sched_fork(unsigned long clone_flags
>
>                 p->prio = p->normal_prio = p->static_prio;
>                 set_load_weight(p, false);
> -
> -               p->latency_prio = NICE_TO_PRIO(0);
> +               set_latency_prio(p, NICE_TO_PRIO(0));
>
>                 /*
>                  * We don't need the reset flag anymore after the fork. It has
> @@ -7507,7 +7513,7 @@ static void __setscheduler_latency(struc
>                                    const struct sched_attr *attr)
>  {
>         if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
> -               p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
> +               set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice));
>  }
>
>  /*
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -952,6 +952,21 @@ int sched_update_scaling(void)
>  }
>  #endif
>
> +void set_latency_fair(struct sched_entity *se, int prio)
> +{
> +       u32 weight = sched_prio_to_weight[prio];
> +       u64 base = sysctl_sched_base_slice;
> +
> +       /*
> +        * For EEVDF the virtual time slope is determined by w_i (iow.
> +        * nice) while the request time r_i is determined by
> +        * latency-nice.
> +        *
> +        * Smaller request gets better latency.
> +        */
> +       se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
> +}
> +
>  static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
>
>  /*
> @@ -964,13 +979,6 @@ static void update_deadline(struct cfs_r
>                 return;
>
>         /*
> -        * For EEVDF the virtual time slope is determined by w_i (iow.
> -        * nice) while the request time r_i is determined by
> -        * sysctl_sched_base_slice.
> -        */
> -       se->slice = sysctl_sched_base_slice;
> -
> -       /*
>          * EEVDF: vd_i = ve_i + r_i / w_i
>          */
>         se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2495,6 +2495,8 @@ extern unsigned int sysctl_numa_balancin
>  extern unsigned int sysctl_numa_balancing_hot_threshold;
>  #endif
>
> +extern void set_latency_fair(struct sched_entity *se, int prio);
> +
>  #ifdef CONFIG_SCHED_HRTICK
>
>  /*
>
>
  
Peter Zijlstra June 8, 2023, 10:34 a.m. UTC | #2
On Tue, Jun 06, 2023 at 04:54:13PM +0200, Vincent Guittot wrote:
> On Wed, 31 May 2023 at 14:47, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Implement latency-nice as a modulation of the EEVDF r_i parameter,
> > specifically apply the inverse sched_prio_to_weight[] relation on
> > base_slice.
> >
> > Given a base slice of 3 [ms], this gives a range of:
> >
> >   latency-nice  19: 3*1024 / 15    ~= 204.8 [ms]
> >   latency-nice -20: 3*1024 / 88761 ~= 0.034 [ms]
> 
> I have reread the publication. I have question about
> 
> Theorem 1: The lag of any active client k in a steady system is
> bounded as follows,
>     -rmax < lagk (d) < max(rmax ; q);
> 
> and
> 
> Corollary 2: Consider a steady system and a client k such that no
> request of client k is larger than a
> time quantum. Then at any time t, the lag of client k is bounded as follows:
>     -q < lagk (t) < q
> 
> q being the time quanta a task can run
> and rmax the maximum slice of active task
> 
> I wonder how it applies to us. What is our time quanta q ?

> I guess that it's the tick because it is assumed that the algorithm
> evaluates which task should run next for each q interval in order to
> fulfill the fairness IIUC.So I don't think that we can assume a q
> shorter than the tick (at least with current implementation) unless we
> trigger some additional interrupts

Indeed, TICK_NSEC is our unit of accounting (unless HRTICK, but I've not
looked at that, it might not DTRT -- also, I still need to rewrite that
whole thing to not be so damn expensive).

> Then asking for a request shorter than the tick also means that
> scheduler must enqueue a new request (on behalf of the task) during
> the tick and evaluate if the task is still the one to be scheduled
> now.

If there is no 'interrupt', we won't update time and the scheduler can't
do anything -- as you well know. The paper only requires (and we
slightly violate this) to push forward the deadline. See the comment
with update_deadline().

Much like pure EDF without a combined CBS.

> So similarly to q, the request size r should be at least a tick
> in order to reevaluate which task will run next after the end of a
> request. In fact, the real limit is : r/wi >= tick/(Sum wj)

> We can always not follow these assumptions made in the publication but
> I wonder how we can then rely on its theorems and corollaries

Again, I'm not entirely following, the corollaries take r_i < q into
account, that's where the max(rmax, q) term comes from.

You're right in that r_i < q does not behave 'right', but it doesn't
invalidate the results. Note that if a task overshoots, it will build of
significant negative lag (right side of the tree) and won't be eligible
for it's next actual period. This 'hole' in the schedule is then used to
make up for the extra time it used previously.

The much bigger problem with those bounds is this little caveat: 'in a
steady state'. They conveniently fail to consider the impact of
leave/join operations on the whole thing -- and that's a much more
interesting case.
  
Peter Zijlstra June 8, 2023, 12:44 p.m. UTC | #3
On Thu, Jun 08, 2023 at 12:34:58PM +0200, Peter Zijlstra wrote:
> > Then asking for a request shorter than the tick also means that
> > scheduler must enqueue a new request (on behalf of the task) during
> > the tick and evaluate if the task is still the one to be scheduled
> > now.
> 
> If there is no 'interrupt', we won't update time and the scheduler can't
> do anything -- as you well know. The paper only requires (and we
> slightly violate this) to push forward the deadline. See the comment
> with update_deadline().
> 
> Much like pure EDF without a combined CBS.
> 
> > So similarly to q, the request size r should be at least a tick
> > in order to reevaluate which task will run next after the end of a
> > request. In fact, the real limit is : r/wi >= tick/(Sum wj)
> 
> > We can always not follow these assumptions made in the publication but
> > I wonder how we can then rely on its theorems and corollaries
> 
> Again, I'm not entirely following, the corollaries take r_i < q into
> account, that's where the max(rmax, q) term comes from.
> 
> You're right in that r_i < q does not behave 'right', but it doesn't
> invalidate the results. Note that if a task overshoots, it will build of
> significant negative lag (right side of the tree) and won't be eligible
> for it's next actual period. This 'hole' in the schedule is then used to
> make up for the extra time it used previously.

So notably, if your task *does* behave correctly and does not consume
the full request, then it will not build up (large) negative lag and
wakeup-preemption can make it go quickly on the next period.

This is where that FUDGE hack comes in, except I got it wrong, I think
it needs to be something like:

	if (delta / W >= vslice) {
		se->vlag += vslice
		if (se->vlag > 0)
			se->vlag = 0;
	}

To ensure it can't gain time. It's still a gruesome hack, but at least
is shouldn't be able to game the system.
  
Benjamin Segall Oct. 11, 2023, 11:24 p.m. UTC | #4
Peter Zijlstra <peterz@infradead.org> writes:

> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -952,6 +952,21 @@ int sched_update_scaling(void)
>  }
>  #endif
>  
> +void set_latency_fair(struct sched_entity *se, int prio)
> +{
> +	u32 weight = sched_prio_to_weight[prio];
> +	u64 base = sysctl_sched_base_slice;
> +
> +	/*
> +	 * For EEVDF the virtual time slope is determined by w_i (iow.
> +	 * nice) while the request time r_i is determined by
> +	 * latency-nice.
> +	 *
> +	 * Smaller request gets better latency.
> +	 */
> +	se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
> +}
> +
>  static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
>  
>  /*


This seems questionable in combination with the earlier changes that
make things use se->slice by itself as the expected time slice:


> @@ -6396,13 +6629,12 @@ static inline void unthrottle_offline_cf
>  static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
>  {
>  	struct sched_entity *se = &p->se;
> -	struct cfs_rq *cfs_rq = cfs_rq_of(se);
>  
>  	SCHED_WARN_ON(task_rq(p) != rq);
>  
>  	if (rq->cfs.h_nr_running > 1) {
> -		u64 slice = sched_slice(cfs_rq, se);
>  		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
> +		u64 slice = se->slice;
>  		s64 delta = slice - ran;
>  
>  		if (delta < 0) {
> @@ -12136,8 +12382,8 @@ static void rq_offline_fair(struct rq *r
>  static inline bool
>  __entity_slice_used(struct sched_entity *se, int min_nr_tasks)
>  {
> -	u64 slice = sched_slice(cfs_rq_of(se), se);
>  	u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
> +	u64 slice = se->slice;
>  
>  	return (rtime * min_nr_tasks > slice);
>  }
> @@ -12832,7 +13078,7 @@ static unsigned int get_rr_interval_fair
>  	 * idle runqueue:
>  	 */
>  	if (rq->cfs.load.weight)
> -		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
> +		rr_interval = NS_TO_JIFFIES(se->slice);
>  
>  	return rr_interval;
>  }

We probably do not want a task with normal weight and low latency-weight
(aka high latency / latency-nice value) to be expected to have a very
very high slice value for some of these. get_rr_interval_fair is
whatever, it's not really a number that exists, and CONFIG_SCHED_CORE
isn't updated for EEVDF at all, but HRTICK at least probably should be
updated. Having such a task run for 68 times normal seems likely to have
far worse latency effects than any gains from other parts.
  

Patch

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1305,6 +1305,12 @@  static void set_load_weight(struct task_
 	}
 }
 
+static inline void set_latency_prio(struct task_struct *p, int prio)
+{
+	p->latency_prio = prio;
+	set_latency_fair(&p->se, prio - MAX_RT_PRIO);
+}
+
 #ifdef CONFIG_UCLAMP_TASK
 /*
  * Serializes updates of utilization clamp values
@@ -4464,9 +4470,10 @@  static void __sched_fork(unsigned long c
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
 	p->se.vlag			= 0;
-	p->se.slice			= sysctl_sched_base_slice;
 	INIT_LIST_HEAD(&p->se.group_node);
 
+	set_latency_prio(p, p->latency_prio);
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq			= NULL;
 #endif
@@ -4718,8 +4725,7 @@  int sched_fork(unsigned long clone_flags
 
 		p->prio = p->normal_prio = p->static_prio;
 		set_load_weight(p, false);
-
-		p->latency_prio = NICE_TO_PRIO(0);
+		set_latency_prio(p, NICE_TO_PRIO(0));
 
 		/*
 		 * We don't need the reset flag anymore after the fork. It has
@@ -7507,7 +7513,7 @@  static void __setscheduler_latency(struc
 				   const struct sched_attr *attr)
 {
 	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
-		p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
+		set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice));
 }
 
 /*
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -952,6 +952,21 @@  int sched_update_scaling(void)
 }
 #endif
 
+void set_latency_fair(struct sched_entity *se, int prio)
+{
+	u32 weight = sched_prio_to_weight[prio];
+	u64 base = sysctl_sched_base_slice;
+
+	/*
+	 * For EEVDF the virtual time slope is determined by w_i (iow.
+	 * nice) while the request time r_i is determined by
+	 * latency-nice.
+	 *
+	 * Smaller request gets better latency.
+	 */
+	se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
+}
+
 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
 
 /*
@@ -964,13 +979,6 @@  static void update_deadline(struct cfs_r
 		return;
 
 	/*
-	 * For EEVDF the virtual time slope is determined by w_i (iow.
-	 * nice) while the request time r_i is determined by
-	 * sysctl_sched_base_slice.
-	 */
-	se->slice = sysctl_sched_base_slice;
-
-	/*
 	 * EEVDF: vd_i = ve_i + r_i / w_i
 	 */
 	se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2495,6 +2495,8 @@  extern unsigned int sysctl_numa_balancin
 extern unsigned int sysctl_numa_balancing_hot_threshold;
 #endif
 
+extern void set_latency_fair(struct sched_entity *se, int prio);
+
 #ifdef CONFIG_SCHED_HRTICK
 
 /*