diff mbox series

[11/15] sched/eevdf: Better handle mixed slice length

Message ID	20230531124604.341527144@infradead.org
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20; Message-ID: <20230531124604.341527144@infradead.org> User-Agent: quilt/0.66 Date: Wed, 31 May 2023 13:58:50 +0200 From: Peter Zijlstra <peterz@infradead.org> To: mingo@kernel.org, vincent.guittot@linaro.org Cc: linux-kernel@vger.kernel.org, peterz@infradead.org, juri.lelli@redhat.com, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, bristot@redhat.com, corbet@lwn.net, qyousef@layalina.io, chris.hyser@oracle.com, patrick.bellasi@matbug.net, pjt@google.com, pavel@ucw.cz, qperret@google.com, tim.c.chen@linux.intel.com, joshdon@google.com, timj@gnu.org, kprateek.nayak@amd.com, yu.c.chen@intel.com, youssefesmat@chromium.org, joel@joelfernandes.org, efault@gmx.de, tglx@linutronix.de Subject: [PATCH 11/15] sched/eevdf: Better handle mixed slice length References: <20230531115839.089944915@infradead.org> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Precedence: bulk
Series	sched: EEVDF and latency-nice and/or slice-attr \| [00/15] sched: EEVDF and latency-nice and/or slice-attr [01/15] sched/fair: Add avg_vruntime [02/15] sched/fair: Remove START_DEBIT [03/15] sched/fair: Add lag based placement [04/15] rbtree: Add rb_add_augmented_cached() helper [05/15] sched/fair: Implement an EEVDF like policy [06/15] sched: Commit to lag based placement [07/15] sched/smp: Use lag to simplify cross-runqueue placement [08/15] sched: Commit to EEVDF [09/15] sched/debug: Rename min_granularity to base_slice [10/15] sched/fair: Propagate enqueue flags into place_entity() [11/15] sched/eevdf: Better handle mixed slice length [RFC,12/15] sched: Introduce latency-nice as a per-task attribute [RFC,13/15] sched/fair: Implement latency-nice [RFC,14/15] sched/fair: Add sched group latency support [RFC,15/15] sched/eevdf: Use sched_attr::sched_runtime to set request/slice

Commit Message

Peter Zijlstra May 31, 2023, 11:58 a.m. UTC

  In the case where (due to latency-nice) there are different request
sizes in the tree, the smaller requests tend to be dominated by the
larger. Also note how the EEVDF lag limits are based on r_max.

Therefore; add a heuristic that for the mixed request size case, moves
smaller requests to placement strategy #2 which ensures they're
immidiately eligible and and due to their smaller (virtual) deadline
will cause preemption.

NOTE: this relies on update_entity_lag() to impose lag limits above
a single slice.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/fair.c     |   30 ++++++++++++++++++++++++++++++
 kernel/sched/features.h |    1 +
 kernel/sched/sched.h    |    1 +
 3 files changed, 32 insertions(+)

Comments

Vincent Guittot June 2, 2023, 1:45 p.m. UTC | #1

On Wed, 31 May 2023 at 14:47, Peter Zijlstra <peterz@infradead.org> wrote:
>
> In the case where (due to latency-nice) there are different request
> sizes in the tree, the smaller requests tend to be dominated by the
> larger. Also note how the EEVDF lag limits are based on r_max.
>
> Therefore; add a heuristic that for the mixed request size case, moves
> smaller requests to placement strategy #2 which ensures they're
> immidiately eligible and and due to their smaller (virtual) deadline
> will cause preemption.
>
> NOTE: this relies on update_entity_lag() to impose lag limits above
> a single slice.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/fair.c     |   30 ++++++++++++++++++++++++++++++
>  kernel/sched/features.h |    1 +
>  kernel/sched/sched.h    |    1 +
>  3 files changed, 32 insertions(+)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -642,6 +642,7 @@ avg_vruntime_add(struct cfs_rq *cfs_rq,
>         s64 key = entity_key(cfs_rq, se);
>
>         cfs_rq->avg_vruntime += key * weight;
> +       cfs_rq->avg_slice += se->slice * weight;
>         cfs_rq->avg_load += weight;
>  }
>
> @@ -652,6 +653,7 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq,
>         s64 key = entity_key(cfs_rq, se);
>
>         cfs_rq->avg_vruntime -= key * weight;
> +       cfs_rq->avg_slice -= se->slice * weight;
>         cfs_rq->avg_load -= weight;
>  }
>
> @@ -4908,6 +4910,21 @@ static inline void update_misfit_status(
>
>  #endif /* CONFIG_SMP */
>
> +static inline bool
> +entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> +{
> +       u64 now;
> +
> +       if (!(flags & ENQUEUE_WAKEUP))
> +               return false;
> +
> +       if (flags & ENQUEUE_MIGRATED)
> +               return true;
> +
> +       now = rq_clock_task(rq_of(cfs_rq));
> +       return (s64)(se->exec_start - now) >= se->slice;
> +}
> +
>  static void
>  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  {
> @@ -4930,6 +4947,19 @@ place_entity(struct cfs_rq *cfs_rq, stru
>                 lag = se->vlag;
>
>                 /*
> +                * For latency sensitive tasks; those that have a shorter than
> +                * average slice and do not fully consume the slice, transition
> +                * to EEVDF placement strategy #2.
> +                */
> +               if (sched_feat(PLACE_FUDGE) &&
> +                   (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) &&
> +                   entity_has_slept(cfs_rq, se, flags)) {
> +                       lag += vslice;
> +                       if (lag > 0)
> +                               lag = 0;

This PLACE_FUDGE looks quite not a good heuristic because it breaks
the better fair sharing of cpu bandwidth that EEVDF is supposed to
bring. Furthermore, it breaks the isolation between cpu bandwidth and
latency because playing with latency_nice will impact your cpu
bandwidth

> +               }
> +
> +               /*
>                  * If we want to place a task and preserve lag, we have to
>                  * consider the effect of the new entity on the weighted
>                  * average and compensate for this, otherwise lag can quickly
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -5,6 +5,7 @@
>   * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
>   */
>  SCHED_FEAT(PLACE_LAG, true)
> +SCHED_FEAT(PLACE_FUDGE, true)
>  SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
>
>  /*
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -555,6 +555,7 @@ struct cfs_rq {
>         unsigned int            idle_h_nr_running; /* SCHED_IDLE */
>
>         s64                     avg_vruntime;
> +       u64                     avg_slice;
>         u64                     avg_load;
>
>         u64                     exec_clock;
>
>

Peter Zijlstra June 2, 2023, 3:06 p.m. UTC | #2

On Fri, Jun 02, 2023 at 03:45:18PM +0200, Vincent Guittot wrote:
> On Wed, 31 May 2023 at 14:47, Peter Zijlstra <peterz@infradead.org> wrote:

> > +static inline bool
> > +entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> > +{
> > +       u64 now;
> > +
> > +       if (!(flags & ENQUEUE_WAKEUP))
> > +               return false;
> > +
> > +       if (flags & ENQUEUE_MIGRATED)
> > +               return true;
> > +
> > +       now = rq_clock_task(rq_of(cfs_rq));
> > +       return (s64)(se->exec_start - now) >= se->slice;
> > +}
> > +
> >  static void
> >  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> >  {
> > @@ -4930,6 +4947,19 @@ place_entity(struct cfs_rq *cfs_rq, stru
> >                 lag = se->vlag;
> >
> >                 /*
> > +                * For latency sensitive tasks; those that have a shorter than
> > +                * average slice and do not fully consume the slice, transition
> > +                * to EEVDF placement strategy #2.
> > +                */
> > +               if (sched_feat(PLACE_FUDGE) &&
> > +                   (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) &&
> > +                   entity_has_slept(cfs_rq, se, flags)) {
> > +                       lag += vslice;
> > +                       if (lag > 0)
> > +                               lag = 0;
> 
> This PLACE_FUDGE looks quite not a good heuristic because it breaks
> the better fair sharing of cpu bandwidth that EEVDF is supposed to
> bring. Furthermore, it breaks the isolation between cpu bandwidth and
> latency because playing with latency_nice will impact your cpu
> bandwidth

Yeah, probably :/ Even though entity_has_slept() ensures the task slept
for at least one slice, that's probably not enough to preserve the
bandwidth contraints.

The fairness analysis in the paper conveniently avoids all 'interesting'
cases, including their own placement policies.

I'll sit on this one longer and think a bit more about it.

Chen Yu June 10, 2023, 6:34 a.m. UTC | #3

On 2023-05-31 at 13:58:50 +0200, Peter Zijlstra wrote:
> In the case where (due to latency-nice) there are different request
> sizes in the tree, the smaller requests tend to be dominated by the
> larger. Also note how the EEVDF lag limits are based on r_max.
> 
> Therefore; add a heuristic that for the mixed request size case, moves
> smaller requests to placement strategy #2 which ensures they're
> immidiately eligible and and due to their smaller (virtual) deadline
> will cause preemption.
> 
> NOTE: this relies on update_entity_lag() to impose lag limits above
> a single slice.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/fair.c     |   30 ++++++++++++++++++++++++++++++
>  kernel/sched/features.h |    1 +
>  kernel/sched/sched.h    |    1 +
>  3 files changed, 32 insertions(+)
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -642,6 +642,7 @@ avg_vruntime_add(struct cfs_rq *cfs_rq,
>  	s64 key = entity_key(cfs_rq, se);
>  
>  	cfs_rq->avg_vruntime += key * weight;
> +	cfs_rq->avg_slice += se->slice * weight;
>  	cfs_rq->avg_load += weight;
>  }
>  
> @@ -652,6 +653,7 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq,
>  	s64 key = entity_key(cfs_rq, se);
>  
>  	cfs_rq->avg_vruntime -= key * weight;
> +	cfs_rq->avg_slice -= se->slice * weight;
>  	cfs_rq->avg_load -= weight;
>  }
>  
> @@ -4908,6 +4910,21 @@ static inline void update_misfit_status(
>  
>  #endif /* CONFIG_SMP */
>  
> +static inline bool
> +entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> +{
> +	u64 now;
> +
> +	if (!(flags & ENQUEUE_WAKEUP))
> +		return false;
> +
> +	if (flags & ENQUEUE_MIGRATED)
> +		return true;
> +
> +	now = rq_clock_task(rq_of(cfs_rq));
> +	return (s64)(se->exec_start - now) >= se->slice;
> +}
A minor question, should it be now - se->exec_start ?
(se->exec_start - now) is always negetive on local wakeup?

thanks,
Chenyu

Peter Zijlstra June 10, 2023, 11:22 a.m. UTC | #4

On Sat, Jun 10, 2023 at 02:34:04PM +0800, Chen Yu wrote:

> > +static inline bool
> > +entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> > +{
> > +	u64 now;
> > +
> > +	if (!(flags & ENQUEUE_WAKEUP))
> > +		return false;
> > +
> > +	if (flags & ENQUEUE_MIGRATED)
> > +		return true;
> > +
> > +	now = rq_clock_task(rq_of(cfs_rq));
> > +	return (s64)(se->exec_start - now) >= se->slice;
> > +}
> A minor question, should it be now - se->exec_start ?
> (se->exec_start - now) is always negetive on local wakeup?

Yeah, also:

https://lkml.kernel.org/r/20230608124440.GB1002251@hirez.programming.kicks-ass.net

That is, it should be something along the lines of:

	delta = new - se->exec->start
	rerturn delta/W > vslice

diff mbox series

Patch

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -642,6 +642,7 @@  avg_vruntime_add(struct cfs_rq *cfs_rq,
 	s64 key = entity_key(cfs_rq, se);
 
 	cfs_rq->avg_vruntime += key * weight;
+	cfs_rq->avg_slice += se->slice * weight;
 	cfs_rq->avg_load += weight;
 }
 
@@ -652,6 +653,7 @@  avg_vruntime_sub(struct cfs_rq *cfs_rq,
 	s64 key = entity_key(cfs_rq, se);
 
 	cfs_rq->avg_vruntime -= key * weight;
+	cfs_rq->avg_slice -= se->slice * weight;
 	cfs_rq->avg_load -= weight;
 }
 
@@ -4908,6 +4910,21 @@  static inline void update_misfit_status(
 
 #endif /* CONFIG_SMP */
 
+static inline bool
+entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+	u64 now;
+
+	if (!(flags & ENQUEUE_WAKEUP))
+		return false;
+
+	if (flags & ENQUEUE_MIGRATED)
+		return true;
+
+	now = rq_clock_task(rq_of(cfs_rq));
+	return (s64)(se->exec_start - now) >= se->slice;
+}
+
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -4930,6 +4947,19 @@  place_entity(struct cfs_rq *cfs_rq, stru
 		lag = se->vlag;
 
 		/*
+		 * For latency sensitive tasks; those that have a shorter than
+		 * average slice and do not fully consume the slice, transition
+		 * to EEVDF placement strategy #2.
+		 */
+		if (sched_feat(PLACE_FUDGE) &&
+		    (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) &&
+		    entity_has_slept(cfs_rq, se, flags)) {
+			lag += vslice;
+			if (lag > 0)
+				lag = 0;
+		}
+
+		/*
 		 * If we want to place a task and preserve lag, we have to
 		 * consider the effect of the new entity on the weighted
 		 * average and compensate for this, otherwise lag can quickly
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -5,6 +5,7 @@ 
  * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
  */
 SCHED_FEAT(PLACE_LAG, true)
+SCHED_FEAT(PLACE_FUDGE, true)
 SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
 
 /*
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -555,6 +555,7 @@  struct cfs_rq {
 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
 
 	s64			avg_vruntime;
+	u64			avg_slice;
 	u64			avg_load;
 
 	u64			exec_clock;