[RFC,V3,4/6] sched/deadline: Introduce deadline servers

Message ID e08a76013ff2b6a83028e8fdf3b8a6a0aa677d58.1686239016.git.bristot@kernel.org
State New
Headers
Series SCHED_DEADLINE server infrastructure |

Commit Message

Daniel Bristot de Oliveira June 8, 2023, 3:58 p.m. UTC
  From: Peter Zijlstra <peterz@infradead.org>

Low priority tasks (e.g., SCHED_OTHER) can suffer starvation if tasks
with higher priority (e.g., SCHED_FIFO) monopolize CPU(s).

RT Throttling has been introduced a while ago as a (mostly debug)
countermeasure one can utilize to reserve some CPU time for low priority
tasks (usually background type of work, e.g. workqueues, timers, etc.).
It however has its own problems (see documentation) and the undesired
effect of unconditionally throttling FIFO tasks even when no lower
priority activity needs to run (there are mechanisms to fix this issue
as well, but, again, with their own problems).

Introduce deadline servers to service low priority tasks needs under
starvation conditions. Deadline servers are built extending SCHED_DEADLINE
implementation to allow 2-level scheduling (a sched_deadline entity
becomes a container for lower priority scheduling entities).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
---
 include/linux/sched.h   |  22 ++-
 kernel/sched/core.c     |  17 ++
 kernel/sched/deadline.c | 350 +++++++++++++++++++++++++++-------------
 kernel/sched/fair.c     |   4 +
 kernel/sched/sched.h    |  29 ++++
 5 files changed, 309 insertions(+), 113 deletions(-)
  

Comments

Phil Auld June 13, 2023, 1:21 p.m. UTC | #1
Hi,

On Thu, Jun 08, 2023 at 05:58:16PM +0200 Daniel Bristot de Oliveira wrote:
> From: Peter Zijlstra <peterz@infradead.org>
> 
> Low priority tasks (e.g., SCHED_OTHER) can suffer starvation if tasks
> with higher priority (e.g., SCHED_FIFO) monopolize CPU(s).
> 
> RT Throttling has been introduced a while ago as a (mostly debug)
> countermeasure one can utilize to reserve some CPU time for low priority
> tasks (usually background type of work, e.g. workqueues, timers, etc.).
> It however has its own problems (see documentation) and the undesired
> effect of unconditionally throttling FIFO tasks even when no lower
> priority activity needs to run (there are mechanisms to fix this issue
> as well, but, again, with their own problems).
> 
> Introduce deadline servers to service low priority tasks needs under
> starvation conditions. Deadline servers are built extending SCHED_DEADLINE
> implementation to allow 2-level scheduling (a sched_deadline entity
> becomes a container for lower priority scheduling entities).
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
> ---
>  include/linux/sched.h   |  22 ++-
>  kernel/sched/core.c     |  17 ++
>  kernel/sched/deadline.c | 350 +++++++++++++++++++++++++++-------------
>  kernel/sched/fair.c     |   4 +
>  kernel/sched/sched.h    |  29 ++++
>  5 files changed, 309 insertions(+), 113 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 26b1925a702a..4c90d7693a75 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -64,12 +64,14 @@ struct robust_list_head;
>  struct root_domain;
>  struct rq;
>  struct sched_attr;
> +struct sched_dl_entity;
>  struct sched_param;
>  struct seq_file;
>  struct sighand_struct;
>  struct signal_struct;
>  struct task_delay_info;
>  struct task_group;
> +struct task_struct;
>  struct user_event_mm;
>  
>  /*
> @@ -600,6 +602,9 @@ struct sched_rt_entity {
>  #endif
>  } __randomize_layout;
>  
> +typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
> +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
> +
>  struct sched_dl_entity {
>  	struct rb_node			rb_node;
>  
> @@ -647,6 +652,7 @@ struct sched_dl_entity {
>  	unsigned int			dl_yielded        : 1;
>  	unsigned int			dl_non_contending : 1;
>  	unsigned int			dl_overrun	  : 1;
> +	unsigned int			dl_server         : 1;
>  
>  	/*
>  	 * Bandwidth enforcement timer. Each -deadline task has its
> @@ -661,7 +667,20 @@ struct sched_dl_entity {
>  	 * timer is needed to decrease the active utilization at the correct
>  	 * time.
>  	 */
> -	struct hrtimer inactive_timer;
> +	struct hrtimer			inactive_timer;
> +
> +	/*
> +	 * Bits for DL-server functionality. Also see the comment near
> +	 * dl_server_update().
> +	 *
> +	 * @rq the runqueue this server is for
> +	 *
> +	 * @server_has_tasks() returns true if @server_pick return a
> +	 * runnable task.
> +	 */
> +	struct rq			*rq;
> +	dl_server_has_tasks_f		server_has_tasks;
> +	dl_server_pick_f		server_pick;
>  
>  #ifdef CONFIG_RT_MUTEXES
>  	/*
> @@ -790,6 +809,7 @@ struct task_struct {
>  	struct sched_entity		se;
>  	struct sched_rt_entity		rt;
>  	struct sched_dl_entity		dl;
> +	struct sched_dl_entity		*server;
>  	const struct sched_class	*sched_class;


I like the cleanups in the first 3 patches a lot. I'm still wrapping my head
around the rest.  But "server" is a very non-descript, generic term for
something in task_struct.  Maybe "sched_server" or something?

Cheers,
Phil

>  
>  #ifdef CONFIG_SCHED_CORE
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e34b02cbe41f..5b88b822ec89 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3803,6 +3803,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
>  		rq->idle_stamp = 0;
>  	}
>  #endif
> +
> +	p->server = NULL;
>  }
>  
>  /*
> @@ -6013,12 +6015,27 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>  			p = pick_next_task_idle(rq);
>  		}
>  
> +		/*
> +		 * This is the fast path; it cannot be a DL server pick;
> +		 * therefore even if @p == @prev, ->server must be NULL.
> +		 */
> +		if (p->server)
> +			p->server = NULL;
> +
>  		return p;
>  	}
>  
>  restart:
>  	put_prev_task_balance(rq, prev, rf);
>  
> +	/*
> +	 * We've updated @prev and no longer need the server link, clear it.
> +	 * Must be done before ->pick_next_task() because that can (re)set
> +	 * ->server.
> +	 */
> +	if (prev->server)
> +		prev->server = NULL;
> +
>  	for_each_class(class) {
>  		p = class->pick_next_task(rq);
>  		if (p)
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 869734eecb2c..c67056ff5749 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -52,8 +52,14 @@ static int __init sched_dl_sysctl_init(void)
>  late_initcall(sched_dl_sysctl_init);
>  #endif
>  
> +static bool dl_server(struct sched_dl_entity *dl_se)
> +{
> +	return dl_se->dl_server;
> +}
> +
>  static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
>  {
> +	BUG_ON(dl_server(dl_se));
>  	return container_of(dl_se, struct task_struct, dl);
>  }
>  
> @@ -62,14 +68,22 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
>  	return container_of(dl_rq, struct rq, dl);
>  }
>  
> -static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
> +static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se)
>  {
> -	struct task_struct *p = dl_task_of(dl_se);
> -	struct rq *rq = task_rq(p);
> +	struct rq *rq = dl_se->rq;
> +
> +	if (!dl_server(dl_se))
> +		rq = task_rq(dl_task_of(dl_se));
>  
> -	return &rq->dl;
> +	return rq;
>  }
>  
> +static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
> +{
> +	return &rq_of_dl_se(dl_se)->dl;
> +}
> +
> +
>  static inline int on_dl_rq(struct sched_dl_entity *dl_se)
>  {
>  	return !RB_EMPTY_NODE(&dl_se->rb_node);
> @@ -392,9 +406,8 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se);
>  static void task_non_contending(struct sched_dl_entity *dl_se)
>  {
>  	struct hrtimer *timer = &dl_se->inactive_timer;
> -	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> -	struct rq *rq = rq_of_dl_rq(dl_rq);
> -	struct task_struct *p = dl_task_of(dl_se);
> +	struct rq *rq = rq_of_dl_se(dl_se);
> +	struct dl_rq *dl_rq = &rq->dl;
>  	s64 zerolag_time;
>  
>  	/*
> @@ -424,25 +437,33 @@ static void task_non_contending(struct sched_dl_entity *dl_se)
>  	 * utilization now, instead of starting a timer
>  	 */
>  	if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
> -		if (dl_task(p))
> +		if (dl_server(dl_se)) {
>  			sub_running_bw(dl_se, dl_rq);
> +		} else {
> +			struct task_struct *p = dl_task_of(dl_se);
> +
> +			if (dl_task(p))
> +				sub_running_bw(dl_se, dl_rq);
>  
> -		if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
> -			struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
> +			if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
> +				struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
>  
> -			if (READ_ONCE(p->__state) == TASK_DEAD)
> -				sub_rq_bw(dl_se, &rq->dl);
> -			raw_spin_lock(&dl_b->lock);
> -			__dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p)));
> -			raw_spin_unlock(&dl_b->lock);
> -			__dl_clear_params(dl_se);
> +				if (READ_ONCE(p->__state) == TASK_DEAD)
> +					sub_rq_bw(dl_se, &rq->dl);
> +				raw_spin_lock(&dl_b->lock);
> +				__dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p)));
> +				raw_spin_unlock(&dl_b->lock);
> +				__dl_clear_params(dl_se);
> +			}
>  		}
>  
>  		return;
>  	}
>  
>  	dl_se->dl_non_contending = 1;
> -	get_task_struct(p);
> +	if (!dl_server(dl_se))
> +		get_task_struct(dl_task_of(dl_se));
> +
>  	hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
>  }
>  
> @@ -469,8 +490,10 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
>  		 * will not touch the rq's active utilization,
>  		 * so we are still safe.
>  		 */
> -		if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
> -			put_task_struct(dl_task_of(dl_se));
> +		if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
> +			if (!dl_server(dl_se))
> +				put_task_struct(dl_task_of(dl_se));
> +		}
>  	} else {
>  		/*
>  		 * Since "dl_non_contending" is not set, the
> @@ -483,10 +506,8 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
>  	}
>  }
>  
> -static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
> +static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
>  {
> -	struct sched_dl_entity *dl_se = &p->dl;
> -
>  	return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
>  }
>  
> @@ -573,8 +594,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
>  
>  	if (p->nr_cpus_allowed > 1)
>  		dl_rq->dl_nr_migratory++;
> -
> -	update_dl_migration(dl_rq);
>  }
>  
>  static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
> @@ -583,8 +602,6 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
>  
>  	if (p->nr_cpus_allowed > 1)
>  		dl_rq->dl_nr_migratory--;
> -
> -	update_dl_migration(dl_rq);
>  }
>  
>  #define __node_2_pdl(node) \
> @@ -762,8 +779,10 @@ static inline void deadline_queue_pull_task(struct rq *rq)
>  }
>  #endif /* CONFIG_SMP */
>  
> +static void
> +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags);
>  static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
> -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
> +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags);
>  static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
>  
>  static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
> @@ -1011,8 +1030,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
>   */
>  static void update_dl_entity(struct sched_dl_entity *dl_se)
>  {
> -	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> -	struct rq *rq = rq_of_dl_rq(dl_rq);
> +	struct rq *rq = rq_of_dl_se(dl_se);
>  
>  	if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
>  	    dl_entity_overflow(dl_se, rq_clock(rq))) {
> @@ -1043,11 +1061,11 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
>   * actually started or not (i.e., the replenishment instant is in
>   * the future or in the past).
>   */
> -static int start_dl_timer(struct task_struct *p)
> +static int start_dl_timer(struct sched_dl_entity *dl_se)
>  {
> -	struct sched_dl_entity *dl_se = &p->dl;
>  	struct hrtimer *timer = &dl_se->dl_timer;
> -	struct rq *rq = task_rq(p);
> +	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> +	struct rq *rq = rq_of_dl_rq(dl_rq);
>  	ktime_t now, act;
>  	s64 delta;
>  
> @@ -1081,13 +1099,33 @@ static int start_dl_timer(struct task_struct *p)
>  	 * and observe our state.
>  	 */
>  	if (!hrtimer_is_queued(timer)) {
> -		get_task_struct(p);
> +		if (!dl_server(dl_se))
> +			get_task_struct(dl_task_of(dl_se));
>  		hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
>  	}
>  
>  	return 1;
>  }
>  
> +static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
> +{
> +#ifdef CONFIG_SMP
> +	/*
> +	 * Queueing this task back might have overloaded rq, check if we need
> +	 * to kick someone away.
> +	 */
> +	if (has_pushable_dl_tasks(rq)) {
> +		/*
> +		 * Nothing relies on rq->lock after this, so its safe to drop
> +		 * rq->lock.
> +		 */
> +		rq_unpin_lock(rq, rf);
> +		push_dl_task(rq);
> +		rq_repin_lock(rq, rf);
> +	}
> +#endif
> +}
> +
>  /*
>   * This is the bandwidth enforcement timer callback. If here, we know
>   * a task is not on its dl_rq, since the fact that the timer was running
> @@ -1106,10 +1144,34 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
>  	struct sched_dl_entity *dl_se = container_of(timer,
>  						     struct sched_dl_entity,
>  						     dl_timer);
> -	struct task_struct *p = dl_task_of(dl_se);
> +	struct task_struct *p;
>  	struct rq_flags rf;
>  	struct rq *rq;
>  
> +	if (dl_server(dl_se)) {
> +		struct rq *rq = rq_of_dl_se(dl_se);
> +		struct rq_flags rf;
> +
> +		rq_lock(rq, &rf);
> +		if (dl_se->dl_throttled) {
> +			sched_clock_tick();
> +			update_rq_clock(rq);
> +
> +			if (dl_se->server_has_tasks(dl_se)) {
> +				enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
> +				resched_curr(rq);
> +				__push_dl_task(rq, &rf);
> +			} else {
> +				replenish_dl_entity(dl_se);
> +			}
> +
> +		}
> +		rq_unlock(rq, &rf);
> +
> +		return HRTIMER_NORESTART;
> +	}
> +
> +	p = dl_task_of(dl_se);
>  	rq = task_rq_lock(p, &rf);
>  
>  	/*
> @@ -1180,21 +1242,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
>  	else
>  		resched_curr(rq);
>  
> -#ifdef CONFIG_SMP
> -	/*
> -	 * Queueing this task back might have overloaded rq, check if we need
> -	 * to kick someone away.
> -	 */
> -	if (has_pushable_dl_tasks(rq)) {
> -		/*
> -		 * Nothing relies on rq->lock after this, so its safe to drop
> -		 * rq->lock.
> -		 */
> -		rq_unpin_lock(rq, &rf);
> -		push_dl_task(rq);
> -		rq_repin_lock(rq, &rf);
> -	}
> -#endif
> +	__push_dl_task(rq, &rf);
>  
>  unlock:
>  	task_rq_unlock(rq, p, &rf);
> @@ -1236,12 +1284,11 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se)
>   */
>  static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
>  {
> -	struct task_struct *p = dl_task_of(dl_se);
> -	struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
> +	struct rq *rq = rq_of_dl_se(dl_se);
>  
>  	if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
>  	    dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
> -		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
> +		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se)))
>  			return;
>  		dl_se->dl_throttled = 1;
>  		if (dl_se->runtime > 0)
> @@ -1296,29 +1343,13 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
>  	return (delta * u_act) >> BW_SHIFT;
>  }
>  
> -/*
> - * Update the current task's runtime statistics (provided it is still
> - * a -deadline task and has not been removed from the dl_rq).
> - */
> -static void update_curr_dl(struct rq *rq)
> +static inline void
> +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
> +                        int flags);
> +static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
>  {
> -	struct task_struct *curr = rq->curr;
> -	struct sched_dl_entity *dl_se = &curr->dl;
> -	s64 delta_exec, scaled_delta_exec;
> -	int cpu = cpu_of(rq);
> -
> -	if (!dl_task(curr) || !on_dl_rq(dl_se))
> -		return;
> +	s64 scaled_delta_exec;
>  
> -	/*
> -	 * Consumed budget is computed considering the time as
> -	 * observed by schedulable tasks (excluding time spent
> -	 * in hardirq context, etc.). Deadlines are instead
> -	 * computed using hard walltime. This seems to be the more
> -	 * natural solution, but the full ramifications of this
> -	 * approach need further study.
> -	 */
> -	delta_exec = update_curr_common(rq);
>  	if (unlikely(delta_exec <= 0)) {
>  		if (unlikely(dl_se->dl_yielded))
>  			goto throttle;
> @@ -1336,10 +1367,9 @@ static void update_curr_dl(struct rq *rq)
>  	 * according to current frequency and CPU maximum capacity.
>  	 */
>  	if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
> -		scaled_delta_exec = grub_reclaim(delta_exec,
> -						 rq,
> -						 &curr->dl);
> +		scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se);
>  	} else {
> +		int cpu = cpu_of(rq);
>  		unsigned long scale_freq = arch_scale_freq_capacity(cpu);
>  		unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
>  
> @@ -1358,11 +1388,21 @@ static void update_curr_dl(struct rq *rq)
>  		    (dl_se->flags & SCHED_FLAG_DL_OVERRUN))
>  			dl_se->dl_overrun = 1;
>  
> -		__dequeue_task_dl(rq, curr, 0);
> -		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
> -			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
> +		dequeue_dl_entity(dl_se, 0);
> +		if (!dl_server(dl_se)) {
> +			/* XXX: After v2, from __dequeue_task_dl() */
> +			update_stats_dequeue_dl(&rq->dl, dl_se, 0);
> +			dequeue_pushable_dl_task(rq, dl_task_of(dl_se));
> +		}
> +
> +		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) {
> +			if (dl_server(dl_se))
> +				enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
> +			else
> +				enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH);
> +		}
>  
> -		if (!is_leftmost(curr, &rq->dl))
> +		if (!is_leftmost(dl_se, &rq->dl))
>  			resched_curr(rq);
>  	}
>  
> @@ -1392,20 +1432,82 @@ static void update_curr_dl(struct rq *rq)
>  	}
>  }
>  
> +void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
> +{
> +	update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
> +}
> +
> +void dl_server_start(struct sched_dl_entity *dl_se)
> +{
> +	if (!dl_server(dl_se)) {
> +		dl_se->dl_server = 1;
> +		setup_new_dl_entity(dl_se);
> +	}
> +	enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
> +}
> +
> +void dl_server_stop(struct sched_dl_entity *dl_se)
> +{
> +	dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
> +}
> +
> +void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
> +		    dl_server_has_tasks_f has_tasks,
> +		    dl_server_pick_f pick)
> +{
> +	dl_se->rq = rq;
> +	dl_se->server_has_tasks = has_tasks;
> +	dl_se->server_pick = pick;
> +}
> +
> +/*
> + * Update the current task's runtime statistics (provided it is still
> + * a -deadline task and has not been removed from the dl_rq).
> + */
> +static void update_curr_dl(struct rq *rq)
> +{
> +	struct task_struct *curr = rq->curr;
> +	struct sched_dl_entity *dl_se = &curr->dl;
> +	s64 delta_exec;
> +
> +	if (!dl_task(curr) || !on_dl_rq(dl_se))
> +		return;
> +
> +	/*
> +	 * Consumed budget is computed considering the time as
> +	 * observed by schedulable tasks (excluding time spent
> +	 * in hardirq context, etc.). Deadlines are instead
> +	 * computed using hard walltime. This seems to be the more
> +	 * natural solution, but the full ramifications of this
> +	 * approach need further study.
> +	 */
> +	delta_exec = update_curr_common(rq);
> +	update_curr_dl_se(rq, dl_se, delta_exec);
> +}
> +
>  static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
>  {
>  	struct sched_dl_entity *dl_se = container_of(timer,
>  						     struct sched_dl_entity,
>  						     inactive_timer);
> -	struct task_struct *p = dl_task_of(dl_se);
> +	struct task_struct *p = NULL;
>  	struct rq_flags rf;
>  	struct rq *rq;
>  
> -	rq = task_rq_lock(p, &rf);
> +	if (!dl_server(dl_se)) {
> +		p = dl_task_of(dl_se);
> +		rq = task_rq_lock(p, &rf);
> +	} else {
> +		rq = dl_se->rq;
> +		rq_lock(rq, &rf);
> +	}
>  
>  	sched_clock_tick();
>  	update_rq_clock(rq);
>  
> +	if (dl_server(dl_se))
> +		goto no_task;
> +
>  	if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
>  		struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
>  
> @@ -1422,14 +1524,21 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
>  
>  		goto unlock;
>  	}
> +
> +no_task:
>  	if (dl_se->dl_non_contending == 0)
>  		goto unlock;
>  
>  	sub_running_bw(dl_se, &rq->dl);
>  	dl_se->dl_non_contending = 0;
>  unlock:
> -	task_rq_unlock(rq, p, &rf);
> -	put_task_struct(p);
> +
> +	if (!dl_server(dl_se)) {
> +		task_rq_unlock(rq, p, &rf);
> +		put_task_struct(p);
> +	} else {
> +		rq_unlock(rq, &rf);
> +	}
>  
>  	return HRTIMER_NORESTART;
>  }
> @@ -1487,34 +1596,35 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
>  static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
>  static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
>  
> +static inline void update_dl_migration(struct dl_rq *dl_rq) {}
> +
>  #endif /* CONFIG_SMP */
>  
>  static inline
>  void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
>  {
> -	int prio = dl_task_of(dl_se)->prio;
>  	u64 deadline = dl_se->deadline;
>  
> -	WARN_ON(!dl_prio(prio));
>  	dl_rq->dl_nr_running++;
>  	add_nr_running(rq_of_dl_rq(dl_rq), 1);
>  
>  	inc_dl_deadline(dl_rq, deadline);
> -	inc_dl_migration(dl_se, dl_rq);
> +	if (!dl_server(dl_se))
> +		inc_dl_migration(dl_se, dl_rq);
> +	update_dl_migration(dl_rq);
>  }
>  
>  static inline
>  void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
>  {
> -	int prio = dl_task_of(dl_se)->prio;
> -
> -	WARN_ON(!dl_prio(prio));
>  	WARN_ON(!dl_rq->dl_nr_running);
>  	dl_rq->dl_nr_running--;
>  	sub_nr_running(rq_of_dl_rq(dl_rq), 1);
>  
>  	dec_dl_deadline(dl_rq, dl_se->deadline);
> -	dec_dl_migration(dl_se, dl_rq);
> +	if (!dl_server(dl_se))
> +		dec_dl_migration(dl_se, dl_rq);
> +	update_dl_migration(dl_rq);
>  }
>  
>  static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
> @@ -1676,8 +1786,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
>  	} else if (flags & ENQUEUE_REPLENISH) {
>  		replenish_dl_entity(dl_se);
>  	} else if ((flags & ENQUEUE_RESTORE) &&
> -		  dl_time_before(dl_se->deadline,
> -				 rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
> +		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
>  		setup_new_dl_entity(dl_se);
>  	}
>  
> @@ -1762,14 +1871,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
>  		enqueue_pushable_dl_task(rq, p);
>  }
>  
> -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
> -{
> -	update_stats_dequeue_dl(&rq->dl, &p->dl, flags);
> -	dequeue_dl_entity(&p->dl, flags);
> -
> -	if (!p->dl.dl_throttled)
> -		dequeue_pushable_dl_task(rq, p);
> -}
>  
>  static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
>  {
> @@ -1778,7 +1879,9 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
>  	if (p->on_rq == TASK_ON_RQ_MIGRATING)
>  		flags |= DEQUEUE_MIGRATING;
>  
> -	__dequeue_task_dl(rq, p, flags);
> +	dequeue_dl_entity(&p->dl, flags);
> +	if (!p->dl.dl_throttled)
> +		dequeue_pushable_dl_task(rq, p);
>  }
>  
>  /*
> @@ -1968,12 +2071,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
>  }
>  
>  #ifdef CONFIG_SCHED_HRTICK
> -static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
> +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
>  {
> -	hrtick_start(rq, p->dl.runtime);
> +	hrtick_start(rq, dl_se->runtime);
>  }
>  #else /* !CONFIG_SCHED_HRTICK */
> -static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
> +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
>  {
>  }
>  #endif
> @@ -1993,9 +2096,6 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
>  	if (!first)
>  		return;
>  
> -	if (hrtick_enabled_dl(rq))
> -		start_hrtick_dl(rq, p);
> -
>  	if (rq->curr->sched_class != &dl_sched_class)
>  		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
>  
> @@ -2018,12 +2118,26 @@ static struct task_struct *pick_task_dl(struct rq *rq)
>  	struct dl_rq *dl_rq = &rq->dl;
>  	struct task_struct *p;
>  
> +again:
>  	if (!sched_dl_runnable(rq))
>  		return NULL;
>  
>  	dl_se = pick_next_dl_entity(dl_rq);
>  	WARN_ON_ONCE(!dl_se);
> -	p = dl_task_of(dl_se);
> +
> +
> +	if (dl_server(dl_se)) {
> +		p = dl_se->server_pick(dl_se);
> +		if (!p) {
> +			// XXX should not happen, warn?!
> +			dl_se->dl_yielded = 1;
> +			update_curr_dl_se(rq, dl_se, 0);
> +			goto again;
> +		}
> +		p->server = dl_se;
> +	} else {
> +		p = dl_task_of(dl_se);
> +	}
>  
>  	return p;
>  }
> @@ -2033,9 +2147,20 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
>  	struct task_struct *p;
>  
>  	p = pick_task_dl(rq);
> -	if (p)
> +	if (!p)
> +		return p;
> +
> +	/*
> +	 * XXX: re-check !dl_server, changed from v2 because of
> +	 * pick_next_task_dl change
> +	 */
> +	if (!dl_server(&p->dl))
>  		set_next_task_dl(rq, p, true);
>  
> +	/* XXX not quite right */
> +	if (hrtick_enabled(rq))
> +		start_hrtick_dl(rq, &p->dl);
> +
>  	return p;
>  }
>  
> @@ -2073,8 +2198,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
>  	 * be set and schedule() will start a new hrtick for the next task.
>  	 */
>  	if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 &&
> -	    is_leftmost(p, &rq->dl))
> -		start_hrtick_dl(rq, p);
> +	    is_leftmost(&p->dl, &rq->dl))
> +		start_hrtick_dl(rq, &p->dl);
>  }
>  
>  static void task_fork_dl(struct task_struct *p)
> @@ -3003,6 +3128,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se)
>  	dl_se->dl_yielded		= 0;
>  	dl_se->dl_non_contending	= 0;
>  	dl_se->dl_overrun		= 0;
> +	dl_se->dl_server		= 0;
>  
>  #ifdef CONFIG_RT_MUTEXES
>  	dl_se->pi_se			= dl_se;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index fda67f05190d..0c58d8e55b69 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -930,6 +930,8 @@ s64 update_curr_common(struct rq *rq)
>  
>  	account_group_exec_runtime(curr, delta_exec);
>  	cgroup_account_cputime(curr, delta_exec);
> +	if (curr->server)
> +		dl_server_update(curr->server, delta_exec);
>  
>  	return delta_exec;
>  }
> @@ -958,6 +960,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
>  		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
>  		cgroup_account_cputime(curtask, delta_exec);
>  		account_group_exec_runtime(curtask, delta_exec);
> +		if (curtask->server)
> +			dl_server_update(curtask->server, delta_exec);
>  	}
>  
>  	account_cfs_rq_runtime(cfs_rq, delta_exec);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index aaf163695c2e..390c99e2f8a8 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -324,6 +324,35 @@ extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *att
>  extern int  dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
>  extern int  dl_cpu_busy(int cpu, struct task_struct *p);
>  
> +/*
> + * SCHED_DEADLINE supports servers (nested scheduling) with the following
> + * interface:
> + *
> + *   dl_se::rq -- runqueue we belong to.
> + *
> + *   dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the
> + *                                server when it runs out of tasks to run.
> + *
> + *   dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
> + *                           returns NULL.
> + *
> + *   dl_server_update() -- called from update_curr_common(), propagates runtime
> + *                         to the server.
> + *
> + *   dl_server_start()
> + *   dl_server_stop()  -- start/stop the server when it has (no) tasks
> + *
> + *   dl_server_init()
> + *
> + * XXX
> + */
> +extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
> +extern void dl_server_start(struct sched_dl_entity *dl_se);
> +extern void dl_server_stop(struct sched_dl_entity *dl_se);
> +extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
> +		    dl_server_has_tasks_f has_tasks,
> +		    dl_server_pick_f pick);
> +
>  #ifdef CONFIG_CGROUP_SCHED
>  
>  struct cfs_rq;
> -- 
> 2.40.1
> 

--
  
Valentin Schneider June 23, 2023, 4:47 p.m. UTC | #2
On 08/06/23 17:58, Daniel Bristot de Oliveira wrote:
> @@ -2033,9 +2147,20 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
>  	struct task_struct *p;
>  
>  	p = pick_task_dl(rq);
> -	if (p)
> +	if (!p)
> +		return p;
> +
> +	/*
> +	 * XXX: re-check !dl_server, changed from v2 because of
> +	 * pick_next_task_dl change
> +	 */
> +	if (!dl_server(&p->dl))
>  		set_next_task_dl(rq, p, true);
>

Should this be

       if (!p->server)

instead? AFAICT dl_server(&p->dl) can never be true since there's no
pi_se-like link to the server via the dl_se, only via the task_struct, and
the server pick cannot return the server itself (as it's a pure sched_entity).

> +	/* XXX not quite right */
> +	if (hrtick_enabled(rq))
> +		start_hrtick_dl(rq, &p->dl);
> +

IIUC that got hauled out of set_next_task_dl() to cover the case where we
pick the server (+ the server pick) and want to more thoroughly enforce the
server's bandwidth. If so, what's the issue with starting the hrtick here?

>  	return p;
>  }
>
  
Daniel Bristot de Oliveira July 4, 2023, 3:52 p.m. UTC | #3
Back from EOSS...

On 6/23/23 18:47, Valentin Schneider wrote:
> On 08/06/23 17:58, Daniel Bristot de Oliveira wrote:
>> @@ -2033,9 +2147,20 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
>>  	struct task_struct *p;
>>  
>>  	p = pick_task_dl(rq);
>> -	if (p)
>> +	if (!p)
>> +		return p;
>> +
>> +	/*
>> +	 * XXX: re-check !dl_server, changed from v2 because of
>> +	 * pick_next_task_dl change
>> +	 */
>> +	if (!dl_server(&p->dl))
>>  		set_next_task_dl(rq, p, true);
>>
> 
> Should this be
> 
>        if (!p->server)
> 
> instead? AFAICT dl_server(&p->dl) can never be true since there's no
> pi_se-like link to the server via the dl_se, only via the task_struct, and
> the server pick cannot return the server itself (as it's a pure sched_entity).

makes sense... I will check that in the v4.

> 
>> +	/* XXX not quite right */
>> +	if (hrtick_enabled(rq))
>> +		start_hrtick_dl(rq, &p->dl);
>> +
> 
> IIUC that got hauled out of set_next_task_dl() to cover the case where we
> pick the server (+ the server pick) and want to more thoroughly enforce the
> server's bandwidth. If so, what's the issue with starting the hrtick here?

I think that the commend was added more as a check if it is correct... it seems it is.

Thanks Vale!
-- Daniel

> 
>>  	return p;
>>  }
>>  
>
  
Joel Fernandes July 4, 2023, 5:25 p.m. UTC | #4
On Tue, Jul 4, 2023 at 11:52 AM Daniel Bristot de Oliveira
<bristot@redhat.com> wrote:
>
>
> Back from EOSS...
>
> On 6/23/23 18:47, Valentin Schneider wrote:
> > On 08/06/23 17:58, Daniel Bristot de Oliveira wrote:
> >> @@ -2033,9 +2147,20 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
> >>      struct task_struct *p;
> >>
> >>      p = pick_task_dl(rq);
> >> -    if (p)
> >> +    if (!p)
> >> +            return p;
> >> +
> >> +    /*
> >> +     * XXX: re-check !dl_server, changed from v2 because of
> >> +     * pick_next_task_dl change
> >> +     */
> >> +    if (!dl_server(&p->dl))
> >>              set_next_task_dl(rq, p, true);
> >>
> >
> > Should this be
> >
> >        if (!p->server)
> >
> > instead? AFAICT dl_server(&p->dl) can never be true since there's no
> > pi_se-like link to the server via the dl_se, only via the task_struct, and
> > the server pick cannot return the server itself (as it's a pure sched_entity).
>
> makes sense... I will check that in the v4.

Makes sense to me too. Since p is either a real DL task or a CFS task,
"if (dl_server(&p->dl))" is incorrect.  "if (p->server)" is the right
check.

Optionally, a BUG_ON() as well could be added to make sure the p
returned by pick_task_dl() is always false:
BUG_ON(dl_server(&p->dl));

thanks,

 - Joel



>
> >
> >> +    /* XXX not quite right */
> >> +    if (hrtick_enabled(rq))
> >> +            start_hrtick_dl(rq, &p->dl);
> >> +
> >
> > IIUC that got hauled out of set_next_task_dl() to cover the case where we
> > pick the server (+ the server pick) and want to more thoroughly enforce the
> > server's bandwidth. If so, what's the issue with starting the hrtick here?
>
> I think that the commend was added more as a check if it is correct... it seems it is.
>
> Thanks Vale!
> -- Daniel
>
> >
> >>      return p;
> >>  }
> >>
> >
>
  
Joel Fernandes July 4, 2023, 5:28 p.m. UTC | #5
On Tue, Jul 4, 2023 at 1:25 PM Joel Fernandes <joel@joelfernandes.org> wrote:
>
> On Tue, Jul 4, 2023 at 11:52 AM Daniel Bristot de Oliveira
> <bristot@redhat.com> wrote:
> >
> >
> > Back from EOSS...
> >
> > On 6/23/23 18:47, Valentin Schneider wrote:
> > > On 08/06/23 17:58, Daniel Bristot de Oliveira wrote:
> > >> @@ -2033,9 +2147,20 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
> > >>      struct task_struct *p;
> > >>
> > >>      p = pick_task_dl(rq);
> > >> -    if (p)
> > >> +    if (!p)
> > >> +            return p;
> > >> +
> > >> +    /*
> > >> +     * XXX: re-check !dl_server, changed from v2 because of
> > >> +     * pick_next_task_dl change
> > >> +     */
> > >> +    if (!dl_server(&p->dl))
> > >>              set_next_task_dl(rq, p, true);
> > >>
> > >
> > > Should this be
> > >
> > >        if (!p->server)
> > >
> > > instead? AFAICT dl_server(&p->dl) can never be true since there's no
> > > pi_se-like link to the server via the dl_se, only via the task_struct, and
> > > the server pick cannot return the server itself (as it's a pure sched_entity).
> >
> > makes sense... I will check that in the v4.
>
> Makes sense to me too. Since p is either a real DL task or a CFS task,
> "if (dl_server(&p->dl))" is incorrect.  "if (p->server)" is the right
> check.

Grr, "if (!p->server)" I mean. Which ensures that set_next_task_dl()
is not called on a non-DL task.

 - Joel
  

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26b1925a702a..4c90d7693a75 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -64,12 +64,14 @@  struct robust_list_head;
 struct root_domain;
 struct rq;
 struct sched_attr;
+struct sched_dl_entity;
 struct sched_param;
 struct seq_file;
 struct sighand_struct;
 struct signal_struct;
 struct task_delay_info;
 struct task_group;
+struct task_struct;
 struct user_event_mm;
 
 /*
@@ -600,6 +602,9 @@  struct sched_rt_entity {
 #endif
 } __randomize_layout;
 
+typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
+typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+
 struct sched_dl_entity {
 	struct rb_node			rb_node;
 
@@ -647,6 +652,7 @@  struct sched_dl_entity {
 	unsigned int			dl_yielded        : 1;
 	unsigned int			dl_non_contending : 1;
 	unsigned int			dl_overrun	  : 1;
+	unsigned int			dl_server         : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
@@ -661,7 +667,20 @@  struct sched_dl_entity {
 	 * timer is needed to decrease the active utilization at the correct
 	 * time.
 	 */
-	struct hrtimer inactive_timer;
+	struct hrtimer			inactive_timer;
+
+	/*
+	 * Bits for DL-server functionality. Also see the comment near
+	 * dl_server_update().
+	 *
+	 * @rq the runqueue this server is for
+	 *
+	 * @server_has_tasks() returns true if @server_pick return a
+	 * runnable task.
+	 */
+	struct rq			*rq;
+	dl_server_has_tasks_f		server_has_tasks;
+	dl_server_pick_f		server_pick;
 
 #ifdef CONFIG_RT_MUTEXES
 	/*
@@ -790,6 +809,7 @@  struct task_struct {
 	struct sched_entity		se;
 	struct sched_rt_entity		rt;
 	struct sched_dl_entity		dl;
+	struct sched_dl_entity		*server;
 	const struct sched_class	*sched_class;
 
 #ifdef CONFIG_SCHED_CORE
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e34b02cbe41f..5b88b822ec89 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3803,6 +3803,8 @@  ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 		rq->idle_stamp = 0;
 	}
 #endif
+
+	p->server = NULL;
 }
 
 /*
@@ -6013,12 +6015,27 @@  __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 			p = pick_next_task_idle(rq);
 		}
 
+		/*
+		 * This is the fast path; it cannot be a DL server pick;
+		 * therefore even if @p == @prev, ->server must be NULL.
+		 */
+		if (p->server)
+			p->server = NULL;
+
 		return p;
 	}
 
 restart:
 	put_prev_task_balance(rq, prev, rf);
 
+	/*
+	 * We've updated @prev and no longer need the server link, clear it.
+	 * Must be done before ->pick_next_task() because that can (re)set
+	 * ->server.
+	 */
+	if (prev->server)
+		prev->server = NULL;
+
 	for_each_class(class) {
 		p = class->pick_next_task(rq);
 		if (p)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 869734eecb2c..c67056ff5749 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -52,8 +52,14 @@  static int __init sched_dl_sysctl_init(void)
 late_initcall(sched_dl_sysctl_init);
 #endif
 
+static bool dl_server(struct sched_dl_entity *dl_se)
+{
+	return dl_se->dl_server;
+}
+
 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
 {
+	BUG_ON(dl_server(dl_se));
 	return container_of(dl_se, struct task_struct, dl);
 }
 
@@ -62,14 +68,22 @@  static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
 	return container_of(dl_rq, struct rq, dl);
 }
 
-static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se)
 {
-	struct task_struct *p = dl_task_of(dl_se);
-	struct rq *rq = task_rq(p);
+	struct rq *rq = dl_se->rq;
+
+	if (!dl_server(dl_se))
+		rq = task_rq(dl_task_of(dl_se));
 
-	return &rq->dl;
+	return rq;
 }
 
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+	return &rq_of_dl_se(dl_se)->dl;
+}
+
+
 static inline int on_dl_rq(struct sched_dl_entity *dl_se)
 {
 	return !RB_EMPTY_NODE(&dl_se->rb_node);
@@ -392,9 +406,8 @@  static void __dl_clear_params(struct sched_dl_entity *dl_se);
 static void task_non_contending(struct sched_dl_entity *dl_se)
 {
 	struct hrtimer *timer = &dl_se->inactive_timer;
-	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq);
-	struct task_struct *p = dl_task_of(dl_se);
+	struct rq *rq = rq_of_dl_se(dl_se);
+	struct dl_rq *dl_rq = &rq->dl;
 	s64 zerolag_time;
 
 	/*
@@ -424,25 +437,33 @@  static void task_non_contending(struct sched_dl_entity *dl_se)
 	 * utilization now, instead of starting a timer
 	 */
 	if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
-		if (dl_task(p))
+		if (dl_server(dl_se)) {
 			sub_running_bw(dl_se, dl_rq);
+		} else {
+			struct task_struct *p = dl_task_of(dl_se);
+
+			if (dl_task(p))
+				sub_running_bw(dl_se, dl_rq);
 
-		if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
-			struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+			if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
+				struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
-			if (READ_ONCE(p->__state) == TASK_DEAD)
-				sub_rq_bw(dl_se, &rq->dl);
-			raw_spin_lock(&dl_b->lock);
-			__dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p)));
-			raw_spin_unlock(&dl_b->lock);
-			__dl_clear_params(dl_se);
+				if (READ_ONCE(p->__state) == TASK_DEAD)
+					sub_rq_bw(dl_se, &rq->dl);
+				raw_spin_lock(&dl_b->lock);
+				__dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p)));
+				raw_spin_unlock(&dl_b->lock);
+				__dl_clear_params(dl_se);
+			}
 		}
 
 		return;
 	}
 
 	dl_se->dl_non_contending = 1;
-	get_task_struct(p);
+	if (!dl_server(dl_se))
+		get_task_struct(dl_task_of(dl_se));
+
 	hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
 }
 
@@ -469,8 +490,10 @@  static void task_contending(struct sched_dl_entity *dl_se, int flags)
 		 * will not touch the rq's active utilization,
 		 * so we are still safe.
 		 */
-		if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
-			put_task_struct(dl_task_of(dl_se));
+		if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
+			if (!dl_server(dl_se))
+				put_task_struct(dl_task_of(dl_se));
+		}
 	} else {
 		/*
 		 * Since "dl_non_contending" is not set, the
@@ -483,10 +506,8 @@  static void task_contending(struct sched_dl_entity *dl_se, int flags)
 	}
 }
 
-static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
+static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
-	struct sched_dl_entity *dl_se = &p->dl;
-
 	return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
 }
 
@@ -573,8 +594,6 @@  static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 
 	if (p->nr_cpus_allowed > 1)
 		dl_rq->dl_nr_migratory++;
-
-	update_dl_migration(dl_rq);
 }
 
 static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
@@ -583,8 +602,6 @@  static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 
 	if (p->nr_cpus_allowed > 1)
 		dl_rq->dl_nr_migratory--;
-
-	update_dl_migration(dl_rq);
 }
 
 #define __node_2_pdl(node) \
@@ -762,8 +779,10 @@  static inline void deadline_queue_pull_task(struct rq *rq)
 }
 #endif /* CONFIG_SMP */
 
+static void
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags);
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags);
 static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
 
 static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
@@ -1011,8 +1030,7 @@  static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
  */
 static void update_dl_entity(struct sched_dl_entity *dl_se)
 {
-	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq);
+	struct rq *rq = rq_of_dl_se(dl_se);
 
 	if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
 	    dl_entity_overflow(dl_se, rq_clock(rq))) {
@@ -1043,11 +1061,11 @@  static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
  * actually started or not (i.e., the replenishment instant is in
  * the future or in the past).
  */
-static int start_dl_timer(struct task_struct *p)
+static int start_dl_timer(struct sched_dl_entity *dl_se)
 {
-	struct sched_dl_entity *dl_se = &p->dl;
 	struct hrtimer *timer = &dl_se->dl_timer;
-	struct rq *rq = task_rq(p);
+	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+	struct rq *rq = rq_of_dl_rq(dl_rq);
 	ktime_t now, act;
 	s64 delta;
 
@@ -1081,13 +1099,33 @@  static int start_dl_timer(struct task_struct *p)
 	 * and observe our state.
 	 */
 	if (!hrtimer_is_queued(timer)) {
-		get_task_struct(p);
+		if (!dl_server(dl_se))
+			get_task_struct(dl_task_of(dl_se));
 		hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
 	}
 
 	return 1;
 }
 
+static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * Queueing this task back might have overloaded rq, check if we need
+	 * to kick someone away.
+	 */
+	if (has_pushable_dl_tasks(rq)) {
+		/*
+		 * Nothing relies on rq->lock after this, so its safe to drop
+		 * rq->lock.
+		 */
+		rq_unpin_lock(rq, rf);
+		push_dl_task(rq);
+		rq_repin_lock(rq, rf);
+	}
+#endif
+}
+
 /*
  * This is the bandwidth enforcement timer callback. If here, we know
  * a task is not on its dl_rq, since the fact that the timer was running
@@ -1106,10 +1144,34 @@  static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	struct sched_dl_entity *dl_se = container_of(timer,
 						     struct sched_dl_entity,
 						     dl_timer);
-	struct task_struct *p = dl_task_of(dl_se);
+	struct task_struct *p;
 	struct rq_flags rf;
 	struct rq *rq;
 
+	if (dl_server(dl_se)) {
+		struct rq *rq = rq_of_dl_se(dl_se);
+		struct rq_flags rf;
+
+		rq_lock(rq, &rf);
+		if (dl_se->dl_throttled) {
+			sched_clock_tick();
+			update_rq_clock(rq);
+
+			if (dl_se->server_has_tasks(dl_se)) {
+				enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+				resched_curr(rq);
+				__push_dl_task(rq, &rf);
+			} else {
+				replenish_dl_entity(dl_se);
+			}
+
+		}
+		rq_unlock(rq, &rf);
+
+		return HRTIMER_NORESTART;
+	}
+
+	p = dl_task_of(dl_se);
 	rq = task_rq_lock(p, &rf);
 
 	/*
@@ -1180,21 +1242,7 @@  static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	else
 		resched_curr(rq);
 
-#ifdef CONFIG_SMP
-	/*
-	 * Queueing this task back might have overloaded rq, check if we need
-	 * to kick someone away.
-	 */
-	if (has_pushable_dl_tasks(rq)) {
-		/*
-		 * Nothing relies on rq->lock after this, so its safe to drop
-		 * rq->lock.
-		 */
-		rq_unpin_lock(rq, &rf);
-		push_dl_task(rq);
-		rq_repin_lock(rq, &rf);
-	}
-#endif
+	__push_dl_task(rq, &rf);
 
 unlock:
 	task_rq_unlock(rq, p, &rf);
@@ -1236,12 +1284,11 @@  static void init_dl_task_timer(struct sched_dl_entity *dl_se)
  */
 static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
 {
-	struct task_struct *p = dl_task_of(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+	struct rq *rq = rq_of_dl_se(dl_se);
 
 	if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
 	    dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
-		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
+		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se)))
 			return;
 		dl_se->dl_throttled = 1;
 		if (dl_se->runtime > 0)
@@ -1296,29 +1343,13 @@  static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
 	return (delta * u_act) >> BW_SHIFT;
 }
 
-/*
- * Update the current task's runtime statistics (provided it is still
- * a -deadline task and has not been removed from the dl_rq).
- */
-static void update_curr_dl(struct rq *rq)
+static inline void
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+                        int flags);
+static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
 {
-	struct task_struct *curr = rq->curr;
-	struct sched_dl_entity *dl_se = &curr->dl;
-	s64 delta_exec, scaled_delta_exec;
-	int cpu = cpu_of(rq);
-
-	if (!dl_task(curr) || !on_dl_rq(dl_se))
-		return;
+	s64 scaled_delta_exec;
 
-	/*
-	 * Consumed budget is computed considering the time as
-	 * observed by schedulable tasks (excluding time spent
-	 * in hardirq context, etc.). Deadlines are instead
-	 * computed using hard walltime. This seems to be the more
-	 * natural solution, but the full ramifications of this
-	 * approach need further study.
-	 */
-	delta_exec = update_curr_common(rq);
 	if (unlikely(delta_exec <= 0)) {
 		if (unlikely(dl_se->dl_yielded))
 			goto throttle;
@@ -1336,10 +1367,9 @@  static void update_curr_dl(struct rq *rq)
 	 * according to current frequency and CPU maximum capacity.
 	 */
 	if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
-		scaled_delta_exec = grub_reclaim(delta_exec,
-						 rq,
-						 &curr->dl);
+		scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se);
 	} else {
+		int cpu = cpu_of(rq);
 		unsigned long scale_freq = arch_scale_freq_capacity(cpu);
 		unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
 
@@ -1358,11 +1388,21 @@  static void update_curr_dl(struct rq *rq)
 		    (dl_se->flags & SCHED_FLAG_DL_OVERRUN))
 			dl_se->dl_overrun = 1;
 
-		__dequeue_task_dl(rq, curr, 0);
-		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
-			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+		dequeue_dl_entity(dl_se, 0);
+		if (!dl_server(dl_se)) {
+			/* XXX: After v2, from __dequeue_task_dl() */
+			update_stats_dequeue_dl(&rq->dl, dl_se, 0);
+			dequeue_pushable_dl_task(rq, dl_task_of(dl_se));
+		}
+
+		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) {
+			if (dl_server(dl_se))
+				enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+			else
+				enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH);
+		}
 
-		if (!is_leftmost(curr, &rq->dl))
+		if (!is_leftmost(dl_se, &rq->dl))
 			resched_curr(rq);
 	}
 
@@ -1392,20 +1432,82 @@  static void update_curr_dl(struct rq *rq)
 	}
 }
 
+void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
+{
+	update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+}
+
+void dl_server_start(struct sched_dl_entity *dl_se)
+{
+	if (!dl_server(dl_se)) {
+		dl_se->dl_server = 1;
+		setup_new_dl_entity(dl_se);
+	}
+	enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
+}
+
+void dl_server_stop(struct sched_dl_entity *dl_se)
+{
+	dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
+}
+
+void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+		    dl_server_has_tasks_f has_tasks,
+		    dl_server_pick_f pick)
+{
+	dl_se->rq = rq;
+	dl_se->server_has_tasks = has_tasks;
+	dl_se->server_pick = pick;
+}
+
+/*
+ * Update the current task's runtime statistics (provided it is still
+ * a -deadline task and has not been removed from the dl_rq).
+ */
+static void update_curr_dl(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	struct sched_dl_entity *dl_se = &curr->dl;
+	s64 delta_exec;
+
+	if (!dl_task(curr) || !on_dl_rq(dl_se))
+		return;
+
+	/*
+	 * Consumed budget is computed considering the time as
+	 * observed by schedulable tasks (excluding time spent
+	 * in hardirq context, etc.). Deadlines are instead
+	 * computed using hard walltime. This seems to be the more
+	 * natural solution, but the full ramifications of this
+	 * approach need further study.
+	 */
+	delta_exec = update_curr_common(rq);
+	update_curr_dl_se(rq, dl_se, delta_exec);
+}
+
 static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 {
 	struct sched_dl_entity *dl_se = container_of(timer,
 						     struct sched_dl_entity,
 						     inactive_timer);
-	struct task_struct *p = dl_task_of(dl_se);
+	struct task_struct *p = NULL;
 	struct rq_flags rf;
 	struct rq *rq;
 
-	rq = task_rq_lock(p, &rf);
+	if (!dl_server(dl_se)) {
+		p = dl_task_of(dl_se);
+		rq = task_rq_lock(p, &rf);
+	} else {
+		rq = dl_se->rq;
+		rq_lock(rq, &rf);
+	}
 
 	sched_clock_tick();
 	update_rq_clock(rq);
 
+	if (dl_server(dl_se))
+		goto no_task;
+
 	if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
 		struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
@@ -1422,14 +1524,21 @@  static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 
 		goto unlock;
 	}
+
+no_task:
 	if (dl_se->dl_non_contending == 0)
 		goto unlock;
 
 	sub_running_bw(dl_se, &rq->dl);
 	dl_se->dl_non_contending = 0;
 unlock:
-	task_rq_unlock(rq, p, &rf);
-	put_task_struct(p);
+
+	if (!dl_server(dl_se)) {
+		task_rq_unlock(rq, p, &rf);
+		put_task_struct(p);
+	} else {
+		rq_unlock(rq, &rf);
+	}
 
 	return HRTIMER_NORESTART;
 }
@@ -1487,34 +1596,35 @@  static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
 static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
 static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
 
+static inline void update_dl_migration(struct dl_rq *dl_rq) {}
+
 #endif /* CONFIG_SMP */
 
 static inline
 void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
-	int prio = dl_task_of(dl_se)->prio;
 	u64 deadline = dl_se->deadline;
 
-	WARN_ON(!dl_prio(prio));
 	dl_rq->dl_nr_running++;
 	add_nr_running(rq_of_dl_rq(dl_rq), 1);
 
 	inc_dl_deadline(dl_rq, deadline);
-	inc_dl_migration(dl_se, dl_rq);
+	if (!dl_server(dl_se))
+		inc_dl_migration(dl_se, dl_rq);
+	update_dl_migration(dl_rq);
 }
 
 static inline
 void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
-	int prio = dl_task_of(dl_se)->prio;
-
-	WARN_ON(!dl_prio(prio));
 	WARN_ON(!dl_rq->dl_nr_running);
 	dl_rq->dl_nr_running--;
 	sub_nr_running(rq_of_dl_rq(dl_rq), 1);
 
 	dec_dl_deadline(dl_rq, dl_se->deadline);
-	dec_dl_migration(dl_se, dl_rq);
+	if (!dl_server(dl_se))
+		dec_dl_migration(dl_se, dl_rq);
+	update_dl_migration(dl_rq);
 }
 
 static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
@@ -1676,8 +1786,7 @@  enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 	} else if (flags & ENQUEUE_REPLENISH) {
 		replenish_dl_entity(dl_se);
 	} else if ((flags & ENQUEUE_RESTORE) &&
-		  dl_time_before(dl_se->deadline,
-				 rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
+		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
 		setup_new_dl_entity(dl_se);
 	}
 
@@ -1762,14 +1871,6 @@  static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 		enqueue_pushable_dl_task(rq, p);
 }
 
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
-{
-	update_stats_dequeue_dl(&rq->dl, &p->dl, flags);
-	dequeue_dl_entity(&p->dl, flags);
-
-	if (!p->dl.dl_throttled)
-		dequeue_pushable_dl_task(rq, p);
-}
 
 static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
@@ -1778,7 +1879,9 @@  static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 	if (p->on_rq == TASK_ON_RQ_MIGRATING)
 		flags |= DEQUEUE_MIGRATING;
 
-	__dequeue_task_dl(rq, p, flags);
+	dequeue_dl_entity(&p->dl, flags);
+	if (!p->dl.dl_throttled)
+		dequeue_pushable_dl_task(rq, p);
 }
 
 /*
@@ -1968,12 +2071,12 @@  static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
 }
 
 #ifdef CONFIG_SCHED_HRTICK
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
 {
-	hrtick_start(rq, p->dl.runtime);
+	hrtick_start(rq, dl_se->runtime);
 }
 #else /* !CONFIG_SCHED_HRTICK */
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
 {
 }
 #endif
@@ -1993,9 +2096,6 @@  static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
 	if (!first)
 		return;
 
-	if (hrtick_enabled_dl(rq))
-		start_hrtick_dl(rq, p);
-
 	if (rq->curr->sched_class != &dl_sched_class)
 		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
 
@@ -2018,12 +2118,26 @@  static struct task_struct *pick_task_dl(struct rq *rq)
 	struct dl_rq *dl_rq = &rq->dl;
 	struct task_struct *p;
 
+again:
 	if (!sched_dl_runnable(rq))
 		return NULL;
 
 	dl_se = pick_next_dl_entity(dl_rq);
 	WARN_ON_ONCE(!dl_se);
-	p = dl_task_of(dl_se);
+
+
+	if (dl_server(dl_se)) {
+		p = dl_se->server_pick(dl_se);
+		if (!p) {
+			// XXX should not happen, warn?!
+			dl_se->dl_yielded = 1;
+			update_curr_dl_se(rq, dl_se, 0);
+			goto again;
+		}
+		p->server = dl_se;
+	} else {
+		p = dl_task_of(dl_se);
+	}
 
 	return p;
 }
@@ -2033,9 +2147,20 @@  static struct task_struct *pick_next_task_dl(struct rq *rq)
 	struct task_struct *p;
 
 	p = pick_task_dl(rq);
-	if (p)
+	if (!p)
+		return p;
+
+	/*
+	 * XXX: re-check !dl_server, changed from v2 because of
+	 * pick_next_task_dl change
+	 */
+	if (!dl_server(&p->dl))
 		set_next_task_dl(rq, p, true);
 
+	/* XXX not quite right */
+	if (hrtick_enabled(rq))
+		start_hrtick_dl(rq, &p->dl);
+
 	return p;
 }
 
@@ -2073,8 +2198,8 @@  static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
 	 * be set and schedule() will start a new hrtick for the next task.
 	 */
 	if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 &&
-	    is_leftmost(p, &rq->dl))
-		start_hrtick_dl(rq, p);
+	    is_leftmost(&p->dl, &rq->dl))
+		start_hrtick_dl(rq, &p->dl);
 }
 
 static void task_fork_dl(struct task_struct *p)
@@ -3003,6 +3128,7 @@  static void __dl_clear_params(struct sched_dl_entity *dl_se)
 	dl_se->dl_yielded		= 0;
 	dl_se->dl_non_contending	= 0;
 	dl_se->dl_overrun		= 0;
+	dl_se->dl_server		= 0;
 
 #ifdef CONFIG_RT_MUTEXES
 	dl_se->pi_se			= dl_se;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fda67f05190d..0c58d8e55b69 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -930,6 +930,8 @@  s64 update_curr_common(struct rq *rq)
 
 	account_group_exec_runtime(curr, delta_exec);
 	cgroup_account_cputime(curr, delta_exec);
+	if (curr->server)
+		dl_server_update(curr->server, delta_exec);
 
 	return delta_exec;
 }
@@ -958,6 +960,8 @@  static void update_curr(struct cfs_rq *cfs_rq)
 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 		cgroup_account_cputime(curtask, delta_exec);
 		account_group_exec_runtime(curtask, delta_exec);
+		if (curtask->server)
+			dl_server_update(curtask->server, delta_exec);
 	}
 
 	account_cfs_rq_runtime(cfs_rq, delta_exec);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index aaf163695c2e..390c99e2f8a8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -324,6 +324,35 @@  extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *att
 extern int  dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
 extern int  dl_cpu_busy(int cpu, struct task_struct *p);
 
+/*
+ * SCHED_DEADLINE supports servers (nested scheduling) with the following
+ * interface:
+ *
+ *   dl_se::rq -- runqueue we belong to.
+ *
+ *   dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the
+ *                                server when it runs out of tasks to run.
+ *
+ *   dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
+ *                           returns NULL.
+ *
+ *   dl_server_update() -- called from update_curr_common(), propagates runtime
+ *                         to the server.
+ *
+ *   dl_server_start()
+ *   dl_server_stop()  -- start/stop the server when it has (no) tasks
+ *
+ *   dl_server_init()
+ *
+ * XXX
+ */
+extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
+extern void dl_server_start(struct sched_dl_entity *dl_se);
+extern void dl_server_stop(struct sched_dl_entity *dl_se);
+extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+		    dl_server_has_tasks_f has_tasks,
+		    dl_server_pick_f pick);
+
 #ifdef CONFIG_CGROUP_SCHED
 
 struct cfs_rq;