On 9/7/23 10:07, Peter Zijlstra wrote:
> On Wed, Sep 06, 2023 at 04:58:11PM +0200, Daniel Bristot de Oliveira wrote:
>
>>> Yeah, it's a wee hack to move it to the zero-laxity point. I was
>>> considering if it makes sense to push that down and make it available
>>> for all DL tasks, but I'm not sure..
>>
>> It might be useful in the future, like when DL dominates all other schedulers, so
>> we can have a way to schedule a deferred work, like kworkers... :-) But it might be
>> too early for that..
>
> So... that scheme I was pushing where we unconditionally decrement
> fair_server.dl_runtime from update_curr_fair(), that relies on it being
> a proper zero-laxity scheduler, and doesn't work with the proposed defer
> hack.
>
> That is, it relies on dl_runtime > 0 during throttle, and you explicitly
> set it 0.
>
> Now, I've not looked at all this code in detail in a minute, but would
> not something like the below work?
>
> AFAICT the regular dl_task_timer() callback works to make it go, because
> replenish will see positive runtime (or not, when already consumed) and
> DTRT.
>
>
> Index: linux-2.6/include/linux/sched.h
> ===================================================================
> --- linux-2.6.orig/include/linux/sched.h
> +++ linux-2.6/include/linux/sched.h
> @@ -657,6 +657,7 @@ struct sched_dl_entity {
> unsigned int dl_non_contending : 1;
> unsigned int dl_overrun : 1;
> unsigned int dl_server : 1;
> + unsigned int dl_zerolax : 1;
>
> /*
> * Bandwidth enforcement timer. Each -deadline task has its
> Index: linux-2.6/kernel/sched/deadline.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched/deadline.c
> +++ linux-2.6/kernel/sched/deadline.c
> @@ -895,6 +895,16 @@ static void replenish_dl_entity(struct s
> dl_se->dl_yielded = 0;
> if (dl_se->dl_throttled)
> dl_se->dl_throttled = 0;
> +
> + /*
> + * If this is a zero-laxity task, and we're before the zero-laxity
> + * point, throttle it.
> + */
> + if (dl_se->dl_zerolax &&
> + dl_time_before(dl_se->deadline - dl_se->runtime, rq_clock(rq))) {
> + if (!is_dl_boosted(dl_se) && start_dl_timer(dl_se))
> + dl_se->dl_throttled = 1;
> + }
> }
>
> /*
> @@ -1078,7 +1088,12 @@ static int start_dl_timer(struct sched_d
> * that it is actually coming from rq->clock and not from
> * hrtimer's time base reading.
> */
> - act = ns_to_ktime(dl_next_period(dl_se));
> + if (dl_se->dl_zerolax && !dl_se->dl_throttled) {
> + act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
> + } else {
> + act = ns_to_ktime(dl_next_period(dl_se));
> + }
> +
> now = hrtimer_cb_get_time(timer);
> delta = ktime_to_ns(now) - rq_clock(rq);
> act = ktime_add_ns(act, delta);
> @@ -1794,6 +1809,13 @@ enqueue_dl_entity(struct sched_dl_entity
> setup_new_dl_entity(dl_se);
> }
>
> + /*
> + * If we are still throttled, eg. we got replenished but are a
> + * zero-laxity task and still got to wait, don't enqueue.
> + */
> + if (dl_se->dl_throttled)
> + return;
> +
> __enqueue_dl_entity(dl_se);
> }
Let me see if I got it:
- Always start the server, but throttled with full runtime...
- Unconditionally decrement fair_server.dl_runtime from update_curr_fair()
(check if it is not decremented twice as it runs)
- When the dl timer fire, replenish or throttle for the next period?
is that the base for it?
-- Daniel
@@ -609,6 +609,12 @@ struct sched_rt_entity {
typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+enum dl_server_state {
+ DL_SERVER_STOPPED = 0,
+ DL_SERVER_DEFER,
+ DL_SERVER_RUNNING
+};
+
struct sched_dl_entity {
struct rb_node rb_node;
@@ -685,6 +691,7 @@ struct sched_dl_entity {
struct rq *rq;
dl_server_has_tasks_f server_has_tasks;
dl_server_pick_f server_pick;
+ enum dl_server_state server_state;
#ifdef CONFIG_RT_MUTEXES
/*
@@ -422,7 +422,7 @@ static void task_non_contending(struct sched_dl_entity *dl_se)
if (dl_entity_is_special(dl_se))
return;
- WARN_ON(dl_se->dl_non_contending);
+ WARN_ON_ONCE(dl_se->dl_non_contending);
zerolag_time = dl_se->deadline -
div64_long((dl_se->runtime * dl_se->dl_period),
@@ -1155,6 +1155,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct rq_flags rf;
rq_lock(rq, &rf);
+
if (dl_se->dl_throttled) {
sched_clock_tick();
update_rq_clock(rq);
@@ -1165,9 +1166,12 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
__push_dl_task(rq, &rf);
} else {
replenish_dl_entity(dl_se);
+ task_non_contending(dl_se);
}
}
+
+ dl_se->server_state = DL_SERVER_RUNNING;
rq_unlock(rq, &rf);
return HRTIMER_NORESTART;
@@ -1441,18 +1445,65 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
}
-void dl_server_start(struct sched_dl_entity *dl_se)
+void dl_server_start(struct sched_dl_entity *dl_se, int defer)
{
+ if (dl_se->server_state != DL_SERVER_STOPPED) {
+ WARN_ON_ONCE(!(on_dl_rq(dl_se) || dl_se->dl_throttled));
+ return;
+ }
+
+ if (defer) {
+ /*
+ * Postpone the replenishment to the (next period - the execution time)
+ *
+ * With this in place, we have two cases:
+ *
+ * On the absence of DL tasks:
+ * The server will start at the replenishment time, getting
+ * its runtime before now + period. This is the expected
+ * throttling behavior.
+ *
+ * In the presense of DL tasks:
+ * The server will be replenished, and then it will be
+ * schedule according to EDF, not breaking SCHED_DEADLINE.
+ *
+ * In the first cycle the server will be postponed at most
+ * at period + period - runtime at most. But then the
+ * server will receive its runtime/period.
+ *
+ * The server will, however, run on top of any RT task, which
+ * is the expected throttling behavior.
+ */
+ dl_se->deadline = rq_clock(dl_se->rq) + dl_se->dl_period - dl_se->dl_runtime;
+ /* Zero the runtime */
+ dl_se->runtime = 0;
+ /* throttle the server */
+ dl_se->dl_throttled = 1;
+
+ dl_se->server_state = DL_SERVER_DEFER;
+ start_dl_timer(dl_se);
+ return;
+ }
+
if (!dl_server(dl_se)) {
dl_se->dl_server = 1;
setup_new_dl_entity(dl_se);
}
+
+ dl_se->server_state = DL_SERVER_RUNNING;
enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
}
void dl_server_stop(struct sched_dl_entity *dl_se)
{
+ if (dl_se->server_state == DL_SERVER_STOPPED)
+ return;
+
+ hrtimer_try_to_cancel(&dl_se->dl_timer);
dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
+
+ dl_se->dl_throttled = 0;
+ dl_se->server_state = DL_SERVER_STOPPED;
}
void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
@@ -1462,6 +1513,8 @@ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_se->rq = rq;
dl_se->server_has_tasks = has_tasks;
dl_se->server_pick = pick;
+ dl_se->server_state = DL_SERVER_STOPPED;
+ dl_se->dl_server = 1;
}
/*
@@ -1817,8 +1870,9 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
* (the task moves from "active contending" to "active non contending"
* or "inactive")
*/
- if (flags & DEQUEUE_SLEEP)
+ if (flags & DEQUEUE_SLEEP && !dl_server(dl_se))
task_non_contending(dl_se);
+
}
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -1875,7 +1929,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
enqueue_pushable_dl_task(rq, p);
}
-
static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
update_curr_dl(rq);
@@ -6499,9 +6499,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
util_est_enqueue(&rq->cfs, p);
- if (!rq->cfs.h_nr_running)
- dl_server_start(&rq->fair_server);
-
/*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
@@ -6568,6 +6565,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_overutilized_status(rq);
enqueue_throttle:
+ if (sched_fair_server_needed(rq))
+ dl_server_start(&rq->fair_server, rq->fair_server_defer);
+
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
@@ -6646,7 +6646,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
rq->next_balance = jiffies;
dequeue_throttle:
- if (!rq->cfs.h_nr_running)
+ if (!sched_fair_server_needed(rq))
dl_server_stop(&rq->fair_server);
util_est_update(&rq->cfs, p, task_sleep);
@@ -8317,6 +8317,8 @@ void fair_server_init(struct rq *rq)
dl_se->dl_deadline = 1000 * NSEC_PER_MSEC;
dl_se->dl_period = 1000 * NSEC_PER_MSEC;
+ rq->fair_server_defer = 1;
+
dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick);
}
@@ -1537,6 +1537,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+
+ if (sched_fair_server_needed(rq))
+ dl_server_start(&rq->fair_server, rq->fair_server_defer);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1547,6 +1550,9 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
dequeue_rt_entity(rt_se, flags);
dequeue_pushable_task(rq, p);
+
+ if (!sched_fair_server_needed(rq))
+ dl_server_stop(&rq->fair_server);
}
/*
@@ -345,7 +345,7 @@ extern int dl_bw_check_overflow(int cpu);
* dl_server_init() -- initializes the server.
*/
extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
-extern void dl_server_start(struct sched_dl_entity *dl_se);
+extern void dl_server_start(struct sched_dl_entity *dl_se, int defer);
extern void dl_server_stop(struct sched_dl_entity *dl_se);
extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_has_tasks_f has_tasks,
@@ -1027,6 +1027,7 @@ struct rq {
struct dl_rq dl;
struct sched_dl_entity fair_server;
+ int fair_server_defer;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
@@ -2394,6 +2395,15 @@ static inline bool sched_fair_runnable(struct rq *rq)
return rq->cfs.nr_running > 0;
}
+static inline bool sched_fair_server_needed(struct rq *rq)
+{
+ /*
+ * The fair server will activate anytime a fair task can starve
+ * because real-time tasks.
+ */
+ return (sched_rt_runnable(rq) && sched_fair_runnable(rq));
+}
+
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
extern struct task_struct *pick_next_task_idle(struct rq *rq);