@@ -100,6 +100,7 @@ struct thread_info {
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
+#define TIF_NEED_THROTTLED 30
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
@@ -122,6 +123,7 @@ struct thread_info {
#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
#define _TIF_ADDR32 (1 << TIF_ADDR32)
+#define _TIF_NEED_THROTTLED (1 << TIF_NEED_THROTTLED)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW_BASE \
@@ -5246,10 +5246,15 @@ static inline int throttled_lb_pair(struct task_group *tg,
throttled_hierarchy(dest_cfs_rq);
}
+static void
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
+
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct sched_entity *se, *tmp;
+ struct task_struct *p;
cfs_rq->throttle_count--;
if (!cfs_rq->throttle_count) {
@@ -5259,6 +5264,12 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
/* Add cfs_rq with load or one or more already running entities to the list */
if (!cfs_rq_is_decayed(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
+ /* Enqueue each throttled task */
+ list_for_each_entry_safe(se, tmp, &cfs_rq->throttled_tasks, group_node) {
+ list_del_init(&se->group_node);
+ p = task_of(se);
+ enqueue_task_fair(se->cfs_rq->rq, p, ENQUEUE_WAKEUP);
+ }
}
return 0;
@@ -5284,7 +5295,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, idle_task_delta, dequeue = 1;
+ long dequeue = 1;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5314,49 +5325,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- task_delta = cfs_rq->h_nr_running;
- idle_task_delta = cfs_rq->idle_h_nr_running;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
-
- qcfs_rq->h_nr_running -= task_delta;
- qcfs_rq->idle_h_nr_running -= idle_task_delta;
-
- if (qcfs_rq->load.weight) {
- /* Avoid re-evaluating load for this entity: */
- se = parent_entity(se);
- break;
- }
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- update_load_avg(qcfs_rq, se, 0);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
-
- qcfs_rq->h_nr_running -= task_delta;
- qcfs_rq->idle_h_nr_running -= idle_task_delta;
- }
-
- /* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, task_delta);
-
-done:
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
@@ -5371,7 +5339,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -5401,46 +5368,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
goto unthrottle_throttle;
}
- task_delta = cfs_rq->h_nr_running;
- idle_task_delta = cfs_rq->idle_h_nr_running;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- if (se->on_rq)
- break;
- enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
-
- qcfs_rq->h_nr_running += task_delta;
- qcfs_rq->idle_h_nr_running += idle_task_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- update_load_avg(qcfs_rq, se, UPDATE_TG);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
-
- qcfs_rq->h_nr_running += task_delta;
- qcfs_rq->idle_h_nr_running += idle_task_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
- }
-
- /* At this point se is NULL and we are at root level*/
- add_nr_running(rq, task_delta);
-
unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);
@@ -6200,10 +6127,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
-
flags = ENQUEUE_WAKEUP;
}
@@ -6219,10 +6142,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
}
/* At this point se is NULL and we are at root level*/
@@ -6278,10 +6197,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
-
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
@@ -6309,11 +6224,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
-
}
/* At this point se is NULL and we are at root level*/
@@ -6323,7 +6233,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
-dequeue_throttle:
util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -7811,6 +7720,21 @@ static struct task_struct *pick_task_fair(struct rq *rq)
}
#endif
+/*
+ * This function is registered into task_work (executed when returning
+ * to user mode), it sets the TIF_NEED_THROTTLED flag and reschedules
+ * when a task needs to be throttled.
+ */
+void throttle_work(struct callback_head *arg)
+{
+ test_and_set_tsk_thread_flag(current, TIF_NEED_THROTTLED);
+ schedule();
+}
+
+struct callback_head throttle_work_head = {
+ .next = NULL,
+ .func = throttle_work,
+};
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -7856,14 +7780,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
* Therefore the nr_running test will indeed
* be correct.
*/
- if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
- cfs_rq = &rq->cfs;
-
- if (!cfs_rq->nr_running)
- goto idle;
-
- goto simple;
- }
+ check_cfs_rq_runtime(cfs_rq);
}
se = pick_next_entity(cfs_rq, curr);
@@ -7872,6 +7789,19 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
p = task_of(se);
+ /*
+ * If TIF_NEED_THROTTLED has been set and the task should be throttled,
+ * dequeue the task.
+ */
+ if (read_task_thread_flags(p) & TIF_NEED_THROTTLED) {
+ clear_ti_thread_flag(task_thread_info(p), TIF_NEED_THROTTLED);
+ if (throttled_hierarchy(se->cfs_rq)) {
+ dequeue_task_fair(rq, p, DEQUEUE_SLEEP);
+ list_add(&se->group_node, &p->se.cfs_rq->throttled_tasks);
+ cfs_rq = &rq->cfs;
+ goto again;
+ }
+ }
/*
* Since we haven't yet done put_prev_entity and if the selected task
* is a different task than we started out with, try and touch the
@@ -7927,6 +7857,13 @@ done: __maybe_unused;
update_misfit_status(p, rq);
+ /*
+ * If the picked task should be throttled, register task_work for it
+ * to reschedule when returning to user mode.
+ */
+ if (throttled_hierarchy(se->cfs_rq))
+ SCHED_WARN_ON(task_work_add(p, &throttle_work_head, TWA_RESUME));
+
return p;
idle:
@@ -12183,6 +12120,10 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ INIT_LIST_HEAD(&cfs_rq->throttled_tasks);
+#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -645,6 +645,7 @@ struct cfs_rq {
int throttled;
int throttle_count;
struct list_head throttled_list;
+ struct list_head throttled_tasks;
#ifdef CONFIG_SMP
struct list_head throttled_csd_list;
#endif