[2/2] sched/core: split iowait state into two states
Commit Message
iowait is a bogus metric, but it's helpful in the sense that it allows
short waits to not enter sleep states that have a higher exit latency
than we would've picked for iowait'ing tasks. However, it's harmless in
that lots of applications and monitoring assumes that iowait is busy
time, or otherwise use it as a health metric. Particularly for async
IO it's entirely nonsensical.
Split the iowait part into two parts - one that tracks whether we need
boosting for short waits, and one that says we need to account the task
as such. ->in_iowait_acct nests inside of ->in_iowait, both for
efficiency reasons, but also so that the relationship between the two
is clear. A waiter may set ->in_wait alone and not care about the
accounting.
Existing users of nr_iowait() for accounting purposes are switched to
use nr_iowait_acct(), which leaves the governor using nr_iowait() as it
only cares about iowaiters, not the accounting side.
io_schedule_prepare() and io_schedule_finish() are changed to return
a simple mask of two state bits, as we now have more than one state to
manage. Outside of that, no further changes are needed to suppor this
generically.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
arch/s390/appldata/appldata_base.c | 2 +-
arch/s390/appldata/appldata_os.c | 2 +-
fs/proc/stat.c | 2 +-
include/linux/sched.h | 6 ++++
include/linux/sched/stat.h | 10 +++++--
kernel/sched/core.c | 45 ++++++++++++++++++++++++------
kernel/sched/sched.h | 2 ++
kernel/time/tick-sched.c | 6 ++--
8 files changed, 59 insertions(+), 16 deletions(-)
Comments
On Wed, Feb 28 2024 at 12:16, Jens Axboe wrote:
> iowait is a bogus metric, but it's helpful in the sense that it allows
> short waits to not enter sleep states that have a higher exit latency
> than we would've picked for iowait'ing tasks. However, it's harmless in
> that lots of applications and monitoring assumes that iowait is busy
> time, or otherwise use it as a health metric. Particularly for async
> IO it's entirely nonsensical.
>
> Split the iowait part into two parts - one that tracks whether we need
> boosting for short waits, and one that says we need to account the
> task
We :)
> as such. ->in_iowait_acct nests inside of ->in_iowait, both for
> efficiency reasons, but also so that the relationship between the two
> is clear. A waiter may set ->in_wait alone and not care about the
> accounting.
> +/*
> + * Returns a token which is comprised of the two bits of iowait wait state -
> + * one is whether we're making ourselves as in iowait for cpufreq reasons,
> + * and the other is if the task should be accounted as such.
> + */
> int io_schedule_prepare(void)
> {
> - int old_iowait = current->in_iowait;
> + int old_wait_flags = 0;
> +
> + if (current->in_iowait)
> + old_wait_flags |= TASK_IOWAIT;
> + if (current->in_iowait_acct)
> + old_wait_flags |= TASK_IOWAIT_ACCT;
>
> current->in_iowait = 1;
> + current->in_iowait_acct = 1;
> blk_flush_plug(current->plug, true);
> - return old_iowait;
> + return old_wait_flags;
> }
>
> -void io_schedule_finish(int token)
> +void io_schedule_finish(int old_wait_flags)
> {
> - current->in_iowait = token;
> + if (!(old_wait_flags & TASK_IOWAIT))
> + current->in_iowait = 0;
> + if (!(old_wait_flags & TASK_IOWAIT_ACCT))
> + current->in_iowait_acct = 0;
Why? TASK_IOWAIT_ACCT requires TASK_IOWAIT, right? So if TASK_IOWAIT was
not set then TASK_IOWAIT_ACCT must have been clear too, no?
Thanks,
tglx
On 2/29/24 10:31 AM, Thomas Gleixner wrote:
> On Wed, Feb 28 2024 at 12:16, Jens Axboe wrote:
>> iowait is a bogus metric, but it's helpful in the sense that it allows
>> short waits to not enter sleep states that have a higher exit latency
>> than we would've picked for iowait'ing tasks. However, it's harmless in
>> that lots of applications and monitoring assumes that iowait is busy
>> time, or otherwise use it as a health metric. Particularly for async
>> IO it's entirely nonsensical.
>>
>> Split the iowait part into two parts - one that tracks whether we need
>> boosting for short waits, and one that says we need to account the
>> task
>
> We :)
I appreciate the commit message police :-)
I'll rewrite it.
>> +/*
>> + * Returns a token which is comprised of the two bits of iowait wait state -
>> + * one is whether we're making ourselves as in iowait for cpufreq reasons,
>> + * and the other is if the task should be accounted as such.
>> + */
>> int io_schedule_prepare(void)
>> {
>> - int old_iowait = current->in_iowait;
>> + int old_wait_flags = 0;
>> +
>> + if (current->in_iowait)
>> + old_wait_flags |= TASK_IOWAIT;
>> + if (current->in_iowait_acct)
>> + old_wait_flags |= TASK_IOWAIT_ACCT;
>>
>> current->in_iowait = 1;
>> + current->in_iowait_acct = 1;
>> blk_flush_plug(current->plug, true);
>> - return old_iowait;
>> + return old_wait_flags;
>> }
>>
>> -void io_schedule_finish(int token)
>> +void io_schedule_finish(int old_wait_flags)
>> {
>> - current->in_iowait = token;
>> + if (!(old_wait_flags & TASK_IOWAIT))
>> + current->in_iowait = 0;
>> + if (!(old_wait_flags & TASK_IOWAIT_ACCT))
>> + current->in_iowait_acct = 0;
>
> Why? TASK_IOWAIT_ACCT requires TASK_IOWAIT, right? So if TASK_IOWAIT was
> not set then TASK_IOWAIT_ACCT must have been clear too, no?
It does, IOWAIT_ACCT always nests inside IOWAIT. I guess it would be
more explanatory as:
/*
* If TASK_IOWAIT isn't set, then TASK_IOWAIT_ACCT cannot have
* been set either as it nests inside TASK_IOWAIT.
*/
if (!(old_wait_flags & TASK_IOWAIT))
current->in_iowait = 0;
else if (!(old_wait_flags & TASK_IOWAIT_ACCT))
current->in_iowait_acct = 0;
?
@@ -423,4 +423,4 @@ EXPORT_SYMBOL_GPL(si_swapinfo);
#endif
EXPORT_SYMBOL_GPL(nr_threads);
EXPORT_SYMBOL_GPL(nr_running);
-EXPORT_SYMBOL_GPL(nr_iowait);
+EXPORT_SYMBOL_GPL(nr_iowait_acct);
@@ -100,7 +100,7 @@ static void appldata_get_os_data(void *data)
os_data->nr_threads = nr_threads;
os_data->nr_running = nr_running();
- os_data->nr_iowait = nr_iowait();
+ os_data->nr_iowait = nr_iowait_acct();
os_data->avenrun[0] = avenrun[0] + (FIXED_1/200);
os_data->avenrun[1] = avenrun[1] + (FIXED_1/200);
os_data->avenrun[2] = avenrun[2] + (FIXED_1/200);
@@ -180,7 +180,7 @@ static int show_stat(struct seq_file *p, void *v)
(unsigned long long)boottime.tv_sec,
total_forks,
nr_running(),
- nr_iowait());
+ nr_iowait_acct());
seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq);
@@ -922,7 +922,13 @@ struct task_struct {
/* Bit to tell TOMOYO we're in execve(): */
unsigned in_execve:1;
+ /* task is in iowait */
unsigned in_iowait:1;
+ /*
+ * task is in iowait and should be accounted as such. can only be set
+ * if ->in_iowait is also set.
+ */
+ unsigned in_iowait_acct:1;
#ifndef TIF_RESTORE_SIGMASK
unsigned restore_sigmask:1;
#endif
@@ -19,8 +19,14 @@ DECLARE_PER_CPU(unsigned long, process_counts);
extern int nr_processes(void);
extern unsigned int nr_running(void);
extern bool single_task_running(void);
-extern unsigned int nr_iowait(void);
-extern unsigned int nr_iowait_cpu(int cpu);
+extern unsigned int nr_iowait_acct(void);
+extern unsigned int nr_iowait_acct_cpu(int cpu);
+unsigned int nr_iowait_cpu(int cpu);
+
+enum {
+ TASK_IOWAIT = 1,
+ TASK_IOWAIT_ACCT = 2,
+};
static inline int sched_info_on(void)
{
@@ -3790,6 +3790,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
if (p->in_iowait) {
delayacct_blkio_end(p);
task_rq(p)->nr_iowait--;
+ if (p->in_iowait_acct)
+ task_rq(p)->nr_iowait_acct--;
}
activate_task(rq, p, en_flags);
@@ -4358,6 +4360,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
delayacct_blkio_end(p);
atomic_inc(&__rq->nr_iowait_remote);
+ if (p->in_iowait_acct)
+ atomic_inc(&__rq->nr_iowait_acct_remote);
}
wake_flags |= WF_MIGRATED;
@@ -5463,11 +5467,11 @@ unsigned long long nr_context_switches(void)
* it does become runnable.
*/
-unsigned int nr_iowait_cpu(int cpu)
+unsigned int nr_iowait_acct_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- return rq->nr_iowait - atomic_read(&rq->nr_iowait_remote);
+ return rq->nr_iowait_acct - atomic_read(&rq->nr_iowait_acct_remote);
}
/*
@@ -5500,16 +5504,23 @@ unsigned int nr_iowait_cpu(int cpu)
* Task CPU affinities can make all that even more 'interesting'.
*/
-unsigned int nr_iowait(void)
+unsigned int nr_iowait_acct(void)
{
unsigned int i, sum = 0;
for_each_possible_cpu(i)
- sum += nr_iowait_cpu(i);
+ sum += nr_iowait_acct_cpu(i);
return sum;
}
+unsigned int nr_iowait_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ return rq->nr_iowait - atomic_read(&rq->nr_iowait_remote);
+}
+
#ifdef CONFIG_SMP
/*
@@ -6686,6 +6697,8 @@ static void __sched notrace __schedule(unsigned int sched_mode)
if (prev->in_iowait) {
rq->nr_iowait++;
+ if (prev->in_iowait_acct)
+ rq->nr_iowait_acct++;
delayacct_blkio_start();
}
}
@@ -8988,18 +9001,32 @@ int __sched yield_to(struct task_struct *p, bool preempt)
}
EXPORT_SYMBOL_GPL(yield_to);
+/*
+ * Returns a token which is comprised of the two bits of iowait wait state -
+ * one is whether we're making ourselves as in iowait for cpufreq reasons,
+ * and the other is if the task should be accounted as such.
+ */
int io_schedule_prepare(void)
{
- int old_iowait = current->in_iowait;
+ int old_wait_flags = 0;
+
+ if (current->in_iowait)
+ old_wait_flags |= TASK_IOWAIT;
+ if (current->in_iowait_acct)
+ old_wait_flags |= TASK_IOWAIT_ACCT;
current->in_iowait = 1;
+ current->in_iowait_acct = 1;
blk_flush_plug(current->plug, true);
- return old_iowait;
+ return old_wait_flags;
}
-void io_schedule_finish(int token)
+void io_schedule_finish(int old_wait_flags)
{
- current->in_iowait = token;
+ if (!(old_wait_flags & TASK_IOWAIT))
+ current->in_iowait = 0;
+ if (!(old_wait_flags & TASK_IOWAIT_ACCT))
+ current->in_iowait_acct = 0;
}
/*
@@ -10033,6 +10060,8 @@ void __init sched_init(void)
#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
+ rq->nr_iowait_acct = 0;
+ atomic_set(&rq->nr_iowait_acct_remote, 0);
rq->nr_iowait = 0;
atomic_set(&rq->nr_iowait_remote, 0);
@@ -1054,6 +1054,8 @@ struct rq {
* modified under the rq lock (nr_iowait), and if we don't have the rq
* lock, then nr_iowait_remote is used.
*/
+ unsigned int nr_iowait_acct;
+ atomic_t nr_iowait_acct_remote;
unsigned int nr_iowait;
atomic_t nr_iowait_remote;
@@ -669,7 +669,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
delta = ktime_sub(now, ts->idle_entrytime);
write_seqcount_begin(&ts->idle_sleeptime_seq);
- if (nr_iowait_cpu(smp_processor_id()) > 0)
+ if (nr_iowait_acct_cpu(smp_processor_id()) > 0)
ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
else
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
@@ -742,7 +742,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
- !nr_iowait_cpu(cpu), last_update_time);
+ !nr_iowait_acct_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
@@ -768,7 +768,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
- nr_iowait_cpu(cpu), last_update_time);
+ nr_iowait_acct_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);