[1/2] sched/core: switch struct rq->nr_iowait to an unsigned int

Message ID 20240227211152.1099534-2-axboe@kernel.dk
State New
Headers
Series Split iowait into two states |

Commit Message

Jens Axboe Feb. 27, 2024, 9:06 p.m. UTC
  In 3 of the 4 spots where we modify rq->nr_iowait we already hold the
rq lock, and in the 4th one we can just grab it. This avoids an atomic
in the scheduler fast path if we're in iowait, with the tradeoff being
that we'll grab the rq lock for the case where a task is scheduled out
in iowait mode on one CPU, and scheduled in on another CPU.

This obviously leaves the reading side as potentially racy, but that
should be OK. iowait states change all of the time, and can change while
it's being read as well, or summed up.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/sched/core.c    | 15 ++++++++++-----
 kernel/sched/cputime.c |  2 +-
 kernel/sched/sched.h   |  2 +-
 3 files changed, 12 insertions(+), 7 deletions(-)
  

Comments

David Wei Feb. 27, 2024, 11:05 p.m. UTC | #1
On 2024-02-27 21:06, Jens Axboe wrote:
> In 3 of the 4 spots where we modify rq->nr_iowait we already hold the
> rq lock, and in the 4th one we can just grab it. This avoids an atomic
> in the scheduler fast path if we're in iowait, with the tradeoff being
> that we'll grab the rq lock for the case where a task is scheduled out
> in iowait mode on one CPU, and scheduled in on another CPU.
> 
> This obviously leaves the reading side as potentially racy, but that
> should be OK. iowait states change all of the time, and can change while
> it's being read as well, or summed up.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  kernel/sched/core.c    | 15 ++++++++++-----
>  kernel/sched/cputime.c |  2 +-
>  kernel/sched/sched.h   |  2 +-
>  3 files changed, 12 insertions(+), 7 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9116bcc90346..ecc6c26096e5 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3789,7 +3789,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
>  #endif
>  	if (p->in_iowait) {
>  		delayacct_blkio_end(p);
> -		atomic_dec(&task_rq(p)->nr_iowait);
> +		task_rq(p)->nr_iowait--;
>  	}
>  
>  	activate_task(rq, p, en_flags);
> @@ -4354,8 +4354,13 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
>  		cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
>  		if (task_cpu(p) != cpu) {
>  			if (p->in_iowait) {
> +				struct rq *rq = task_rq(p);
> +				struct rq_flags rf;
> +
> +				rq_lock(rq, &rf);
> +				task_rq(p)->nr_iowait--;

Could this use rq directly, or does it not matter?

> +				rq_unlock(rq, &rf);
>  				delayacct_blkio_end(p);
> -				atomic_dec(&task_rq(p)->nr_iowait);
>  			}
>  
>  			wake_flags |= WF_MIGRATED;
> @@ -5463,7 +5468,7 @@ unsigned long long nr_context_switches(void)
>  
>  unsigned int nr_iowait_cpu(int cpu)
>  {
> -	return atomic_read(&cpu_rq(cpu)->nr_iowait);
> +	return cpu_rq(cpu)->nr_iowait;
>  }
>  
>  /*
> @@ -6681,7 +6686,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
>  			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
>  
>  			if (prev->in_iowait) {
> -				atomic_inc(&rq->nr_iowait);
> +				rq->nr_iowait++;
>  				delayacct_blkio_start();
>  			}
>  		}
> @@ -10029,7 +10034,7 @@ void __init sched_init(void)
>  #endif
>  #endif /* CONFIG_SMP */
>  		hrtick_rq_init(rq);
> -		atomic_set(&rq->nr_iowait, 0);
> +		rq->nr_iowait = 0;

I checked that both ttwu_do_activate() and __schedule() have the rq lock
held, but I couldn't find it for this. Is it under the assumption that
the rq is in a pre-init state (maybe because scheduler_running = 0?) so
no lock is needed?

>  
>  #ifdef CONFIG_SCHED_CORE
>  		rq->core = rq;
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index af7952f12e6c..b970b6c6151e 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -224,7 +224,7 @@ void account_idle_time(u64 cputime)
>  	u64 *cpustat = kcpustat_this_cpu->cpustat;
>  	struct rq *rq = this_rq();
>  
> -	if (atomic_read(&rq->nr_iowait) > 0)
> +	if (rq->nr_iowait)
>  		cpustat[CPUTIME_IOWAIT] += cputime;
>  	else
>  		cpustat[CPUTIME_IDLE] += cputime;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 001fe047bd5d..a1222a4bdc7b 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1049,7 +1049,7 @@ struct rq {
>  	u64			clock_idle_copy;
>  #endif
>  
> -	atomic_t		nr_iowait;
> +	unsigned int		nr_iowait;
>  
>  #ifdef CONFIG_SCHED_DEBUG
>  	u64 last_seen_need_resched_ns;
  
Jens Axboe Feb. 27, 2024, 11:17 p.m. UTC | #2
On 2/27/24 4:05 PM, David Wei wrote:
>> @@ -4354,8 +4354,13 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
>>  		cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
>>  		if (task_cpu(p) != cpu) {
>>  			if (p->in_iowait) {
>> +				struct rq *rq = task_rq(p);
>> +				struct rq_flags rf;
>> +
>> +				rq_lock(rq, &rf);
>> +				task_rq(p)->nr_iowait--;
> 
> Could this use rq directly, or does it not matter?

It certainly could, I'll make that edit. Same thing, but may as well use
the variable as defined. Also makes it clear we're modifying the one
we've locked.

>> @@ -10029,7 +10034,7 @@ void __init sched_init(void)
>>  #endif
>>  #endif /* CONFIG_SMP */
>>  		hrtick_rq_init(rq);
>> -		atomic_set(&rq->nr_iowait, 0);
>> +		rq->nr_iowait = 0;
> 
> I checked that both ttwu_do_activate() and __schedule() have the rq lock
> held, but I couldn't find it for this. Is it under the assumption that
> the rq is in a pre-init state (maybe because scheduler_running = 0?) so
> no lock is needed?

This is run at boot time (it's __init), so it's before anything is
running.
  

Patch

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9116bcc90346..ecc6c26096e5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3789,7 +3789,7 @@  ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 #endif
 	if (p->in_iowait) {
 		delayacct_blkio_end(p);
-		atomic_dec(&task_rq(p)->nr_iowait);
+		task_rq(p)->nr_iowait--;
 	}
 
 	activate_task(rq, p, en_flags);
@@ -4354,8 +4354,13 @@  int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
 		if (task_cpu(p) != cpu) {
 			if (p->in_iowait) {
+				struct rq *rq = task_rq(p);
+				struct rq_flags rf;
+
+				rq_lock(rq, &rf);
+				task_rq(p)->nr_iowait--;
+				rq_unlock(rq, &rf);
 				delayacct_blkio_end(p);
-				atomic_dec(&task_rq(p)->nr_iowait);
 			}
 
 			wake_flags |= WF_MIGRATED;
@@ -5463,7 +5468,7 @@  unsigned long long nr_context_switches(void)
 
 unsigned int nr_iowait_cpu(int cpu)
 {
-	return atomic_read(&cpu_rq(cpu)->nr_iowait);
+	return cpu_rq(cpu)->nr_iowait;
 }
 
 /*
@@ -6681,7 +6686,7 @@  static void __sched notrace __schedule(unsigned int sched_mode)
 			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
 
 			if (prev->in_iowait) {
-				atomic_inc(&rq->nr_iowait);
+				rq->nr_iowait++;
 				delayacct_blkio_start();
 			}
 		}
@@ -10029,7 +10034,7 @@  void __init sched_init(void)
 #endif
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
-		atomic_set(&rq->nr_iowait, 0);
+		rq->nr_iowait = 0;
 
 #ifdef CONFIG_SCHED_CORE
 		rq->core = rq;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index af7952f12e6c..b970b6c6151e 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -224,7 +224,7 @@  void account_idle_time(u64 cputime)
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 	struct rq *rq = this_rq();
 
-	if (atomic_read(&rq->nr_iowait) > 0)
+	if (rq->nr_iowait)
 		cpustat[CPUTIME_IOWAIT] += cputime;
 	else
 		cpustat[CPUTIME_IDLE] += cputime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 001fe047bd5d..a1222a4bdc7b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1049,7 +1049,7 @@  struct rq {
 	u64			clock_idle_copy;
 #endif
 
-	atomic_t		nr_iowait;
+	unsigned int		nr_iowait;
 
 #ifdef CONFIG_SCHED_DEBUG
 	u64 last_seen_need_resched_ns;