[5/6] locking/rwsem: Unify wait loop

Message ID 20230223123319.608133045@infradead.org
State New
Headers
Series locking/rwsem: Rework writer wakeup and handoff |

Commit Message

Peter Zijlstra Feb. 23, 2023, 12:26 p.m. UTC
  Now that the reader and writer wait loops are identical, share the
code.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/locking/rwsem.c |  117 +++++++++++++++++++------------------------------
 1 file changed, 47 insertions(+), 70 deletions(-)
  

Comments

Boqun Feng Feb. 23, 2023, 7:31 p.m. UTC | #1
On Thu, Feb 23, 2023 at 01:26:47PM +0100, Peter Zijlstra wrote:
> Now that the reader and writer wait loops are identical, share the
> code.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/locking/rwsem.c |  117 +++++++++++++++++++------------------------------
>  1 file changed, 47 insertions(+), 70 deletions(-)
> 
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -650,13 +650,11 @@ static void rwsem_mark_wake(struct rw_se
>   * optionally wake up waiters before it returns.
>   */
>  static inline void
> -rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
> -		      struct wake_q_head *wake_q)
> +rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
>  		      __releases(&sem->wait_lock)
>  {
>  	bool first = rwsem_first_waiter(sem) == waiter;
> -
> -	wake_q_init(wake_q);
> +	DEFINE_WAKE_Q(wake_q);
>  
>  	/*
>  	 * If the wait_list isn't empty and the waiter to be deleted is
> @@ -664,10 +662,10 @@ rwsem_del_wake_waiter(struct rw_semaphor
>  	 * be eligible to acquire or spin on the lock.
>  	 */
>  	if (rwsem_del_waiter(sem, waiter) && first)
> -		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
> +		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
>  	raw_spin_unlock_irq(&sem->wait_lock);
> -	if (!wake_q_empty(wake_q))
> -		wake_up_q(wake_q);
> +	if (!wake_q_empty(&wake_q))
> +		wake_up_q(&wake_q);
>  }
>  
>  /*
> @@ -993,6 +991,46 @@ static inline void rwsem_cond_wake_waite
>  	rwsem_mark_wake(sem, wake_type, wake_q);
>  }
>  
> +#define waiter_type(_waiter, _r, _w)	\
> +	((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
> +
> +static __always_inline struct rw_semaphore *
> +rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter, int state)
> +{
> +	trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, LCB_F_WRITE));
> +
> +	/* wait to be given the lock */
> +	for (;;) {
> +		set_current_state(state);
> +		if (!smp_load_acquire(&waiter->task)) {
> +			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
> +			break;
> +		}
> +		if (signal_pending_state(state, current)) {
> +			raw_spin_lock_irq(&sem->wait_lock);

Move the below __set_current_state(TASK_RUNNING)s up here? I think we
need the preemption protection when changing the task state here.

> +			if (waiter->task)
> +				goto out_nolock;

I originally wanted to suggest renaming the label to "out_locked", but I
think we can just move the labeled code up here? And even open-code
rwsem_del_wake_waiter() since it only has one usage.


Regards,
Boqun

> +			raw_spin_unlock_irq(&sem->wait_lock);
> +			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> +			break;
> +		}
> +		schedule_preempt_disabled();
> +		lockevent_inc(waiter_type(waiter, rwsem_sleep_reader, rwsem_sleep_writer));
> +	}
> +
> +	__set_current_state(TASK_RUNNING);
> +	lockevent_inc(waiter_type(waiter, rwsem_rlock, rwsem_wlock));
> +	trace_contention_end(sem, 0);
> +	return sem;
> +
> +out_nolock:
> +	rwsem_del_wake_waiter(sem, waiter);
> +	__set_current_state(TASK_RUNNING);
> +	lockevent_inc(waiter_type(waiter, rwsem_rlock_fail, rwsem_wlock_fail));
> +	trace_contention_end(sem, -EINTR);
> +	return ERR_PTR(-EINTR);
> +}
> +
>  /*
>   * Wait for the read lock to be granted
>   */
> @@ -1071,38 +1109,7 @@ rwsem_down_read_slowpath(struct rw_semap
>  	if (!wake_q_empty(&wake_q))
>  		wake_up_q(&wake_q);
>  
> -	trace_contention_begin(sem, LCB_F_READ);
> -
> -	/* wait to be given the lock */
> -	for (;;) {
> -		set_current_state(state);
> -		if (!smp_load_acquire(&waiter.task)) {
> -			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
> -			break;
> -		}
> -		if (signal_pending_state(state, current)) {
> -			raw_spin_lock_irq(&sem->wait_lock);
> -			if (waiter.task)
> -				goto out_nolock;
> -			raw_spin_unlock_irq(&sem->wait_lock);
> -			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> -			break;
> -		}
> -		schedule_preempt_disabled();
> -		lockevent_inc(rwsem_sleep_reader);
> -	}
> -
> -	__set_current_state(TASK_RUNNING);
> -	lockevent_inc(rwsem_rlock);
> -	trace_contention_end(sem, 0);
> -	return sem;
> -
> -out_nolock:
> -	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
> -	__set_current_state(TASK_RUNNING);
> -	lockevent_inc(rwsem_rlock_fail);
> -	trace_contention_end(sem, -EINTR);
> -	return ERR_PTR(-EINTR);
> +	return rwsem_waiter_wait(sem, &waiter, state);
>  }
>  
>  /*
> @@ -1150,37 +1157,7 @@ rwsem_down_write_slowpath(struct rw_sema
>  	}
>  	raw_spin_unlock_irq(&sem->wait_lock);
>  
> -	/* wait until we successfully acquire the lock */
> -	trace_contention_begin(sem, LCB_F_WRITE);
> -
> -	for (;;) {
> -		set_current_state(state);
> -		if (!smp_load_acquire(&waiter.task)) {
> -			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
> -			break;
> -		}
> -		if (signal_pending_state(state, current)) {
> -			raw_spin_lock_irq(&sem->wait_lock);
> -			if (waiter.task)
> -				goto out_nolock;
> -			raw_spin_unlock_irq(&sem->wait_lock);
> -			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> -			break;
> -		}
> -		schedule_preempt_disabled();
> -		lockevent_inc(rwsem_sleep_writer);
> -	}
> -	__set_current_state(TASK_RUNNING);
> -	lockevent_inc(rwsem_wlock);
> -	trace_contention_end(sem, 0);
> -	return sem;
> -
> -out_nolock:
> -	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
> -	__set_current_state(TASK_RUNNING);
> -	lockevent_inc(rwsem_wlock_fail);
> -	trace_contention_end(sem, -EINTR);
> -	return ERR_PTR(-EINTR);
> +	return rwsem_waiter_wait(sem, &waiter, state);
>  }
>  
>  /*
> 
>
  
Waiman Long Feb. 23, 2023, 10:45 p.m. UTC | #2
On 2/23/23 07:26, Peter Zijlstra wrote:
> Now that the reader and writer wait loops are identical, share the
> code.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   kernel/locking/rwsem.c |  117 +++++++++++++++++++------------------------------
>   1 file changed, 47 insertions(+), 70 deletions(-)
>
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -650,13 +650,11 @@ static void rwsem_mark_wake(struct rw_se
>    * optionally wake up waiters before it returns.
>    */
>   static inline void
> -rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
> -		      struct wake_q_head *wake_q)
> +rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
>   		      __releases(&sem->wait_lock)
>   {
>   	bool first = rwsem_first_waiter(sem) == waiter;
> -
> -	wake_q_init(wake_q);
> +	DEFINE_WAKE_Q(wake_q);
>   
>   	/*
>   	 * If the wait_list isn't empty and the waiter to be deleted is
> @@ -664,10 +662,10 @@ rwsem_del_wake_waiter(struct rw_semaphor
>   	 * be eligible to acquire or spin on the lock.
>   	 */
>   	if (rwsem_del_waiter(sem, waiter) && first)
> -		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
> +		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
>   	raw_spin_unlock_irq(&sem->wait_lock);
> -	if (!wake_q_empty(wake_q))
> -		wake_up_q(wake_q);
> +	if (!wake_q_empty(&wake_q))
> +		wake_up_q(&wake_q);
>   }
>   
>   /*
> @@ -993,6 +991,46 @@ static inline void rwsem_cond_wake_waite
>   	rwsem_mark_wake(sem, wake_type, wake_q);
>   }
>   
> +#define waiter_type(_waiter, _r, _w)	\
> +	((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
> +
> +static __always_inline struct rw_semaphore *
> +rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter, int state)
> +{
> +	trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, LCB_F_WRITE));
> +
> +	/* wait to be given the lock */
> +	for (;;) {
> +		set_current_state(state);
> +		if (!smp_load_acquire(&waiter->task)) {
> +			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
> +			break;
> +		}
> +		if (signal_pending_state(state, current)) {
> +			raw_spin_lock_irq(&sem->wait_lock);
> +			if (waiter->task)
> +				goto out_nolock;
> +			raw_spin_unlock_irq(&sem->wait_lock);
> +			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> +			break;
> +		}
> +		schedule_preempt_disabled();
> +		lockevent_inc(waiter_type(waiter, rwsem_sleep_reader, rwsem_sleep_writer));
> +	}
> +
> +	__set_current_state(TASK_RUNNING);
> +	lockevent_inc(waiter_type(waiter, rwsem_rlock, rwsem_wlock));
> +	trace_contention_end(sem, 0);
> +	return sem;
> +
> +out_nolock:
> +	rwsem_del_wake_waiter(sem, waiter);
> +	__set_current_state(TASK_RUNNING);

Similar to boqun's comment, we should move __set_current_state() before 
rwsem_del_wake_waiter().

Unfortunately, lockevent_inc() doesn't work with waiter_type() like that 
as the compilation will fail if CONFIG_LOCK_EVENT_COUNTS is enabled.  
Could you include the attached patch in your series and make the 
following changes?

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index deb0d016a6ce..5b14b0d076fd 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1021,13 +1021,14 @@ static inline void rwsem_cond_wake_waiter(struct 
rw_semaphore *sem, long count,
         rwsem_mark_wake(sem, wake_type, wake_q);
  }

-#define waiter_type(_waiter, _r, _w)   \
-       ((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
+#define waiter_type(_reader, _r, _w)   ((_reader) ? (_r) : (_w))

  static __always_inline struct rw_semaphore *
  rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter 
*waiter, int state)
  {
-       trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, 
LCB_F_WRITE));
+       bool reader = waiter->type == RWSEM_WAITING_FOR_READ;
+
+       trace_contention_begin(sem, waiter_type(reader, LCB_F_READ, 
LCB_F_WRITE));

         /* wait to be given the lock */
         for (;;) {
@@ -1045,18 +1046,18 @@ rwsem_waiter_wait(struct rw_semaphore *sem, 
struct rwsem_waiter *waiter, int sta
                         break;
                 }
                 schedule_preempt_disabled();
-               lockevent_inc(waiter_type(waiter, rwsem_sleep_reader, 
rwsem_sleep_writer));
+               lockevent_cond_inc2(reader, rwsem_sleep_reader, 
rwsem_sleep_writer);
         }

         __set_current_state(TASK_RUNNING);
-       lockevent_inc(waiter_type(waiter, rwsem_rlock, rwsem_wlock));
+       lockevent_cond_inc2(reader, rwsem_rlock, rwsem_wlock);
         trace_contention_end(sem, 0);
         return sem;

  out_nolock:
         rwsem_del_wake_waiter(sem, waiter);
         __set_current_state(TASK_RUNNING);
-       lockevent_inc(waiter_type(waiter, rwsem_rlock_fail, 
rwsem_wlock_fail));
+       lockevent_cond_inc2(reader, rwsem_rlock_fail, rwsem_wlock_fail);
         trace_contention_end(sem, -EINTR);
         return ERR_PTR(-EINTR);
  }

Thanks,
Longman

lockevent_inc
  
Boqun Feng Feb. 24, 2023, 1:33 a.m. UTC | #3
On Thu, Feb 23, 2023 at 11:31:47AM -0800, Boqun Feng wrote:
[..]
> > +#define waiter_type(_waiter, _r, _w)	\
> > +	((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
> > +
> > +static __always_inline struct rw_semaphore *
> > +rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter, int state)
> > +{
> > +	trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, LCB_F_WRITE));
> > +
> > +	/* wait to be given the lock */
> > +	for (;;) {
> > +		set_current_state(state);
> > +		if (!smp_load_acquire(&waiter->task)) {
> > +			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
> > +			break;
> > +		}
> > +		if (signal_pending_state(state, current)) {
> > +			raw_spin_lock_irq(&sem->wait_lock);
> 
> Move the below __set_current_state(TASK_RUNNING)s up here? I think we
> need the preemption protection when changing the task state here.
> 

Nevermind since we have the preemption protection for the whole
function... but merging two __set_current_state()s into one still looks
good.

Regards,
Boqun

> > +			if (waiter->task)
> > +				goto out_nolock;
> 
[...]
> > +
> > +	__set_current_state(TASK_RUNNING);
> > +	lockevent_inc(waiter_type(waiter, rwsem_rlock, rwsem_wlock));
> > +	trace_contention_end(sem, 0);
> > +	return sem;
> > +
> > +out_nolock:
> > +	rwsem_del_wake_waiter(sem, waiter);
> > +	__set_current_state(TASK_RUNNING);
> > +	lockevent_inc(waiter_type(waiter, rwsem_rlock_fail, rwsem_wlock_fail));
> > +	trace_contention_end(sem, -EINTR);
> > +	return ERR_PTR(-EINTR);
> > +}
> > +
> >  /*
> >   * Wait for the read lock to be granted
> >   */
> > @@ -1071,38 +1109,7 @@ rwsem_down_read_slowpath(struct rw_semap
> >  	if (!wake_q_empty(&wake_q))
> >  		wake_up_q(&wake_q);
> >  
> > -	trace_contention_begin(sem, LCB_F_READ);
> > -
> > -	/* wait to be given the lock */
> > -	for (;;) {
> > -		set_current_state(state);
> > -		if (!smp_load_acquire(&waiter.task)) {
> > -			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
> > -			break;
> > -		}
> > -		if (signal_pending_state(state, current)) {
> > -			raw_spin_lock_irq(&sem->wait_lock);
> > -			if (waiter.task)
> > -				goto out_nolock;
> > -			raw_spin_unlock_irq(&sem->wait_lock);
> > -			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> > -			break;
> > -		}
> > -		schedule_preempt_disabled();
> > -		lockevent_inc(rwsem_sleep_reader);
> > -	}
> > -
> > -	__set_current_state(TASK_RUNNING);
> > -	lockevent_inc(rwsem_rlock);
> > -	trace_contention_end(sem, 0);
> > -	return sem;
> > -
> > -out_nolock:
> > -	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
> > -	__set_current_state(TASK_RUNNING);
> > -	lockevent_inc(rwsem_rlock_fail);
> > -	trace_contention_end(sem, -EINTR);
> > -	return ERR_PTR(-EINTR);
> > +	return rwsem_waiter_wait(sem, &waiter, state);
> >  }
> >  
> >  /*
> > @@ -1150,37 +1157,7 @@ rwsem_down_write_slowpath(struct rw_sema
> >  	}
> >  	raw_spin_unlock_irq(&sem->wait_lock);
> >  
> > -	/* wait until we successfully acquire the lock */
> > -	trace_contention_begin(sem, LCB_F_WRITE);
> > -
> > -	for (;;) {
> > -		set_current_state(state);
> > -		if (!smp_load_acquire(&waiter.task)) {
> > -			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
> > -			break;
> > -		}
> > -		if (signal_pending_state(state, current)) {
> > -			raw_spin_lock_irq(&sem->wait_lock);
> > -			if (waiter.task)
> > -				goto out_nolock;
> > -			raw_spin_unlock_irq(&sem->wait_lock);
> > -			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> > -			break;
> > -		}
> > -		schedule_preempt_disabled();
> > -		lockevent_inc(rwsem_sleep_writer);
> > -	}
> > -	__set_current_state(TASK_RUNNING);
> > -	lockevent_inc(rwsem_wlock);
> > -	trace_contention_end(sem, 0);
> > -	return sem;
> > -
> > -out_nolock:
> > -	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
> > -	__set_current_state(TASK_RUNNING);
> > -	lockevent_inc(rwsem_wlock_fail);
> > -	trace_contention_end(sem, -EINTR);
> > -	return ERR_PTR(-EINTR);
> > +	return rwsem_waiter_wait(sem, &waiter, state);
> >  }
> >  
> >  /*
> > 
> >
  
Peter Zijlstra Feb. 26, 2023, 12:01 p.m. UTC | #4
On Thu, Feb 23, 2023 at 05:33:53PM -0800, Boqun Feng wrote:
> On Thu, Feb 23, 2023 at 11:31:47AM -0800, Boqun Feng wrote:
> [..]
> > > +#define waiter_type(_waiter, _r, _w)	\
> > > +	((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
> > > +
> > > +static __always_inline struct rw_semaphore *
> > > +rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter, int state)
> > > +{
> > > +	trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, LCB_F_WRITE));
> > > +
> > > +	/* wait to be given the lock */
> > > +	for (;;) {
> > > +		set_current_state(state);
> > > +		if (!smp_load_acquire(&waiter->task)) {
> > > +			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
> > > +			break;
> > > +		}
> > > +		if (signal_pending_state(state, current)) {
> > > +			raw_spin_lock_irq(&sem->wait_lock);
> > 
> > Move the below __set_current_state(TASK_RUNNING)s up here? I think we
> > need the preemption protection when changing the task state here.
> > 
> 
> Nevermind since we have the preemption protection for the whole
> function... but merging two __set_current_state()s into one still looks
> good.

Even if it were not; I still don't understand the concern. Preemption
ignores task state.
  
Peter Zijlstra Feb. 26, 2023, 4:15 p.m. UTC | #5
On Thu, Feb 23, 2023 at 05:45:56PM -0500, Waiman Long wrote:

> Unfortunately, lockevent_inc() doesn't work with waiter_type() like that as
> the compilation will fail if CONFIG_LOCK_EVENT_COUNTS is enabled.  Could you
> include the attached patch in your series and make the following changes?

Yeah, robot told me; fixed it like so:

--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -995,13 +995,16 @@ static inline void rwsem_cond_wake_waite
 	rwsem_mark_wake(sem, wake_type, wake_q);
 }
 
-#define waiter_type(_waiter, _r, _w)	\
-	((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
+#define lockevent_rw_inc(rd, evr, evw)	do {	\
+	lockevent_cond_inc(evr, (rd));		\
+	lockevent_cond_inc(evw, !(rd));		\
+} while (0)
 
 static __always_inline struct rw_semaphore *
-rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter, int state)
+rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
+		  int state, bool reader)
 {
-	trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, LCB_F_WRITE));
+	trace_contention_begin(sem, reader ? LCB_F_READ : LCB_F_WRITE);
 
 	/* wait to be given the lock */
 	for (;;) {
@@ -1019,18 +1022,19 @@ rwsem_waiter_wait(struct rw_semaphore *s
 			break;
 		}
 		schedule_preempt_disabled();
-		lockevent_inc(waiter_type(waiter, rwsem_sleep_reader, rwsem_sleep_writer));
+		lockevent_rw_inc(reader, rwsem_sleep_reader, rwsem_sleep_writer);
 	}
 
 	__set_current_state(TASK_RUNNING);
-	lockevent_inc(waiter_type(waiter, rwsem_rlock, rwsem_wlock));
+
+	lockevent_rw_inc(reader, rwsem_rlock, rwsem_wlock);
 	trace_contention_end(sem, 0);
 	return sem;
 
 out_nolock:
 	rwsem_del_wake_waiter(sem, waiter);
 	__set_current_state(TASK_RUNNING);
-	lockevent_inc(waiter_type(waiter, rwsem_rlock_fail, rwsem_wlock_fail));
+	lockevent_rw_inc(reader, rwem_rlock_fail, rwsem_wlock_fail);
 	trace_contention_end(sem, -EINTR);
 	return ERR_PTR(-EINTR);
 }
@@ -1112,7 +1116,7 @@ rwsem_down_read_slowpath(struct rw_semap
 	if (!wake_q_empty(&wake_q))
 		wake_up_q(&wake_q);
 
-	return rwsem_waiter_wait(sem, &waiter, state);
+	return rwsem_waiter_wait(sem, &waiter, state, true);
 }
 
 /*
@@ -1162,7 +1166,7 @@ rwsem_down_write_slowpath(struct rw_sema
 	}
 	raw_spin_unlock_irq(&sem->wait_lock);
 
-	return rwsem_waiter_wait(sem, &waiter, state);
+	return rwsem_waiter_wait(sem, &waiter, state, false);
 }
 
 /*
  
Boqun Feng Feb. 26, 2023, 6:22 p.m. UTC | #6
On Sun, Feb 26, 2023 at 01:01:10PM +0100, Peter Zijlstra wrote:
> On Thu, Feb 23, 2023 at 05:33:53PM -0800, Boqun Feng wrote:
> > On Thu, Feb 23, 2023 at 11:31:47AM -0800, Boqun Feng wrote:
> > [..]
> > > > +#define waiter_type(_waiter, _r, _w)	\
> > > > +	((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
> > > > +
> > > > +static __always_inline struct rw_semaphore *
> > > > +rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter, int state)
> > > > +{
> > > > +	trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, LCB_F_WRITE));
> > > > +
> > > > +	/* wait to be given the lock */
> > > > +	for (;;) {
> > > > +		set_current_state(state);
> > > > +		if (!smp_load_acquire(&waiter->task)) {
> > > > +			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
> > > > +			break;
> > > > +		}
> > > > +		if (signal_pending_state(state, current)) {
> > > > +			raw_spin_lock_irq(&sem->wait_lock);
> > > 
> > > Move the below __set_current_state(TASK_RUNNING)s up here? I think we
> > > need the preemption protection when changing the task state here.
> > > 
> > 
> > Nevermind since we have the preemption protection for the whole
> > function... but merging two __set_current_state()s into one still looks
> > good.
> 
> Even if it were not; I still don't understand the concern. Preemption
> ignores task state.

Because I missed the exact thing you just mentioned... ;-)

I was worried about the following case:

							ttwu();
	set_current_state(TASK_UNINTERRUPTIBLE);
	....
	<preemption enable>
	<preempted>
	  preempt_schedule_irq():
	    __schedule(...):
	      deactivate_task(); // Wakeup missed.

However this is not true, since __schedule() in preempt_schedule_irq()
is a SM_PREEMPT one.

Sorry for the noise then. But good for me to revisit these stuffs ;-)

Regards,
Boqun
  

Patch

--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -650,13 +650,11 @@  static void rwsem_mark_wake(struct rw_se
  * optionally wake up waiters before it returns.
  */
 static inline void
-rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
-		      struct wake_q_head *wake_q)
+rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
 		      __releases(&sem->wait_lock)
 {
 	bool first = rwsem_first_waiter(sem) == waiter;
-
-	wake_q_init(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 
 	/*
 	 * If the wait_list isn't empty and the waiter to be deleted is
@@ -664,10 +662,10 @@  rwsem_del_wake_waiter(struct rw_semaphor
 	 * be eligible to acquire or spin on the lock.
 	 */
 	if (rwsem_del_waiter(sem, waiter) && first)
-		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
+		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 	raw_spin_unlock_irq(&sem->wait_lock);
-	if (!wake_q_empty(wake_q))
-		wake_up_q(wake_q);
+	if (!wake_q_empty(&wake_q))
+		wake_up_q(&wake_q);
 }
 
 /*
@@ -993,6 +991,46 @@  static inline void rwsem_cond_wake_waite
 	rwsem_mark_wake(sem, wake_type, wake_q);
 }
 
+#define waiter_type(_waiter, _r, _w)	\
+	((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
+
+static __always_inline struct rw_semaphore *
+rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter, int state)
+{
+	trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, LCB_F_WRITE));
+
+	/* wait to be given the lock */
+	for (;;) {
+		set_current_state(state);
+		if (!smp_load_acquire(&waiter->task)) {
+			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
+			break;
+		}
+		if (signal_pending_state(state, current)) {
+			raw_spin_lock_irq(&sem->wait_lock);
+			if (waiter->task)
+				goto out_nolock;
+			raw_spin_unlock_irq(&sem->wait_lock);
+			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
+			break;
+		}
+		schedule_preempt_disabled();
+		lockevent_inc(waiter_type(waiter, rwsem_sleep_reader, rwsem_sleep_writer));
+	}
+
+	__set_current_state(TASK_RUNNING);
+	lockevent_inc(waiter_type(waiter, rwsem_rlock, rwsem_wlock));
+	trace_contention_end(sem, 0);
+	return sem;
+
+out_nolock:
+	rwsem_del_wake_waiter(sem, waiter);
+	__set_current_state(TASK_RUNNING);
+	lockevent_inc(waiter_type(waiter, rwsem_rlock_fail, rwsem_wlock_fail));
+	trace_contention_end(sem, -EINTR);
+	return ERR_PTR(-EINTR);
+}
+
 /*
  * Wait for the read lock to be granted
  */
@@ -1071,38 +1109,7 @@  rwsem_down_read_slowpath(struct rw_semap
 	if (!wake_q_empty(&wake_q))
 		wake_up_q(&wake_q);
 
-	trace_contention_begin(sem, LCB_F_READ);
-
-	/* wait to be given the lock */
-	for (;;) {
-		set_current_state(state);
-		if (!smp_load_acquire(&waiter.task)) {
-			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
-			break;
-		}
-		if (signal_pending_state(state, current)) {
-			raw_spin_lock_irq(&sem->wait_lock);
-			if (waiter.task)
-				goto out_nolock;
-			raw_spin_unlock_irq(&sem->wait_lock);
-			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
-			break;
-		}
-		schedule_preempt_disabled();
-		lockevent_inc(rwsem_sleep_reader);
-	}
-
-	__set_current_state(TASK_RUNNING);
-	lockevent_inc(rwsem_rlock);
-	trace_contention_end(sem, 0);
-	return sem;
-
-out_nolock:
-	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
-	__set_current_state(TASK_RUNNING);
-	lockevent_inc(rwsem_rlock_fail);
-	trace_contention_end(sem, -EINTR);
-	return ERR_PTR(-EINTR);
+	return rwsem_waiter_wait(sem, &waiter, state);
 }
 
 /*
@@ -1150,37 +1157,7 @@  rwsem_down_write_slowpath(struct rw_sema
 	}
 	raw_spin_unlock_irq(&sem->wait_lock);
 
-	/* wait until we successfully acquire the lock */
-	trace_contention_begin(sem, LCB_F_WRITE);
-
-	for (;;) {
-		set_current_state(state);
-		if (!smp_load_acquire(&waiter.task)) {
-			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
-			break;
-		}
-		if (signal_pending_state(state, current)) {
-			raw_spin_lock_irq(&sem->wait_lock);
-			if (waiter.task)
-				goto out_nolock;
-			raw_spin_unlock_irq(&sem->wait_lock);
-			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
-			break;
-		}
-		schedule_preempt_disabled();
-		lockevent_inc(rwsem_sleep_writer);
-	}
-	__set_current_state(TASK_RUNNING);
-	lockevent_inc(rwsem_wlock);
-	trace_contention_end(sem, 0);
-	return sem;
-
-out_nolock:
-	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
-	__set_current_state(TASK_RUNNING);
-	lockevent_inc(rwsem_wlock_fail);
-	trace_contention_end(sem, -EINTR);
-	return ERR_PTR(-EINTR);
+	return rwsem_waiter_wait(sem, &waiter, state);
 }
 
 /*