KVM: VMX: fix lockdep warning on posted intr wakeup

Message ID 20230310155955.29652-1-yan.y.zhao@intel.com
State New
Headers
Series KVM: VMX: fix lockdep warning on posted intr wakeup |

Commit Message

Yan Zhao March 10, 2023, 3:59 p.m. UTC
  Use rcu list to break the possible circular locking dependency reported
by lockdep.

path 1, ``sysvec_kvm_posted_intr_wakeup_ipi()`` --> ``pi_wakeup_handler()``
         -->  ``kvm_vcpu_wake_up()`` --> ``try_to_wake_up()``,
         the lock sequence is
         &per_cpu(wakeup_vcpus_on_cpu_lock, cpu) --> &p->pi_lock.

path 2, ``schedule()`` --> ``kvm_sched_out()`` --> ``vmx_vcpu_put()`` -->
        ``vmx_vcpu_pi_put()`` --> ``pi_enable_wakeup_handler()``,
         the lock sequence is
         &rq->__lock --> &per_cpu(wakeup_vcpus_on_cpu_lock, cpu).

path 3, ``task_rq_lock()``,
        the lock sequence is &p->pi_lock --> &rq->__lock

lockdep report:
 Chain exists of:
   &p->pi_lock --> &rq->__lock --> &per_cpu(wakeup_vcpus_on_cpu_lock, cpu)

  Possible unsafe locking scenario:

        CPU0                CPU1
        ----                ----
   lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
                            lock(&rq->__lock);
                            lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
   lock(&p->pi_lock);

  *** DEADLOCK ***

Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 arch/x86/kvm/vmx/posted_intr.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)


base-commit: 89400df96a7570b651404bbc3b7afe627c52a192
  

Comments

Sean Christopherson March 10, 2023, 5 p.m. UTC | #1
On Fri, Mar 10, 2023, Yan Zhao wrote:
> Use rcu list to break the possible circular locking dependency reported
> by lockdep.
> 
> path 1, ``sysvec_kvm_posted_intr_wakeup_ipi()`` --> ``pi_wakeup_handler()``
>          -->  ``kvm_vcpu_wake_up()`` --> ``try_to_wake_up()``,
>          the lock sequence is
>          &per_cpu(wakeup_vcpus_on_cpu_lock, cpu) --> &p->pi_lock.

Heh, that's an unfortunate naming collision.  It took me a bit of staring to
realize pi_lock is a scheduler lock, not a posted interrupt lock.

> path 2, ``schedule()`` --> ``kvm_sched_out()`` --> ``vmx_vcpu_put()`` -->
>         ``vmx_vcpu_pi_put()`` --> ``pi_enable_wakeup_handler()``,
>          the lock sequence is
>          &rq->__lock --> &per_cpu(wakeup_vcpus_on_cpu_lock, cpu).
> 
> path 3, ``task_rq_lock()``,
>         the lock sequence is &p->pi_lock --> &rq->__lock
> 
> lockdep report:
>  Chain exists of:
>    &p->pi_lock --> &rq->__lock --> &per_cpu(wakeup_vcpus_on_cpu_lock, cpu)
> 
>   Possible unsafe locking scenario:
> 
>         CPU0                CPU1
>         ----                ----
>    lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
>                             lock(&rq->__lock);
>                             lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
>    lock(&p->pi_lock);
> 
>   *** DEADLOCK ***

I don't think there's a deadlock here.  pi_wakeup_handler() is called from IRQ
context, pi_enable_wakeup_handler() disable IRQs before acquiring
wakeup_vcpus_on_cpu_lock, and "cpu" in pi_enable_wakeup_handler() is guaranteed
to be the current CPU, i.e. the same CPU.  So CPU0 and CPU1 can't be contending
for the same wakeup_vcpus_on_cpu_lock in this scenario.

vmx_vcpu_pi_load() does do cross-CPU locking, but finish_task_switch() drops
rq->__lock before invoking the sched_in notifiers.

> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> ---
>  arch/x86/kvm/vmx/posted_intr.c | 12 +++++-------
>  1 file changed, 5 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
> index 94c38bea60e7..e3ffc45c0a7b 100644
> --- a/arch/x86/kvm/vmx/posted_intr.c
> +++ b/arch/x86/kvm/vmx/posted_intr.c
> @@ -90,7 +90,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
>  	 */
>  	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
>  		raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
> -		list_del(&vmx->pi_wakeup_list);
> +		list_del_rcu(&vmx->pi_wakeup_list);
>  		raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));

_If_ there is indeed a possible deadlock, there technically needs to be an explicit 
synchonize_rcu() before freeing the vCPU.  In practice, there are probably multiple
synchonize_rcu() calls in the destruction path, not to mention that it would take a
minor miracle for pi_wakeup_handler() to get stalled long enough to achieve a
use-after-free.

>  	}
>  
> @@ -153,7 +153,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
>  	local_irq_save(flags);
>  
>  	raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
> -	list_add_tail(&vmx->pi_wakeup_list,
> +	list_add_tail_rcu(&vmx->pi_wakeup_list,
>  		      &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
>  	raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));


> @@ -219,16 +219,14 @@ void pi_wakeup_handler(void)
>  {
>  	int cpu = smp_processor_id();
>  	struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu);
> -	raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu);
>  	struct vcpu_vmx *vmx;
>  
> -	raw_spin_lock(spinlock);
> -	list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) {
> -
> +	rcu_read_lock();

This isn't strictly necessary, IRQs are disabled.

> +	list_for_each_entry_rcu(vmx, wakeup_list, pi_wakeup_list) {
>  		if (pi_test_on(&vmx->pi_desc))
>  			kvm_vcpu_wake_up(&vmx->vcpu);
>  	}
> -	raw_spin_unlock(spinlock);
> +	rcu_read_unlock();
>  }
>  
>  void __init pi_init_cpu(int cpu)
> 
> base-commit: 89400df96a7570b651404bbc3b7afe627c52a192
> -- 
> 2.17.1
>
  
Yan Zhao March 13, 2023, 8:15 a.m. UTC | #2
On Fri, Mar 10, 2023 at 09:00:00AM -0800, Sean Christopherson wrote:
> On Fri, Mar 10, 2023, Yan Zhao wrote:
> > Use rcu list to break the possible circular locking dependency reported
> > by lockdep.
> > 
> > path 1, ``sysvec_kvm_posted_intr_wakeup_ipi()`` --> ``pi_wakeup_handler()``
> >          -->  ``kvm_vcpu_wake_up()`` --> ``try_to_wake_up()``,
> >          the lock sequence is
> >          &per_cpu(wakeup_vcpus_on_cpu_lock, cpu) --> &p->pi_lock.
> 
> Heh, that's an unfortunate naming collision.  It took me a bit of staring to
> realize pi_lock is a scheduler lock, not a posted interrupt lock.
me too :)

> 
> > path 2, ``schedule()`` --> ``kvm_sched_out()`` --> ``vmx_vcpu_put()`` -->
> >         ``vmx_vcpu_pi_put()`` --> ``pi_enable_wakeup_handler()``,
> >          the lock sequence is
> >          &rq->__lock --> &per_cpu(wakeup_vcpus_on_cpu_lock, cpu).
> > 
> > path 3, ``task_rq_lock()``,
> >         the lock sequence is &p->pi_lock --> &rq->__lock
> > 
> > lockdep report:
> >  Chain exists of:
> >    &p->pi_lock --> &rq->__lock --> &per_cpu(wakeup_vcpus_on_cpu_lock, cpu)
> > 
> >   Possible unsafe locking scenario:
> > 
> >         CPU0                CPU1
> >         ----                ----
> >    lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
> >                             lock(&rq->__lock);
> >                             lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
> >    lock(&p->pi_lock);
> > 
> >   *** DEADLOCK ***
> 
> I don't think there's a deadlock here.  pi_wakeup_handler() is called from IRQ
> context, pi_enable_wakeup_handler() disable IRQs before acquiring
> wakeup_vcpus_on_cpu_lock, and "cpu" in pi_enable_wakeup_handler() is guaranteed
> to be the current CPU, i.e. the same CPU.  So CPU0 and CPU1 can't be contending
> for the same wakeup_vcpus_on_cpu_lock in this scenario.
> 
> vmx_vcpu_pi_load() does do cross-CPU locking, but finish_task_switch() drops
> rq->__lock before invoking the sched_in notifiers.
Right. Thanks for this analysis!
But the path of pi_wakeup_handler() tells lockdep that the lock ordering
is &p->pi_lock --> &rq->__lock --> &per_cpu(wakeup_vcpus_on_cpu_lock,
cpu), so the lockdep just complains about it.

> 
> > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> > ---
> >  arch/x86/kvm/vmx/posted_intr.c | 12 +++++-------
> >  1 file changed, 5 insertions(+), 7 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
> > index 94c38bea60e7..e3ffc45c0a7b 100644
> > --- a/arch/x86/kvm/vmx/posted_intr.c
> > +++ b/arch/x86/kvm/vmx/posted_intr.c
> > @@ -90,7 +90,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
> >  	 */
> >  	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
> >  		raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
> > -		list_del(&vmx->pi_wakeup_list);
> > +		list_del_rcu(&vmx->pi_wakeup_list);
> >  		raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
> 
> _If_ there is indeed a possible deadlock, there technically needs to be an explicit 
> synchonize_rcu() before freeing the vCPU.  In practice, there are probably multiple
> synchonize_rcu() calls in the destruction path, not to mention that it would take a
> minor miracle for pi_wakeup_handler() to get stalled long enough to achieve a
> use-after-free.
>
Yes, I neglected it.

Thanks for the quick and detailed review!
I will post v2 to fix it.

Yan
  

Patch

diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 94c38bea60e7..e3ffc45c0a7b 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -90,7 +90,7 @@  void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	 */
 	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
 		raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
-		list_del(&vmx->pi_wakeup_list);
+		list_del_rcu(&vmx->pi_wakeup_list);
 		raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
 	}
 
@@ -153,7 +153,7 @@  static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
 	local_irq_save(flags);
 
 	raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
-	list_add_tail(&vmx->pi_wakeup_list,
+	list_add_tail_rcu(&vmx->pi_wakeup_list,
 		      &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
 	raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
 
@@ -219,16 +219,14 @@  void pi_wakeup_handler(void)
 {
 	int cpu = smp_processor_id();
 	struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu);
-	raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu);
 	struct vcpu_vmx *vmx;
 
-	raw_spin_lock(spinlock);
-	list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) {
-
+	rcu_read_lock();
+	list_for_each_entry_rcu(vmx, wakeup_list, pi_wakeup_list) {
 		if (pi_test_on(&vmx->pi_desc))
 			kvm_vcpu_wake_up(&vmx->vcpu);
 	}
-	raw_spin_unlock(spinlock);
+	rcu_read_unlock();
 }
 
 void __init pi_init_cpu(int cpu)