xhci: fix null pointer deref for xhci_urb_enqueue

Message ID 20231117072131.2886406-1-khtsai@google.com
State New
Headers
Series xhci: fix null pointer deref for xhci_urb_enqueue |

Commit Message

Kuen-Han Tsai Nov. 17, 2023, 7:21 a.m. UTC
  The null pointer dereference happens when xhci_free_dev() frees the
xhci->devs[slot_id] virtual device while xhci_urb_enqueue() is
processing a urb and checking the max packet size.

[106913.850735][ T2068] usb 2-1: USB disconnect, device number 2
[106913.856999][ T4618] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010
[106913.857488][ T4618] Call trace:
[106913.857491][ T4618]  xhci_check_maxpacket+0x30/0x2dc
[106913.857494][ T4618]  xhci_urb_enqueue+0x24c/0x47c
[106913.857498][ T4618]  usb_hcd_submit_urb+0x1f4/0xf34
[106913.857501][ T4618]  usb_submit_urb+0x4b8/0x4fc
[106913.857503][ T4618]  usb_control_msg+0x144/0x238
[106913.857507][ T4618]  do_proc_control+0x1f0/0x5bc
[106913.857509][ T4618]  usbdev_ioctl+0xdd8/0x15a8

This patch adds a spinlock to the xhci_urb_enqueue function to make sure
xhci_free_dev() and xhci_urb_enqueue() do not race and cause null
pointer dereference.

Signed-off-by: Kuen-Han Tsai <khtsai@google.com>
---
 drivers/usb/host/xhci.c | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)
  

Comments

Mathias Nyman Nov. 17, 2023, 1:32 p.m. UTC | #1
On 17.11.2023 9.21, Kuen-Han Tsai wrote:
> The null pointer dereference happens when xhci_free_dev() frees the
> xhci->devs[slot_id] virtual device while xhci_urb_enqueue() is
> processing a urb and checking the max packet size.
> 
> [106913.850735][ T2068] usb 2-1: USB disconnect, device number 2
> [106913.856999][ T4618] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010
> [106913.857488][ T4618] Call trace:
> [106913.857491][ T4618]  xhci_check_maxpacket+0x30/0x2dc
> [106913.857494][ T4618]  xhci_urb_enqueue+0x24c/0x47c
> [106913.857498][ T4618]  usb_hcd_submit_urb+0x1f4/0xf34
> [106913.857501][ T4618]  usb_submit_urb+0x4b8/0x4fc
> [106913.857503][ T4618]  usb_control_msg+0x144/0x238
> [106913.857507][ T4618]  do_proc_control+0x1f0/0x5bc
> [106913.857509][ T4618]  usbdev_ioctl+0xdd8/0x15a8
> 
> This patch adds a spinlock to the xhci_urb_enqueue function to make sure
> xhci_free_dev() and xhci_urb_enqueue() do not race and cause null
> pointer dereference.

Thanks, nice catch

This patch does however need some additional tuning

> 
> Signed-off-by: Kuen-Han Tsai <khtsai@google.com>
> ---
>   drivers/usb/host/xhci.c | 38 ++++++++++++++++++++++++--------------
>   1 file changed, 24 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
> index 884b0898d9c9..e0766ebeff0e 100644
> --- a/drivers/usb/host/xhci.c
> +++ b/drivers/usb/host/xhci.c
> @@ -1522,23 +1522,32 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
>   	struct urb_priv	*urb_priv;
>   	int num_tds;
>   
> -	if (!urb)
> -		return -EINVAL;
> -	ret = xhci_check_args(hcd, urb->dev, urb->ep,
> -					true, true, __func__);
> -	if (ret <= 0)
> -		return ret ? ret : -EINVAL;
> +	spin_lock_irqsave(&xhci->lock, flags);
> +
> +	if (!urb) {
> +		ret = -EINVAL;
> +		goto done;
> +	}
> +
> +	ret = xhci_check_args(hcd, urb->dev, urb->ep, true, true, __func__);
> +	if (ret <= 0) {
> +		ret = ret ? ret : -EINVAL;
> +		goto done;
> +	}
>   
>   	slot_id = urb->dev->slot_id;
>   	ep_index = xhci_get_endpoint_index(&urb->ep->desc);
>   	ep_state = &xhci->devs[slot_id]->eps[ep_index].ep_state;
>   
> -	if (!HCD_HW_ACCESSIBLE(hcd))
> -		return -ESHUTDOWN;
> +	if (!HCD_HW_ACCESSIBLE(hcd)) {
> +		ret = -ESHUTDOWN;
> +		goto done;
> +	}
>   
>   	if (xhci->devs[slot_id]->flags & VDEV_PORT_ERROR) {
>   		xhci_dbg(xhci, "Can't queue urb, port error, link inactive\n");
> -		return -ENODEV;
> +		ret = -ENODEV;
> +		goto done;
>   	}
>   
>   	if (usb_endpoint_xfer_isoc(&urb->ep->desc))
> @@ -1552,8 +1561,10 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
>   		num_tds = 1;
>   
>   	urb_priv = kzalloc(struct_size(urb_priv, td, num_tds), mem_flags);

kzalloc with spinlock held, should preferably be moved outside lock, otherwise should use GFP_ATOMIC

> -	if (!urb_priv)
> -		return -ENOMEM;
> +	if (!urb_priv) {
> +		ret = -ENOMEM;
> +		goto done;
> +	}
>   
>   	urb_priv->num_tds = num_tds;
>   	urb_priv->num_tds_done = 0;
> @@ -1571,13 +1582,11 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag

xhci_check_maxpacket() called here can't be called with spinlock held

>   			if (ret < 0) {
>   				xhci_urb_free_priv(urb_priv);
>   				urb->hcpriv = NULL;
> -				return ret;
> +				goto done;

Thanks
Mathias
  
Greg KH Nov. 17, 2023, 1:53 p.m. UTC | #2
On Fri, Nov 17, 2023 at 03:21:28PM +0800, Kuen-Han Tsai wrote:
> The null pointer dereference happens when xhci_free_dev() frees the
> xhci->devs[slot_id] virtual device while xhci_urb_enqueue() is
> processing a urb and checking the max packet size.
> 
> [106913.850735][ T2068] usb 2-1: USB disconnect, device number 2
> [106913.856999][ T4618] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010
> [106913.857488][ T4618] Call trace:
> [106913.857491][ T4618]  xhci_check_maxpacket+0x30/0x2dc
> [106913.857494][ T4618]  xhci_urb_enqueue+0x24c/0x47c
> [106913.857498][ T4618]  usb_hcd_submit_urb+0x1f4/0xf34
> [106913.857501][ T4618]  usb_submit_urb+0x4b8/0x4fc
> [106913.857503][ T4618]  usb_control_msg+0x144/0x238
> [106913.857507][ T4618]  do_proc_control+0x1f0/0x5bc
> [106913.857509][ T4618]  usbdev_ioctl+0xdd8/0x15a8
> 
> This patch adds a spinlock to the xhci_urb_enqueue function to make sure
> xhci_free_dev() and xhci_urb_enqueue() do not race and cause null
> pointer dereference.

I thought we had a lock for this already, what changed to cause this to
start triggering now, all these years later?

> 
> Signed-off-by: Kuen-Han Tsai <khtsai@google.com>

What commit id does this fix?


> ---
>  drivers/usb/host/xhci.c | 38 ++++++++++++++++++++++++--------------
>  1 file changed, 24 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
> index 884b0898d9c9..e0766ebeff0e 100644
> --- a/drivers/usb/host/xhci.c
> +++ b/drivers/usb/host/xhci.c
> @@ -1522,23 +1522,32 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
>  	struct urb_priv	*urb_priv;
>  	int num_tds;
>  
> -	if (!urb)
> -		return -EINVAL;
> -	ret = xhci_check_args(hcd, urb->dev, urb->ep,
> -					true, true, __func__);
> -	if (ret <= 0)
> -		return ret ? ret : -EINVAL;
> +	spin_lock_irqsave(&xhci->lock, flags);
> +
> +	if (!urb) {
> +		ret = -EINVAL;
> +		goto done;
> +	}

Why does this have to be inside the lock?  The urb can't change here,
can it?

> +
> +	ret = xhci_check_args(hcd, urb->dev, urb->ep, true, true, __func__);
> +	if (ret <= 0) {
> +		ret = ret ? ret : -EINVAL;
> +		goto done;
> +	}
>  
>  	slot_id = urb->dev->slot_id;
>  	ep_index = xhci_get_endpoint_index(&urb->ep->desc);
>  	ep_state = &xhci->devs[slot_id]->eps[ep_index].ep_state;
>  
> -	if (!HCD_HW_ACCESSIBLE(hcd))
> -		return -ESHUTDOWN;
> +	if (!HCD_HW_ACCESSIBLE(hcd)) {
> +		ret = -ESHUTDOWN;
> +		goto done;

Note, we now have completions, so all of this "goto done" doesn't need
to happen anymore.  Not a complaint, just a suggestion for future
changes or this one, your choice.

thanks,

greg k-h
  
Kuen-Han Tsai Nov. 18, 2023, 10:19 a.m. UTC | #3
Hi Mathias

>>       if (usb_endpoint_xfer_isoc(&urb->ep->desc))
>> @@ -1552,8 +1561,10 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
>>               num_tds = 1;
>>
>>       urb_priv = kzalloc(struct_size(urb_priv, td, num_tds), mem_flags);
> kzalloc with spinlock held, should preferably be moved outside lock, otherwise should use GFP_ATOMIC

Thanks for pointing this out. I realize this patch is incorrect and it
is non-ideal to include many codes unrelated to xhci->devs[slot_id]
within the lock.

> xhci_check_maxpacket() called here can't be called with spinlock held

It appears that xhci_check_maxpacket() might potentially lead to a
deadlock later if a spinlock is held. Is this the concern you were
referring to? If not, please let me know if there are any other
potential issues that I may have missed, thanks!


On Fri, Nov 17, 2023 at 9:31 PM Mathias Nyman
<mathias.nyman@linux.intel.com> wrote:
>
> On 17.11.2023 9.21, Kuen-Han Tsai wrote:
> > The null pointer dereference happens when xhci_free_dev() frees the
> > xhci->devs[slot_id] virtual device while xhci_urb_enqueue() is
> > processing a urb and checking the max packet size.
> >
> > [106913.850735][ T2068] usb 2-1: USB disconnect, device number 2
> > [106913.856999][ T4618] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010
> > [106913.857488][ T4618] Call trace:
> > [106913.857491][ T4618]  xhci_check_maxpacket+0x30/0x2dc
> > [106913.857494][ T4618]  xhci_urb_enqueue+0x24c/0x47c
> > [106913.857498][ T4618]  usb_hcd_submit_urb+0x1f4/0xf34
> > [106913.857501][ T4618]  usb_submit_urb+0x4b8/0x4fc
> > [106913.857503][ T4618]  usb_control_msg+0x144/0x238
> > [106913.857507][ T4618]  do_proc_control+0x1f0/0x5bc
> > [106913.857509][ T4618]  usbdev_ioctl+0xdd8/0x15a8
> >
> > This patch adds a spinlock to the xhci_urb_enqueue function to make sure
> > xhci_free_dev() and xhci_urb_enqueue() do not race and cause null
> > pointer dereference.
>
> Thanks, nice catch
>
> This patch does however need some additional tuning
>
> >
> > Signed-off-by: Kuen-Han Tsai <khtsai@google.com>
> > ---
> >   drivers/usb/host/xhci.c | 38 ++++++++++++++++++++++++--------------
> >   1 file changed, 24 insertions(+), 14 deletions(-)
> >
> > diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
> > index 884b0898d9c9..e0766ebeff0e 100644
> > --- a/drivers/usb/host/xhci.c
> > +++ b/drivers/usb/host/xhci.c
> > @@ -1522,23 +1522,32 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
> >       struct urb_priv *urb_priv;
> >       int num_tds;
> >
> > -     if (!urb)
> > -             return -EINVAL;
> > -     ret = xhci_check_args(hcd, urb->dev, urb->ep,
> > -                                     true, true, __func__);
> > -     if (ret <= 0)
> > -             return ret ? ret : -EINVAL;
> > +     spin_lock_irqsave(&xhci->lock, flags);
> > +
> > +     if (!urb) {
> > +             ret = -EINVAL;
> > +             goto done;
> > +     }
> > +
> > +     ret = xhci_check_args(hcd, urb->dev, urb->ep, true, true, __func__);
> > +     if (ret <= 0) {
> > +             ret = ret ? ret : -EINVAL;
> > +             goto done;
> > +     }
> >
> >       slot_id = urb->dev->slot_id;
> >       ep_index = xhci_get_endpoint_index(&urb->ep->desc);
> >       ep_state = &xhci->devs[slot_id]->eps[ep_index].ep_state;
> >
> > -     if (!HCD_HW_ACCESSIBLE(hcd))
> > -             return -ESHUTDOWN;
> > +     if (!HCD_HW_ACCESSIBLE(hcd)) {
> > +             ret = -ESHUTDOWN;
> > +             goto done;
> > +     }
> >
> >       if (xhci->devs[slot_id]->flags & VDEV_PORT_ERROR) {
> >               xhci_dbg(xhci, "Can't queue urb, port error, link inactive\n");
> > -             return -ENODEV;
> > +             ret = -ENODEV;
> > +             goto done;
> >       }
> >
> >       if (usb_endpoint_xfer_isoc(&urb->ep->desc))
> > @@ -1552,8 +1561,10 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
> >               num_tds = 1;
> >
> >       urb_priv = kzalloc(struct_size(urb_priv, td, num_tds), mem_flags);
>
> kzalloc with spinlock held, should preferably be moved outside lock, otherwise should use GFP_ATOMIC
>
> > -     if (!urb_priv)
> > -             return -ENOMEM;
> > +     if (!urb_priv) {
> > +             ret = -ENOMEM;
> > +             goto done;
> > +     }
> >
> >       urb_priv->num_tds = num_tds;
> >       urb_priv->num_tds_done = 0;
> > @@ -1571,13 +1582,11 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
>
> xhci_check_maxpacket() called here can't be called with spinlock held
>
> >                       if (ret < 0) {
> >                               xhci_urb_free_priv(urb_priv);
> >                               urb->hcpriv = NULL;
> > -                             return ret;
> > +                             goto done;
>
> Thanks
> Mathias
  
Kuen-Han Tsai Nov. 18, 2023, 11:19 a.m. UTC | #4
Hi Greg

On Fri, Nov 17, 2023 at 9:53 PM Greg KH <gregkh@linuxfoundation.org> wrote:
>
> On Fri, Nov 17, 2023 at 03:21:28PM +0800, Kuen-Han Tsai wrote:
> > The null pointer dereference happens when xhci_free_dev() frees the
> > xhci->devs[slot_id] virtual device while xhci_urb_enqueue() is
> > processing a urb and checking the max packet size.
> >
> > [106913.850735][ T2068] usb 2-1: USB disconnect, device number 2
> > [106913.856999][ T4618] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010
> > [106913.857488][ T4618] Call trace:
> > [106913.857491][ T4618]  xhci_check_maxpacket+0x30/0x2dc
> > [106913.857494][ T4618]  xhci_urb_enqueue+0x24c/0x47c
> > [106913.857498][ T4618]  usb_hcd_submit_urb+0x1f4/0xf34
> > [106913.857501][ T4618]  usb_submit_urb+0x4b8/0x4fc
> > [106913.857503][ T4618]  usb_control_msg+0x144/0x238
> > [106913.857507][ T4618]  do_proc_control+0x1f0/0x5bc
> > [106913.857509][ T4618]  usbdev_ioctl+0xdd8/0x15a8
> >
> > This patch adds a spinlock to the xhci_urb_enqueue function to make sure
> > xhci_free_dev() and xhci_urb_enqueue() do not race and cause null
> > pointer dereference.
>
> I thought we had a lock for this already, what changed to cause this to
> start triggering now, all these years later?

Right, there is a lock in place for xhci_urb_enqueue(), but it doesn't
protect all code segments that use xhci->devs[slot_id] within the
function. I couldn't identify any specific changes that might have
introduced this issue. It's likely a long-standing potential problem
that's difficult to trigger under normal situations.

This issue happens when the USB enumeration process is complete, and a
user space program submits a control request to the peripheral, but
then the device is rapidly disconnected. I was able to reproduce this
issue by introducing a 3-second delay within xhci_check_maxpacket()
and disconnecting the peripheral while observing that the control
request is being processed by xhci_check_maxpacket().

>
> >
> > Signed-off-by: Kuen-Han Tsai <khtsai@google.com>
>
> What commit id does this fix?

Should I include a "Fixes:" header even if this patch doesn't address
a bug from a specific commit?

>
>
> > ---
> >  drivers/usb/host/xhci.c | 38 ++++++++++++++++++++++++--------------
> >  1 file changed, 24 insertions(+), 14 deletions(-)
> >
> > diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
> > index 884b0898d9c9..e0766ebeff0e 100644
> > --- a/drivers/usb/host/xhci.c
> > +++ b/drivers/usb/host/xhci.c
> > @@ -1522,23 +1522,32 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
> >       struct urb_priv *urb_priv;
> >       int num_tds;
> >
> > -     if (!urb)
> > -             return -EINVAL;
> > -     ret = xhci_check_args(hcd, urb->dev, urb->ep,
> > -                                     true, true, __func__);
> > -     if (ret <= 0)
> > -             return ret ? ret : -EINVAL;
> > +     spin_lock_irqsave(&xhci->lock, flags);
> > +
> > +     if (!urb) {
> > +             ret = -EINVAL;
> > +             goto done;
> > +     }
>
> Why does this have to be inside the lock?  The urb can't change here,
> can it?

You're right, no need to place those inside the lock. I will move them
out of the protection.

>
> > +
> > +     ret = xhci_check_args(hcd, urb->dev, urb->ep, true, true, __func__);
> > +     if (ret <= 0) {
> > +             ret = ret ? ret : -EINVAL;
> > +             goto done;
> > +     }
> >
> >       slot_id = urb->dev->slot_id;
> >       ep_index = xhci_get_endpoint_index(&urb->ep->desc);
> >       ep_state = &xhci->devs[slot_id]->eps[ep_index].ep_state;
> >
> > -     if (!HCD_HW_ACCESSIBLE(hcd))
> > -             return -ESHUTDOWN;
> > +     if (!HCD_HW_ACCESSIBLE(hcd)) {
> > +             ret = -ESHUTDOWN;
> > +             goto done;
>
> Note, we now have completions, so all of this "goto done" doesn't need
> to happen anymore.  Not a complaint, just a suggestion for future
> changes or this one, your choice.
>

I'm not familiar with the concept of 'completions'. Can you please
provide some links or explanations to help me understand it? I use a
'goto done' statement because I follow this pattern seen in many
previous commits. However, I'm willing to modify this approach if
there's a more suitable alternative.

Please forgive me if any of my questions seem overly basic. I'm still
in the process of learning how to contribute to the kernel community.

Thanks,
Kuen-Han

> thanks,
>
> greg k-h
  
Mathias Nyman Nov. 20, 2023, 3:33 p.m. UTC | #5
On 18.11.2023 12.19, Kuen-Han Tsai wrote:
> Hi Mathias
> 
>>>        if (usb_endpoint_xfer_isoc(&urb->ep->desc))
>>> @@ -1552,8 +1561,10 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
>>>                num_tds = 1;
>>>
>>>        urb_priv = kzalloc(struct_size(urb_priv, td, num_tds), mem_flags);
>> kzalloc with spinlock held, should preferably be moved outside lock, otherwise should use GFP_ATOMIC
> 
> Thanks for pointing this out. I realize this patch is incorrect and it
> is non-ideal to include many codes unrelated to xhci->devs[slot_id]
> within the lock.
> 
>> xhci_check_maxpacket() called here can't be called with spinlock held
> 
> It appears that xhci_check_maxpacket() might potentially lead to a
> deadlock later if a spinlock is held. Is this the concern you were
> referring to? If not, please let me know if there are any other
> potential issues that I may have missed, thanks!

xhci_check_maxpacket() will allocate memory, wait for completion, and use the same lock,
so there are several issues here.

I actually think we shouldn't call xhci_check_maxpacket() at all while queuing urbs.

usb core knows when there was max packet size mismatch during enumeration.
I think we should add a hook to the hcd that usb core can call in these cases

Thanks
Mathias
  
kernel test robot Nov. 23, 2023, 1:54 a.m. UTC | #6
Hello,

kernel test robot noticed "WARNING:HARDIRQ-safe->HARDIRQ-unsafe_lock_order_detected" on:

commit: 90703e106b4214512828bff96df3df2ecff5c7b7 ("[PATCH] xhci: fix null pointer deref for xhci_urb_enqueue")
url: https://github.com/intel-lab-lkp/linux/commits/Kuen-Han-Tsai/xhci-fix-null-pointer-deref-for-xhci_urb_enqueue/20231117-152346
base: https://git.kernel.org/cgit/linux/kernel/git/gregkh/usb.git usb-testing
patch link: https://lore.kernel.org/all/20231117072131.2886406-1-khtsai@google.com/
patch subject: [PATCH] xhci: fix null pointer deref for xhci_urb_enqueue

in testcase: kernel-selftests
version: kernel-selftests-x86_64-60acb023-1_20230329
with following parameters:

	group: net
	test: fcnal-test.sh
	atomic_test: ipv4_ping



compiler: gcc-12
test machine: 36 threads 1 sockets Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz (Cascade Lake) with 32G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202311222304.1a72c7d4-oliver.sang@intel.com


[   18.016498][    T9] WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
[   18.016498][    T9] 6.7.0-rc1-00001-g90703e106b42 #1 Not tainted
[   18.016498][    T9] -----------------------------------------------------
[   18.016498][    T9] kworker/0:1/9 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
[   18.019119][    T1] iTCO_vendor_support: vendor-support=0
[   18.016498][    T9] ffffffff84f96760 (
[   18.019656][    T1] intel_pstate: HWP enabled by BIOS
[   18.016498][    T9] mmu_notifier_invalidate_range_start
[   18.020037][    T1] intel_pstate: Intel P-state driver initializing
[ 18.016498][ T9] ){+.+.}-{0:0}, at: fs_reclaim_acquire (mm/page_alloc.c:3710 mm/page_alloc.c:3701) 
[   18.016498][    T9]
[   18.016498][    T9] and this task is already holding:
[ 18.016498][ T9] ffff8881e0b12428 (&xhci->lock){-.-.}-{2:2}, at: xhci_urb_enqueue (drivers/usb/host/xhci.c:1525) 
[   18.016498][    T9] which would create a new lock dependency:
[   18.016498][    T9]  (&xhci->lock){-.-.}-{2:2} -> (mmu_notifier_invalidate_range_start){+.+.}-{0:0}
[   18.016498][    T9]
[   18.016498][    T9] but this new dependency connects a HARDIRQ-irq-safe lock:
[   18.016498][    T9]  (&xhci->lock){-.-.}-{2:2}
[   18.016498][    T9]
[   18.016498][    T9] ... which became HARDIRQ-irq-safe at:
[ 18.016498][ T9] __lock_acquire (kernel/locking/lockdep.c:5090) 
[ 18.016498][ T9] lock_acquire (kernel/locking/lockdep.c:467 kernel/locking/lockdep.c:5755 kernel/locking/lockdep.c:5718) 
[ 18.016498][ T9] _raw_spin_lock (include/linux/spinlock_api_smp.h:134 kernel/locking/spinlock.c:154) 
[ 18.016498][ T9] xhci_irq (drivers/usb/host/xhci-ring.c:3032) 
[ 18.016498][ T9] __handle_irq_event_percpu (kernel/irq/handle.c:158) 



The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20231122/202311222304.1a72c7d4-oliver.sang@intel.com
  
Mathias Nyman Nov. 28, 2023, 1:57 p.m. UTC | #7
On 20.11.2023 17.33, Mathias Nyman wrote:
> On 18.11.2023 12.19, Kuen-Han Tsai wrote:
>> Hi Mathias
>>
>>>>        if (usb_endpoint_xfer_isoc(&urb->ep->desc))
>>>> @@ -1552,8 +1561,10 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
>>>>                num_tds = 1;
>>>>
>>>>        urb_priv = kzalloc(struct_size(urb_priv, td, num_tds), mem_flags);
>>> kzalloc with spinlock held, should preferably be moved outside lock, otherwise should use GFP_ATOMIC
>>
>> Thanks for pointing this out. I realize this patch is incorrect and it
>> is non-ideal to include many codes unrelated to xhci->devs[slot_id]
>> within the lock.
>>
>>> xhci_check_maxpacket() called here can't be called with spinlock held
>>
>> It appears that xhci_check_maxpacket() might potentially lead to a
>> deadlock later if a spinlock is held. Is this the concern you were
>> referring to? If not, please let me know if there are any other
>> potential issues that I may have missed, thanks!
> 
> xhci_check_maxpacket() will allocate memory, wait for completion, and use the same lock,
> so there are several issues here.
> 
> I actually think we shouldn't call xhci_check_maxpacket() at all while queuing urbs.
> 
> usb core knows when there was max packet size mismatch during enumeration.
> I think we should add a hook to the hcd that usb core can call in these cases

I moved the max packet checks away from xhci_urb_enqueue() and fixed up the locking.

I can't trigger the original issue, but I tested it by setting incorrect initial max packet
size values.

If you have the chance to test this with your setup I'd appreciate it.

patches found here:
git://git.kernel.org/pub/scm/linux/kernel/git/mnyman/xhci.git fix_urb_enqueue_locking
https://git.kernel.org/pub/scm/linux/kernel/git/mnyman/xhci.git/log/?h=fix_urb_enqueue_locking

I'll add them to this thread as well

thanks
Mathias
  
Kuen-Han Tsai Nov. 28, 2023, 3:01 p.m. UTC | #8
Thank you so much for fixing the issue, Mathias!

> I moved the max packet checks away from xhci_urb_enqueue() and fixed up the locking.
> I can't trigger the original issue, but I tested it by setting incorrect initial max packet
> size values.

I added a 3-seconds delay within xhci_check_maxpacket(). When I saw
the max packet size was being checked, I removed the USB device to
trigger the race problem.

[  172.392813][ T1960] [khtsai] xhci_check_maxpacket, before,
slot_id=2, devs[slot_id]=000000003cb76fec
[  174.290601][   T20] usb 2-1: USB disconnect, device number 2
[  174.290608][   T20] usb 2-1.2: USB disconnect, device number 3
[  174.297180][   T20] [khtsai] xhci_free_dev, ret=1
[  174.305010][  T133] usb usb3: USB disconnect, device number 1
[  174.316346][   T20] [khtsai] xhci_free_dev, ret=1
[  175.458962][ T1960] [khtsai] xhci_check_maxpacket, after,
slot_id=2, devs[slot_id]=0000000000000000
[  175.460835][ T1960] Unable to handle kernel NULL pointer
dereference at virtual address 0000000000000010

> If you have the chance to test this with your setup I'd appreciate it.

Sure, I will definitely help verify it. However, I believe the race
problem won't happen as your patch already removes max packet checks
from xhci_urb_enqueue() and also protects sections using the
xhci->devs[slot_id] virtual device.

> patches found here:
> git://git.kernel.org/pub/scm/linux/kernel/git/mnyman/xhci.git fix_urb_enqueue_locking
> https://git.kernel.org/pub/scm/linux/kernel/git/mnyman/xhci.git/log/?h=fix_urb_enqueue_locking

I'll add them to this thread as well
  

Patch

diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 884b0898d9c9..e0766ebeff0e 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -1522,23 +1522,32 @@  static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
 	struct urb_priv	*urb_priv;
 	int num_tds;
 
-	if (!urb)
-		return -EINVAL;
-	ret = xhci_check_args(hcd, urb->dev, urb->ep,
-					true, true, __func__);
-	if (ret <= 0)
-		return ret ? ret : -EINVAL;
+	spin_lock_irqsave(&xhci->lock, flags);
+
+	if (!urb) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	ret = xhci_check_args(hcd, urb->dev, urb->ep, true, true, __func__);
+	if (ret <= 0) {
+		ret = ret ? ret : -EINVAL;
+		goto done;
+	}
 
 	slot_id = urb->dev->slot_id;
 	ep_index = xhci_get_endpoint_index(&urb->ep->desc);
 	ep_state = &xhci->devs[slot_id]->eps[ep_index].ep_state;
 
-	if (!HCD_HW_ACCESSIBLE(hcd))
-		return -ESHUTDOWN;
+	if (!HCD_HW_ACCESSIBLE(hcd)) {
+		ret = -ESHUTDOWN;
+		goto done;
+	}
 
 	if (xhci->devs[slot_id]->flags & VDEV_PORT_ERROR) {
 		xhci_dbg(xhci, "Can't queue urb, port error, link inactive\n");
-		return -ENODEV;
+		ret = -ENODEV;
+		goto done;
 	}
 
 	if (usb_endpoint_xfer_isoc(&urb->ep->desc))
@@ -1552,8 +1561,10 @@  static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
 		num_tds = 1;
 
 	urb_priv = kzalloc(struct_size(urb_priv, td, num_tds), mem_flags);
-	if (!urb_priv)
-		return -ENOMEM;
+	if (!urb_priv) {
+		ret = -ENOMEM;
+		goto done;
+	}
 
 	urb_priv->num_tds = num_tds;
 	urb_priv->num_tds_done = 0;
@@ -1571,13 +1582,11 @@  static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
 			if (ret < 0) {
 				xhci_urb_free_priv(urb_priv);
 				urb->hcpriv = NULL;
-				return ret;
+				goto done;
 			}
 		}
 	}
 
-	spin_lock_irqsave(&xhci->lock, flags);
-
 	if (xhci->xhc_state & XHCI_STATE_DYING) {
 		xhci_dbg(xhci, "Ep 0x%x: URB %p submitted for non-responsive xHCI host.\n",
 			 urb->ep->desc.bEndpointAddress, urb);
@@ -1620,6 +1629,7 @@  static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
 		xhci_urb_free_priv(urb_priv);
 		urb->hcpriv = NULL;
 	}
+done:
 	spin_unlock_irqrestore(&xhci->lock, flags);
 	return ret;
 }