[15/17] vfio/pci: Let enable and disable of interrupt types use same signature

Message ID bf87e46c249941ebbfacb20ee9ff92e8efd2a595.1706849424.git.reinette.chatre@intel.com
State New
Headers
Series vfio/pci: Remove duplicate code and logic from VFIO PCI interrupt management |

Commit Message

Reinette Chatre Feb. 2, 2024, 4:57 a.m. UTC
  vfio_pci_set_intx_trigger() and vfio_pci_set_msi_trigger() have
flows that can be shared.

For INTx, MSI, and MSI-X interrupt management to share the
same enable/disable flow the interrupt specific enable and
disable functions should have the same signatures.

Let vfio_intx_enable() and vfio_msi_enable() use the same
parameters by passing "start" and "count" to these functions
instead of letting the (what will eventually be) common code
interpret these values.

Providing "start" and "count" to vfio_intx_enable()
enables the INTx specific check of these parameters to move into
the INTx specific vfio_intx_enable(). Similarly, providing "start"
and "count" to vfio_msi_enable() enables the MSI/MSI-X specific
code to initialize number of vectors needed.

The shared MSI/MSI-X code needs the interrupt index. Provide
the interrupt index (clearly marked as unused) to the INTx code
to use the same signatures.

With interrupt type specific code using the same parameters it
is possible to have common code that calls the enable/disable
code for different interrupt types.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
---
 drivers/vfio/pci/vfio_pci_intrs.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)
  

Comments

Alex Williamson Feb. 5, 2024, 10:35 p.m. UTC | #1
On Thu,  1 Feb 2024 20:57:09 -0800
Reinette Chatre <reinette.chatre@intel.com> wrote:

> vfio_pci_set_intx_trigger() and vfio_pci_set_msi_trigger() have
> flows that can be shared.
> 
> For INTx, MSI, and MSI-X interrupt management to share the
> same enable/disable flow the interrupt specific enable and
> disable functions should have the same signatures.
> 
> Let vfio_intx_enable() and vfio_msi_enable() use the same
> parameters by passing "start" and "count" to these functions
> instead of letting the (what will eventually be) common code
> interpret these values.
> 
> Providing "start" and "count" to vfio_intx_enable()
> enables the INTx specific check of these parameters to move into
> the INTx specific vfio_intx_enable(). Similarly, providing "start"
> and "count" to vfio_msi_enable() enables the MSI/MSI-X specific
> code to initialize number of vectors needed.
> 
> The shared MSI/MSI-X code needs the interrupt index. Provide
> the interrupt index (clearly marked as unused) to the INTx code
> to use the same signatures.
> 
> With interrupt type specific code using the same parameters it
> is possible to have common code that calls the enable/disable
> code for different interrupt types.
> 
> Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
> ---
>  drivers/vfio/pci/vfio_pci_intrs.c | 27 ++++++++++++++++++---------
>  1 file changed, 18 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
> index 37065623d286..9217fea3f636 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -257,13 +257,18 @@ static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
>  	return ret;
>  }
>  
> -static int vfio_intx_enable(struct vfio_pci_core_device *vdev)
> +static int vfio_intx_enable(struct vfio_pci_core_device *vdev,
> +			    unsigned int start, unsigned int count,
> +			    unsigned int __always_unused index)
>  {
>  	struct vfio_pci_irq_ctx *ctx;
>  
>  	if (!is_irq_none(vdev))
>  		return -EINVAL;
>  
> +	if (start != 0 || count != 1)
> +		return -EINVAL;
> +
>  	if (!vdev->pdev->irq)
>  		return -ENODEV;
>  
> @@ -332,7 +337,8 @@ static char *vfio_intx_device_name(struct vfio_pci_core_device *vdev,
>  	return kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev));
>  }
>  
> -static void vfio_intx_disable(struct vfio_pci_core_device *vdev)
> +static void vfio_intx_disable(struct vfio_pci_core_device *vdev,
> +			      unsigned int __always_unused index)
>  {
>  	struct vfio_pci_irq_ctx *ctx;
>  
> @@ -358,17 +364,20 @@ static irqreturn_t vfio_msihandler(int irq, void *arg)
>  	return IRQ_HANDLED;
>  }
>  
> -static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec,
> +static int vfio_msi_enable(struct vfio_pci_core_device *vdev,
> +			   unsigned int start, unsigned int count,
>  			   unsigned int index)
>  {
>  	struct pci_dev *pdev = vdev->pdev;
>  	unsigned int flag;
> -	int ret;
> +	int ret, nvec;
>  	u16 cmd;
>  
>  	if (!is_irq_none(vdev))
>  		return -EINVAL;
>  
> +	nvec = start + count;
> +
>  	flag = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? PCI_IRQ_MSIX : PCI_IRQ_MSI;
>  
>  	/* return the number of supported vectors if we can't get all: */
> @@ -701,11 +710,11 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
>  	unsigned int i;
>  
>  	if (is_intx(vdev) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
> -		vfio_intx_disable(vdev);
> +		vfio_intx_disable(vdev, index);
>  		return 0;
>  	}
>  
> -	if (!(is_intx(vdev) || is_irq_none(vdev)) || start != 0 || count != 1)
> +	if (!(is_intx(vdev) || is_irq_none(vdev)))
>  		return -EINVAL;
>  
>  	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
> @@ -715,13 +724,13 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
>  		if (is_intx(vdev))
>  			return vfio_irq_set_block(vdev, start, count, fds, index);
>  
> -		ret = vfio_intx_enable(vdev);
> +		ret = vfio_intx_enable(vdev, start, count, index);

Please trace what happens when a user calls SET_IRQS to setup a trigger
eventfd with start = 0, count = 1, followed by any other combination of
start and count values once is_intx() is true.  vfio_intx_enable()
cannot be the only place we bounds check the user, all of the INTx
callbacks should be an error or nop if vector != 0.  Thanks,

Alex

>  		if (ret)
>  			return ret;
>  
>  		ret = vfio_irq_set_block(vdev, start, count, fds, index);
>  		if (ret)
> -			vfio_intx_disable(vdev);
> +			vfio_intx_disable(vdev, index);
>  
>  		return ret;
>  	}
> @@ -771,7 +780,7 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
>  			return vfio_irq_set_block(vdev, start, count,
>  						  fds, index);
>  
> -		ret = vfio_msi_enable(vdev, start + count, index);
> +		ret = vfio_msi_enable(vdev, start, count, index);
>  		if (ret)
>  			return ret;
>
  
Reinette Chatre Feb. 6, 2024, 9:46 p.m. UTC | #2
Hi Alex,

On 2/5/2024 2:35 PM, Alex Williamson wrote:
> On Thu,  1 Feb 2024 20:57:09 -0800
> Reinette Chatre <reinette.chatre@intel.com> wrote:

..

>> @@ -715,13 +724,13 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
>>  		if (is_intx(vdev))
>>  			return vfio_irq_set_block(vdev, start, count, fds, index);
>>  
>> -		ret = vfio_intx_enable(vdev);
>> +		ret = vfio_intx_enable(vdev, start, count, index);
> 
> Please trace what happens when a user calls SET_IRQS to setup a trigger
> eventfd with start = 0, count = 1, followed by any other combination of
> start and count values once is_intx() is true.  vfio_intx_enable()
> cannot be the only place we bounds check the user, all of the INTx
> callbacks should be an error or nop if vector != 0.  Thanks,
> 

Thank you very much for catching this. I plan to add the vector
check to the device_name() and request_interrupt() callbacks. I do
not think it is necessary to add the vector check to disable() since
it does not operate on a range and from what I can tell it depends on
a successful enable() that already contains the vector check. Similar,
free_interrupt() requires a successful request_interrupt() (that will
have vector check in next version).
send_eventfd() requires a valid interrupt context that is only
possible if enable() or request_interrupt() succeeded.

If user space creates an eventfd with start = 0 and count = 1
and then attempts to trigger the eventfd using another combination then
the changes in this series will result in a nop while the current
implementation will result in -EINVAL. Is this acceptable?

Reinette
  
Alex Williamson Feb. 6, 2024, 10:03 p.m. UTC | #3
On Tue, 6 Feb 2024 13:46:37 -0800
Reinette Chatre <reinette.chatre@intel.com> wrote:

> Hi Alex,
> 
> On 2/5/2024 2:35 PM, Alex Williamson wrote:
> > On Thu,  1 Feb 2024 20:57:09 -0800
> > Reinette Chatre <reinette.chatre@intel.com> wrote:  
> 
> ..
> 
> >> @@ -715,13 +724,13 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
> >>  		if (is_intx(vdev))
> >>  			return vfio_irq_set_block(vdev, start, count, fds, index);
> >>  
> >> -		ret = vfio_intx_enable(vdev);
> >> +		ret = vfio_intx_enable(vdev, start, count, index);  
> > 
> > Please trace what happens when a user calls SET_IRQS to setup a trigger
> > eventfd with start = 0, count = 1, followed by any other combination of
> > start and count values once is_intx() is true.  vfio_intx_enable()
> > cannot be the only place we bounds check the user, all of the INTx
> > callbacks should be an error or nop if vector != 0.  Thanks,
> >   
> 
> Thank you very much for catching this. I plan to add the vector
> check to the device_name() and request_interrupt() callbacks. I do
> not think it is necessary to add the vector check to disable() since
> it does not operate on a range and from what I can tell it depends on
> a successful enable() that already contains the vector check. Similar,
> free_interrupt() requires a successful request_interrupt() (that will
> have vector check in next version).
> send_eventfd() requires a valid interrupt context that is only
> possible if enable() or request_interrupt() succeeded.

Sounds reasonable.

> If user space creates an eventfd with start = 0 and count = 1
> and then attempts to trigger the eventfd using another combination then
> the changes in this series will result in a nop while the current
> implementation will result in -EINVAL. Is this acceptable?

I think by nop, you mean the ioctl returns success.  Was the call a
success?  Thanks,

Alex
  
Reinette Chatre Feb. 6, 2024, 10:22 p.m. UTC | #4
Hi Alex,

On 2/6/2024 2:03 PM, Alex Williamson wrote:
> On Tue, 6 Feb 2024 13:46:37 -0800
> Reinette Chatre <reinette.chatre@intel.com> wrote:
> 
>> Hi Alex,
>>
>> On 2/5/2024 2:35 PM, Alex Williamson wrote:
>>> On Thu,  1 Feb 2024 20:57:09 -0800
>>> Reinette Chatre <reinette.chatre@intel.com> wrote:  
>>
>> ..
>>
>>>> @@ -715,13 +724,13 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
>>>>  		if (is_intx(vdev))
>>>>  			return vfio_irq_set_block(vdev, start, count, fds, index);
>>>>  
>>>> -		ret = vfio_intx_enable(vdev);
>>>> +		ret = vfio_intx_enable(vdev, start, count, index);  
>>>
>>> Please trace what happens when a user calls SET_IRQS to setup a trigger
>>> eventfd with start = 0, count = 1, followed by any other combination of
>>> start and count values once is_intx() is true.  vfio_intx_enable()
>>> cannot be the only place we bounds check the user, all of the INTx
>>> callbacks should be an error or nop if vector != 0.  Thanks,
>>>   
>>
>> Thank you very much for catching this. I plan to add the vector
>> check to the device_name() and request_interrupt() callbacks. I do
>> not think it is necessary to add the vector check to disable() since
>> it does not operate on a range and from what I can tell it depends on
>> a successful enable() that already contains the vector check. Similar,
>> free_interrupt() requires a successful request_interrupt() (that will
>> have vector check in next version).
>> send_eventfd() requires a valid interrupt context that is only
>> possible if enable() or request_interrupt() succeeded.
> 
> Sounds reasonable.
> 
>> If user space creates an eventfd with start = 0 and count = 1
>> and then attempts to trigger the eventfd using another combination then
>> the changes in this series will result in a nop while the current
>> implementation will result in -EINVAL. Is this acceptable?
> 
> I think by nop, you mean the ioctl returns success.  Was the call a
> success?  Thanks,

Yes, I mean the ioctl returns success without taking any
action (nop).

It is not obvious to me how to interpret "success" because from what I
understand current INTx and MSI/MSI-x are behaving differently when
considering this flow. If I understand correctly, INTx will return
an error if user space attempts to trigger an eventfd that has not
been set up while MSI and MSI-x will return 0.

I can restore existing INTx behavior by adding more logic and a return
code to the send_eventfd() callback so that the different interrupt types
can maintain their existing behavior.

Reinette
  
Alex Williamson Feb. 6, 2024, 11:19 p.m. UTC | #5
On Tue, 6 Feb 2024 14:22:04 -0800
Reinette Chatre <reinette.chatre@intel.com> wrote:

> Hi Alex,
> 
> On 2/6/2024 2:03 PM, Alex Williamson wrote:
> > On Tue, 6 Feb 2024 13:46:37 -0800
> > Reinette Chatre <reinette.chatre@intel.com> wrote:
> >   
> >> Hi Alex,
> >>
> >> On 2/5/2024 2:35 PM, Alex Williamson wrote:  
> >>> On Thu,  1 Feb 2024 20:57:09 -0800
> >>> Reinette Chatre <reinette.chatre@intel.com> wrote:    
> >>
> >> ..
> >>  
> >>>> @@ -715,13 +724,13 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
> >>>>  		if (is_intx(vdev))
> >>>>  			return vfio_irq_set_block(vdev, start, count, fds, index);
> >>>>  
> >>>> -		ret = vfio_intx_enable(vdev);
> >>>> +		ret = vfio_intx_enable(vdev, start, count, index);    
> >>>
> >>> Please trace what happens when a user calls SET_IRQS to setup a trigger
> >>> eventfd with start = 0, count = 1, followed by any other combination of
> >>> start and count values once is_intx() is true.  vfio_intx_enable()
> >>> cannot be the only place we bounds check the user, all of the INTx
> >>> callbacks should be an error or nop if vector != 0.  Thanks,
> >>>     
> >>
> >> Thank you very much for catching this. I plan to add the vector
> >> check to the device_name() and request_interrupt() callbacks. I do
> >> not think it is necessary to add the vector check to disable() since
> >> it does not operate on a range and from what I can tell it depends on
> >> a successful enable() that already contains the vector check. Similar,
> >> free_interrupt() requires a successful request_interrupt() (that will
> >> have vector check in next version).
> >> send_eventfd() requires a valid interrupt context that is only
> >> possible if enable() or request_interrupt() succeeded.  
> > 
> > Sounds reasonable.
> >   
> >> If user space creates an eventfd with start = 0 and count = 1
> >> and then attempts to trigger the eventfd using another combination then
> >> the changes in this series will result in a nop while the current
> >> implementation will result in -EINVAL. Is this acceptable?  
> > 
> > I think by nop, you mean the ioctl returns success.  Was the call a
> > success?  Thanks,  
> 
> Yes, I mean the ioctl returns success without taking any
> action (nop).
> 
> It is not obvious to me how to interpret "success" because from what I
> understand current INTx and MSI/MSI-x are behaving differently when
> considering this flow. If I understand correctly, INTx will return
> an error if user space attempts to trigger an eventfd that has not
> been set up while MSI and MSI-x will return 0.
> 
> I can restore existing INTx behavior by adding more logic and a return
> code to the send_eventfd() callback so that the different interrupt types
> can maintain their existing behavior.

Ah yes, I see the dilemma now.  INTx always checked start/count were
valid but MSI/X plowed through regardless, and with this series we've
standardized the loop around the MSI/X flow.

Tricky, but probably doesn't really matter.  Unless we break someone.

I can ignore that INTx can be masked and signaling a masked vector
doesn't do anything, but signaling an unconfigured vector feels like an
error condition and trying to create verbiage in the uAPI header to
weasel out of that error and unconditionally return success makes me
cringe.

What if we did this:

        uint8_t *bools = data;
	...
        for (i = start; i < start + count; i++) {
                if ((flags & VFIO_IRQ_SET_DATA_NONE) ||
                    ((flags & VFIO_IRQ_SET_DATA_BOOL) && bools[i - start])) {
                        ctx = vfio_irq_ctx_get(vdev, i);
                        if (!ctx || !ctx->trigger)
                                return -EINVAL;
                        intr_ops[index].send_eventfd(vdev, ctx);
                }
        }

And we note the behavior change for MSI/X in the commit log and if
someone shouts that we broke them, we can make that an -errno or
continue based on is_intx().  Sound ok?  Thanks,

Alex
  
Reinette Chatre Feb. 7, 2024, 11:30 p.m. UTC | #6
Hi Alex,

On 2/6/2024 3:19 PM, Alex Williamson wrote:
> On Tue, 6 Feb 2024 14:22:04 -0800
> Reinette Chatre <reinette.chatre@intel.com> wrote:
>> On 2/6/2024 2:03 PM, Alex Williamson wrote:
>>> On Tue, 6 Feb 2024 13:46:37 -0800
>>> Reinette Chatre <reinette.chatre@intel.com> wrote:
>>>> On 2/5/2024 2:35 PM, Alex Williamson wrote:  
>>>>> On Thu,  1 Feb 2024 20:57:09 -0800
>>>>> Reinette Chatre <reinette.chatre@intel.com> wrote:    
>>>>
>>>> ..
>>>>  
>>>>>> @@ -715,13 +724,13 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
>>>>>>  		if (is_intx(vdev))
>>>>>>  			return vfio_irq_set_block(vdev, start, count, fds, index);
>>>>>>  
>>>>>> -		ret = vfio_intx_enable(vdev);
>>>>>> +		ret = vfio_intx_enable(vdev, start, count, index);    
>>>>>
>>>>> Please trace what happens when a user calls SET_IRQS to setup a trigger
>>>>> eventfd with start = 0, count = 1, followed by any other combination of
>>>>> start and count values once is_intx() is true.  vfio_intx_enable()
>>>>> cannot be the only place we bounds check the user, all of the INTx
>>>>> callbacks should be an error or nop if vector != 0.  Thanks,
>>>>>     
>>>>
>>>> Thank you very much for catching this. I plan to add the vector
>>>> check to the device_name() and request_interrupt() callbacks. I do
>>>> not think it is necessary to add the vector check to disable() since
>>>> it does not operate on a range and from what I can tell it depends on
>>>> a successful enable() that already contains the vector check. Similar,
>>>> free_interrupt() requires a successful request_interrupt() (that will
>>>> have vector check in next version).
>>>> send_eventfd() requires a valid interrupt context that is only
>>>> possible if enable() or request_interrupt() succeeded.  
>>>
>>> Sounds reasonable.
>>>   
>>>> If user space creates an eventfd with start = 0 and count = 1
>>>> and then attempts to trigger the eventfd using another combination then
>>>> the changes in this series will result in a nop while the current
>>>> implementation will result in -EINVAL. Is this acceptable?  
>>>
>>> I think by nop, you mean the ioctl returns success.  Was the call a
>>> success?  Thanks,  
>>
>> Yes, I mean the ioctl returns success without taking any
>> action (nop).
>>
>> It is not obvious to me how to interpret "success" because from what I
>> understand current INTx and MSI/MSI-x are behaving differently when
>> considering this flow. If I understand correctly, INTx will return
>> an error if user space attempts to trigger an eventfd that has not
>> been set up while MSI and MSI-x will return 0.
>>
>> I can restore existing INTx behavior by adding more logic and a return
>> code to the send_eventfd() callback so that the different interrupt types
>> can maintain their existing behavior.
> 
> Ah yes, I see the dilemma now.  INTx always checked start/count were
> valid but MSI/X plowed through regardless, and with this series we've
> standardized the loop around the MSI/X flow.
> 
> Tricky, but probably doesn't really matter.  Unless we break someone.
> 
> I can ignore that INTx can be masked and signaling a masked vector
> doesn't do anything, but signaling an unconfigured vector feels like an
> error condition and trying to create verbiage in the uAPI header to
> weasel out of that error and unconditionally return success makes me
> cringe.
> 
> What if we did this:
> 
>         uint8_t *bools = data;
> 	...
>         for (i = start; i < start + count; i++) {
>                 if ((flags & VFIO_IRQ_SET_DATA_NONE) ||
>                     ((flags & VFIO_IRQ_SET_DATA_BOOL) && bools[i - start])) {
>                         ctx = vfio_irq_ctx_get(vdev, i);
>                         if (!ctx || !ctx->trigger)
>                                 return -EINVAL;
>                         intr_ops[index].send_eventfd(vdev, ctx);
>                 }
>         }
> 

This looks good. Thank you very much. Will do.

I studied the code more and have one more observation related to this portion
of the flow:
From what I can tell this change makes the INTx code more robust. If I
understand current implementation correctly it seems possible to enable
INTx but not have interrupt allocated. In this case the interrupt context
(ctx) will exist but ctx->trigger will be NULL. Current
vfio_pci_set_intx_trigger()->vfio_send_intx_eventfd() only checks if
ctx is valid. It looks like it may call eventfd_signal(NULL) where
pointer is dereferenced.

If this is correct then I think a separate fix that can easily be
backported may be needed. Something like:

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 237beac83809..17ec46d8ab29 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -92,7 +92,7 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused)
 		struct vfio_pci_irq_ctx *ctx;
 
 		ctx = vfio_irq_ctx_get(vdev, 0);
-		if (WARN_ON_ONCE(!ctx))
+		if (WARN_ON_ONCE(!ctx || !ctx->trigger))
 			return;
 		eventfd_signal(ctx->trigger);
 	}

> And we note the behavior change for MSI/X in the commit log and if
> someone shouts that we broke them, we can make that an -errno or
> continue based on is_intx().  Sound ok?  Thanks,

I'll be sure to highlight the impact on MSI/MSI-x. Please do expect this
in the final patch "vfio/pci: Remove duplicate interrupt management flow"
though since that is where the different flows are merged.

I am not familiar with how all user space interacts with this flow and if/how
this may break things. I did look at Qemu code and I was not able to find
where it intentionally triggers MSI/MSI-x interrupts, I could only find it
for INTx.

If this does break things I would like to also consider moving the
different behavior into the interrupt type's respective send_eventfd()
callback instead of adding interrupt type specific code (like
is_intx()) into the shared flow.

Thank you.

Reinette
  
Alex Williamson Feb. 8, 2024, 9:08 p.m. UTC | #7
On Wed, 7 Feb 2024 15:30:15 -0800
Reinette Chatre <reinette.chatre@intel.com> wrote:

> Hi Alex,
> 
> On 2/6/2024 3:19 PM, Alex Williamson wrote:
> > On Tue, 6 Feb 2024 14:22:04 -0800
> > Reinette Chatre <reinette.chatre@intel.com> wrote:  
> >> On 2/6/2024 2:03 PM, Alex Williamson wrote:  
> >>> On Tue, 6 Feb 2024 13:46:37 -0800
> >>> Reinette Chatre <reinette.chatre@intel.com> wrote:  
> >>>> On 2/5/2024 2:35 PM, Alex Williamson wrote:    
> >>>>> On Thu,  1 Feb 2024 20:57:09 -0800
> >>>>> Reinette Chatre <reinette.chatre@intel.com> wrote:      
> >>>>
> >>>> ..
> >>>>    
> >>>>>> @@ -715,13 +724,13 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
> >>>>>>  		if (is_intx(vdev))
> >>>>>>  			return vfio_irq_set_block(vdev, start, count, fds, index);
> >>>>>>  
> >>>>>> -		ret = vfio_intx_enable(vdev);
> >>>>>> +		ret = vfio_intx_enable(vdev, start, count, index);      
> >>>>>
> >>>>> Please trace what happens when a user calls SET_IRQS to setup a trigger
> >>>>> eventfd with start = 0, count = 1, followed by any other combination of
> >>>>> start and count values once is_intx() is true.  vfio_intx_enable()
> >>>>> cannot be the only place we bounds check the user, all of the INTx
> >>>>> callbacks should be an error or nop if vector != 0.  Thanks,
> >>>>>       
> >>>>
> >>>> Thank you very much for catching this. I plan to add the vector
> >>>> check to the device_name() and request_interrupt() callbacks. I do
> >>>> not think it is necessary to add the vector check to disable() since
> >>>> it does not operate on a range and from what I can tell it depends on
> >>>> a successful enable() that already contains the vector check. Similar,
> >>>> free_interrupt() requires a successful request_interrupt() (that will
> >>>> have vector check in next version).
> >>>> send_eventfd() requires a valid interrupt context that is only
> >>>> possible if enable() or request_interrupt() succeeded.    
> >>>
> >>> Sounds reasonable.
> >>>     
> >>>> If user space creates an eventfd with start = 0 and count = 1
> >>>> and then attempts to trigger the eventfd using another combination then
> >>>> the changes in this series will result in a nop while the current
> >>>> implementation will result in -EINVAL. Is this acceptable?    
> >>>
> >>> I think by nop, you mean the ioctl returns success.  Was the call a
> >>> success?  Thanks,    
> >>
> >> Yes, I mean the ioctl returns success without taking any
> >> action (nop).
> >>
> >> It is not obvious to me how to interpret "success" because from what I
> >> understand current INTx and MSI/MSI-x are behaving differently when
> >> considering this flow. If I understand correctly, INTx will return
> >> an error if user space attempts to trigger an eventfd that has not
> >> been set up while MSI and MSI-x will return 0.
> >>
> >> I can restore existing INTx behavior by adding more logic and a return
> >> code to the send_eventfd() callback so that the different interrupt types
> >> can maintain their existing behavior.  
> > 
> > Ah yes, I see the dilemma now.  INTx always checked start/count were
> > valid but MSI/X plowed through regardless, and with this series we've
> > standardized the loop around the MSI/X flow.
> > 
> > Tricky, but probably doesn't really matter.  Unless we break someone.
> > 
> > I can ignore that INTx can be masked and signaling a masked vector
> > doesn't do anything, but signaling an unconfigured vector feels like an
> > error condition and trying to create verbiage in the uAPI header to
> > weasel out of that error and unconditionally return success makes me
> > cringe.
> > 
> > What if we did this:
> > 
> >         uint8_t *bools = data;
> > 	...
> >         for (i = start; i < start + count; i++) {
> >                 if ((flags & VFIO_IRQ_SET_DATA_NONE) ||
> >                     ((flags & VFIO_IRQ_SET_DATA_BOOL) && bools[i - start])) {
> >                         ctx = vfio_irq_ctx_get(vdev, i);
> >                         if (!ctx || !ctx->trigger)
> >                                 return -EINVAL;
> >                         intr_ops[index].send_eventfd(vdev, ctx);
> >                 }
> >         }
> >   
> 
> This looks good. Thank you very much. Will do.
> 
> I studied the code more and have one more observation related to this portion
> of the flow:
> From what I can tell this change makes the INTx code more robust. If I
> understand current implementation correctly it seems possible to enable
> INTx but not have interrupt allocated. In this case the interrupt context
> (ctx) will exist but ctx->trigger will be NULL. Current
> vfio_pci_set_intx_trigger()->vfio_send_intx_eventfd() only checks if
> ctx is valid. It looks like it may call eventfd_signal(NULL) where
> pointer is dereferenced.
> 
> If this is correct then I think a separate fix that can easily be
> backported may be needed. Something like:

Good find.  I think it's a bit more complicated though.  There are
several paths to vfio_send_intx_eventfd:

 - vfio_intx_handler

	This can only be called between request_irq() and free_irq()
	where trigger is always valid.  igate is not held.

 - vfio_pci_intx_unmask

	Callers hold igate, additional test of ctx->trigger makes this
	safe.

 - vfio_pci_set_intx_trigger

	Same as above.

 - Through unmask eventfd (virqfd)

	Here be dragons.

In the virqfd case, a write to the eventfd calls virqfd_wakeup() where
we'll call the handler, vfio_pci_intx_unmask_handler(), and based on
the result schedule the thread, vfio_send_intx_eventfd().  Both of
these look suspicious.  They're not called under igate, so testing
ctx->trigger doesn't resolve the race.

I think an option is to wrap the virqfd entry points in igate where we
can then do something similar to your suggestion.  I don't think we
want to WARN_ON(!ctx->trigger) because that's then a user reachable
condition.  Instead we can just quietly follow the same exit paths.

I think that means we end up with something like this:

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 237beac83809..ace7e1dbc607 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -92,12 +92,21 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused)
 		struct vfio_pci_irq_ctx *ctx;
 
 		ctx = vfio_irq_ctx_get(vdev, 0);
-		if (WARN_ON_ONCE(!ctx))
+		if (WARN_ON_ONCE(!ctx) || !ctx->trigger)
 			return;
 		eventfd_signal(ctx->trigger);
 	}
 }
 
+static void vfio_send_intx_eventfd_virqfd(void *opaque, void *unused)
+{
+	struct vfio_pci_core_device *vdev = opaque;
+
+	mutex_lock(&vdev->igate);
+	vfio_send_intx_eventfd(opaque, unused);
+	mutex_unlock(&vdev->igate);
+}
+
 /* Returns true if the INTx vfio_pci_irq_ctx.masked value is changed. */
 bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
 {
@@ -170,7 +179,7 @@ static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
 	}
 
 	ctx = vfio_irq_ctx_get(vdev, 0);
-	if (WARN_ON_ONCE(!ctx))
+	if (WARN_ON_ONCE(!ctx) || !ctx->trigger)
 		goto out_unlock;
 
 	if (ctx->masked && !vdev->virq_disabled) {
@@ -194,6 +203,18 @@ static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
 	return ret;
 }
 
+static int vfio_pci_intx_unmask_handler_virqfd(void *opaque, void *unused)
+{
+	struct vfio_pci_core_device *vdev = opaque;
+	int ret;
+
+	mutex_lock(&vdev->igate);
+	ret = vfio_pci_intx_unmask_handler(opaque, unused);
+	mutex_unlock(&vdev->igate);
+
+	return ret;
+}
+
 void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
 {
 	if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
@@ -572,10 +593,10 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev,
 		if (WARN_ON_ONCE(!ctx))
 			return -EINVAL;
 		if (fd >= 0)
-			return vfio_virqfd_enable((void *) vdev,
-						  vfio_pci_intx_unmask_handler,
-						  vfio_send_intx_eventfd, NULL,
-						  &ctx->unmask, fd);
+			return vfio_virqfd_enable((void *)vdev,
+					vfio_pci_intx_unmask_handler_virqfd,
+					vfio_send_intx_eventfd_virqfd, NULL,
+					&ctx->unmask, fd);
 
 		vfio_virqfd_disable(&ctx->unmask);
 	}


WDYT?
 
> > And we note the behavior change for MSI/X in the commit log and if
> > someone shouts that we broke them, we can make that an -errno or
> > continue based on is_intx().  Sound ok?  Thanks,  
> 
> I'll be sure to highlight the impact on MSI/MSI-x. Please do expect this
> in the final patch "vfio/pci: Remove duplicate interrupt management flow"
> though since that is where the different flows are merged.
> 
> I am not familiar with how all user space interacts with this flow and if/how
> this may break things. I did look at Qemu code and I was not able to find
> where it intentionally triggers MSI/MSI-x interrupts, I could only find it
> for INTx.

Being able to trigger the interrupt via ioctl is more of a diagnostic
feature, not typically used in production.
 
> If this does break things I would like to also consider moving the
> different behavior into the interrupt type's respective send_eventfd()
> callback instead of adding interrupt type specific code (like
> is_intx()) into the shared flow.

Sure, we can pick the best option in the slim (imo) chance the change
affects anyone.  Thanks,

Alex
  
Reinette Chatre Feb. 14, 2024, 7:37 p.m. UTC | #8
Hi Alex,

Apologies for the delay. This was due to two parts:
* I studied these code paths more and I think there may be one more
  flow that can be made more robust (more later),
* I spent a lot of time trying to trigger all the affected code paths but
  here I did not have much luck. I would prefer to run tests that trigger
  these flows so that I can get some help from the kernel lock debugging.
  From what I can tell the irqfd flows can be triggered with the help of the
  kernel-irqchip option to Qemu but on the hardware I have access to I
  could only try kernel-irqchip=auto and that did not trigger the flows
  (the irqfd is set up but the eventfd is never signaled).
  I am not familiar with this area, do you perhaps have guidance on how
  I can test these flows or do you perhaps have access to needed environments
  to test this?

On 2/8/2024 1:08 PM, Alex Williamson wrote:
> On Wed, 7 Feb 2024 15:30:15 -0800
> Reinette Chatre <reinette.chatre@intel.com> wrote:

>> I studied the code more and have one more observation related to this portion
>> of the flow:
>> From what I can tell this change makes the INTx code more robust. If I
>> understand current implementation correctly it seems possible to enable
>> INTx but not have interrupt allocated. In this case the interrupt context
>> (ctx) will exist but ctx->trigger will be NULL. Current
>> vfio_pci_set_intx_trigger()->vfio_send_intx_eventfd() only checks if
>> ctx is valid. It looks like it may call eventfd_signal(NULL) where
>> pointer is dereferenced.
>>
>> If this is correct then I think a separate fix that can easily be
>> backported may be needed. Something like:
> 
> Good find.  I think it's a bit more complicated though.  There are
> several paths to vfio_send_intx_eventfd:
> 
>  - vfio_intx_handler
> 
> 	This can only be called between request_irq() and free_irq()
> 	where trigger is always valid.  igate is not held.
> 
>  - vfio_pci_intx_unmask
> 
> 	Callers hold igate, additional test of ctx->trigger makes this
> 	safe.

Two callers of vfio_pci_intx_unmask() do not seem to hold igate:
vfio_basic_config_write() and vfio_pci_core_runtime_resume().

Considering this I wonder if we could add something like below to the
solution you propose. On a high level the outside callers (VFIO PCI core)
will keep using vfio_pci_intx_unmask() that will now take igate while
the interrupt management code gets a new internal __vfio_pci_intx_unmask()
that should be called with igate held. This results in:

@@ -215,12 +223,20 @@ static int vfio_pci_intx_unmask_handler_virqfd(void *opaque, void *unused)
 	return ret;
 }
 
-void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
+static void __vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
 {
+	lockdep_assert_held(&vdev->igate);
 	if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
 		vfio_send_intx_eventfd(vdev, NULL);
 }
 
+void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
+{
+	mutex_lock(&vdev->igate);
+	__vfio_pci_intx_unmask(vdev);
+	mutex_unlock(&vdev->igate);
+}
+
 static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
 {
 	struct vfio_pci_core_device *vdev = dev_id;
@@ -581,11 +597,11 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev,
 		return -EINVAL;
 
 	if (flags & VFIO_IRQ_SET_DATA_NONE) {
-		vfio_pci_intx_unmask(vdev);
+		__vfio_pci_intx_unmask(vdev);
 	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
 		uint8_t unmask = *(uint8_t *)data;
 		if (unmask)
-			vfio_pci_intx_unmask(vdev);
+			__vfio_pci_intx_unmask(vdev);
 	} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
 		struct vfio_pci_irq_ctx *ctx = vfio_irq_ctx_get(vdev, 0);
 		int32_t fd = *(int32_t *)data;

> 
>  - vfio_pci_set_intx_trigger
> 
> 	Same as above.
> 
>  - Through unmask eventfd (virqfd)
> 
> 	Here be dragons.
> 
> In the virqfd case, a write to the eventfd calls virqfd_wakeup() where
> we'll call the handler, vfio_pci_intx_unmask_handler(), and based on
> the result schedule the thread, vfio_send_intx_eventfd().  Both of
> these look suspicious.  They're not called under igate, so testing
> ctx->trigger doesn't resolve the race.
> 
> I think an option is to wrap the virqfd entry points in igate where we
> can then do something similar to your suggestion.  I don't think we
> want to WARN_ON(!ctx->trigger) because that's then a user reachable
> condition.  Instead we can just quietly follow the same exit paths.
> 
> I think that means we end up with something like this:
> 
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
> index 237beac83809..ace7e1dbc607 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -92,12 +92,21 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused)
>  		struct vfio_pci_irq_ctx *ctx;
>  
>  		ctx = vfio_irq_ctx_get(vdev, 0);
> -		if (WARN_ON_ONCE(!ctx))
> +		if (WARN_ON_ONCE(!ctx) || !ctx->trigger)
>  			return;
>  		eventfd_signal(ctx->trigger);
>  	}
>  }
>  
> +static void vfio_send_intx_eventfd_virqfd(void *opaque, void *unused)
> +{
> +	struct vfio_pci_core_device *vdev = opaque;
> +
> +	mutex_lock(&vdev->igate);
> +	vfio_send_intx_eventfd(opaque, unused);
> +	mutex_unlock(&vdev->igate);
> +}
> +
>  /* Returns true if the INTx vfio_pci_irq_ctx.masked value is changed. */
>  bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
>  {
> @@ -170,7 +179,7 @@ static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
>  	}
>  
>  	ctx = vfio_irq_ctx_get(vdev, 0);
> -	if (WARN_ON_ONCE(!ctx))
> +	if (WARN_ON_ONCE(!ctx) || !ctx->trigger)
>  		goto out_unlock;
>  
>  	if (ctx->masked && !vdev->virq_disabled) {
> @@ -194,6 +203,18 @@ static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
>  	return ret;
>  }
>  
> +static int vfio_pci_intx_unmask_handler_virqfd(void *opaque, void *unused)
> +{
> +	struct vfio_pci_core_device *vdev = opaque;
> +	int ret;
> +
> +	mutex_lock(&vdev->igate);
> +	ret = vfio_pci_intx_unmask_handler(opaque, unused);
> +	mutex_unlock(&vdev->igate);
> +
> +	return ret;
> +}
> +
>  void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
>  {
>  	if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
> @@ -572,10 +593,10 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev,
>  		if (WARN_ON_ONCE(!ctx))
>  			return -EINVAL;
>  		if (fd >= 0)
> -			return vfio_virqfd_enable((void *) vdev,
> -						  vfio_pci_intx_unmask_handler,
> -						  vfio_send_intx_eventfd, NULL,
> -						  &ctx->unmask, fd);
> +			return vfio_virqfd_enable((void *)vdev,
> +					vfio_pci_intx_unmask_handler_virqfd,
> +					vfio_send_intx_eventfd_virqfd, NULL,
> +					&ctx->unmask, fd);
>  
>  		vfio_virqfd_disable(&ctx->unmask);
>  	}
> 
> 
> WDYT?

This looks good to me. Thank you very much for taking the time to
write it.

Reinette
  

Patch

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 37065623d286..9217fea3f636 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -257,13 +257,18 @@  static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
 	return ret;
 }
 
-static int vfio_intx_enable(struct vfio_pci_core_device *vdev)
+static int vfio_intx_enable(struct vfio_pci_core_device *vdev,
+			    unsigned int start, unsigned int count,
+			    unsigned int __always_unused index)
 {
 	struct vfio_pci_irq_ctx *ctx;
 
 	if (!is_irq_none(vdev))
 		return -EINVAL;
 
+	if (start != 0 || count != 1)
+		return -EINVAL;
+
 	if (!vdev->pdev->irq)
 		return -ENODEV;
 
@@ -332,7 +337,8 @@  static char *vfio_intx_device_name(struct vfio_pci_core_device *vdev,
 	return kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev));
 }
 
-static void vfio_intx_disable(struct vfio_pci_core_device *vdev)
+static void vfio_intx_disable(struct vfio_pci_core_device *vdev,
+			      unsigned int __always_unused index)
 {
 	struct vfio_pci_irq_ctx *ctx;
 
@@ -358,17 +364,20 @@  static irqreturn_t vfio_msihandler(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
-static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec,
+static int vfio_msi_enable(struct vfio_pci_core_device *vdev,
+			   unsigned int start, unsigned int count,
 			   unsigned int index)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	unsigned int flag;
-	int ret;
+	int ret, nvec;
 	u16 cmd;
 
 	if (!is_irq_none(vdev))
 		return -EINVAL;
 
+	nvec = start + count;
+
 	flag = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? PCI_IRQ_MSIX : PCI_IRQ_MSI;
 
 	/* return the number of supported vectors if we can't get all: */
@@ -701,11 +710,11 @@  static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
 	unsigned int i;
 
 	if (is_intx(vdev) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
-		vfio_intx_disable(vdev);
+		vfio_intx_disable(vdev, index);
 		return 0;
 	}
 
-	if (!(is_intx(vdev) || is_irq_none(vdev)) || start != 0 || count != 1)
+	if (!(is_intx(vdev) || is_irq_none(vdev)))
 		return -EINVAL;
 
 	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
@@ -715,13 +724,13 @@  static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
 		if (is_intx(vdev))
 			return vfio_irq_set_block(vdev, start, count, fds, index);
 
-		ret = vfio_intx_enable(vdev);
+		ret = vfio_intx_enable(vdev, start, count, index);
 		if (ret)
 			return ret;
 
 		ret = vfio_irq_set_block(vdev, start, count, fds, index);
 		if (ret)
-			vfio_intx_disable(vdev);
+			vfio_intx_disable(vdev, index);
 
 		return ret;
 	}
@@ -771,7 +780,7 @@  static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
 			return vfio_irq_set_block(vdev, start, count,
 						  fds, index);
 
-		ret = vfio_msi_enable(vdev, start + count, index);
+		ret = vfio_msi_enable(vdev, start, count, index);
 		if (ret)
 			return ret;