[2/2] iommu/vt-d: Use device rbtree in iopf reporting path

Message ID 20240215072249.4465-3-baolu.lu@linux.intel.com
State New
Headers
Series iommu/vt-d: Introduce rbtree for probed devices |

Commit Message

Baolu Lu Feb. 15, 2024, 7:22 a.m. UTC
  The existing IO page fault handler currently locates the PCI device by
calling pci_get_domain_bus_and_slot(). This function searches the list
of all PCI devices until the desired device is found. To improve lookup
efficiency, a helper function named device_rbtree_find() is introduced
to search for the device within the rbtree. Replace
pci_get_domain_bus_and_slot() in the IO page fault handling path.

Co-developed-by: Huang Jiaqing <jiaqing.huang@intel.com>
Signed-off-by: Huang Jiaqing <jiaqing.huang@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 drivers/iommu/intel/iommu.h |  1 +
 drivers/iommu/intel/iommu.c | 29 +++++++++++++++++++++++++++++
 drivers/iommu/intel/svm.c   | 14 ++++++--------
 3 files changed, 36 insertions(+), 8 deletions(-)
  

Comments

Jason Gunthorpe Feb. 15, 2024, 5:55 p.m. UTC | #1
On Thu, Feb 15, 2024 at 03:22:49PM +0800, Lu Baolu wrote:
> The existing IO page fault handler currently locates the PCI device by
> calling pci_get_domain_bus_and_slot(). This function searches the list
> of all PCI devices until the desired device is found. To improve lookup
> efficiency, a helper function named device_rbtree_find() is introduced
> to search for the device within the rbtree. Replace
> pci_get_domain_bus_and_slot() in the IO page fault handling path.
> 
> Co-developed-by: Huang Jiaqing <jiaqing.huang@intel.com>
> Signed-off-by: Huang Jiaqing <jiaqing.huang@intel.com>
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
>  drivers/iommu/intel/iommu.h |  1 +
>  drivers/iommu/intel/iommu.c | 29 +++++++++++++++++++++++++++++
>  drivers/iommu/intel/svm.c   | 14 ++++++--------
>  3 files changed, 36 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
> index 54eeaa8e35a9..f13c228924f8 100644
> --- a/drivers/iommu/intel/iommu.h
> +++ b/drivers/iommu/intel/iommu.h
> @@ -1081,6 +1081,7 @@ void free_pgtable_page(void *vaddr);
>  void iommu_flush_write_buffer(struct intel_iommu *iommu);
>  struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
>  					       const struct iommu_user_data *user_data);
> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid);
>  
>  #ifdef CONFIG_INTEL_IOMMU_SVM
>  void intel_svm_check(struct intel_iommu *iommu);
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index 09009d96e553..d92c680bcc96 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -120,6 +120,35 @@ static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
>  	return device_rid_cmp_key(&key, rhs);
>  }
>  
> +/*
> + * Looks up an IOMMU-probed device using its source ID.
> + *
> + * If the device is found:
> + *  - Increments its reference count.
> + *  - Returns a pointer to the device.
> + *  - The caller must call put_device() after using the pointer.
> + *
> + * If the device is not found, returns NULL.
> + */
> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
> +{
> +	struct device_domain_info *info;
> +	struct device *dev = NULL;
> +	struct rb_node *node;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
> +	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
> +	if (node) {
> +		info = rb_entry(node, struct device_domain_info, node);
> +		dev = info->dev;
> +		get_device(dev);

This get_device() is a bit troubling. It eventually calls into
iommu_report_device_fault() which does:

	struct dev_iommu *param = dev->iommu;

Which is going to explode if the iomm driver release has already
happened, which is a precondition to getting to a unref'd struct
device.

The driver needs to do something to fence these events during it's
release function.

If we are already doing that then I'd suggest to drop the get_device
and add a big fat comment explaining the special rules about lifetime
that are in effect here.

Otherwise you need to do that barrier rethink the way the locking
works..

Aside from that this looks like a great improvement to me

Thanks,
Jason
  
Baolu Lu Feb. 18, 2024, 7:02 a.m. UTC | #2
On 2024/2/16 1:55, Jason Gunthorpe wrote:
> On Thu, Feb 15, 2024 at 03:22:49PM +0800, Lu Baolu wrote:
>> The existing IO page fault handler currently locates the PCI device by
>> calling pci_get_domain_bus_and_slot(). This function searches the list
>> of all PCI devices until the desired device is found. To improve lookup
>> efficiency, a helper function named device_rbtree_find() is introduced
>> to search for the device within the rbtree. Replace
>> pci_get_domain_bus_and_slot() in the IO page fault handling path.
>>
>> Co-developed-by: Huang Jiaqing <jiaqing.huang@intel.com>
>> Signed-off-by: Huang Jiaqing <jiaqing.huang@intel.com>
>> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>> ---
>>   drivers/iommu/intel/iommu.h |  1 +
>>   drivers/iommu/intel/iommu.c | 29 +++++++++++++++++++++++++++++
>>   drivers/iommu/intel/svm.c   | 14 ++++++--------
>>   3 files changed, 36 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>> index 54eeaa8e35a9..f13c228924f8 100644
>> --- a/drivers/iommu/intel/iommu.h
>> +++ b/drivers/iommu/intel/iommu.h
>> @@ -1081,6 +1081,7 @@ void free_pgtable_page(void *vaddr);
>>   void iommu_flush_write_buffer(struct intel_iommu *iommu);
>>   struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
>>   					       const struct iommu_user_data *user_data);
>> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid);
>>   
>>   #ifdef CONFIG_INTEL_IOMMU_SVM
>>   void intel_svm_check(struct intel_iommu *iommu);
>> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
>> index 09009d96e553..d92c680bcc96 100644
>> --- a/drivers/iommu/intel/iommu.c
>> +++ b/drivers/iommu/intel/iommu.c
>> @@ -120,6 +120,35 @@ static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
>>   	return device_rid_cmp_key(&key, rhs);
>>   }
>>   
>> +/*
>> + * Looks up an IOMMU-probed device using its source ID.
>> + *
>> + * If the device is found:
>> + *  - Increments its reference count.
>> + *  - Returns a pointer to the device.
>> + *  - The caller must call put_device() after using the pointer.
>> + *
>> + * If the device is not found, returns NULL.
>> + */
>> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
>> +{
>> +	struct device_domain_info *info;
>> +	struct device *dev = NULL;
>> +	struct rb_node *node;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
>> +	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
>> +	if (node) {
>> +		info = rb_entry(node, struct device_domain_info, node);
>> +		dev = info->dev;
>> +		get_device(dev);
> 
> This get_device() is a bit troubling. It eventually calls into
> iommu_report_device_fault() which does:
> 
> 	struct dev_iommu *param = dev->iommu;
> 
> Which is going to explode if the iomm driver release has already
> happened, which is a precondition to getting to a unref'd struct
> device.
> 
> The driver needs to do something to fence these events during it's
> release function.

Yes, theoretically the dev->iommu should be protected in the
iommu_report_device_fault() path.

> 
> If we are already doing that then I'd suggest to drop the get_device
> and add a big fat comment explaining the special rules about lifetime
> that are in effect here.
> 
> Otherwise you need to do that barrier rethink the way the locking
> works..

A device hot removing goes through at least the following steps:

- Disable PRI.
- Drain all outstanding I/O page faults.
- Stop DMA.
- Unload the device driver.
- Call iommu_release_device() upon the BUS_NOTIFY_REMOVED_DEVICE event.

This sequence ensures that a device cannot generate an I/O page fault
after PRI has been disabled. So in reality it's impossible for a device
to generate an I/O page fault before disabling PRI and then go through
the long journey to reach iommu_release_device() before
iopf_get_dev_fault_param() is called in page fault interrupt handling
thread.

Considering this behavior, adding a comment to the code explaining the
sequence and removing put_device() may be a simpler solution?

> 
> Aside from that this looks like a great improvement to me
> 
> Thanks,
> Jason

Best regards,
baolu
  
Ethan Zhao Feb. 19, 2024, 6:54 a.m. UTC | #3
On 2/15/2024 3:22 PM, Lu Baolu wrote:
> The existing IO page fault handler currently locates the PCI device by
> calling pci_get_domain_bus_and_slot(). This function searches the list
> of all PCI devices until the desired device is found. To improve lookup
> efficiency, a helper function named device_rbtree_find() is introduced
> to search for the device within the rbtree. Replace
> pci_get_domain_bus_and_slot() in the IO page fault handling path.
>
> Co-developed-by: Huang Jiaqing <jiaqing.huang@intel.com>
> Signed-off-by: Huang Jiaqing <jiaqing.huang@intel.com>
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
>   drivers/iommu/intel/iommu.h |  1 +
>   drivers/iommu/intel/iommu.c | 29 +++++++++++++++++++++++++++++
>   drivers/iommu/intel/svm.c   | 14 ++++++--------
>   3 files changed, 36 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
> index 54eeaa8e35a9..f13c228924f8 100644
> --- a/drivers/iommu/intel/iommu.h
> +++ b/drivers/iommu/intel/iommu.h
> @@ -1081,6 +1081,7 @@ void free_pgtable_page(void *vaddr);
>   void iommu_flush_write_buffer(struct intel_iommu *iommu);
>   struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
>   					       const struct iommu_user_data *user_data);
> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid);
>   
>   #ifdef CONFIG_INTEL_IOMMU_SVM
>   void intel_svm_check(struct intel_iommu *iommu);
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index 09009d96e553..d92c680bcc96 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -120,6 +120,35 @@ static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
>   	return device_rid_cmp_key(&key, rhs);
>   }
>   
> +/*
> + * Looks up an IOMMU-probed device using its source ID.
> + *
> + * If the device is found:
> + *  - Increments its reference count.
> + *  - Returns a pointer to the device.
> + *  - The caller must call put_device() after using the pointer.
> + *
> + * If the device is not found, returns NULL.
> + */
> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
> +{
> +	struct device_domain_info *info;
> +	struct device *dev = NULL;
> +	struct rb_node *node;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);

Though per iommu device rbtree isn't a big tree, given already holds spin_lock
why still needs irq off ?


Thanks,
Ethan

> +	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
> +	if (node) {
> +		info = rb_entry(node, struct device_domain_info, node);
> +		dev = info->dev;
> +		get_device(dev);
> +	}
> +	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
> +
> +	return dev;
> +}
> +
>   static int device_rbtree_insert(struct intel_iommu *iommu,
>   				struct device_domain_info *info)
>   {
> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
> index b644d57da841..717b7041973c 100644
> --- a/drivers/iommu/intel/svm.c
> +++ b/drivers/iommu/intel/svm.c
> @@ -645,7 +645,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
>   	struct intel_iommu *iommu = d;
>   	struct page_req_dsc *req;
>   	int head, tail, handled;
> -	struct pci_dev *pdev;
> +	struct device *dev;
>   	u64 address;
>   
>   	/*
> @@ -691,21 +691,19 @@ static irqreturn_t prq_event_thread(int irq, void *d)
>   		if (unlikely(req->lpig && !req->rd_req && !req->wr_req))
>   			goto prq_advance;
>   
> -		pdev = pci_get_domain_bus_and_slot(iommu->segment,
> -						   PCI_BUS_NUM(req->rid),
> -						   req->rid & 0xff);
>   		/*
>   		 * If prq is to be handled outside iommu driver via receiver of
>   		 * the fault notifiers, we skip the page response here.
>   		 */
> -		if (!pdev)
> +		dev = device_rbtree_find(iommu, req->rid);
> +		if (!dev)
>   			goto bad_req;
>   
> -		intel_svm_prq_report(iommu, &pdev->dev, req);
> -		trace_prq_report(iommu, &pdev->dev, req->qw_0, req->qw_1,
> +		intel_svm_prq_report(iommu, dev, req);
> +		trace_prq_report(iommu, dev, req->qw_0, req->qw_1,
>   				 req->priv_data[0], req->priv_data[1],
>   				 iommu->prq_seq_number++);
> -		pci_dev_put(pdev);
> +		put_device(dev);
>   prq_advance:
>   		head = (head + sizeof(*req)) & PRQ_RING_MASK;
>   	}
  
Baolu Lu Feb. 19, 2024, 6:58 a.m. UTC | #4
On 2024/2/19 14:54, Ethan Zhao wrote:
> On 2/15/2024 3:22 PM, Lu Baolu wrote:
>> The existing IO page fault handler currently locates the PCI device by
>> calling pci_get_domain_bus_and_slot(). This function searches the list
>> of all PCI devices until the desired device is found. To improve lookup
>> efficiency, a helper function named device_rbtree_find() is introduced
>> to search for the device within the rbtree. Replace
>> pci_get_domain_bus_and_slot() in the IO page fault handling path.
>>
>> Co-developed-by: Huang Jiaqing <jiaqing.huang@intel.com>
>> Signed-off-by: Huang Jiaqing <jiaqing.huang@intel.com>
>> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>> ---
>>   drivers/iommu/intel/iommu.h |  1 +
>>   drivers/iommu/intel/iommu.c | 29 +++++++++++++++++++++++++++++
>>   drivers/iommu/intel/svm.c   | 14 ++++++--------
>>   3 files changed, 36 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>> index 54eeaa8e35a9..f13c228924f8 100644
>> --- a/drivers/iommu/intel/iommu.h
>> +++ b/drivers/iommu/intel/iommu.h
>> @@ -1081,6 +1081,7 @@ void free_pgtable_page(void *vaddr);
>>   void iommu_flush_write_buffer(struct intel_iommu *iommu);
>>   struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain 
>> *parent,
>>                              const struct iommu_user_data *user_data);
>> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid);
>>   #ifdef CONFIG_INTEL_IOMMU_SVM
>>   void intel_svm_check(struct intel_iommu *iommu);
>> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
>> index 09009d96e553..d92c680bcc96 100644
>> --- a/drivers/iommu/intel/iommu.c
>> +++ b/drivers/iommu/intel/iommu.c
>> @@ -120,6 +120,35 @@ static int device_rid_cmp(struct rb_node *lhs, 
>> const struct rb_node *rhs)
>>       return device_rid_cmp_key(&key, rhs);
>>   }
>> +/*
>> + * Looks up an IOMMU-probed device using its source ID.
>> + *
>> + * If the device is found:
>> + *  - Increments its reference count.
>> + *  - Returns a pointer to the device.
>> + *  - The caller must call put_device() after using the pointer.
>> + *
>> + * If the device is not found, returns NULL.
>> + */
>> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
>> +{
>> +    struct device_domain_info *info;
>> +    struct device *dev = NULL;
>> +    struct rb_node *node;
>> +    unsigned long flags;
>> +
>> +    spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
> 
> Though per iommu device rbtree isn't a big tree, given already holds 
> spin_lock
> why still needs irq off ?

I want it to be usable not only in the normal execution flow, but also
in the interrupt context, such as for the DMA remapping fault
(unrecoverable) reporting path.

Best regards,
baolu
  
Ethan Zhao Feb. 19, 2024, 7:06 a.m. UTC | #5
On 2/19/2024 2:58 PM, Baolu Lu wrote:
> On 2024/2/19 14:54, Ethan Zhao wrote:
>> On 2/15/2024 3:22 PM, Lu Baolu wrote:
>>> The existing IO page fault handler currently locates the PCI device by
>>> calling pci_get_domain_bus_and_slot(). This function searches the list
>>> of all PCI devices until the desired device is found. To improve lookup
>>> efficiency, a helper function named device_rbtree_find() is introduced
>>> to search for the device within the rbtree. Replace
>>> pci_get_domain_bus_and_slot() in the IO page fault handling path.
>>>
>>> Co-developed-by: Huang Jiaqing <jiaqing.huang@intel.com>
>>> Signed-off-by: Huang Jiaqing <jiaqing.huang@intel.com>
>>> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>>> ---
>>>   drivers/iommu/intel/iommu.h |  1 +
>>>   drivers/iommu/intel/iommu.c | 29 +++++++++++++++++++++++++++++
>>>   drivers/iommu/intel/svm.c   | 14 ++++++--------
>>>   3 files changed, 36 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>>> index 54eeaa8e35a9..f13c228924f8 100644
>>> --- a/drivers/iommu/intel/iommu.h
>>> +++ b/drivers/iommu/intel/iommu.h
>>> @@ -1081,6 +1081,7 @@ void free_pgtable_page(void *vaddr);
>>>   void iommu_flush_write_buffer(struct intel_iommu *iommu);
>>>   struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain 
>>> *parent,
>>>                              const struct iommu_user_data *user_data);
>>> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid);
>>>   #ifdef CONFIG_INTEL_IOMMU_SVM
>>>   void intel_svm_check(struct intel_iommu *iommu);
>>> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
>>> index 09009d96e553..d92c680bcc96 100644
>>> --- a/drivers/iommu/intel/iommu.c
>>> +++ b/drivers/iommu/intel/iommu.c
>>> @@ -120,6 +120,35 @@ static int device_rid_cmp(struct rb_node *lhs, 
>>> const struct rb_node *rhs)
>>>       return device_rid_cmp_key(&key, rhs);
>>>   }
>>> +/*
>>> + * Looks up an IOMMU-probed device using its source ID.
>>> + *
>>> + * If the device is found:
>>> + *  - Increments its reference count.
>>> + *  - Returns a pointer to the device.
>>> + *  - The caller must call put_device() after using the pointer.
>>> + *
>>> + * If the device is not found, returns NULL.
>>> + */
>>> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
>>> +{
>>> +    struct device_domain_info *info;
>>> +    struct device *dev = NULL;
>>> +    struct rb_node *node;
>>> +    unsigned long flags;
>>> +
>>> +    spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
>>
>> Though per iommu device rbtree isn't a big tree, given already holds 
>> spin_lock
>> why still needs irq off ?
>
> I want it to be usable not only in the normal execution flow, but also
> in the interrupt context, such as for the DMA remapping fault
> (unrecoverable) reporting path.

Holding rbtree_lock only should work in interrrupt context, I missed something ?
But with irq_off, as side effect, telling all the CPUs not to handle interrrupt
that moment.

Thanks,
Ethan

>
> Best regards,
> baolu
  
Baolu Lu Feb. 19, 2024, 7:22 a.m. UTC | #6
On 2024/2/19 15:06, Ethan Zhao wrote:
> On 2/19/2024 2:58 PM, Baolu Lu wrote:
>> On 2024/2/19 14:54, Ethan Zhao wrote:
>>> On 2/15/2024 3:22 PM, Lu Baolu wrote:
>>>> The existing IO page fault handler currently locates the PCI device by
>>>> calling pci_get_domain_bus_and_slot(). This function searches the list
>>>> of all PCI devices until the desired device is found. To improve lookup
>>>> efficiency, a helper function named device_rbtree_find() is introduced
>>>> to search for the device within the rbtree. Replace
>>>> pci_get_domain_bus_and_slot() in the IO page fault handling path.
>>>>
>>>> Co-developed-by: Huang Jiaqing <jiaqing.huang@intel.com>
>>>> Signed-off-by: Huang Jiaqing <jiaqing.huang@intel.com>
>>>> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>>>> ---
>>>>   drivers/iommu/intel/iommu.h |  1 +
>>>>   drivers/iommu/intel/iommu.c | 29 +++++++++++++++++++++++++++++
>>>>   drivers/iommu/intel/svm.c   | 14 ++++++--------
>>>>   3 files changed, 36 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>>>> index 54eeaa8e35a9..f13c228924f8 100644
>>>> --- a/drivers/iommu/intel/iommu.h
>>>> +++ b/drivers/iommu/intel/iommu.h
>>>> @@ -1081,6 +1081,7 @@ void free_pgtable_page(void *vaddr);
>>>>   void iommu_flush_write_buffer(struct intel_iommu *iommu);
>>>>   struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain 
>>>> *parent,
>>>>                              const struct iommu_user_data *user_data);
>>>> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid);
>>>>   #ifdef CONFIG_INTEL_IOMMU_SVM
>>>>   void intel_svm_check(struct intel_iommu *iommu);
>>>> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
>>>> index 09009d96e553..d92c680bcc96 100644
>>>> --- a/drivers/iommu/intel/iommu.c
>>>> +++ b/drivers/iommu/intel/iommu.c
>>>> @@ -120,6 +120,35 @@ static int device_rid_cmp(struct rb_node *lhs, 
>>>> const struct rb_node *rhs)
>>>>       return device_rid_cmp_key(&key, rhs);
>>>>   }
>>>> +/*
>>>> + * Looks up an IOMMU-probed device using its source ID.
>>>> + *
>>>> + * If the device is found:
>>>> + *  - Increments its reference count.
>>>> + *  - Returns a pointer to the device.
>>>> + *  - The caller must call put_device() after using the pointer.
>>>> + *
>>>> + * If the device is not found, returns NULL.
>>>> + */
>>>> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
>>>> +{
>>>> +    struct device_domain_info *info;
>>>> +    struct device *dev = NULL;
>>>> +    struct rb_node *node;
>>>> +    unsigned long flags;
>>>> +
>>>> +    spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
>>>
>>> Though per iommu device rbtree isn't a big tree, given already holds 
>>> spin_lock
>>> why still needs irq off ?
>>
>> I want it to be usable not only in the normal execution flow, but also
>> in the interrupt context, such as for the DMA remapping fault
>> (unrecoverable) reporting path.
> 
> Holding rbtree_lock only should work in interrrupt context, I missed 
> something ?

No. That will possibly cause dead lock.

Best regards,
baolu
  
Ethan Zhao Feb. 21, 2024, 7:04 a.m. UTC | #7
On 2/16/2024 1:55 AM, Jason Gunthorpe wrote:
> On Thu, Feb 15, 2024 at 03:22:49PM +0800, Lu Baolu wrote:
>> The existing IO page fault handler currently locates the PCI device by
>> calling pci_get_domain_bus_and_slot(). This function searches the list
>> of all PCI devices until the desired device is found. To improve lookup
>> efficiency, a helper function named device_rbtree_find() is introduced
>> to search for the device within the rbtree. Replace
>> pci_get_domain_bus_and_slot() in the IO page fault handling path.
>>
>> Co-developed-by: Huang Jiaqing <jiaqing.huang@intel.com>
>> Signed-off-by: Huang Jiaqing <jiaqing.huang@intel.com>
>> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>> ---
>>   drivers/iommu/intel/iommu.h |  1 +
>>   drivers/iommu/intel/iommu.c | 29 +++++++++++++++++++++++++++++
>>   drivers/iommu/intel/svm.c   | 14 ++++++--------
>>   3 files changed, 36 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>> index 54eeaa8e35a9..f13c228924f8 100644
>> --- a/drivers/iommu/intel/iommu.h
>> +++ b/drivers/iommu/intel/iommu.h
>> @@ -1081,6 +1081,7 @@ void free_pgtable_page(void *vaddr);
>>   void iommu_flush_write_buffer(struct intel_iommu *iommu);
>>   struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
>>   					       const struct iommu_user_data *user_data);
>> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid);
>>   
>>   #ifdef CONFIG_INTEL_IOMMU_SVM
>>   void intel_svm_check(struct intel_iommu *iommu);
>> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
>> index 09009d96e553..d92c680bcc96 100644
>> --- a/drivers/iommu/intel/iommu.c
>> +++ b/drivers/iommu/intel/iommu.c
>> @@ -120,6 +120,35 @@ static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
>>   	return device_rid_cmp_key(&key, rhs);
>>   }
>>   
>> +/*
>> + * Looks up an IOMMU-probed device using its source ID.
>> + *
>> + * If the device is found:
>> + *  - Increments its reference count.
>> + *  - Returns a pointer to the device.
>> + *  - The caller must call put_device() after using the pointer.
>> + *
>> + * If the device is not found, returns NULL.
>> + */
>> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
>> +{
>> +	struct device_domain_info *info;
>> +	struct device *dev = NULL;
>> +	struct rb_node *node;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
>> +	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
>> +	if (node) {
>> +		info = rb_entry(node, struct device_domain_info, node);
>> +		dev = info->dev;
>> +		get_device(dev);
> This get_device() is a bit troubling. It eventually calls into
> iommu_report_device_fault() which does:
>
> 	struct dev_iommu *param = dev->iommu;

So far no protection to dev->iommu structure access in generic
iommu layer between different threads, such as hot removal interrupt &
iopf handling thread, so we should enhance that in generic iommu code ?

Thanks,
Ethan

>
> Which is going to explode if the iomm driver release has already
> happened, which is a precondition to getting to a unref'd struct
> device.
>
> The driver needs to do something to fence these events during it's
> release function.
>
> If we are already doing that then I'd suggest to drop the get_device
> and add a big fat comment explaining the special rules about lifetime
> that are in effect here.
>
> Otherwise you need to do that barrier rethink the way the locking
> works..
>
> Aside from that this looks like a great improvement to me
>
> Thanks,
> Jason
>
  
Baolu Lu Feb. 21, 2024, 7:37 a.m. UTC | #8
On 2024/2/21 15:04, Ethan Zhao wrote:
> On 2/16/2024 1:55 AM, Jason Gunthorpe wrote:
>> On Thu, Feb 15, 2024 at 03:22:49PM +0800, Lu Baolu wrote:
>>> The existing IO page fault handler currently locates the PCI device by
>>> calling pci_get_domain_bus_and_slot(). This function searches the list
>>> of all PCI devices until the desired device is found. To improve lookup
>>> efficiency, a helper function named device_rbtree_find() is introduced
>>> to search for the device within the rbtree. Replace
>>> pci_get_domain_bus_and_slot() in the IO page fault handling path.
>>>
>>> Co-developed-by: Huang Jiaqing <jiaqing.huang@intel.com>
>>> Signed-off-by: Huang Jiaqing <jiaqing.huang@intel.com>
>>> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>>> ---
>>>   drivers/iommu/intel/iommu.h |  1 +
>>>   drivers/iommu/intel/iommu.c | 29 +++++++++++++++++++++++++++++
>>>   drivers/iommu/intel/svm.c   | 14 ++++++--------
>>>   3 files changed, 36 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>>> index 54eeaa8e35a9..f13c228924f8 100644
>>> --- a/drivers/iommu/intel/iommu.h
>>> +++ b/drivers/iommu/intel/iommu.h
>>> @@ -1081,6 +1081,7 @@ void free_pgtable_page(void *vaddr);
>>>   void iommu_flush_write_buffer(struct intel_iommu *iommu);
>>>   struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain 
>>> *parent,
>>>                              const struct iommu_user_data *user_data);
>>> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid);
>>>   #ifdef CONFIG_INTEL_IOMMU_SVM
>>>   void intel_svm_check(struct intel_iommu *iommu);
>>> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
>>> index 09009d96e553..d92c680bcc96 100644
>>> --- a/drivers/iommu/intel/iommu.c
>>> +++ b/drivers/iommu/intel/iommu.c
>>> @@ -120,6 +120,35 @@ static int device_rid_cmp(struct rb_node *lhs, 
>>> const struct rb_node *rhs)
>>>       return device_rid_cmp_key(&key, rhs);
>>>   }
>>> +/*
>>> + * Looks up an IOMMU-probed device using its source ID.
>>> + *
>>> + * If the device is found:
>>> + *  - Increments its reference count.
>>> + *  - Returns a pointer to the device.
>>> + *  - The caller must call put_device() after using the pointer.
>>> + *
>>> + * If the device is not found, returns NULL.
>>> + */
>>> +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
>>> +{
>>> +    struct device_domain_info *info;
>>> +    struct device *dev = NULL;
>>> +    struct rb_node *node;
>>> +    unsigned long flags;
>>> +
>>> +    spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
>>> +    node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
>>> +    if (node) {
>>> +        info = rb_entry(node, struct device_domain_info, node);
>>> +        dev = info->dev;
>>> +        get_device(dev);
>> This get_device() is a bit troubling. It eventually calls into
>> iommu_report_device_fault() which does:
>>
>>     struct dev_iommu *param = dev->iommu;
> 
> So far no protection to dev->iommu structure access in generic
> iommu layer between different threads, such as hot removal interrupt &
> iopf handling thread, so we should enhance that in generic iommu code ?

The iommu core is sitting between the upper layers and individual iommu
drivers. All calls from the upper layers are made within the driver
context. The device driver framework and iommu subsystem guarantee that
dev->iommu is always valid in the driver context.

However, iommu_report_device_fault() is different. It's called in the
interrupt thread, not in any driver context. Hence, the individual iommu
driver should guarantee that dev->iommu is valid when calling
iommu_report_device_fault().

Best regards,
baolu
  
Jason Gunthorpe Feb. 21, 2024, 3:31 p.m. UTC | #9
On Sun, Feb 18, 2024 at 03:02:00PM +0800, Baolu Lu wrote:
> A device hot removing goes through at least the following steps:
> 
> - Disable PRI.
> - Drain all outstanding I/O page faults.
> - Stop DMA.
> - Unload the device driver.
> - Call iommu_release_device() upon the BUS_NOTIFY_REMOVED_DEVICE event.
> 
> This sequence ensures that a device cannot generate an I/O page fault
> after PRI has been disabled. So in reality it's impossible for a device
> to generate an I/O page fault before disabling PRI and then go through
> the long journey to reach iommu_release_device() before
> iopf_get_dev_fault_param() is called in page fault interrupt handling
> thread.

Why is this impossible? Seems like a classic race..

Flush the HW page fault queue as part of the above to ensure there is
no concurrent iopf_get_dev_fault_param() on the now PRI disabled BDF.

> Considering this behavior, adding a comment to the code explaining the
> sequence and removing put_device() may be a simpler solution?

A comment is definitely needed

Jason
  

Patch

diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 54eeaa8e35a9..f13c228924f8 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -1081,6 +1081,7 @@  void free_pgtable_page(void *vaddr);
 void iommu_flush_write_buffer(struct intel_iommu *iommu);
 struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
 					       const struct iommu_user_data *user_data);
+struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid);
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
 void intel_svm_check(struct intel_iommu *iommu);
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 09009d96e553..d92c680bcc96 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -120,6 +120,35 @@  static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
 	return device_rid_cmp_key(&key, rhs);
 }
 
+/*
+ * Looks up an IOMMU-probed device using its source ID.
+ *
+ * If the device is found:
+ *  - Increments its reference count.
+ *  - Returns a pointer to the device.
+ *  - The caller must call put_device() after using the pointer.
+ *
+ * If the device is not found, returns NULL.
+ */
+struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
+{
+	struct device_domain_info *info;
+	struct device *dev = NULL;
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
+	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
+	if (node) {
+		info = rb_entry(node, struct device_domain_info, node);
+		dev = info->dev;
+		get_device(dev);
+	}
+	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
+
+	return dev;
+}
+
 static int device_rbtree_insert(struct intel_iommu *iommu,
 				struct device_domain_info *info)
 {
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index b644d57da841..717b7041973c 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -645,7 +645,7 @@  static irqreturn_t prq_event_thread(int irq, void *d)
 	struct intel_iommu *iommu = d;
 	struct page_req_dsc *req;
 	int head, tail, handled;
-	struct pci_dev *pdev;
+	struct device *dev;
 	u64 address;
 
 	/*
@@ -691,21 +691,19 @@  static irqreturn_t prq_event_thread(int irq, void *d)
 		if (unlikely(req->lpig && !req->rd_req && !req->wr_req))
 			goto prq_advance;
 
-		pdev = pci_get_domain_bus_and_slot(iommu->segment,
-						   PCI_BUS_NUM(req->rid),
-						   req->rid & 0xff);
 		/*
 		 * If prq is to be handled outside iommu driver via receiver of
 		 * the fault notifiers, we skip the page response here.
 		 */
-		if (!pdev)
+		dev = device_rbtree_find(iommu, req->rid);
+		if (!dev)
 			goto bad_req;
 
-		intel_svm_prq_report(iommu, &pdev->dev, req);
-		trace_prq_report(iommu, &pdev->dev, req->qw_0, req->qw_1,
+		intel_svm_prq_report(iommu, dev, req);
+		trace_prq_report(iommu, dev, req->qw_0, req->qw_1,
 				 req->priv_data[0], req->priv_data[1],
 				 iommu->prq_seq_number++);
-		pci_dev_put(pdev);
+		put_device(dev);
 prq_advance:
 		head = (head + sizeof(*req)) & PRQ_RING_MASK;
 	}