[v2,2/2] iommu/vt-d: Fix NULL domain on device release

Message ID 20240229094613.121575-3-baolu.lu@linux.intel.com
State New
Headers
Series iommu: Fix domain check on release (part 1/2) |

Commit Message

Baolu Lu Feb. 29, 2024, 9:46 a.m. UTC
  In the kdump kernel, the IOMMU operates in deferred_attach mode. In this
mode, info->domain may not yet be assigned by the time the release_device
function is called. It leads to the following crash in the crash kernel:

    BUG: kernel NULL pointer dereference, address: 000000000000003c
    ...
    RIP: 0010:do_raw_spin_lock+0xa/0xa0
    ...
    _raw_spin_lock_irqsave+0x1b/0x30
    intel_iommu_release_device+0x96/0x170
    iommu_deinit_device+0x39/0xf0
    __iommu_group_remove_device+0xa0/0xd0
    iommu_bus_notifier+0x55/0xb0
    notifier_call_chain+0x5a/0xd0
    blocking_notifier_call_chain+0x41/0x60
    bus_notify+0x34/0x50
    device_del+0x269/0x3d0
    pci_remove_bus_device+0x77/0x100
    p2sb_bar+0xae/0x1d0
    ...
    i801_probe+0x423/0x740

Use the release_domain mechanism to fix it. The scalable mode context
entry which is not part of release_domain should be cleared in
release_device().

Fixes: 586081d3f6b1 ("iommu/vt-d: Remove DEFER_DEVICE_DOMAIN_INFO")
Reported-by: Eric Badger <ebadger@purestorage.com>
Closes: https://lore.kernel.org/r/20240113181713.1817855-1-ebadger@purestorage.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 drivers/iommu/intel/pasid.h |  1 +
 drivers/iommu/intel/iommu.c | 31 +++-----------
 drivers/iommu/intel/pasid.c | 83 +++++++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+), 25 deletions(-)
  

Comments

Tian, Kevin March 4, 2024, 7:36 a.m. UTC | #1
> From: Lu Baolu <baolu.lu@linux.intel.com>
> Sent: Thursday, February 29, 2024 5:46 PM
> 
> +
> +/*
> + * Cache invalidation for changes to a scalable-mode context table
> + * entry.
> + *
> + * Section 6.5.3.3 of the VT-d spec:
> + * - Device-selective context-cache invalidation;
> + * - Domain-selective PASID-cache invalidation to affected domains
> + *   (can be skipped if all PASID entries were not-present);
> + * - Domain-selective IOTLB invalidation to affected domains;

the spec talks about domain-selective but the code actually does
global invalidation.

> + * - Global Device-TLB invalidation to affected functions.
> + *
> + * Note that RWBF (Required Write-Buffer Flushing) capability has
> + * been deprecated for scable mode. Section 11.4.2 of the VT-d spec:
> + *
> + * HRWBF: Hardware implementations reporting Scalable Mode Translation
> + * Support (SMTS) as Set also report this field as Clear.

RWBF info is a bit weird given existing code doesn't touch it

> + */
> +static void sm_context_flush_caches(struct device *dev)
> +{
> +	struct device_domain_info *info = dev_iommu_priv_get(dev);
> +	struct intel_iommu *iommu = info->iommu;
> +
> +	iommu->flush.flush_context(iommu, 0, PCI_DEVID(info->bus, info-
> >devfn),
> +				   DMA_CCMD_MASK_NOBIT,
> DMA_CCMD_DEVICE_INVL);
> +	qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
> +	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
> +	devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID);
> +}
> +
> +static void context_entry_teardown_pasid_table(struct intel_iommu
> *iommu,
> +					       struct context_entry *context)
> +{
> +	context_clear_entry(context);
> +	if (!ecap_coherent(iommu->ecap))
> +		clflush_cache_range(context, sizeof(*context));

this is __iommu_flush_cache(). You can use it throughout this and
the 2nd series.

> +
> +void intel_pasid_teardown_sm_context(struct device *dev)
> +{

it's clearer to call it just intel_teardown_sm_context. pasid_table
is one field in the context entry. Having pasid leading is slightly
confusing.
  
Baolu Lu March 4, 2024, 8:07 a.m. UTC | #2
On 2024/3/4 15:36, Tian, Kevin wrote:
>> From: Lu Baolu <baolu.lu@linux.intel.com>
>> Sent: Thursday, February 29, 2024 5:46 PM
>>
>> +
>> +/*
>> + * Cache invalidation for changes to a scalable-mode context table
>> + * entry.
>> + *
>> + * Section 6.5.3.3 of the VT-d spec:
>> + * - Device-selective context-cache invalidation;
>> + * - Domain-selective PASID-cache invalidation to affected domains
>> + *   (can be skipped if all PASID entries were not-present);
>> + * - Domain-selective IOTLB invalidation to affected domains;
> 
> the spec talks about domain-selective but the code actually does
> global invalidation.

I should have included the following comments below:

/* Given that we have no idea about which domain IDs and PASIDs were
  * used in the pasid table, upgrade them to global PASID and IOTLB
  * cache invalidation. This doesn't impact the performance significantly
  * as the clearing context entry is not a critical path.
  */

> 
>> + * - Global Device-TLB invalidation to affected functions.
>> + *
>> + * Note that RWBF (Required Write-Buffer Flushing) capability has
>> + * been deprecated for scable mode. Section 11.4.2 of the VT-d spec:
>> + *
>> + * HRWBF: Hardware implementations reporting Scalable Mode Translation
>> + * Support (SMTS) as Set also report this field as Clear.
> 
> RWBF info is a bit weird given existing code doesn't touch it

Yes. I will remove above note.

> 
>> + */
>> +static void sm_context_flush_caches(struct device *dev)
>> +{
>> +	struct device_domain_info *info = dev_iommu_priv_get(dev);
>> +	struct intel_iommu *iommu = info->iommu;
>> +
>> +	iommu->flush.flush_context(iommu, 0, PCI_DEVID(info->bus, info-
>>> devfn),
>> +				   DMA_CCMD_MASK_NOBIT,
>> DMA_CCMD_DEVICE_INVL);
>> +	qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
>> +	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
>> +	devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID);
>> +}
>> +
>> +static void context_entry_teardown_pasid_table(struct intel_iommu
>> *iommu,
>> +					       struct context_entry *context)
>> +{
>> +	context_clear_entry(context);
>> +	if (!ecap_coherent(iommu->ecap))
>> +		clflush_cache_range(context, sizeof(*context));
> 
> this is __iommu_flush_cache(). You can use it throughout this and
> the 2nd series.

Yes.

> 
>> +
>> +void intel_pasid_teardown_sm_context(struct device *dev)
>> +{
> 
> it's clearer to call it just intel_teardown_sm_context. pasid_table
> is one field in the context entry. Having pasid leading is slightly
> confusing.

I used the intel_pasid prefix because this helper function is located in
the pasid.c file.

Best regards,
baolu
  
Tian, Kevin March 4, 2024, 8:59 a.m. UTC | #3
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Monday, March 4, 2024 4:07 PM
> 
> On 2024/3/4 15:36, Tian, Kevin wrote:
> >> From: Lu Baolu <baolu.lu@linux.intel.com>
> >> Sent: Thursday, February 29, 2024 5:46 PM
> >>
> >> +
> >> +/*
> >> + * Cache invalidation for changes to a scalable-mode context table
> >> + * entry.
> >> + *
> >> + * Section 6.5.3.3 of the VT-d spec:
> >> + * - Device-selective context-cache invalidation;
> >> + * - Domain-selective PASID-cache invalidation to affected domains
> >> + *   (can be skipped if all PASID entries were not-present);
> >> + * - Domain-selective IOTLB invalidation to affected domains;
> >
> > the spec talks about domain-selective but the code actually does
> > global invalidation.
> 
> I should have included the following comments below:
> 
> /* Given that we have no idea about which domain IDs and PASIDs were
>   * used in the pasid table, upgrade them to global PASID and IOTLB
>   * cache invalidation. This doesn't impact the performance significantly
>   * as the clearing context entry is not a critical path.
>   */
> 

but then it affects all other perf-critical paths which rely on the cache
for other devices...

It's preferable to restrict overhead to this release path only e.g. walking 
the PASID table to identify affected DIDs and PASIDs instead of expanding
the impact to system wide.
  
Baolu Lu March 4, 2024, 9:39 a.m. UTC | #4
On 2024/3/4 16:59, Tian, Kevin wrote:
>> From: Baolu Lu <baolu.lu@linux.intel.com>
>> Sent: Monday, March 4, 2024 4:07 PM
>>
>> On 2024/3/4 15:36, Tian, Kevin wrote:
>>>> From: Lu Baolu <baolu.lu@linux.intel.com>
>>>> Sent: Thursday, February 29, 2024 5:46 PM
>>>>
>>>> +
>>>> +/*
>>>> + * Cache invalidation for changes to a scalable-mode context table
>>>> + * entry.
>>>> + *
>>>> + * Section 6.5.3.3 of the VT-d spec:
>>>> + * - Device-selective context-cache invalidation;
>>>> + * - Domain-selective PASID-cache invalidation to affected domains
>>>> + *   (can be skipped if all PASID entries were not-present);
>>>> + * - Domain-selective IOTLB invalidation to affected domains;
>>>
>>> the spec talks about domain-selective but the code actually does
>>> global invalidation.
>>
>> I should have included the following comments below:
>>
>> /* Given that we have no idea about which domain IDs and PASIDs were
>>    * used in the pasid table, upgrade them to global PASID and IOTLB
>>    * cache invalidation. This doesn't impact the performance significantly
>>    * as the clearing context entry is not a critical path.
>>    */
>>
> 
> but then it affects all other perf-critical paths which rely on the cache
> for other devices...

You are right. Good consideration.

> 
> It's preferable to restrict overhead to this release path only e.g. walking
> the PASID table to identify affected DIDs and PASIDs instead of expanding
> the impact to system wide.

The sm_context_flush_caches() could be used in two different paths:
- Deferred attachment case;
- Normal device release path.

For the formal case, we have to use global cache invalidation; but the
the latter case, it's fine to skip these cache invalidation. The new
helper probably looks like below.

/*
  * Cache invalidation for changes to a scalable-mode context table
  * entry.
  *
  * Section 6.5.3.3 of the VT-d spec:
  * - Device-selective context-cache invalidation;
  * - Domain-selective PASID-cache invalidation to affected domains
  *   (can be skipped if all PASID entries were not-present);
  * - Domain-selective IOTLB invalidation to affected domains;
  * - Global Device-TLB invalidation to affected functions.
  *
  * For kdump cases, old valid entries may be cached due to the in-flight
  * DMA and copied pgtable, but there is no unmapping behaviour for them,
  * thus we need explicit cache flushes for all affected domain IDs and
  * PASIDs used in the copied PASID table. Given that we have no idea about
  * which domain IDs and PASIDs were used in the copied tables, upgrade
  * them to global PASID and IOTLB cache invalidation.
  *
  * For normal case, the iommu has been parked in blocking state. All PASID
  * entries are in non-present now. Skip PASID and IOTLB cache invalidation.
  */
static void sm_context_flush_caches(struct device *dev)
{
         struct device_domain_info *info = dev_iommu_priv_get(dev);
         struct intel_iommu *iommu = info->iommu;

         iommu->flush.flush_context(iommu, 0, PCI_DEVID(info->bus, 
info->devfn),
                                    DMA_CCMD_MASK_NOBIT, 
DMA_CCMD_DEVICE_INVL);
         if (context_copied(iommu, info->bus, info->devfn)) {
                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 
DMA_TLB_GLOBAL_FLUSH);
         }
         devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID);
}

Does it look good for you?

Best regards,
baolu
  

Patch

diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 487ede039bdd..42fda97fd851 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -318,4 +318,5 @@  void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
 				 bool fault_ignore);
 void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu,
 					  struct device *dev, u32 pasid);
+void intel_pasid_teardown_sm_context(struct device *dev);
 #endif /* __INTEL_PASID_H */
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index cc3994efd362..f74d42d3258f 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3869,30 +3869,6 @@  static void domain_context_clear(struct device_domain_info *info)
 			       &domain_context_clear_one_cb, info);
 }
 
-static void dmar_remove_one_dev_info(struct device *dev)
-{
-	struct device_domain_info *info = dev_iommu_priv_get(dev);
-	struct dmar_domain *domain = info->domain;
-	struct intel_iommu *iommu = info->iommu;
-	unsigned long flags;
-
-	if (!dev_is_real_dma_subdevice(info->dev)) {
-		if (dev_is_pci(info->dev) && sm_supported(iommu))
-			intel_pasid_tear_down_entry(iommu, info->dev,
-					IOMMU_NO_PASID, false);
-
-		iommu_disable_pci_caps(info);
-		domain_context_clear(info);
-	}
-
-	spin_lock_irqsave(&domain->lock, flags);
-	list_del(&info->link);
-	spin_unlock_irqrestore(&domain->lock, flags);
-
-	domain_detach_iommu(domain, iommu);
-	info->domain = NULL;
-}
-
 /*
  * Clear the page table pointer in context or pasid table entries so that
  * all DMA requests without PASID from the device are blocked. If the page
@@ -4431,7 +4407,11 @@  static void intel_iommu_release_device(struct device *dev)
 	mutex_lock(&iommu->iopf_lock);
 	device_rbtree_remove(info);
 	mutex_unlock(&iommu->iopf_lock);
-	dmar_remove_one_dev_info(dev);
+
+	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
+	    !context_copied(iommu, info->bus, info->devfn))
+		intel_pasid_teardown_sm_context(dev);
+
 	intel_pasid_free_table(dev);
 	intel_iommu_debugfs_remove_dev(info);
 	kfree(info);
@@ -4922,6 +4902,7 @@  static const struct iommu_dirty_ops intel_dirty_ops = {
 
 const struct iommu_ops intel_iommu_ops = {
 	.blocked_domain		= &blocking_domain,
+	.release_domain		= &blocking_domain,
 	.capable		= intel_iommu_capable,
 	.hw_info		= intel_iommu_hw_info,
 	.domain_alloc		= intel_iommu_domain_alloc,
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 108158e2b907..52068cf52fe2 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -667,3 +667,86 @@  int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
 
 	return 0;
 }
+
+/*
+ * Interfaces to setup or teardown a pasid table to the scalable-mode
+ * context table entry:
+ */
+
+/*
+ * Cache invalidation for changes to a scalable-mode context table
+ * entry.
+ *
+ * Section 6.5.3.3 of the VT-d spec:
+ * - Device-selective context-cache invalidation;
+ * - Domain-selective PASID-cache invalidation to affected domains
+ *   (can be skipped if all PASID entries were not-present);
+ * - Domain-selective IOTLB invalidation to affected domains;
+ * - Global Device-TLB invalidation to affected functions.
+ *
+ * Note that RWBF (Required Write-Buffer Flushing) capability has
+ * been deprecated for scable mode. Section 11.4.2 of the VT-d spec:
+ *
+ * HRWBF: Hardware implementations reporting Scalable Mode Translation
+ * Support (SMTS) as Set also report this field as Clear.
+ */
+static void sm_context_flush_caches(struct device *dev)
+{
+	struct device_domain_info *info = dev_iommu_priv_get(dev);
+	struct intel_iommu *iommu = info->iommu;
+
+	iommu->flush.flush_context(iommu, 0, PCI_DEVID(info->bus, info->devfn),
+				   DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL);
+	qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
+	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
+	devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID);
+}
+
+static void context_entry_teardown_pasid_table(struct intel_iommu *iommu,
+					       struct context_entry *context)
+{
+	context_clear_entry(context);
+	if (!ecap_coherent(iommu->ecap))
+		clflush_cache_range(context, sizeof(*context));
+}
+
+static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn)
+{
+	struct device_domain_info *info = dev_iommu_priv_get(dev);
+	struct intel_iommu *iommu = info->iommu;
+	struct context_entry *context;
+
+	spin_lock(&iommu->lock);
+	context = iommu_context_addr(iommu, bus, devfn, false);
+	if (!context) {
+		spin_unlock(&iommu->lock);
+		return;
+	}
+
+	context_entry_teardown_pasid_table(iommu, context);
+	spin_unlock(&iommu->lock);
+
+	sm_context_flush_caches(dev);
+}
+
+static int pci_pasid_table_teardown(struct pci_dev *pdev, u16 alias, void *data)
+{
+	struct device *dev = data;
+
+	if (dev == &pdev->dev)
+		device_pasid_table_teardown(dev, PCI_BUS_NUM(alias), alias & 0xff);
+
+	return 0;
+}
+
+void intel_pasid_teardown_sm_context(struct device *dev)
+{
+	struct device_domain_info *info = dev_iommu_priv_get(dev);
+
+	if (!dev_is_pci(dev)) {
+		device_pasid_table_teardown(dev, info->bus, info->devfn);
+		return;
+	}
+
+	pci_for_each_dma_alias(to_pci_dev(dev), pci_pasid_table_teardown, dev);
+}