[1/4] iommu: Add dev_iommu->ops_rwsem

Message ID 20230213074941.919324-2-baolu.lu@linux.intel.com
State New
Headers
Series iommu: Extend changing default domain to normal group |

Commit Message

Baolu Lu Feb. 13, 2023, 7:49 a.m. UTC
  Add a RW semaphore to make sure that iommu_ops of a device is consistent
in any non-driver-oriented path, such as a store operation on the iommu
group sysfs node.

Add a pair of helpers to freeze and unfreeze the iommu ops of all devices
in an iommu group, and use them in iommu_group_store_type().

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 include/linux/iommu.h |  3 +++
 drivers/iommu/iommu.c | 53 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 1 deletion(-)
  

Comments

Jason Gunthorpe Feb. 13, 2023, 2:16 p.m. UTC | #1
On Mon, Feb 13, 2023 at 03:49:38PM +0800, Lu Baolu wrote:

> +static int iommu_group_freeze_dev_ops(struct iommu_group *group)
> +{
> +	struct group_device *device;
> +	struct device *dev;
> +
> +	mutex_lock(&group->mutex);
> +	list_for_each_entry(device, &group->devices, list) {
> +		dev = device->dev;
> +		down_read(&dev->iommu->ops_rwsem);

This isn't allowed, you can't obtain locks in a loop like this, it
will deadlock.

You don't need more locks, we already have the group mutex, the
release path should be fixed to use it properly as I was trying to do here:

https://lore.kernel.org/kvm/4-v1-ef00ffecea52+2cb-iommu_group_lifetime_jgg@nvidia.com/
https://lore.kernel.org/kvm/YyyTxx0HnA3maxEk@nvidia.com/

Then what you'd do in a path like this is:

  mutex_lock(&group->mutex);
  dev = iommu_group_first_device(group)
  if (!dev)
     /* Racing with group cleanup */
     return -EINVAL;
  
  /* Now dev->ops is valid and must remain valid so long as
     group->mutex is held */
   

The only reason this doesn't work already is because of the above
stuff about release not holding the group mutex when manipulating the
devices in the group. This is simply mis-locked.

Robin explained it was done like this because
arm_iommu_detach_device() re-enters the iommu core during release_dev,
so I suggest fixing that by simply not doing that (see above)

Below is the lastest attempt I had tried, I didn't have time to get back
to it and we fixed the bug another way. It needs a bit of adjusting to
also remove the device from the group after release is called within
the same mutex critical region.

Jason

diff --git a/arch/arm/include/asm/dma-iommu.h b/arch/arm/include/asm/dma-iommu.h
index fe9ef6f79e9cfe..ea7198a1786186 100644
--- a/arch/arm/include/asm/dma-iommu.h
+++ b/arch/arm/include/asm/dma-iommu.h
@@ -31,6 +31,7 @@ void arm_iommu_release_mapping(struct dma_iommu_mapping *mapping);
 int arm_iommu_attach_device(struct device *dev,
 					struct dma_iommu_mapping *mapping);
 void arm_iommu_detach_device(struct device *dev);
+void arm_iommu_release_device(struct device *dev);
 
 #endif /* __KERNEL__ */
 #endif
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index c135f6e37a00ca..3d7b385e3981ef 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1680,13 +1680,15 @@ int arm_iommu_attach_device(struct device *dev,
 EXPORT_SYMBOL_GPL(arm_iommu_attach_device);
 
 /**
- * arm_iommu_detach_device
+ * arm_iommu_release_device
  * @dev: valid struct device pointer
  *
- * Detaches the provided device from a previously attached map.
- * This overwrites the dma_ops pointer with appropriate non-IOMMU ops.
+ * This is like arm_iommu_detach_device() except it handles the special
+ * case of being called under an iommu driver's release operation. In this
+ * case the driver must have already detached the device from any attached
+ * domain before calling this function.
  */
-void arm_iommu_detach_device(struct device *dev)
+void arm_iommu_release_device(struct device *dev)
 {
 	struct dma_iommu_mapping *mapping;
 
@@ -1696,13 +1698,34 @@ void arm_iommu_detach_device(struct device *dev)
 		return;
 	}
 
-	iommu_detach_device(mapping->domain, dev);
 	kref_put(&mapping->kref, release_iommu_mapping);
 	to_dma_iommu_mapping(dev) = NULL;
 	set_dma_ops(dev, NULL);
 
 	pr_debug("Detached IOMMU controller from %s device.\n", dev_name(dev));
 }
+EXPORT_SYMBOL_GPL(arm_iommu_release_device);
+
+/**
+ * arm_iommu_detach_device
+ * @dev: valid struct device pointer
+ *
+ * Detaches the provided device from a previously attached map.
+ * This overwrites the dma_ops pointer with appropriate non-IOMMU ops.
+ */
+void arm_iommu_detach_device(struct device *dev)
+{
+	struct dma_iommu_mapping *mapping;
+
+	mapping = to_dma_iommu_mapping(dev);
+	if (!mapping) {
+		dev_warn(dev, "Not attached\n");
+		return;
+	}
+
+	iommu_detach_device(mapping->domain, dev);
+	arm_iommu_release_device(dev);
+}
 EXPORT_SYMBOL_GPL(arm_iommu_detach_device);
 
 static void arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index de91dd88705bd3..f3dbd980133567 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -101,6 +101,10 @@ static int iommu_create_device_direct_mappings(struct iommu_group *group,
 static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
 static ssize_t iommu_group_store_type(struct iommu_group *group,
 				      const char *buf, size_t count);
+static struct group_device *
+__iommu_group_remove_device(struct iommu_group *group, struct device *dev);
+static void __iommu_group_remove_device_sysfs(struct iommu_group *group,
+					      struct group_device *device);
 
 #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store)		\
 struct iommu_group_attribute iommu_group_attr_##_name =		\
@@ -430,18 +434,50 @@ int iommu_probe_device(struct device *dev)
 
 void iommu_release_device(struct device *dev)
 {
+	struct iommu_group *group = dev->iommu_group;
 	const struct iommu_ops *ops;
+	struct group_device *device;
 
 	if (!dev->iommu)
 		return;
 
 	iommu_device_unlink(dev->iommu->iommu_dev, dev);
 
+	mutex_lock(&group->mutex);
+	device = __iommu_group_remove_device(group, dev);
 	ops = dev_iommu_ops(dev);
+
+	/*
+	 * If the group has become empty then ownership must have been released,
+	 * and the current domain must be set back to NULL or the default
+	 * domain.
+	 */
+	if (list_empty(&group->devices))
+		WARN_ON(group->owner_cnt ||
+			group->domain != group->default_domain);
+
+	/*
+	 * release_device() must stop using any attached domain on the device.
+	 * If there are still other devices in the group they are not effected
+	 * by this callback.
+	 *
+	 * The IOMMU driver must set the device to either an identity or
+	 * blocking translation and stop using any domain pointer, as it is
+	 * going to be freed.
+	 */
 	if (ops->release_device)
 		ops->release_device(dev);
+	mutex_unlock(&group->mutex);
+
+	__iommu_group_remove_device_sysfs(group, device);
+
+	/*
+	 * This will eventually call iommu_group_release() which will free the
+	 * iommu_domains.
+	 */
+	dev->iommu_group = NULL;
+	kobject_put(group->devices_kobj);
 
-	iommu_group_remove_device(dev);
 	module_put(ops->owner);
 	dev_iommu_free(dev);
 }
@@ -1039,44 +1075,71 @@ int iommu_group_add_device(struct iommu_group *group, struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_group_add_device);
 
-/**
- * iommu_group_remove_device - remove a device from it's current group
- * @dev: device to be removed
- *
- * This function is called by an iommu driver to remove the device from
- * it's current group.  This decrements the iommu group reference count.
- */
-void iommu_group_remove_device(struct device *dev)
+static struct group_device *
+__iommu_group_remove_device(struct iommu_group *group, struct device *dev)
 {
-	struct iommu_group *group = dev->iommu_group;
-	struct group_device *tmp_device, *device = NULL;
+	struct group_device *device;
+
+	lockdep_assert_held(&group->mutex);
 
 	if (!group)
-		return;
+		return NULL;
 
 	dev_info(dev, "Removing from iommu group %d\n", group->id);
 
-	mutex_lock(&group->mutex);
-	list_for_each_entry(tmp_device, &group->devices, list) {
-		if (tmp_device->dev == dev) {
-			device = tmp_device;
+	list_for_each_entry(device, &group->devices, list) {
+		if (device->dev == dev) {
 			list_del(&device->list);
-			break;
+			return device;
 		}
 	}
-	mutex_unlock(&group->mutex);
+	return NULL;
+}
 
+static void __iommu_group_remove_device_sysfs(struct iommu_group *group,
+					      struct group_device *device)
+{
 	if (!device)
 		return;
 
 	sysfs_remove_link(group->devices_kobj, device->name);
-	sysfs_remove_link(&dev->kobj, "iommu_group");
+	sysfs_remove_link(&device->dev->kobj, "iommu_group");
 
-	trace_remove_device_from_group(group->id, dev);
+	trace_remove_device_from_group(group->id, device->dev);
 
 	kfree(device->name);
 	kfree(device);
-	dev->iommu_group = NULL;
+}
+
+/**
+ * iommu_group_remove_device - remove a device from it's current group
+ * @dev: device to be removed
+ *
+ * This function is used by non-iommu drivers to create non-iommu subystem
+ * groups (eg VFIO mdev, SPAPR) Using this from inside an iommu driver is an
+ * abuse. Instead the driver should return the correct group from
+ * ops->device_group()
+ */
+void iommu_group_remove_device(struct device *dev)
+{
+	struct iommu_group *group = dev->iommu_group;
+	struct group_device *device;
+
+	/*
+	 * Since we don't do ops->release_device() there is no way for the
+	 * driver to stop using any attached domain before we free it. If a
+	 * domain is attached while this function is called it will eventually
+	 * UAF.
+	 *
+	 * Thus it is only useful for cases like VFIO/SPAPR that don't use an
+	 * iommu driver, or for cases like FSL that don't use default domains.
+	 */
+	WARN_ON(group->domain);
+
+	mutex_lock(&group->mutex);
+	device = __iommu_group_remove_device(group, dev);
+	mutex_unlock(&group->mutex);
+	__iommu_group_remove_device_sysfs(group, device);
 	kobject_put(group->devices_kobj);
 }
 EXPORT_SYMBOL_GPL(iommu_group_remove_device);
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index a003bd5fc65c13..703a6083172de1 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -302,11 +302,8 @@ static void ipmmu_utlb_enable(struct ipmmu_vmsa_domain *domain,
 /*
  * Disable MMU translation for the microTLB.
  */
-static void ipmmu_utlb_disable(struct ipmmu_vmsa_domain *domain,
-			       unsigned int utlb)
+static void ipmmu_utlb_disable(struct ipmmu_vmsa_device *mmu, unsigned int utlb)
 {
-	struct ipmmu_vmsa_device *mmu = domain->mmu;
-
 	ipmmu_imuctr_write(mmu, utlb, 0);
 	mmu->utlb_ctx[utlb] = IPMMU_CTX_INVALID;
 }
@@ -647,11 +644,11 @@ static void ipmmu_detach_device(struct iommu_domain *io_domain,
 				struct device *dev)
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-	struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+	struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
 	unsigned int i;
 
 	for (i = 0; i < fwspec->num_ids; ++i)
-		ipmmu_utlb_disable(domain, fwspec->ids[i]);
+		ipmmu_utlb_disable(mmu, fwspec->ids[i]);
 
 	/*
 	 * TODO: Optimize by disabling the context when no device is attached.
@@ -847,7 +844,8 @@ static void ipmmu_probe_finalize(struct device *dev)
 
 static void ipmmu_release_device(struct device *dev)
 {
-	arm_iommu_detach_device(dev);
+	ipmmu_detach_device(NULL, dev);
+	arm_iommu_release_device(dev);
 }
 
 static struct iommu_group *ipmmu_find_group(struct device *dev)
  
Baolu Lu Feb. 15, 2023, 5:34 a.m. UTC | #2
On 2/13/23 10:16 PM, Jason Gunthorpe wrote:
> On Mon, Feb 13, 2023 at 03:49:38PM +0800, Lu Baolu wrote:
> 
>> +static int iommu_group_freeze_dev_ops(struct iommu_group *group)
>> +{
>> +	struct group_device *device;
>> +	struct device *dev;
>> +
>> +	mutex_lock(&group->mutex);
>> +	list_for_each_entry(device, &group->devices, list) {
>> +		dev = device->dev;
>> +		down_read(&dev->iommu->ops_rwsem);
> 
> This isn't allowed, you can't obtain locks in a loop like this, it
> will deadlock.
> 
> You don't need more locks, we already have the group mutex, the
> release path should be fixed to use it properly as I was trying to do here:
> 
> https://lore.kernel.org/kvm/4-v1-ef00ffecea52+2cb-iommu_group_lifetime_jgg@nvidia.com/
> https://lore.kernel.org/kvm/YyyTxx0HnA3maxEk@nvidia.com/
> 
> Then what you'd do in a path like this is:
> 
>    mutex_lock(&group->mutex);
>    dev = iommu_group_first_device(group)
>    if (!dev)
>       /* Racing with group cleanup */
>       return -EINVAL;
>    
>    /* Now dev->ops is valid and must remain valid so long as
>       group->mutex is held */
>     
> 
> The only reason this doesn't work already is because of the above
> stuff about release not holding the group mutex when manipulating the
> devices in the group. This is simply mis-locked.
> 
> Robin explained it was done like this because
> arm_iommu_detach_device() re-enters the iommu core during release_dev,
> so I suggest fixing that by simply not doing that (see above)
> 
> Below is the lastest attempt I had tried, I didn't have time to get back
> to it and we fixed the bug another way. It needs a bit of adjusting to
> also remove the device from the group after release is called within
> the same mutex critical region.

Yes. If we can make remove device from list and device release in the
same mutex critical region, we don't need any other lock mechanism
anymore.

The ipmmu driver supports default domain. When code comes to release
device, the device driver has already been unbound. The default domain
should have been attached to the device. Then iommu_detach_device() does
nothing because what it really does is just attaching default domain.

How about removing iommu_detach_device() from ipmmu's release path like
below?

diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index bdf1a4e5eae0..0bc29009703e 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -820,7 +820,16 @@ static void ipmmu_probe_finalize(struct device *dev)

  static void ipmmu_release_device(struct device *dev)
  {
-       arm_iommu_detach_device(dev);
+       struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev);
+
+       if (!mapping) {
+               dev_warn(dev, "Not attached\n");
+               return;
+       }
+
+       arm_iommu_release_mapping(mapping);
+       to_dma_iommu_mapping(dev) = NULL;
+       set_dma_ops(dev, NULL);
  }

After fixing this in ipmmu driver, we can safely put removing device and
release_device in a group->mutex critical region in two steps:

Step 1: Refactor iommu_group_remove_device() w/o functionality change:

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 05522eace439..17b2e358a6fd 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1065,6 +1065,46 @@ int iommu_group_add_device(struct iommu_group 
*group, struct device *dev)
  }
  EXPORT_SYMBOL_GPL(iommu_group_add_device);

+/*
+ * Remove a device from a group's device list and return the group device
+ * if successful.
+ */
+static struct group_device *
+__iommu_group_remove_device(struct iommu_group *group, struct device *dev)
+{
+	struct group_device *device;
+
+	lockdep_assert_held(&group->mutex);
+	list_for_each_entry(device, &group->devices, list) {
+		if (device->dev == dev) {
+			list_del(&device->list);
+			return device;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Release a device from its group and decrements the iommu group reference
+ * count.
+ */
+static void __iommu_group_release_device(struct iommu_group *group,
+					 struct group_device *grp_dev)
+{
+	struct device *dev = grp_dev->dev;
+
+	sysfs_remove_link(group->devices_kobj, grp_dev->name);
+	sysfs_remove_link(&dev->kobj, "iommu_group");
+
+	trace_remove_device_from_group(group->id, dev);
+
+	kfree(grp_dev->name);
+	kfree(grp_dev);
+	dev->iommu_group = NULL;
+	kobject_put(group->devices_kobj);
+}
+
  /**
   * iommu_group_remove_device - remove a device from it's current group
   * @dev: device to be removed
@@ -1075,7 +1115,7 @@ EXPORT_SYMBOL_GPL(iommu_group_add_device);
  void iommu_group_remove_device(struct device *dev)
  {
  	struct iommu_group *group = dev->iommu_group;
-	struct group_device *tmp_device, *device = NULL;
+	struct group_device *device;

  	if (!group)
  		return;
@@ -1083,27 +1123,11 @@ void iommu_group_remove_device(struct device *dev)
  	dev_info(dev, "Removing from iommu group %d\n", group->id);

  	mutex_lock(&group->mutex);
-	list_for_each_entry(tmp_device, &group->devices, list) {
-		if (tmp_device->dev == dev) {
-			device = tmp_device;
-			list_del(&device->list);
-			break;
-		}
-	}
+	device = __iommu_group_remove_device(group, dev);
  	mutex_unlock(&group->mutex);

-	if (!device)
-		return;
-
-	sysfs_remove_link(group->devices_kobj, device->name);
-	sysfs_remove_link(&dev->kobj, "iommu_group");
-
-	trace_remove_device_from_group(group->id, dev);
-
-	kfree(device->name);
-	kfree(device);
-	dev->iommu_group = NULL;
-	kobject_put(group->devices_kobj);
+	if (device)
+		__iommu_group_release_device(group, device);
  }
  EXPORT_SYMBOL_GPL(iommu_group_remove_device);


Step 2: Put removing group and release_device in a same critical region:

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 17b2e358a6fd..eeb2907472bc 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -101,6 +101,10 @@ static int 
iommu_create_device_direct_mappings(struct iommu_group *group,
  static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
  static ssize_t iommu_group_store_type(struct iommu_group *group,
  				      const char *buf, size_t count);
+static struct group_device *
+__iommu_group_remove_device(struct iommu_group *group, struct device *dev);
+static void __iommu_group_release_device(struct iommu_group *group,
+					 struct group_device *grp_dev);

  #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store)		\
  struct iommu_group_attribute iommu_group_attr_##_name =		\
@@ -466,18 +470,25 @@ int iommu_probe_device(struct device *dev)

  void iommu_release_device(struct device *dev)
  {
+	struct iommu_group *group = dev->iommu_group;
+	struct group_device *device;
  	const struct iommu_ops *ops;

-	if (!dev->iommu)
+	if (!dev->iommu || !group)
  		return;

  	iommu_device_unlink(dev->iommu->iommu_dev, dev);

+	mutex_lock(&group->mutex);
  	ops = dev_iommu_ops(dev);
  	if (ops->release_device)
  		ops->release_device(dev);
+	device = __iommu_group_remove_device(group, dev);
+	mutex_unlock(&group->mutex);
+
+	if (device)
+		__iommu_group_release_device(group, device);

-	iommu_group_remove_device(dev);
  	module_put(ops->owner);
  	dev_iommu_free(dev);
  }

Best regards,
baolu
  
Robin Murphy Feb. 15, 2023, 11:24 a.m. UTC | #3
On 2023-02-15 05:34, Baolu Lu wrote:
> On 2/13/23 10:16 PM, Jason Gunthorpe wrote:
>> On Mon, Feb 13, 2023 at 03:49:38PM +0800, Lu Baolu wrote:
>>
>>> +static int iommu_group_freeze_dev_ops(struct iommu_group *group)
>>> +{
>>> +    struct group_device *device;
>>> +    struct device *dev;
>>> +
>>> +    mutex_lock(&group->mutex);
>>> +    list_for_each_entry(device, &group->devices, list) {
>>> +        dev = device->dev;
>>> +        down_read(&dev->iommu->ops_rwsem);
>>
>> This isn't allowed, you can't obtain locks in a loop like this, it
>> will deadlock.
>>
>> You don't need more locks, we already have the group mutex, the
>> release path should be fixed to use it properly as I was trying to do 
>> here:
>>
>> https://lore.kernel.org/kvm/4-v1-ef00ffecea52+2cb-iommu_group_lifetime_jgg@nvidia.com/
>> https://lore.kernel.org/kvm/YyyTxx0HnA3maxEk@nvidia.com/
>>
>> Then what you'd do in a path like this is:
>>
>>    mutex_lock(&group->mutex);
>>    dev = iommu_group_first_device(group)
>>    if (!dev)
>>       /* Racing with group cleanup */
>>       return -EINVAL;
>>    /* Now dev->ops is valid and must remain valid so long as
>>       group->mutex is held */
>>
>> The only reason this doesn't work already is because of the above
>> stuff about release not holding the group mutex when manipulating the
>> devices in the group. This is simply mis-locked.
>>
>> Robin explained it was done like this because
>> arm_iommu_detach_device() re-enters the iommu core during release_dev,
>> so I suggest fixing that by simply not doing that (see above)
>>
>> Below is the lastest attempt I had tried, I didn't have time to get back
>> to it and we fixed the bug another way. It needs a bit of adjusting to
>> also remove the device from the group after release is called within
>> the same mutex critical region.
> 
> Yes. If we can make remove device from list and device release in the
> same mutex critical region, we don't need any other lock mechanism
> anymore.
> 
> The ipmmu driver supports default domain.

It supports default domains *on arm64*. Nothing on 32-bit ARM uses 
default domains, they won't even exist since iommu-dma is not enabled, 
and either way the ARM DMA ops don't understand groups. I don't see an 
obvious satisfactory solution while the arm_iommu_* APIs still exist, 
but if we need bodges in the interim could we please try to concentrate 
them in ARM-specific code?

Thanks,
Robin.

> When code comes to release
> device, the device driver has already been unbound. The default domain
> should have been attached to the device. Then iommu_detach_device() does
> nothing because what it really does is just attaching default domain.
> 
> How about removing iommu_detach_device() from ipmmu's release path like
> below?
> 
> diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
> index bdf1a4e5eae0..0bc29009703e 100644
> --- a/drivers/iommu/ipmmu-vmsa.c
> +++ b/drivers/iommu/ipmmu-vmsa.c
> @@ -820,7 +820,16 @@ static void ipmmu_probe_finalize(struct device *dev)
> 
>   static void ipmmu_release_device(struct device *dev)
>   {
> -       arm_iommu_detach_device(dev);
> +       struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev);
> +
> +       if (!mapping) {
> +               dev_warn(dev, "Not attached\n");
> +               return;
> +       }
> +
> +       arm_iommu_release_mapping(mapping);
> +       to_dma_iommu_mapping(dev) = NULL;
> +       set_dma_ops(dev, NULL);
>   }
> 
> After fixing this in ipmmu driver, we can safely put removing device and
> release_device in a group->mutex critical region in two steps:
> 
> Step 1: Refactor iommu_group_remove_device() w/o functionality change:
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 05522eace439..17b2e358a6fd 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -1065,6 +1065,46 @@ int iommu_group_add_device(struct iommu_group 
> *group, struct device *dev)
>   }
>   EXPORT_SYMBOL_GPL(iommu_group_add_device);
> 
> +/*
> + * Remove a device from a group's device list and return the group device
> + * if successful.
> + */
> +static struct group_device *
> +__iommu_group_remove_device(struct iommu_group *group, struct device *dev)
> +{
> +    struct group_device *device;
> +
> +    lockdep_assert_held(&group->mutex);
> +    list_for_each_entry(device, &group->devices, list) {
> +        if (device->dev == dev) {
> +            list_del(&device->list);
> +            return device;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
> +/*
> + * Release a device from its group and decrements the iommu group 
> reference
> + * count.
> + */
> +static void __iommu_group_release_device(struct iommu_group *group,
> +                     struct group_device *grp_dev)
> +{
> +    struct device *dev = grp_dev->dev;
> +
> +    sysfs_remove_link(group->devices_kobj, grp_dev->name);
> +    sysfs_remove_link(&dev->kobj, "iommu_group");
> +
> +    trace_remove_device_from_group(group->id, dev);
> +
> +    kfree(grp_dev->name);
> +    kfree(grp_dev);
> +    dev->iommu_group = NULL;
> +    kobject_put(group->devices_kobj);
> +}
> +
>   /**
>    * iommu_group_remove_device - remove a device from it's current group
>    * @dev: device to be removed
> @@ -1075,7 +1115,7 @@ EXPORT_SYMBOL_GPL(iommu_group_add_device);
>   void iommu_group_remove_device(struct device *dev)
>   {
>       struct iommu_group *group = dev->iommu_group;
> -    struct group_device *tmp_device, *device = NULL;
> +    struct group_device *device;
> 
>       if (!group)
>           return;
> @@ -1083,27 +1123,11 @@ void iommu_group_remove_device(struct device *dev)
>       dev_info(dev, "Removing from iommu group %d\n", group->id);
> 
>       mutex_lock(&group->mutex);
> -    list_for_each_entry(tmp_device, &group->devices, list) {
> -        if (tmp_device->dev == dev) {
> -            device = tmp_device;
> -            list_del(&device->list);
> -            break;
> -        }
> -    }
> +    device = __iommu_group_remove_device(group, dev);
>       mutex_unlock(&group->mutex);
> 
> -    if (!device)
> -        return;
> -
> -    sysfs_remove_link(group->devices_kobj, device->name);
> -    sysfs_remove_link(&dev->kobj, "iommu_group");
> -
> -    trace_remove_device_from_group(group->id, dev);
> -
> -    kfree(device->name);
> -    kfree(device);
> -    dev->iommu_group = NULL;
> -    kobject_put(group->devices_kobj);
> +    if (device)
> +        __iommu_group_release_device(group, device);
>   }
>   EXPORT_SYMBOL_GPL(iommu_group_remove_device);
> 
> 
> Step 2: Put removing group and release_device in a same critical region:
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 17b2e358a6fd..eeb2907472bc 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -101,6 +101,10 @@ static int 
> iommu_create_device_direct_mappings(struct iommu_group *group,
>   static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
>   static ssize_t iommu_group_store_type(struct iommu_group *group,
>                         const char *buf, size_t count);
> +static struct group_device *
> +__iommu_group_remove_device(struct iommu_group *group, struct device 
> *dev);
> +static void __iommu_group_release_device(struct iommu_group *group,
> +                     struct group_device *grp_dev);
> 
>   #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store)        \
>   struct iommu_group_attribute iommu_group_attr_##_name =        \
> @@ -466,18 +470,25 @@ int iommu_probe_device(struct device *dev)
> 
>   void iommu_release_device(struct device *dev)
>   {
> +    struct iommu_group *group = dev->iommu_group;
> +    struct group_device *device;
>       const struct iommu_ops *ops;
> 
> -    if (!dev->iommu)
> +    if (!dev->iommu || !group)
>           return;
> 
>       iommu_device_unlink(dev->iommu->iommu_dev, dev);
> 
> +    mutex_lock(&group->mutex);
>       ops = dev_iommu_ops(dev);
>       if (ops->release_device)
>           ops->release_device(dev);
> +    device = __iommu_group_remove_device(group, dev);
> +    mutex_unlock(&group->mutex);
> +
> +    if (device)
> +        __iommu_group_release_device(group, device);
> 
> -    iommu_group_remove_device(dev);
>       module_put(ops->owner);
>       dev_iommu_free(dev);
>   }
> 
> Best regards,
> baolu
  
Baolu Lu Feb. 16, 2023, 12:40 a.m. UTC | #4
On 2/15/23 7:24 PM, Robin Murphy wrote:
> On 2023-02-15 05:34, Baolu Lu wrote:
>> On 2/13/23 10:16 PM, Jason Gunthorpe wrote:
>>> On Mon, Feb 13, 2023 at 03:49:38PM +0800, Lu Baolu wrote:
>>>
>>>> +static int iommu_group_freeze_dev_ops(struct iommu_group *group)
>>>> +{
>>>> +    struct group_device *device;
>>>> +    struct device *dev;
>>>> +
>>>> +    mutex_lock(&group->mutex);
>>>> +    list_for_each_entry(device, &group->devices, list) {
>>>> +        dev = device->dev;
>>>> +        down_read(&dev->iommu->ops_rwsem);
>>>
>>> This isn't allowed, you can't obtain locks in a loop like this, it
>>> will deadlock.
>>>
>>> You don't need more locks, we already have the group mutex, the
>>> release path should be fixed to use it properly as I was trying to do 
>>> here:
>>>
>>> https://lore.kernel.org/kvm/4-v1-ef00ffecea52+2cb-iommu_group_lifetime_jgg@nvidia.com/
>>> https://lore.kernel.org/kvm/YyyTxx0HnA3maxEk@nvidia.com/
>>>
>>> Then what you'd do in a path like this is:
>>>
>>>    mutex_lock(&group->mutex);
>>>    dev = iommu_group_first_device(group)
>>>    if (!dev)
>>>       /* Racing with group cleanup */
>>>       return -EINVAL;
>>>    /* Now dev->ops is valid and must remain valid so long as
>>>       group->mutex is held */
>>>
>>> The only reason this doesn't work already is because of the above
>>> stuff about release not holding the group mutex when manipulating the
>>> devices in the group. This is simply mis-locked.
>>>
>>> Robin explained it was done like this because
>>> arm_iommu_detach_device() re-enters the iommu core during release_dev,
>>> so I suggest fixing that by simply not doing that (see above)
>>>
>>> Below is the lastest attempt I had tried, I didn't have time to get back
>>> to it and we fixed the bug another way. It needs a bit of adjusting to
>>> also remove the device from the group after release is called within
>>> the same mutex critical region.
>>
>> Yes. If we can make remove device from list and device release in the
>> same mutex critical region, we don't need any other lock mechanism
>> anymore.
>>
>> The ipmmu driver supports default domain.
> 
> It supports default domains *on arm64*. Nothing on 32-bit ARM uses 
> default domains, they won't even exist since iommu-dma is not enabled, 
> and either way the ARM DMA ops don't understand groups. I don't see an 
> obvious satisfactory solution while the arm_iommu_* APIs still exist, 
> but if we need bodges in the interim could we please try to concentrate 
> them in ARM-specific code?

Yes, sure. Thanks for the guide.

Best regards,
baolu
  

Patch

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3589d1b8f922..a4204e1bfef3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -402,6 +402,8 @@  struct iommu_fault_param {
  * @fwspec:	 IOMMU fwspec data
  * @iommu_dev:	 IOMMU device this device is linked to
  * @priv:	 IOMMU Driver private data
+ * @ops_rwsem:	 RW semaphore to synchronize between device release
+ *		 path and the sysfs interfaces.
  * @max_pasids:  number of PASIDs this device can consume
  * @attach_deferred: the dma domain attachment is deferred
  *
@@ -415,6 +417,7 @@  struct dev_iommu {
 	struct iommu_fwspec		*fwspec;
 	struct iommu_device		*iommu_dev;
 	void				*priv;
+	struct rw_semaphore		ops_rwsem;
 	u32				max_pasids;
 	u32				attach_deferred:1;
 };
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5f1dc9aaba52..4f71dcd2621b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -267,6 +267,7 @@  static struct dev_iommu *dev_iommu_get(struct device *dev)
 		return NULL;
 
 	mutex_init(&param->lock);
+	init_rwsem(&param->ops_rwsem);
 	dev->iommu = param;
 	return param;
 }
@@ -461,12 +462,19 @@  void iommu_release_device(struct device *dev)
 
 	iommu_device_unlink(dev->iommu->iommu_dev, dev);
 
+	/*
+	 * The device's iommu_ops will be released in .release_device
+	 * callback. Hold ops_rwsem to avoid use after release.
+	 */
+	down_write(&dev->iommu->ops_rwsem);
 	ops = dev_iommu_ops(dev);
 	if (ops->release_device)
 		ops->release_device(dev);
+	module_put(ops->owner);
+	dev->iommu->iommu_dev = NULL;
+	up_write(&dev->iommu->ops_rwsem);
 
 	iommu_group_remove_device(dev);
-	module_put(ops->owner);
 	dev_iommu_free(dev);
 }
 
@@ -2911,6 +2919,46 @@  static int iommu_change_dev_def_domain(struct iommu_group *group,
 	return ret;
 }
 
+static int iommu_group_freeze_dev_ops(struct iommu_group *group)
+{
+	struct group_device *device;
+	struct device *dev;
+
+	mutex_lock(&group->mutex);
+	list_for_each_entry(device, &group->devices, list) {
+		dev = device->dev;
+		down_read(&dev->iommu->ops_rwsem);
+		/* .release_device has been called. */
+		if (!dev->iommu->iommu_dev) {
+			up_read(&dev->iommu->ops_rwsem);
+			goto restore_out;
+		}
+	}
+	mutex_unlock(&group->mutex);
+
+	return 0;
+
+restore_out:
+	list_for_each_entry(device, &group->devices, list) {
+		if (device->dev == dev)
+			break;
+		up_read(&device->dev->iommu->ops_rwsem);
+	}
+	mutex_unlock(&group->mutex);
+
+	return -EINVAL;
+}
+
+static void iommu_group_unfreeze_dev_ops(struct iommu_group *group)
+{
+	struct group_device *device;
+
+	mutex_lock(&group->mutex);
+	list_for_each_entry(device, &group->devices, list)
+		up_read(&device->dev->iommu->ops_rwsem);
+	mutex_unlock(&group->mutex);
+}
+
 /*
  * Changing the default domain through sysfs requires the users to unbind the
  * drivers from the devices in the iommu group, except for a DMA -> DMA-FQ
@@ -2988,6 +3036,8 @@  static ssize_t iommu_group_store_type(struct iommu_group *group,
 	 */
 	mutex_unlock(&group->mutex);
 
+	iommu_group_freeze_dev_ops(group);
+
 	/* Check if the device in the group still has a driver bound to it */
 	device_lock(dev);
 	if (device_is_bound(dev) && !(req_type == IOMMU_DOMAIN_DMA_FQ &&
@@ -3002,6 +3052,7 @@  static ssize_t iommu_group_store_type(struct iommu_group *group,
 
 out:
 	device_unlock(dev);
+	iommu_group_unfreeze_dev_ops(group);
 	put_device(dev);
 
 	return ret;