[v2,4/8] iommu: Factor out some helpers

Message ID 959a1e8d598c0a82f94123e017cafb273784f848.1674753627.git.robin.murphy@arm.com
State New
Headers
Series iommu: The early demise of bus ops |

Commit Message

Robin Murphy Jan. 26, 2023, 6:26 p.m. UTC
  The pattern for picking the first device out of the group list is
repeated a few times now, so it's clearly worth factoring out to hide
the group_device detail from places that don't need to know. Similarly,
the safety check for dev_iommu_ops() at public interfaces starts looking
a bit repetitive, and might not be completely obvious at first glance,
so let's factor that out for clarity as well, in preparation for more
uses of both.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

v2: - Add dev_iommu_ops_valid() helper
    - Add lockdep assertion [Jason]

 drivers/iommu/iommu.c | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)
  

Comments

Baolu Lu Jan. 28, 2023, 8:12 a.m. UTC | #1
On 2023/1/27 2:26, Robin Murphy wrote:
> The pattern for picking the first device out of the group list is
> repeated a few times now, so it's clearly worth factoring out to hide
> the group_device detail from places that don't need to know. Similarly,
> the safety check for dev_iommu_ops() at public interfaces starts looking
> a bit repetitive, and might not be completely obvious at first glance,
> so let's factor that out for clarity as well, in preparation for more
> uses of both.
> 
> Signed-off-by: Robin Murphy<robin.murphy@arm.com>

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>

Best regards,
baolu
  
Jason Gunthorpe Jan. 30, 2023, 4:38 p.m. UTC | #2
On Thu, Jan 26, 2023 at 06:26:19PM +0000, Robin Murphy wrote:
> The pattern for picking the first device out of the group list is
> repeated a few times now, so it's clearly worth factoring out to hide
> the group_device detail from places that don't need to know. Similarly,
> the safety check for dev_iommu_ops() at public interfaces starts looking
> a bit repetitive, and might not be completely obvious at first glance,
> so let's factor that out for clarity as well, in preparation for more
> uses of both.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
> 
> v2: - Add dev_iommu_ops_valid() helper
>     - Add lockdep assertion [Jason]
> 
>  drivers/iommu/iommu.c | 39 ++++++++++++++++++++++-----------------
>  1 file changed, 22 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 77f076030995..440bb3b7bded 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -284,6 +284,12 @@ static void dev_iommu_free(struct device *dev)
>  	kfree(param);
>  }
>  
> +/* Only needed in public APIs which allow unchecked devices for convenience */
> +static bool dev_iommu_ops_valid(struct device *dev)
> +{
> +	return dev->iommu && dev->iommu->iommu_dev;
> +}

I did an audit and more than half the callers need this test, and a
few places are missing it already.

We've kind of made a systematic error that assumes being in a group is
sufficient to prove there are non-NULL ops.

So I suggest that we make dev_iommu_ops() return NULL in all cases
where there is no driver and have a new API to get the ops in cases
where the call chain knows that it is safe, and there are only 5 of
those cases.

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index de91dd88705bd3..4b04ad50de8d6b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -98,7 +98,8 @@ static int __iommu_group_set_domain(struct iommu_group *group,
 				    struct iommu_domain *new_domain);
 static int iommu_create_device_direct_mappings(struct iommu_group *group,
 					       struct device *dev);
-static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
+static struct iommu_group *iommu_group_get_for_dev(struct device *dev,
+						   const struct iommu_ops *ops);
 static ssize_t iommu_group_store_type(struct iommu_group *group,
 				      const char *buf, size_t count);
 
@@ -130,6 +131,19 @@ static struct bus_type * const iommu_buses[] = {
 #endif
 };
 
+/*
+ * This may only be called if it is already clear from the calling context that
+ * the device has an ops. Seeing the device is part of an iommu_group is not
+ * enough as VFIO and POWER put devices in iommu_groups and do not attach
+ * drivers. Thus the only places that are safe have either already called
+ * dev_iommu_ops() on their call chain, or were responsible for assigning ops in
+ * the first place.
+ */
+static inline const struct iommu_ops *dev_iommu_ops_safe(struct device *dev)
+{
+	return dev->iommu->iommu_dev->ops;
+}
+
 /*
  * Use a function instead of an array here because the domain-type is a
  * bit-field, so an array would waste memory.
@@ -338,7 +352,7 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
 	dev->iommu->iommu_dev = iommu_dev;
 	dev->iommu->max_pasids = dev_iommu_get_max_pasids(dev);
 
-	group = iommu_group_get_for_dev(dev);
+	group = iommu_group_get_for_dev(dev, ops);
 	if (IS_ERR(group)) {
 		ret = PTR_ERR(group);
 		goto out_release;
@@ -414,7 +428,8 @@ int iommu_probe_device(struct device *dev)
 	mutex_unlock(&group->mutex);
 	iommu_group_put(group);
 
-	ops = dev_iommu_ops(dev);
+	/* __iommu_probe_device() set the ops */
+	ops = dev_iommu_ops_safe(dev);
 	if (ops->probe_finalize)
 		ops->probe_finalize(dev);
 
@@ -430,14 +445,13 @@ int iommu_probe_device(struct device *dev)
 
 void iommu_release_device(struct device *dev)
 {
-	const struct iommu_ops *ops;
+	const struct iommu_ops *ops = dev_iommu_ops(dev);
 
-	if (!dev->iommu)
+	if (!ops)
 		return;
 
 	iommu_device_unlink(dev->iommu->iommu_dev, dev);
 
-	ops = dev_iommu_ops(dev);
 	if (ops->release_device)
 		ops->release_device(dev);
 
@@ -614,13 +628,6 @@ int iommu_get_group_resv_regions(struct iommu_group *group,
 	list_for_each_entry(device, &group->devices, list) {
 		struct list_head dev_resv_regions;
 
-		/*
-		 * Non-API groups still expose reserved_regions in sysfs,
-		 * so filter out calls that get here that way.
-		 */
-		if (!device->dev->iommu)
-			break;
-
 		INIT_LIST_HEAD(&dev_resv_regions);
 		iommu_get_resv_regions(device->dev, &dev_resv_regions);
 		ret = iommu_insert_device_resv_regions(&dev_resv_regions, head);
@@ -1332,7 +1339,8 @@ int iommu_page_response(struct device *dev,
 	struct iommu_fault_event *evt;
 	struct iommu_fault_page_request *prm;
 	struct dev_iommu *param = dev->iommu;
-	const struct iommu_ops *ops = dev_iommu_ops(dev);
+	/* This API should only be called from an IOMMU driver */
+	const struct iommu_ops *ops = dev_iommu_ops_safe(dev);
 	bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID;
 
 	if (!ops->page_response)
@@ -1601,7 +1609,8 @@ EXPORT_SYMBOL_GPL(fsl_mc_device_group);
 
 static int iommu_get_def_domain_type(struct device *dev)
 {
-	const struct iommu_ops *ops = dev_iommu_ops(dev);
+	/* It is not locked but all callers know there is an ops */
+	const struct iommu_ops *ops = dev_iommu_ops_safe(dev);
 
 	if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted)
 		return IOMMU_DOMAIN_DMA;
@@ -1658,9 +1667,9 @@ static int iommu_alloc_default_domain(struct iommu_group *group,
  * to the returned IOMMU group, which will already include the provided
  * device.  The reference should be released with iommu_group_put().
  */
-static struct iommu_group *iommu_group_get_for_dev(struct device *dev)
+static struct iommu_group *iommu_group_get_for_dev(struct device *dev,
+						   const struct iommu_ops *ops)
 {
-	const struct iommu_ops *ops = dev_iommu_ops(dev);
 	struct iommu_group *group;
 	int ret;
 
@@ -1795,7 +1804,8 @@ static int __iommu_group_dma_attach(struct iommu_group *group)
 
 static int iommu_group_do_probe_finalize(struct device *dev, void *data)
 {
-	const struct iommu_ops *ops = dev_iommu_ops(dev);
+	/* It is unlocked but all callers know there is an ops */
+	const struct iommu_ops *ops = dev_iommu_ops_safe(dev);
 
 	if (ops->probe_finalize)
 		ops->probe_finalize(dev);
@@ -1884,13 +1894,9 @@ EXPORT_SYMBOL_GPL(iommu_present);
  */
 bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
 {
-	const struct iommu_ops *ops;
-
-	if (!dev->iommu || !dev->iommu->iommu_dev)
-		return false;
+	const struct iommu_ops *ops = dev_iommu_ops(dev);
 
-	ops = dev_iommu_ops(dev);
-	if (!ops->capable)
+	if (!ops || !ops->capable)
 		return false;
 
 	return ops->capable(dev, cap);
@@ -2620,7 +2626,11 @@ void iommu_get_resv_regions(struct device *dev, struct list_head *list)
 {
 	const struct iommu_ops *ops = dev_iommu_ops(dev);
 
-	if (ops->get_resv_regions)
+	/*
+	 * Non-API groups still expose reserved_regions in sysfs, so filter out
+	 * calls that get here that way.
+	 */
+	if (ops && ops->get_resv_regions)
 		ops->get_resv_regions(dev, list);
 }
 
@@ -2979,6 +2989,11 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
 	/* Since group has only one device */
 	grp_dev = list_first_entry(&group->devices, struct group_device, list);
 	dev = grp_dev->dev;
+	if (!dev_iommu_ops(dev)) {
+		/* The group doesn't have an iommu driver attached */
+		mutex_unlock(&group->mutex);
+		return -EINVAL;
+	}
 	get_device(dev);
 
 	/*
@@ -3301,7 +3316,7 @@ static void __iommu_remove_group_pasid(struct iommu_group *group,
 	const struct iommu_ops *ops;
 
 	list_for_each_entry(device, &group->devices, list) {
-		ops = dev_iommu_ops(device->dev);
+		ops = dev_iommu_ops_safe(device->dev);
 		ops->remove_dev_pasid(device->dev, pasid);
 	}
 }
@@ -3413,6 +3428,9 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
 	const struct iommu_ops *ops = dev_iommu_ops(dev);
 	struct iommu_domain *domain;
 
+	if (!ops)
+		return NULL;
+
 	domain = ops->domain_alloc(IOMMU_DOMAIN_SVA);
 	if (!domain)
 		return NULL;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 46e1347bfa2286..60e84f8c7c46e9 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -441,14 +441,11 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 	};
 }
 
+/* May return NULL if the device has no iommu driver */
 static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
 {
-	/*
-	 * Assume that valid ops must be installed if iommu_probe_device()
-	 * has succeeded. The device ops are essentially for internal use
-	 * within the IOMMU subsystem itself, so we should be able to trust
-	 * ourselves not to misuse the helper.
-	 */
+	if (!dev->iommu)
+		return NULL;
 	return dev->iommu->iommu_dev->ops;
 }
  
Robin Murphy Jan. 30, 2023, 6:05 p.m. UTC | #3
On 2023-01-30 16:38, Jason Gunthorpe wrote:
> On Thu, Jan 26, 2023 at 06:26:19PM +0000, Robin Murphy wrote:
>> The pattern for picking the first device out of the group list is
>> repeated a few times now, so it's clearly worth factoring out to hide
>> the group_device detail from places that don't need to know. Similarly,
>> the safety check for dev_iommu_ops() at public interfaces starts looking
>> a bit repetitive, and might not be completely obvious at first glance,
>> so let's factor that out for clarity as well, in preparation for more
>> uses of both.
>>
>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>> ---
>>
>> v2: - Add dev_iommu_ops_valid() helper
>>      - Add lockdep assertion [Jason]
>>
>>   drivers/iommu/iommu.c | 39 ++++++++++++++++++++++-----------------
>>   1 file changed, 22 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index 77f076030995..440bb3b7bded 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -284,6 +284,12 @@ static void dev_iommu_free(struct device *dev)
>>   	kfree(param);
>>   }
>>   
>> +/* Only needed in public APIs which allow unchecked devices for convenience */
>> +static bool dev_iommu_ops_valid(struct device *dev)
>> +{
>> +	return dev->iommu && dev->iommu->iommu_dev;
>> +}
> 
> I did an audit and more than half the callers need this test, and a
> few places are missing it already.
> 
> We've kind of made a systematic error that assumes being in a group is
> sufficient to prove there are non-NULL ops.
> 
> So I suggest that we make dev_iommu_ops() return NULL in all cases
> where there is no driver and have a new API to get the ops in cases
> where the call chain knows that it is safe, and there are only 5 of
> those cases.
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index de91dd88705bd3..4b04ad50de8d6b 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -98,7 +98,8 @@ static int __iommu_group_set_domain(struct iommu_group *group,
>   				    struct iommu_domain *new_domain);
>   static int iommu_create_device_direct_mappings(struct iommu_group *group,
>   					       struct device *dev);
> -static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
> +static struct iommu_group *iommu_group_get_for_dev(struct device *dev,
> +						   const struct iommu_ops *ops);
>   static ssize_t iommu_group_store_type(struct iommu_group *group,
>   				      const char *buf, size_t count);
>   
> @@ -130,6 +131,19 @@ static struct bus_type * const iommu_buses[] = {
>   #endif
>   };
>   
> +/*
> + * This may only be called if it is already clear from the calling context that
> + * the device has an ops. Seeing the device is part of an iommu_group is not
> + * enough as VFIO and POWER put devices in iommu_groups and do not attach
> + * drivers. Thus the only places that are safe have either already called
> + * dev_iommu_ops() on their call chain, or were responsible for assigning ops in
> + * the first place.
> + */
> +static inline const struct iommu_ops *dev_iommu_ops_safe(struct device *dev)
> +{
> +	return dev->iommu->iommu_dev->ops;
> +}
> +
>   /*
>    * Use a function instead of an array here because the domain-type is a
>    * bit-field, so an array would waste memory.
> @@ -338,7 +352,7 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
>   	dev->iommu->iommu_dev = iommu_dev;
>   	dev->iommu->max_pasids = dev_iommu_get_max_pasids(dev);
>   
> -	group = iommu_group_get_for_dev(dev);
> +	group = iommu_group_get_for_dev(dev, ops);

Why? We've literally just assigned dev->iommu->iommu_dev 2 lines above; 
it is not allowed to have invalid ops. Plus in future they may 
potentially be a different set of device ops from the ones we initially 
found to provide ->probe_device - in that case we definitely want 
group_get_for_dev to use the proper instance ops. This is the only place 
it is (and should be) called, so I don't see any problem.

>   	if (IS_ERR(group)) {
>   		ret = PTR_ERR(group);
>   		goto out_release;
> @@ -414,7 +428,8 @@ int iommu_probe_device(struct device *dev)
>   	mutex_unlock(&group->mutex);
>   	iommu_group_put(group);
>   
> -	ops = dev_iommu_ops(dev);
> +	/* __iommu_probe_device() set the ops */
> +	ops = dev_iommu_ops_safe(dev);
>   	if (ops->probe_finalize)
>   		ops->probe_finalize(dev);
>   
> @@ -430,14 +445,13 @@ int iommu_probe_device(struct device *dev)
>   
>   void iommu_release_device(struct device *dev)
>   {
> -	const struct iommu_ops *ops;
> +	const struct iommu_ops *ops = dev_iommu_ops(dev);

This is just moving an existing check around.

> -	if (!dev->iommu)
> +	if (!ops)
>   		return;
>   
>   	iommu_device_unlink(dev->iommu->iommu_dev, dev);
>   
> -	ops = dev_iommu_ops(dev);
>   	if (ops->release_device)
>   		ops->release_device(dev);
>   
> @@ -614,13 +628,6 @@ int iommu_get_group_resv_regions(struct iommu_group *group,
>   	list_for_each_entry(device, &group->devices, list) {
>   		struct list_head dev_resv_regions;
>   
> -		/*
> -		 * Non-API groups still expose reserved_regions in sysfs,
> -		 * so filter out calls that get here that way.
> -		 */
> -		if (!device->dev->iommu)
> -			break;
> -
>   		INIT_LIST_HEAD(&dev_resv_regions);
>   		iommu_get_resv_regions(device->dev, &dev_resv_regions);
>   		ret = iommu_insert_device_resv_regions(&dev_resv_regions, head);
> @@ -1332,7 +1339,8 @@ int iommu_page_response(struct device *dev,
>   	struct iommu_fault_event *evt;
>   	struct iommu_fault_page_request *prm;
>   	struct dev_iommu *param = dev->iommu;
> -	const struct iommu_ops *ops = dev_iommu_ops(dev);
> +	/* This API should only be called from an IOMMU driver */
> +	const struct iommu_ops *ops = dev_iommu_ops_safe(dev);
>   	bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID;
>   
>   	if (!ops->page_response)
> @@ -1601,7 +1609,8 @@ EXPORT_SYMBOL_GPL(fsl_mc_device_group);
>   
>   static int iommu_get_def_domain_type(struct device *dev)
>   {
> -	const struct iommu_ops *ops = dev_iommu_ops(dev);
> +	/* It is not locked but all callers know there is an ops */
> +	const struct iommu_ops *ops = dev_iommu_ops_safe(dev);
>   
>   	if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted)
>   		return IOMMU_DOMAIN_DMA;
> @@ -1658,9 +1667,9 @@ static int iommu_alloc_default_domain(struct iommu_group *group,
>    * to the returned IOMMU group, which will already include the provided
>    * device.  The reference should be released with iommu_group_put().
>    */
> -static struct iommu_group *iommu_group_get_for_dev(struct device *dev)
> +static struct iommu_group *iommu_group_get_for_dev(struct device *dev,
> +						   const struct iommu_ops *ops)
>   {
> -	const struct iommu_ops *ops = dev_iommu_ops(dev);
>   	struct iommu_group *group;
>   	int ret;
>   
> @@ -1795,7 +1804,8 @@ static int __iommu_group_dma_attach(struct iommu_group *group)
>   
>   static int iommu_group_do_probe_finalize(struct device *dev, void *data)
>   {
> -	const struct iommu_ops *ops = dev_iommu_ops(dev);
> +	/* It is unlocked but all callers know there is an ops */
> +	const struct iommu_ops *ops = dev_iommu_ops_safe(dev);
>   
>   	if (ops->probe_finalize)
>   		ops->probe_finalize(dev);
> @@ -1884,13 +1894,9 @@ EXPORT_SYMBOL_GPL(iommu_present);
>    */
>   bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
>   {
> -	const struct iommu_ops *ops;
> -
> -	if (!dev->iommu || !dev->iommu->iommu_dev)
> -		return false;
> +	const struct iommu_ops *ops = dev_iommu_ops(dev);
>   
> -	ops = dev_iommu_ops(dev);
> -	if (!ops->capable)
> +	if (!ops || !ops->capable)
>   		return false;
>   
>   	return ops->capable(dev, cap);
> @@ -2620,7 +2626,11 @@ void iommu_get_resv_regions(struct device *dev, struct list_head *list)
>   {
>   	const struct iommu_ops *ops = dev_iommu_ops(dev);
>   
> -	if (ops->get_resv_regions)
> +	/*
> +	 * Non-API groups still expose reserved_regions in sysfs, so filter out
> +	 * calls that get here that way.
> +	 */
> +	if (ops && ops->get_resv_regions)

This is just moving the existing check from the public interface to 
pointlessly impose it on the internal call path as well.

>   		ops->get_resv_regions(dev, list);
>   }
>   
> @@ -2979,6 +2989,11 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
>   	/* Since group has only one device */
>   	grp_dev = list_first_entry(&group->devices, struct group_device, list);
>   	dev = grp_dev->dev;
> +	if (!dev_iommu_ops(dev)) {
> +		/* The group doesn't have an iommu driver attached */
> +		mutex_unlock(&group->mutex);
> +		return -EINVAL;
> +	}

Withot any ops, group->default_domain will be NULL so we could never 
even get here.

>   	get_device(dev);
>   
>   	/*
> @@ -3301,7 +3316,7 @@ static void __iommu_remove_group_pasid(struct iommu_group *group,
>   	const struct iommu_ops *ops;
>   
>   	list_for_each_entry(device, &group->devices, list) {
> -		ops = dev_iommu_ops(device->dev);
> +		ops = dev_iommu_ops_safe(device->dev);
>   		ops->remove_dev_pasid(device->dev, pasid);
>   	}
>   }
> @@ -3413,6 +3428,9 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
>   	const struct iommu_ops *ops = dev_iommu_ops(dev);
>   	struct iommu_domain *domain;
>   
> +	if (!ops)
> +		return NULL;

Anyone who incorrectly calls sva_domain_alloc() directly without having 
successfully called iommu_dev_enable_feature() first deserves to crash.

(Incidentally, you've missed enable/disable_feature here.)

> +
>   	domain = ops->domain_alloc(IOMMU_DOMAIN_SVA);
>   	if (!domain)
>   		return NULL;
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 46e1347bfa2286..60e84f8c7c46e9 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -441,14 +441,11 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
>   	};
>   }
>   
> +/* May return NULL if the device has no iommu driver */
>   static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
>   {
> -	/*
> -	 * Assume that valid ops must be installed if iommu_probe_device()
> -	 * has succeeded. The device ops are essentially for internal use
> -	 * within the IOMMU subsystem itself, so we should be able to trust
> -	 * ourselves not to misuse the helper.
> -	 */
> +	if (!dev->iommu)
> +		return NULL;
>   	return dev->iommu->iommu_dev->ops;

This is actively broken, since dev->iommu may be non-NULL before 
dev->iommu->iommu_dev is set (a fwspec will always be set before the 
instyance is registered, and a device may potentially hang around in 
that state for evertt if the relevant IOMMU instance never appears).

Sorry, I don't think any of this makes sense :/

Thanks,
Robin.
  
Jason Gunthorpe Jan. 30, 2023, 6:20 p.m. UTC | #4
On Mon, Jan 30, 2023 at 06:05:12PM +0000, Robin Murphy wrote:
> >    * Use a function instead of an array here because the domain-type is a
> >    * bit-field, so an array would waste memory.
> > @@ -338,7 +352,7 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
> >   	dev->iommu->iommu_dev = iommu_dev;
> >   	dev->iommu->max_pasids = dev_iommu_get_max_pasids(dev);
> > -	group = iommu_group_get_for_dev(dev);
> > +	group = iommu_group_get_for_dev(dev, ops);
> 
> Why? We've literally just assigned dev->iommu->iommu_dev 2 lines above; it
> is not allowed to have invalid ops. Plus in future they may potentially be a
> different set of device ops from the ones we initially found to provide
> ->probe_device - in that case we definitely want group_get_for_dev to use
> the proper instance ops. This is the only place it is (and should be)
> called, so I don't see any problem.

Sure, but IMHO it was clearer to pass the ops down than to obtain it
again. But maybe this could be tidied in a different way.

> >   	if (IS_ERR(group)) {
> >   		ret = PTR_ERR(group);
> >   		goto out_release;
> > @@ -414,7 +428,8 @@ int iommu_probe_device(struct device *dev)
> >   	mutex_unlock(&group->mutex);
> >   	iommu_group_put(group);
> > -	ops = dev_iommu_ops(dev);
> > +	/* __iommu_probe_device() set the ops */
> > +	ops = dev_iommu_ops_safe(dev);
> >   	if (ops->probe_finalize)
> >   		ops->probe_finalize(dev);
> > @@ -430,14 +445,13 @@ int iommu_probe_device(struct device *dev)
> >   void iommu_release_device(struct device *dev)
> >   {
> > -	const struct iommu_ops *ops;
> > +	const struct iommu_ops *ops = dev_iommu_ops(dev);
> 
> This is just moving an existing check around.

The point is to have one check. If you need to get the ops and you
don't know the state of dev then you calll dev_iommu_ops() and check
for NULL. Simple and consistent.

> > -     if (!dev->iommu)
> > +     if (!ops)
> >               return;

As you pointed out below this isn't even coded right since iommu can
be non-NULL.

> > @@ -2620,7 +2626,11 @@ void iommu_get_resv_regions(struct device *dev, struct list_head *list)
> >   {
> >   	const struct iommu_ops *ops = dev_iommu_ops(dev);
> > -	if (ops->get_resv_regions)
> > +	/*
> > +	 * Non-API groups still expose reserved_regions in sysfs, so filter out
> > +	 * calls that get here that way.
> > +	 */
> > +	if (ops && ops->get_resv_regions)
> 
> This is just moving the existing check from the public interface to
> pointlessly impose it on the internal call path as well.

Who cares? We don't need to micro-optimize this stuff. The fact there
are a few bugs already is proof enough of that.

> >   		ops->get_resv_regions(dev, list);
> >   }
> > @@ -2979,6 +2989,11 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
> >   	/* Since group has only one device */
> >   	grp_dev = list_first_entry(&group->devices, struct group_device, list);
> >   	dev = grp_dev->dev;
> > +	if (!dev_iommu_ops(dev)) {
> > +		/* The group doesn't have an iommu driver attached */
> > +		mutex_unlock(&group->mutex);
> > +		return -EINVAL;
> > +	}
> 
> Withot any ops, group->default_domain will be NULL so we could never even
> get here.

Fair enough, deserves a comment

> >   	get_device(dev);
> >   	/*
> > @@ -3301,7 +3316,7 @@ static void __iommu_remove_group_pasid(struct iommu_group *group,
> >   	const struct iommu_ops *ops;
> >   	list_for_each_entry(device, &group->devices, list) {
> > -		ops = dev_iommu_ops(device->dev);
> > +		ops = dev_iommu_ops_safe(device->dev);
> >   		ops->remove_dev_pasid(device->dev, pasid);
> >   	}
> >   }
> > @@ -3413,6 +3428,9 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
> >   	const struct iommu_ops *ops = dev_iommu_ops(dev);
> >   	struct iommu_domain *domain;
> > +	if (!ops)
> > +		return NULL;
> 
> Anyone who incorrectly calls sva_domain_alloc() directly without having
> successfully called iommu_dev_enable_feature() first deserves to crash.

Lets not build assumptions like this into the code please. That
iommu_dev_enable_feature() thing is on my hitlist too :(

> (Incidentally, you've missed enable/disable_feature here.)

Yes, because they don't call dev_iommu_ops for some reason. It should
be fixed too.

> > +/* May return NULL if the device has no iommu driver */
> >   static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
> >   {
> > -	/*
> > -	 * Assume that valid ops must be installed if iommu_probe_device()
> > -	 * has succeeded. The device ops are essentially for internal use
> > -	 * within the IOMMU subsystem itself, so we should be able to trust
> > -	 * ourselves not to misuse the helper.
> > -	 */
> > +	if (!dev->iommu)
> > +		return NULL;
> >   	return dev->iommu->iommu_dev->ops;
> 
> This is actively broken, since dev->iommu may be non-NULL before
> dev->iommu->iommu_dev is set (a fwspec will always be set before the
> instyance is registered, and a device may potentially hang around in that
> state for evertt if the relevant IOMMU instance never appears).

Sure, I missed a NULL

> Sorry, I don't think any of this makes sense :/

The point is to be more consistent and not just blindly add more
helpers. If you need ops then ask for the ops. If you aren't sure the
dev has ops then check ops for NULL. It is pretty simple.

Jason
  
Robin Murphy Jan. 30, 2023, 11:33 p.m. UTC | #5
On 2023-01-30 18:20, Jason Gunthorpe wrote:
> On Mon, Jan 30, 2023 at 06:05:12PM +0000, Robin Murphy wrote:
>>>     * Use a function instead of an array here because the domain-type is a
>>>     * bit-field, so an array would waste memory.
>>> @@ -338,7 +352,7 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
>>>    	dev->iommu->iommu_dev = iommu_dev;
>>>    	dev->iommu->max_pasids = dev_iommu_get_max_pasids(dev);
>>> -	group = iommu_group_get_for_dev(dev);
>>> +	group = iommu_group_get_for_dev(dev, ops);
>>
>> Why? We've literally just assigned dev->iommu->iommu_dev 2 lines above; it
>> is not allowed to have invalid ops. Plus in future they may potentially be a
>> different set of device ops from the ones we initially found to provide
>> ->probe_device - in that case we definitely want group_get_for_dev to use
>> the proper instance ops. This is the only place it is (and should be)
>> called, so I don't see any problem.
> 
> Sure, but IMHO it was clearer to pass the ops down than to obtain it
> again. But maybe this could be tidied in a different way.

I disagree. In what we have now, every operation which calls into a 
driver consistently uses the instance ops (or domain ops), except for 
->probe_device for obvious reasons. Making ->device_group look special 
for no technical reason is less consistent, and as such less clear. It 
would be the only place in the kernel where ops are called from a 
function argument, and to anyone unfamiliar looking at the code and 
wondering why that is, the answer "because Jason thinks it looks better" 
is not going to be obvious at all.

>>>    	if (IS_ERR(group)) {
>>>    		ret = PTR_ERR(group);
>>>    		goto out_release;
>>> @@ -414,7 +428,8 @@ int iommu_probe_device(struct device *dev)
>>>    	mutex_unlock(&group->mutex);
>>>    	iommu_group_put(group);
>>> -	ops = dev_iommu_ops(dev);
>>> +	/* __iommu_probe_device() set the ops */
>>> +	ops = dev_iommu_ops_safe(dev);
>>>    	if (ops->probe_finalize)
>>>    		ops->probe_finalize(dev);
>>> @@ -430,14 +445,13 @@ int iommu_probe_device(struct device *dev)
>>>    void iommu_release_device(struct device *dev)
>>>    {
>>> -	const struct iommu_ops *ops;
>>> +	const struct iommu_ops *ops = dev_iommu_ops(dev);
>>
>> This is just moving an existing check around.
> 
> The point is to have one check. If you need to get the ops and you
> don't know the state of dev then you calll dev_iommu_ops() and check
> for NULL. Simple and consistent.
> 
>>> -     if (!dev->iommu)
>>> +     if (!ops)
>>>                return;
> 
> As you pointed out below this isn't even coded right since iommu can
> be non-NULL.

Oh, indeed that is technically a bug, although it's pretty much 
impossible to hit in practice because there's no real device hotplug on 
DT-based systems using fwspec - "dynamic" buses like PCI SR-IOV or 
fsl-mc aren't going to be discovered at all until the relevant IOMMU 
associated with the main controller device is ready, thus no removable 
child device would ever be in the "half-probed" state. I missed this one 
since I was looking for the dev->iommu->iommu_dev pattern - since it 
looks like this series is headed for a v3 next cycle, I've added this 
site to $SUBJECT locally.

>>> @@ -2620,7 +2626,11 @@ void iommu_get_resv_regions(struct device *dev, struct list_head *list)
>>>    {
>>>    	const struct iommu_ops *ops = dev_iommu_ops(dev);
>>> -	if (ops->get_resv_regions)
>>> +	/*
>>> +	 * Non-API groups still expose reserved_regions in sysfs, so filter out
>>> +	 * calls that get here that way.
>>> +	 */
>>> +	if (ops && ops->get_resv_regions)
>>
>> This is just moving the existing check from the public interface to
>> pointlessly impose it on the internal call path as well.
> 
> Who cares? We don't need to micro-optimize this stuff. The fact there
> are a few bugs already is proof enough of that.

"a few"? So far there's only one, and it's not even the class of bug 
you're trying to address, because there _is_ an explicit validity check 
already, it just has an oversight in it.

This isn't micro-optimisation, it's consistency: we have validity checks 
close to the entrypoints of public interfaces where they are required. 
On internal paths where they are not required, we do not sometimes have 
checks and sometimes not, leaving people to wonder what the requirements 
actually are - in fact someone went so far as to call such patterns 
"confusing" and "overzealous" back when dev_iommu_ops() was reviewed. It 
was agreed that the lack of a check where ops are retrieved clearly 
documents where they are expected to be valid.

>>>    		ops->get_resv_regions(dev, list);
>>>    }
>>> @@ -2979,6 +2989,11 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
>>>    	/* Since group has only one device */
>>>    	grp_dev = list_first_entry(&group->devices, struct group_device, list);
>>>    	dev = grp_dev->dev;
>>> +	if (!dev_iommu_ops(dev)) {
>>> +		/* The group doesn't have an iommu driver attached */
>>> +		mutex_unlock(&group->mutex);
>>> +		return -EINVAL;
>>> +	}
>>
>> Withot any ops, group->default_domain will be NULL so we could never even
>> get here.
> 
> Fair enough, deserves a comment

Great big function specifically for changing the default domain of a 
group... right at the top, literally the second thing it does on entry 
is check that the group and default domain are valid, and return if not. 
If that isn't sufficiently clear, I'm not sure what to say :/

>>>    	get_device(dev);
>>>    	/*
>>> @@ -3301,7 +3316,7 @@ static void __iommu_remove_group_pasid(struct iommu_group *group,
>>>    	const struct iommu_ops *ops;
>>>    	list_for_each_entry(device, &group->devices, list) {
>>> -		ops = dev_iommu_ops(device->dev);
>>> +		ops = dev_iommu_ops_safe(device->dev);
>>>    		ops->remove_dev_pasid(device->dev, pasid);
>>>    	}
>>>    }
>>> @@ -3413,6 +3428,9 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
>>>    	const struct iommu_ops *ops = dev_iommu_ops(dev);
>>>    	struct iommu_domain *domain;
>>> +	if (!ops)
>>> +		return NULL;
>>
>> Anyone who incorrectly calls sva_domain_alloc() directly without having
>> successfully called iommu_dev_enable_feature() first deserves to crash.
> 
> Lets not build assumptions like this into the code please. That
> iommu_dev_enable_feature() thing is on my hitlist too :(

Huh? The whole public API is full of assumptions that callers aren't 
doing nonsensical things. Pass a bogus group or domain pointer to 
iommu_attach_group()? Boom! Pass a bogus domain pointer to iommu_map()? 
Boom! AFAICT the only thing that should be calling 
iommu_sva_domain_alloc() at all at the moment is 
iommu_sva_bind_device(), so it's effectively an internal interface where 
bogus device pointers really shouldn't be expected at all.

Sure, if you change the interface so that random drivers are free to 
allocate SVA domains directly and feed them to iommu_attach_group(), and 
checks are warranted in different places, then add them then.

>> (Incidentally, you've missed enable/disable_feature here.)
> 
> Yes, because they don't call dev_iommu_ops for some reason. It should
> be fixed too.

It is, in the patch you're replying to.

>>> +/* May return NULL if the device has no iommu driver */
>>>    static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
>>>    {
>>> -	/*
>>> -	 * Assume that valid ops must be installed if iommu_probe_device()
>>> -	 * has succeeded. The device ops are essentially for internal use
>>> -	 * within the IOMMU subsystem itself, so we should be able to trust
>>> -	 * ourselves not to misuse the helper.
>>> -	 */
>>> +	if (!dev->iommu)
>>> +		return NULL;
>>>    	return dev->iommu->iommu_dev->ops;
>>
>> This is actively broken, since dev->iommu may be non-NULL before
>> dev->iommu->iommu_dev is set (a fwspec will always be set before the
>> instyance is registered, and a device may potentially hang around in that
>> state for evertt if the relevant IOMMU instance never appears).
> 
> Sure, I missed a NULL
> 
>> Sorry, I don't think any of this makes sense :/
> 
> The point is to be more consistent and not just blindly add more
> helpers. If you need ops then ask for the ops. If you aren't sure the
> dev has ops then check ops for NULL. It is pretty simple.

I'm not "blindly" adding more helpers, I'm ratifying clear common 
elements of the code and logic that we already have, to make them that 
much easier to replicate further. And I'm still no less puzzled by this 
thread... adding a new helper to consolidate having one thing in some 
places and another in others so offends your sensibilities that what 
you'd much rather do instead is add a new helper to have one thing in 
some places and another thing in others? There is a logical consistency 
to the current code already, which I assume is sufficiently clear to the 
majority of us because it's what made it through community review and 
what we've all been working on top of for the last year. "If you need 
ops then ask for the ops. If you aren't sure the dev has ops then check 
for valid ops first." Just as simple.

FWIW, personally I find up-front availability checks perfectly intuitive 
and natural, because they can easily be imagined as a real-life interaction:

"Do you have any coffee?"
"No, sorry."

vs.

"Please give me a cup of coffee."
<hands over empty cup>
"Is this coffee?"
"No."

One could possibly even argue that a separate up-front check also 
visibly implies that there are no concurrency or TOCTOU considerations 
expected, which for dev_iommu_ops() there should certainly not be.

Please understand that I'm not going to this length just to be 
objectionable; this is me genuinely being unable to rationalise your 
argument and spending my entire evening on a deep dive into my own 
thought process to lay it out and check for any obvious errors.

Thanks,
Robin.
  
Jason Gunthorpe Jan. 31, 2023, 7:54 p.m. UTC | #6
On Mon, Jan 30, 2023 at 11:33:54PM +0000, Robin Murphy wrote:

> Please understand that I'm not going to this length just to be
> objectionable; this is me genuinely being unable to rationalise your
> argument and spending my entire evening on a deep dive into my own thought
> process to lay it out and check for any obvious errors.

Sorry, I don't want to use up your time like this. It is a minor style
remark, if you don't like it then you should go ahead with your
original.

It is hard to debate style, everyone has their own viewpoint of what
is better style or not.

But to elaborate my feeling, I find this:

	const struct iommu_ops *ops = dev_iommu_ops(dev);

	if (!ops)
	   return -ENODEV

To be nicer code and more kernely then this:

	const struct iommu_ops *ops;

	if (!dev_iomm_ops_valid(dev))
	   return -ENODEV
        ops = dev_iommu_ops(dev);

In part because the former is a harder to type wrong and when used
consistently it is alot more obvious that the NULL test is
missing. static checkers like smatch/coverity will even warn on the
obvious possible NULL deref if someone misses the NULL test.

In general in kernel we have the API flow of call a function and check
the result for error, asking permission to call an API is less
typical.

This case is complicated because of the effort to try and document
cases that can't fail. I'm not sure if this is worthwhile, but..

Documentation of special cases is better by labeling them as a special
case, eg with a special function name. Think
rcu_dereference_protected(). Making them special by observing a
missing related function call is trickier to learn and remember.

Also if you rely on ops testing you are sort of forced into a second
function because the static tools will complain about all the places
that don't test for null if only one API is provided.

Basically, if you have dev_iommu_ops/dev_iommu_ops_safe then the
choice to use safe is obviously documented in the code and if you
mis-use dev_iommu_ops then a static tool will complain. Reviewers who
see 'safe' in a diff are reminded to look at it for safety rules.

If you have dev_iommu_ops/dev_iommu_ops_valid then static tools will
never complain and the choice to use 'safe' is implicitly documented
by not calling dev_iommu_ops_valid. Reviewers have to remember to
check every call to dev_iommu_ops() to see if it should have the valid
check.

Regards,
Jason
  

Patch

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 77f076030995..440bb3b7bded 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -284,6 +284,12 @@  static void dev_iommu_free(struct device *dev)
 	kfree(param);
 }
 
+/* Only needed in public APIs which allow unchecked devices for convenience */
+static bool dev_iommu_ops_valid(struct device *dev)
+{
+	return dev->iommu && dev->iommu->iommu_dev;
+}
+
 static u32 dev_iommu_get_max_pasids(struct device *dev)
 {
 	u32 max_pasids = 0, bits = 0;
@@ -1096,6 +1102,12 @@  void iommu_group_remove_device(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_group_remove_device);
 
+static struct device *iommu_group_first_dev(struct iommu_group *group)
+{
+	lockdep_assert_held(&group->mutex);
+	return list_first_entry(&group->devices, struct group_device, list)->dev;
+}
+
 static int iommu_group_device_count(struct iommu_group *group)
 {
 	struct group_device *entry;
@@ -1907,7 +1919,7 @@  bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
 {
 	const struct iommu_ops *ops;
 
-	if (!dev->iommu || !dev->iommu->iommu_dev)
+	if (!dev_iommu_ops_valid(dev))
 		return false;
 
 	ops = dev_iommu_ops(dev);
@@ -2770,8 +2782,8 @@  EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
  */
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
 {
-	if (dev->iommu && dev->iommu->iommu_dev) {
-		const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
+	if (dev_iommu_ops_valid(dev)) {
+		const struct iommu_ops *ops = dev_iommu_ops(dev);
 
 		if (ops->dev_enable_feat)
 			return ops->dev_enable_feat(dev, feat);
@@ -2786,8 +2798,8 @@  EXPORT_SYMBOL_GPL(iommu_dev_enable_feature);
  */
 int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat)
 {
-	if (dev->iommu && dev->iommu->iommu_dev) {
-		const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
+	if (dev_iommu_ops_valid(dev)) {
+		const struct iommu_ops *ops = dev_iommu_ops(dev);
 
 		if (ops->dev_disable_feat)
 			return ops->dev_disable_feat(dev, feat);
@@ -2816,7 +2828,6 @@  static int iommu_change_dev_def_domain(struct iommu_group *group,
 				       struct device *prev_dev, int type)
 {
 	struct iommu_domain *prev_dom;
-	struct group_device *grp_dev;
 	int ret, dev_def_dom;
 	struct device *dev;
 
@@ -2848,8 +2859,7 @@  static int iommu_change_dev_def_domain(struct iommu_group *group,
 	}
 
 	/* Since group has only one device */
-	grp_dev = list_first_entry(&group->devices, struct group_device, list);
-	dev = grp_dev->dev;
+	dev = iommu_group_first_dev(group);
 
 	if (prev_dev != dev) {
 		dev_err_ratelimited(prev_dev, "Cannot change default domain: Device has been changed\n");
@@ -2946,7 +2956,6 @@  static int iommu_change_dev_def_domain(struct iommu_group *group,
 static ssize_t iommu_group_store_type(struct iommu_group *group,
 				      const char *buf, size_t count)
 {
-	struct group_device *grp_dev;
 	struct device *dev;
 	int ret, req_type;
 
@@ -2981,8 +2990,7 @@  static ssize_t iommu_group_store_type(struct iommu_group *group,
 	}
 
 	/* Since group has only one device */
-	grp_dev = list_first_entry(&group->devices, struct group_device, list);
-	dev = grp_dev->dev;
+	dev = iommu_group_first_dev(group);
 	get_device(dev);
 
 	/*
@@ -3107,21 +3115,18 @@  void iommu_device_unuse_default_domain(struct device *dev)
 
 static int __iommu_group_alloc_blocking_domain(struct iommu_group *group)
 {
-	struct group_device *dev =
-		list_first_entry(&group->devices, struct group_device, list);
+	struct device *dev = iommu_group_first_dev(group);
 
 	if (group->blocking_domain)
 		return 0;
 
-	group->blocking_domain =
-		__iommu_domain_alloc(dev->dev->bus, IOMMU_DOMAIN_BLOCKED);
+	group->blocking_domain = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_BLOCKED);
 	if (!group->blocking_domain) {
 		/*
 		 * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED
 		 * create an empty domain instead.
 		 */
-		group->blocking_domain = __iommu_domain_alloc(
-			dev->dev->bus, IOMMU_DOMAIN_UNMANAGED);
+		group->blocking_domain = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_UNMANAGED);
 		if (!group->blocking_domain)
 			return -EINVAL;
 	}