[v9,14/14] iommu: Track iopf group instead of last fault
Commit Message
Previously, before a group of page faults was passed to the domain's iopf
handler, the last page fault of the group was kept in the list of
iommu_fault_param::faults. In the page fault response path, the group's
last page fault was used to look up the list, and the page faults were
responded to device only if there was a matched fault.
The previous approach seems unnecessarily complex and not performance
friendly. Put the page fault group itself to the outstanding fault list.
It can be removed in the page fault response path or in the
iopf_queue_remove_device() path. The pending list is protected by
iommu_fault_param::lock. To allow checking for the group's presence in
the list using list_empty(), the iopf group should be removed from the
list with list_del_init().
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
---
include/linux/iommu.h | 2 +
drivers/iommu/io-pgfault.c | 220 +++++++++++++------------------------
2 files changed, 78 insertions(+), 144 deletions(-)
Comments
On Wed, Dec 20, 2023 at 09:23:32AM +0800, Lu Baolu wrote:
> /**
> - * iommu_handle_iopf - IO Page Fault handler
> - * @fault: fault event
> - * @iopf_param: the fault parameter of the device.
> + * iommu_report_device_fault() - Report fault event to device driver
> + * @dev: the device
> + * @evt: fault event data
> *
> - * Add a fault to the device workqueue, to be handled by mm.
> + * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ
> + * handler. When this function fails and the fault is recoverable, it is the
> + * caller's responsibility to complete the fault.
This patch seems OK for what it does so:
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
However, this seems like a strange design, surely this function should
just call ops->page_response() when it can't enqueue the fault?
It is much cleaner that way, so maybe you can take this into a
following patch (along with the driver fixes to accomodate. (and
perhaps iommu_report_device_fault() should return void too)
Also iopf_group_response() should return void (another patch!),
nothing can do anything with the failure. This implies that
ops->page_response() must also return void - which is consistent with
what the drivers do, the failure paths are all integrity validations
of the fault and should be WARN_ON'd not return codes.
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 7d11b74e4048e2..2715e24fd64234 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -39,7 +39,7 @@ static void iopf_put_dev_fault_param(struct iommu_fault_param *fault_param)
kfree_rcu(fault_param, rcu);
}
-void iopf_free_group(struct iopf_group *group)
+static void __iopf_free_group(struct iopf_group *group)
{
struct iopf_fault *iopf, *next;
@@ -50,6 +50,11 @@ void iopf_free_group(struct iopf_group *group)
/* Pair with iommu_report_device_fault(). */
iopf_put_dev_fault_param(group->fault_param);
+}
+
+void iopf_free_group(struct iopf_group *group)
+{
+ __iopf_free_group(group);
kfree(group);
}
EXPORT_SYMBOL_GPL(iopf_free_group);
@@ -97,14 +102,49 @@ static int report_partial_fault(struct iommu_fault_param *fault_param,
return 0;
}
+static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param,
+ struct iopf_fault *evt,
+ struct iopf_group *abort_group)
+{
+ struct iopf_fault *iopf, *next;
+ struct iopf_group *group;
+
+ group = kzalloc(sizeof(*group), GFP_KERNEL);
+ if (!group) {
+ /*
+ * We always need to construct the group as we need it to abort
+ * the request at the driver if it cfan't be handled.
+ */
+ group = abort_group;
+ }
+
+ group->fault_param = iopf_param;
+ group->last_fault.fault = evt->fault;
+ INIT_LIST_HEAD(&group->faults);
+ INIT_LIST_HEAD(&group->pending_node);
+ list_add(&group->last_fault.list, &group->faults);
+
+ /* See if we have partial faults for this group */
+ mutex_lock(&iopf_param->lock);
+ list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
+ if (iopf->fault.prm.grpid == evt->fault.prm.grpid)
+ /* Insert *before* the last fault */
+ list_move(&iopf->list, &group->faults);
+ }
+ list_add(&group->pending_node, &iopf_param->faults);
+ mutex_unlock(&iopf_param->lock);
+
+ return group;
+}
+
/**
* iommu_report_device_fault() - Report fault event to device driver
* @dev: the device
* @evt: fault event data
*
* Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ
- * handler. When this function fails and the fault is recoverable, it is the
- * caller's responsibility to complete the fault.
+ * handler. If this function fails then ops->page_response() was called to
+ * complete evt if required.
*
* This module doesn't handle PCI PASID Stop Marker; IOMMU drivers must discard
* them before reporting faults. A PASID Stop Marker (LRW = 0b100) doesn't
@@ -143,22 +183,24 @@ int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
{
struct iommu_fault *fault = &evt->fault;
struct iommu_fault_param *iopf_param;
- struct iopf_fault *iopf, *next;
- struct iommu_domain *domain;
+ struct iopf_group abort_group;
struct iopf_group *group;
int ret;
+/*
+ remove this too, it is pointless. The driver should only invoke this function on page_req faults.
if (fault->type != IOMMU_FAULT_PAGE_REQ)
return -EOPNOTSUPP;
+*/
iopf_param = iopf_get_dev_fault_param(dev);
- if (!iopf_param)
+ if (WARN_ON(!iopf_param))
return -ENODEV;
if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
ret = report_partial_fault(iopf_param, fault);
iopf_put_dev_fault_param(iopf_param);
-
+ /* A request that is not the last does not need to be ack'd */
return ret;
}
@@ -170,56 +212,34 @@ int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
* will send a response to the hardware. We need to clean up before
* leaving, otherwise partial faults will be stuck.
*/
- domain = get_domain_for_iopf(dev, fault);
- if (!domain) {
- ret = -EINVAL;
- goto cleanup_partial;
- }
-
- group = kzalloc(sizeof(*group), GFP_KERNEL);
- if (!group) {
+ group = iopf_group_alloc(iopf_param, evt, &abort_group);
+ if (group == &abort_group) {
ret = -ENOMEM;
- goto cleanup_partial;
+ goto err_abort;
}
- group->fault_param = iopf_param;
- group->last_fault.fault = *fault;
- INIT_LIST_HEAD(&group->faults);
- INIT_LIST_HEAD(&group->pending_node);
- group->domain = domain;
- list_add(&group->last_fault.list, &group->faults);
-
- /* See if we have partial faults for this group */
- mutex_lock(&iopf_param->lock);
- list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
- if (iopf->fault.prm.grpid == fault->prm.grpid)
- /* Insert *before* the last fault */
- list_move(&iopf->list, &group->faults);
+ group->domain = get_domain_for_iopf(dev, fault);
+ if (!group->domain) {
+ ret = -EINVAL;
+ goto err_abort;
}
- list_add(&group->pending_node, &iopf_param->faults);
- mutex_unlock(&iopf_param->lock);
- ret = domain->iopf_handler(group);
- if (ret) {
- mutex_lock(&iopf_param->lock);
- list_del_init(&group->pending_node);
- mutex_unlock(&iopf_param->lock);
+ /*
+ * On success iopf_handler must call iopf_group_response() and
+ * iopf_free_group()
+ */
+ ret = group->domain->iopf_handler(group);
+ if (ret)
+ goto err_abort;
+ return 0;
+
+err_abort:
+ iopf_group_response(group,
+ IOMMU_PAGE_RESP_FAILURE); //?? right code?
+ if (group == &abort_group)
+ __iopf_free_group(group);
+ else
iopf_free_group(group);
- }
-
- return ret;
-
-cleanup_partial:
- mutex_lock(&iopf_param->lock);
- list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
- if (iopf->fault.prm.grpid == fault->prm.grpid) {
- list_del(&iopf->list);
- kfree(iopf);
- }
- }
- mutex_unlock(&iopf_param->lock);
- iopf_put_dev_fault_param(iopf_param);
-
return ret;
}
EXPORT_SYMBOL_GPL(iommu_report_device_fault);
@@ -262,7 +282,7 @@ EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);
*
* Return 0 on success and <0 on error.
*/
-int iopf_group_response(struct iopf_group *group,
+void iopf_group_response(struct iopf_group *group,
enum iommu_page_response_code status)
{
struct iommu_fault_param *fault_param = group->fault_param;
@@ -400,9 +420,9 @@ EXPORT_SYMBOL_GPL(iopf_queue_add_device);
*/
void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
{
- struct iopf_fault *iopf, *next;
+ struct iopf_fault *partial_iopf;
+ struct iopf_fault *next;
struct iopf_group *group, *temp;
- struct iommu_page_response resp;
struct dev_iommu *param = dev->iommu;
struct iommu_fault_param *fault_param;
const struct iommu_ops *ops = dev_iommu_ops(dev);
@@ -416,15 +436,16 @@ void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
goto unlock;
mutex_lock(&fault_param->lock);
- list_for_each_entry_safe(iopf, next, &fault_param->partial, list)
- kfree(iopf);
+ list_for_each_entry_safe(partial_iopf, next, &fault_param->partial, list)
+ kfree(partial_iopf);
list_for_each_entry_safe(group, temp, &fault_param->faults, pending_node) {
- memset(&resp, 0, sizeof(struct iommu_page_response));
- iopf = &group->last_fault;
- resp.pasid = iopf->fault.prm.pasid;
- resp.grpid = iopf->fault.prm.grpid;
- resp.code = IOMMU_PAGE_RESP_INVALID;
+ struct iopf_fault *iopf = &group->last_fault;
+ struct iommu_page_response resp = {
+ .pasid = iopf->fault.prm.pasid,
+ .grpid = iopf->fault.prm.grpid,
+ .code = IOMMU_PAGE_RESP_INVALID
+ };
if (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID)
resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
On 1/6/24 1:53 AM, Jason Gunthorpe wrote:
> On Wed, Dec 20, 2023 at 09:23:32AM +0800, Lu Baolu wrote:
>> /**
>> - * iommu_handle_iopf - IO Page Fault handler
>> - * @fault: fault event
>> - * @iopf_param: the fault parameter of the device.
>> + * iommu_report_device_fault() - Report fault event to device driver
>> + * @dev: the device
>> + * @evt: fault event data
>> *
>> - * Add a fault to the device workqueue, to be handled by mm.
>> + * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ
>> + * handler. When this function fails and the fault is recoverable, it is the
>> + * caller's responsibility to complete the fault.
> This patch seems OK for what it does so:
>
> Reviewed-by: Jason Gunthorpe<jgg@nvidia.com>
>
> However, this seems like a strange design, surely this function should
> just call ops->page_response() when it can't enqueue the fault?
>
> It is much cleaner that way, so maybe you can take this into a
> following patch (along with the driver fixes to accomodate. (and
> perhaps iommu_report_device_fault() should return void too)
>
> Also iopf_group_response() should return void (another patch!),
> nothing can do anything with the failure. This implies that
> ops->page_response() must also return void - which is consistent with
> what the drivers do, the failure paths are all integrity validations
> of the fault and should be WARN_ON'd not return codes.
Make sense. I will integrate the code in the next version and convert
iommu_report_device_fault() to return void.
Best regards,
baolu
@@ -129,6 +129,8 @@ struct iopf_fault {
struct iopf_group {
struct iopf_fault last_fault;
struct list_head faults;
+ /* list node for iommu_fault_param::faults */
+ struct list_head pending_node;
struct work_struct work;
struct iommu_domain *domain;
/* The device's fault data parameter. */
@@ -78,12 +78,33 @@ static struct iommu_domain *get_domain_for_iopf(struct device *dev,
return domain;
}
+/* Non-last request of a group. Postpone until the last one. */
+static int report_partial_fault(struct iommu_fault_param *fault_param,
+ struct iommu_fault *fault)
+{
+ struct iopf_fault *iopf;
+
+ iopf = kzalloc(sizeof(*iopf), GFP_KERNEL);
+ if (!iopf)
+ return -ENOMEM;
+
+ iopf->fault = *fault;
+
+ mutex_lock(&fault_param->lock);
+ list_add(&iopf->list, &fault_param->partial);
+ mutex_unlock(&fault_param->lock);
+
+ return 0;
+}
+
/**
- * iommu_handle_iopf - IO Page Fault handler
- * @fault: fault event
- * @iopf_param: the fault parameter of the device.
+ * iommu_report_device_fault() - Report fault event to device driver
+ * @dev: the device
+ * @evt: fault event data
*
- * Add a fault to the device workqueue, to be handled by mm.
+ * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ
+ * handler. When this function fails and the fault is recoverable, it is the
+ * caller's responsibility to complete the fault.
*
* This module doesn't handle PCI PASID Stop Marker; IOMMU drivers must discard
* them before reporting faults. A PASID Stop Marker (LRW = 0b100) doesn't
@@ -118,34 +139,37 @@ static struct iommu_domain *get_domain_for_iopf(struct device *dev,
*
* Return: 0 on success and <0 on error.
*/
-static int iommu_handle_iopf(struct iommu_fault *fault,
- struct iommu_fault_param *iopf_param)
+int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
{
- int ret;
+ struct iommu_fault *fault = &evt->fault;
+ struct iommu_fault_param *iopf_param;
+ struct iopf_fault *iopf, *next;
+ struct iommu_domain *domain;
struct iopf_group *group;
- struct iommu_domain *domain;
- struct iopf_fault *iopf, *next;
- struct device *dev = iopf_param->dev;
-
- lockdep_assert_held(&iopf_param->lock);
+ int ret;
if (fault->type != IOMMU_FAULT_PAGE_REQ)
- /* Not a recoverable page fault */
return -EOPNOTSUPP;
+ iopf_param = iopf_get_dev_fault_param(dev);
+ if (!iopf_param)
+ return -ENODEV;
+
if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
- iopf = kzalloc(sizeof(*iopf), GFP_KERNEL);
- if (!iopf)
- return -ENOMEM;
+ ret = report_partial_fault(iopf_param, fault);
+ iopf_put_dev_fault_param(iopf_param);
- iopf->fault = *fault;
-
- /* Non-last request of a group. Postpone until the last one */
- list_add(&iopf->list, &iopf_param->partial);
-
- return 0;
+ return ret;
}
+ /*
+ * This is the last page fault of a group. Allocate an iopf group and
+ * pass it to domain's page fault handler. The group holds a reference
+ * count of the fault parameter. It will be released after response or
+ * error path of this function. If an error is returned, the caller
+ * will send a response to the hardware. We need to clean up before
+ * leaving, otherwise partial faults will be stuck.
+ */
domain = get_domain_for_iopf(dev, fault);
if (!domain) {
ret = -EINVAL;
@@ -154,11 +178,6 @@ static int iommu_handle_iopf(struct iommu_fault *fault,
group = kzalloc(sizeof(*group), GFP_KERNEL);
if (!group) {
- /*
- * The caller will send a response to the hardware. But we do
- * need to clean up before leaving, otherwise partial faults
- * will be stuck.
- */
ret = -ENOMEM;
goto cleanup_partial;
}
@@ -166,145 +185,45 @@ static int iommu_handle_iopf(struct iommu_fault *fault,
group->fault_param = iopf_param;
group->last_fault.fault = *fault;
INIT_LIST_HEAD(&group->faults);
+ INIT_LIST_HEAD(&group->pending_node);
group->domain = domain;
list_add(&group->last_fault.list, &group->faults);
/* See if we have partial faults for this group */
+ mutex_lock(&iopf_param->lock);
list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
if (iopf->fault.prm.grpid == fault->prm.grpid)
/* Insert *before* the last fault */
list_move(&iopf->list, &group->faults);
}
-
+ list_add(&group->pending_node, &iopf_param->faults);
mutex_unlock(&iopf_param->lock);
+
ret = domain->iopf_handler(group);
- mutex_lock(&iopf_param->lock);
- if (ret)
+ if (ret) {
+ mutex_lock(&iopf_param->lock);
+ list_del_init(&group->pending_node);
+ mutex_unlock(&iopf_param->lock);
iopf_free_group(group);
+ }
return ret;
+
cleanup_partial:
+ mutex_lock(&iopf_param->lock);
list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
if (iopf->fault.prm.grpid == fault->prm.grpid) {
list_del(&iopf->list);
kfree(iopf);
}
}
- return ret;
-}
-
-/**
- * iommu_report_device_fault() - Report fault event to device driver
- * @dev: the device
- * @evt: fault event data
- *
- * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ
- * handler. When this function fails and the fault is recoverable, it is the
- * caller's responsibility to complete the fault.
- *
- * Return 0 on success, or an error.
- */
-int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
-{
- bool last_prq = evt->fault.type == IOMMU_FAULT_PAGE_REQ &&
- (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE);
- struct iommu_fault_param *fault_param;
- struct iopf_fault *evt_pending;
- int ret;
-
- fault_param = iopf_get_dev_fault_param(dev);
- if (!fault_param)
- return -EINVAL;
-
- mutex_lock(&fault_param->lock);
- if (last_prq) {
- evt_pending = kmemdup(evt, sizeof(struct iopf_fault),
- GFP_KERNEL);
- if (!evt_pending) {
- ret = -ENOMEM;
- goto err_unlock;
- }
- list_add_tail(&evt_pending->list, &fault_param->faults);
- }
-
- ret = iommu_handle_iopf(&evt->fault, fault_param);
- if (ret)
- goto err_free;
-
- mutex_unlock(&fault_param->lock);
- /* The reference count of fault_param is now held by iopf_group. */
- if (!last_prq)
- iopf_put_dev_fault_param(fault_param);
-
- return 0;
-err_free:
- if (last_prq) {
- list_del(&evt_pending->list);
- kfree(evt_pending);
- }
-err_unlock:
- mutex_unlock(&fault_param->lock);
- iopf_put_dev_fault_param(fault_param);
+ mutex_unlock(&iopf_param->lock);
+ iopf_put_dev_fault_param(iopf_param);
return ret;
}
EXPORT_SYMBOL_GPL(iommu_report_device_fault);
-static int iommu_page_response(struct iopf_group *group,
- struct iommu_page_response *msg)
-{
- bool needs_pasid;
- int ret = -EINVAL;
- struct iopf_fault *evt;
- struct iommu_fault_page_request *prm;
- struct device *dev = group->fault_param->dev;
- const struct iommu_ops *ops = dev_iommu_ops(dev);
- bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID;
- struct iommu_fault_param *fault_param = group->fault_param;
-
- /* Only send response if there is a fault report pending */
- mutex_lock(&fault_param->lock);
- if (list_empty(&fault_param->faults)) {
- dev_warn_ratelimited(dev, "no pending PRQ, drop response\n");
- goto done_unlock;
- }
- /*
- * Check if we have a matching page request pending to respond,
- * otherwise return -EINVAL
- */
- list_for_each_entry(evt, &fault_param->faults, list) {
- prm = &evt->fault.prm;
- if (prm->grpid != msg->grpid)
- continue;
-
- /*
- * If the PASID is required, the corresponding request is
- * matched using the group ID, the PASID valid bit and the PASID
- * value. Otherwise only the group ID matches request and
- * response.
- */
- needs_pasid = prm->flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
- if (needs_pasid && (!has_pasid || msg->pasid != prm->pasid))
- continue;
-
- if (!needs_pasid && has_pasid) {
- /* No big deal, just clear it. */
- msg->flags &= ~IOMMU_PAGE_RESP_PASID_VALID;
- msg->pasid = 0;
- }
-
- ret = ops->page_response(dev, evt, msg);
- list_del(&evt->list);
- kfree(evt);
- break;
- }
-
-done_unlock:
- mutex_unlock(&fault_param->lock);
-
- return ret;
-}
-
/**
* iopf_queue_flush_dev - Ensure that all queued faults have been processed
* @dev: the endpoint whose faults need to be flushed.
@@ -346,18 +265,30 @@ EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);
int iopf_group_response(struct iopf_group *group,
enum iommu_page_response_code status)
{
+ struct iommu_fault_param *fault_param = group->fault_param;
struct iopf_fault *iopf = &group->last_fault;
+ struct device *dev = group->fault_param->dev;
+ const struct iommu_ops *ops = dev_iommu_ops(dev);
struct iommu_page_response resp = {
.pasid = iopf->fault.prm.pasid,
.grpid = iopf->fault.prm.grpid,
.code = status,
};
+ int ret = -EINVAL;
if ((iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
(iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID))
resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
- return iommu_page_response(group, &resp);
+ /* Only send response if there is a fault report pending */
+ mutex_lock(&fault_param->lock);
+ if (!list_empty(&group->pending_node)) {
+ ret = ops->page_response(dev, &group->last_fault, &resp);
+ list_del_init(&group->pending_node);
+ }
+ mutex_unlock(&fault_param->lock);
+
+ return ret;
}
EXPORT_SYMBOL_GPL(iopf_group_response);
@@ -470,6 +401,7 @@ EXPORT_SYMBOL_GPL(iopf_queue_add_device);
void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
{
struct iopf_fault *iopf, *next;
+ struct iopf_group *group, *temp;
struct iommu_page_response resp;
struct dev_iommu *param = dev->iommu;
struct iommu_fault_param *fault_param;
@@ -487,8 +419,9 @@ void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
list_for_each_entry_safe(iopf, next, &fault_param->partial, list)
kfree(iopf);
- list_for_each_entry_safe(iopf, next, &fault_param->faults, list) {
+ list_for_each_entry_safe(group, temp, &fault_param->faults, pending_node) {
memset(&resp, 0, sizeof(struct iommu_page_response));
+ iopf = &group->last_fault;
resp.pasid = iopf->fault.prm.pasid;
resp.grpid = iopf->fault.prm.grpid;
resp.code = IOMMU_PAGE_RESP_INVALID;
@@ -497,8 +430,7 @@ void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
ops->page_response(dev, iopf, &resp);
- list_del(&iopf->list);
- kfree(iopf);
+ list_del_init(&group->pending_node);
}
mutex_unlock(&fault_param->lock);