[v3,09/17] iommufd: Add IOMMU_HWPT_INVALIDATE

Message ID 20230724110406.107212-10-yi.l.liu@intel.com
State New
Headers
Series iommufd: Add nesting infrastructure |

Commit Message

Yi Liu July 24, 2023, 11:03 a.m. UTC
  In nested translation, the stage-1 page table is user-managed and used by
IOMMU hardware, so update of any present page table entry in the stage-1
page table should be followed with an IOTLB invalidation.

This adds IOMMU_HWPT_INVALIDATE for stage-1 IOTLB invalidation.

Co-developed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/iommu/iommufd/hw_pagetable.c    | 45 +++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |  9 +++++
 drivers/iommu/iommufd/main.c            |  3 ++
 include/uapi/linux/iommufd.h            | 22 ++++++++++++
 4 files changed, 79 insertions(+)
  

Comments

Jason Gunthorpe July 28, 2023, 6:02 p.m. UTC | #1
On Mon, Jul 24, 2023 at 04:03:58AM -0700, Yi Liu wrote:
> In nested translation, the stage-1 page table is user-managed and used by
> IOMMU hardware, so update of any present page table entry in the stage-1
> page table should be followed with an IOTLB invalidation.
> 
> This adds IOMMU_HWPT_INVALIDATE for stage-1 IOTLB invalidation.
> 
> Co-developed-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>  drivers/iommu/iommufd/hw_pagetable.c    | 45 +++++++++++++++++++++++++
>  drivers/iommu/iommufd/iommufd_private.h |  9 +++++
>  drivers/iommu/iommufd/main.c            |  3 ++
>  include/uapi/linux/iommufd.h            | 22 ++++++++++++
>  4 files changed, 79 insertions(+)
> 
> diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
> index 97e4114226de..9064e6d181b4 100644
> --- a/drivers/iommu/iommufd/hw_pagetable.c
> +++ b/drivers/iommu/iommufd/hw_pagetable.c
> @@ -286,3 +286,48 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
>  	iommufd_put_object(&idev->obj);
>  	return rc;
>  }
> +
> +int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_hwpt_invalidate *cmd = ucmd->cmd;
> +	struct iommufd_hw_pagetable *hwpt;
> +	u32 user_data_len, klen;
> +	u64 user_ptr;
> +	int rc = 0;
> +
> +	if (!cmd->data_len || cmd->__reserved)
> +		return -EOPNOTSUPP;
> +
> +	hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id);
> +	if (IS_ERR(hwpt))
> +		return PTR_ERR(hwpt);
> +
> +	/* Do not allow any kernel-managed hw_pagetable */
> +	if (!hwpt->parent) {

I don't think this is needed because:

> +		rc = -EINVAL;
> +		goto out_put_hwpt;
> +	}
> +
> +	klen = hwpt->domain->ops->cache_invalidate_user_data_len;
> +	if (!hwpt->domain->ops->cache_invalidate_user || !klen) {
> +		rc = -EOPNOTSUPP;

We need to get to a place where the drivers are providing proper ops
for the domains, so this op should never exist for a paging domain.

And return EINVAL here instead.

> +		goto out_put_hwpt;
> +	}
> +
> +	/*
> +	 * Copy the needed fields before reusing the ucmd buffer, this
> +	 * avoids memory allocation in this path.
> +	 */
> +	user_ptr = cmd->data_uptr;
> +	user_data_len = cmd->data_len;

Uhh, who checks that klen < the temporary stack struct?

Jason
  
Yi Liu July 31, 2023, 10:07 a.m. UTC | #2
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Saturday, July 29, 2023 2:03 AM
> 
> On Mon, Jul 24, 2023 at 04:03:58AM -0700, Yi Liu wrote:
> > In nested translation, the stage-1 page table is user-managed and used by
> > IOMMU hardware, so update of any present page table entry in the stage-1
> > page table should be followed with an IOTLB invalidation.
> >
> > This adds IOMMU_HWPT_INVALIDATE for stage-1 IOTLB invalidation.
> >
> > Co-developed-by: Nicolin Chen <nicolinc@nvidia.com>
> > Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > ---
> >  drivers/iommu/iommufd/hw_pagetable.c    | 45 +++++++++++++++++++++++++
> >  drivers/iommu/iommufd/iommufd_private.h |  9 +++++
> >  drivers/iommu/iommufd/main.c            |  3 ++
> >  include/uapi/linux/iommufd.h            | 22 ++++++++++++
> >  4 files changed, 79 insertions(+)
> >
> > diff --git a/drivers/iommu/iommufd/hw_pagetable.c
> b/drivers/iommu/iommufd/hw_pagetable.c
> > index 97e4114226de..9064e6d181b4 100644
> > --- a/drivers/iommu/iommufd/hw_pagetable.c
> > +++ b/drivers/iommu/iommufd/hw_pagetable.c
> > @@ -286,3 +286,48 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
> >  	iommufd_put_object(&idev->obj);
> >  	return rc;
> >  }
> > +
> > +int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
> > +{
> > +	struct iommu_hwpt_invalidate *cmd = ucmd->cmd;
> > +	struct iommufd_hw_pagetable *hwpt;
> > +	u32 user_data_len, klen;
> > +	u64 user_ptr;
> > +	int rc = 0;
> > +
> > +	if (!cmd->data_len || cmd->__reserved)
> > +		return -EOPNOTSUPP;
> > +
> > +	hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id);
> > +	if (IS_ERR(hwpt))
> > +		return PTR_ERR(hwpt);
> > +
> > +	/* Do not allow any kernel-managed hw_pagetable */
> > +	if (!hwpt->parent) {
> 
> I don't think this is needed because:
> 
> > +		rc = -EINVAL;
> > +		goto out_put_hwpt;
> > +	}
> > +
> > +	klen = hwpt->domain->ops->cache_invalidate_user_data_len;
> > +	if (!hwpt->domain->ops->cache_invalidate_user || !klen) {
> > +		rc = -EOPNOTSUPP;
> 
> We need to get to a place where the drivers are providing proper ops
> for the domains, so this op should never exist for a paging domain.

Yes.

> 
> And return EINVAL here instead.

Sure.

> 
> > +		goto out_put_hwpt;
> > +	}
> > +
> > +	/*
> > +	 * Copy the needed fields before reusing the ucmd buffer, this
> > +	 * avoids memory allocation in this path.
> > +	 */
> > +	user_ptr = cmd->data_uptr;
> > +	user_data_len = cmd->data_len;
> 
> Uhh, who checks that klen < the temporary stack struct?

Take vtd as an example. The invalidate structure is struct iommu_hwpt_vtd_s1_invalidate[1].
The klen is sizeof(struct iommu_hwpt_vtd_s1_invalidate)[2]. iommu_hwpt_vtd_s1_invalidate
is also placed in the temporary stack struct (actually it is a union)[1]. So the klen should
be <= temporary stack.

[1] https://lore.kernel.org/linux-iommu/20230724111335.107427-8-yi.l.liu@intel.com/
[2] https://lore.kernel.org/linux-iommu/20230724111335.107427-10-yi.l.liu@intel.com/

It's not so explicit though. Perhaps worth to have a check like below in this patch?

if (unlikely(klen > sizeof(union ucmd_buffer)))
	return -EINVAL;

Regards,
Yi Liu
  
Jason Gunthorpe July 31, 2023, 1:19 p.m. UTC | #3
On Mon, Jul 31, 2023 at 10:07:32AM +0000, Liu, Yi L wrote:
> > > +		goto out_put_hwpt;
> > > +	}
> > > +
> > > +	/*
> > > +	 * Copy the needed fields before reusing the ucmd buffer, this
> > > +	 * avoids memory allocation in this path.
> > > +	 */
> > > +	user_ptr = cmd->data_uptr;
> > > +	user_data_len = cmd->data_len;
> > 
> > Uhh, who checks that klen < the temporary stack struct?
> 
> Take vtd as an example. The invalidate structure is struct iommu_hwpt_vtd_s1_invalidate[1].
> The klen is sizeof(struct iommu_hwpt_vtd_s1_invalidate)[2]. iommu_hwpt_vtd_s1_invalidate
> is also placed in the temporary stack struct (actually it is a union)[1]. So the klen should
> be <= temporary stack.

Ohh, I think I would add a few comments noting that the invalidate
structs need to be added to that union. Easy to miss.

> It's not so explicit though. Perhaps worth to have a check like below in this patch?
> 
> if (unlikely(klen > sizeof(union ucmd_buffer)))
> 	return -EINVAL;

Yes, stick this in the domain allocate path with a WARN_ON. The driver
is broken to allocate a domain with an invalid size.

Jason
  
Nicolin Chen Aug. 3, 2023, 2:07 a.m. UTC | #4
On Fri, Jul 28, 2023 at 03:02:43PM -0300, Jason Gunthorpe wrote:
> > +int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
> > +{
> > +	struct iommu_hwpt_invalidate *cmd = ucmd->cmd;
> > +	struct iommufd_hw_pagetable *hwpt;
> > +	u32 user_data_len, klen;
> > +	u64 user_ptr;
> > +	int rc = 0;
> > +
> > +	if (!cmd->data_len || cmd->__reserved)
> > +		return -EOPNOTSUPP;
> > +
> > +	hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id);
> > +	if (IS_ERR(hwpt))
> > +		return PTR_ERR(hwpt);
> > +
> > +	/* Do not allow any kernel-managed hw_pagetable */
> > +	if (!hwpt->parent) {
> 
> I don't think this is needed because:
> 
> > +		rc = -EINVAL;
> > +		goto out_put_hwpt;
> > +	}
> > +
> > +	klen = hwpt->domain->ops->cache_invalidate_user_data_len;
> > +	if (!hwpt->domain->ops->cache_invalidate_user || !klen) {
> > +		rc = -EOPNOTSUPP;
> 
> We need to get to a place where the drivers are providing proper ops
> for the domains, so this op should never exist for a paging domain.
> 
> And return EINVAL here instead.

Fixed those two above and added the following in alloc():

-------------------------------------------------------------------------------
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 08aa4debc58a..7ddeda22ac62 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -123,6 +123,12 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx,
                rc = -EINVAL;
                goto out_abort;
        }
+       /* Driver is buggy by mixing user-managed ops in kernel-managed ops */
+       if (WARN_ON_ONCE(hwpt->domain->ops->cache_invalidate_user ||
+                        hwpt->domain->ops->cache_invalidate_user_data_len)) {
+               rc = -EINVAL;
+               goto out_abort;
+       }
 
        /*
         * Set the coherency mode before we do iopt_table_add_domain() as some
-------------------------------------------------------------------------------

Thanks
Nic
  
Nicolin Chen Aug. 3, 2023, 2:16 a.m. UTC | #5
On Mon, Jul 31, 2023 at 10:19:09AM -0300, Jason Gunthorpe wrote:
> On Mon, Jul 31, 2023 at 10:07:32AM +0000, Liu, Yi L wrote:
> > > > +		goto out_put_hwpt;
> > > > +	}
> > > > +
> > > > +	/*
> > > > +	 * Copy the needed fields before reusing the ucmd buffer, this
> > > > +	 * avoids memory allocation in this path.
> > > > +	 */
> > > > +	user_ptr = cmd->data_uptr;
> > > > +	user_data_len = cmd->data_len;
> > > 
> > > Uhh, who checks that klen < the temporary stack struct?
> > 
> > Take vtd as an example. The invalidate structure is struct iommu_hwpt_vtd_s1_invalidate[1].
> > The klen is sizeof(struct iommu_hwpt_vtd_s1_invalidate)[2]. iommu_hwpt_vtd_s1_invalidate
> > is also placed in the temporary stack struct (actually it is a union)[1]. So the klen should
> > be <= temporary stack.
> 
> Ohh, I think I would add a few comments noting that the invalidate
> structs need to be added to that union. Easy to miss.

Added here:

-        * Copy the needed fields before reusing the ucmd buffer, this
-        * avoids memory allocation in this path.
+        * Copy the needed fields before reusing the ucmd buffer, this avoids
+        * memory allocation in this path. Again, user invalidate data struct
+        * must be added to the union ucmd_buffer.

> > It's not so explicit though. Perhaps worth to have a check like below in this patch?
> > 
> > if (unlikely(klen > sizeof(union ucmd_buffer)))
> > 	return -EINVAL;
> 
> Yes, stick this in the domain allocate path with a WARN_ON. The driver
> is broken to allocate a domain with an invalid size.

And here too with a WARN_ON_ONCE.

+       /*
+        * Either the driver is broken by having an invalid size, or the user
+        * invalidate data struct used by the driver is missing in the union.
+        */
+       if (WARN_ON_ONCE(hwpt->domain->ops->cache_invalidate_user &&
+                        (!hwpt->domain->ops->cache_invalidate_user_data_len ||
+                         hwpt->domain->ops->cache_invalidate_user_data_len >
+                         sizeof(union ucmd_buffer)))) {
+               rc = -EINVAL;
+               goto out_abort;
+
+       }

Though I am making this cache_invalidate_user optional here, I
wonder if there actually could be a case that a user-managed
domain doesn't need a cache_invalidate_user op...

Thanks
Nicolin
  
Yi Liu Aug. 3, 2023, 2:56 a.m. UTC | #6
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Monday, July 31, 2023 9:19 PM
> 
> On Mon, Jul 31, 2023 at 10:07:32AM +0000, Liu, Yi L wrote:
> > > > +		goto out_put_hwpt;
> > > > +	}
> > > > +
> > > > +	/*
> > > > +	 * Copy the needed fields before reusing the ucmd buffer, this
> > > > +	 * avoids memory allocation in this path.
> > > > +	 */
> > > > +	user_ptr = cmd->data_uptr;
> > > > +	user_data_len = cmd->data_len;
> > >
> > > Uhh, who checks that klen < the temporary stack struct?
> >
> > Take vtd as an example. The invalidate structure is struct
> iommu_hwpt_vtd_s1_invalidate[1].
> > The klen is sizeof(struct iommu_hwpt_vtd_s1_invalidate)[2].
> iommu_hwpt_vtd_s1_invalidate
> > is also placed in the temporary stack struct (actually it is a union)[1]. So the klen should
> > be <= temporary stack.
> 
> Ohh, I think I would add a few comments noting that the invalidate
> structs need to be added to that union. Easy to miss.

Sure. Actually, there are some description as below in patch [1]. Would
add some as well here.

"Cache invalidation path is performance path, so it's better to avoid
memory allocation in such path. To achieve it, this path reuses the
ucmd_buffer to copy user data. So the new data structures are added in
the ucmd_buffer union to avoid overflow."

[1] https://lore.kernel.org/linux-iommu/20230724111335.107427-8-yi.l.liu@intel.com/

> > It's not so explicit though. Perhaps worth to have a check like below in this patch?
> >
> > if (unlikely(klen > sizeof(union ucmd_buffer)))
> > 	return -EINVAL;
> 
> Yes, stick this in the domain allocate path with a WARN_ON. The driver
> is broken to allocate a domain with an invalid size.

Ok. so if any mistake on the data structure, the allocation fails in the first place.

Regards,
Yi Liu
  
Yi Liu Aug. 3, 2023, 3:07 a.m. UTC | #7
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Thursday, August 3, 2023 10:17 AM
> 
> On Mon, Jul 31, 2023 at 10:19:09AM -0300, Jason Gunthorpe wrote:
> > On Mon, Jul 31, 2023 at 10:07:32AM +0000, Liu, Yi L wrote:
> > > > > +		goto out_put_hwpt;
> > > > > +	}
> > > > > +
> > > > > +	/*
> > > > > +	 * Copy the needed fields before reusing the ucmd buffer, this
> > > > > +	 * avoids memory allocation in this path.
> > > > > +	 */
> > > > > +	user_ptr = cmd->data_uptr;
> > > > > +	user_data_len = cmd->data_len;
> > > >
> > > > Uhh, who checks that klen < the temporary stack struct?
> > >
> > > Take vtd as an example. The invalidate structure is struct
> iommu_hwpt_vtd_s1_invalidate[1].
> > > The klen is sizeof(struct iommu_hwpt_vtd_s1_invalidate)[2].
> iommu_hwpt_vtd_s1_invalidate
> > > is also placed in the temporary stack struct (actually it is a union)[1]. So the klen
> should
> > > be <= temporary stack.
> >
> > Ohh, I think I would add a few comments noting that the invalidate
> > structs need to be added to that union. Easy to miss.
> 
> Added here:
> 
> -        * Copy the needed fields before reusing the ucmd buffer, this
> -        * avoids memory allocation in this path.
> +        * Copy the needed fields before reusing the ucmd buffer, this avoids
> +        * memory allocation in this path. Again, user invalidate data struct
> +        * must be added to the union ucmd_buffer.
> 
> > > It's not so explicit though. Perhaps worth to have a check like below in this patch?
> > >
> > > if (unlikely(klen > sizeof(union ucmd_buffer)))
> > > 	return -EINVAL;
> >
> > Yes, stick this in the domain allocate path with a WARN_ON. The driver
> > is broken to allocate a domain with an invalid size.
> 
> And here too with a WARN_ON_ONCE.
> 
> +       /*
> +        * Either the driver is broken by having an invalid size, or the user
> +        * invalidate data struct used by the driver is missing in the union.
> +        */
> +       if (WARN_ON_ONCE(hwpt->domain->ops->cache_invalidate_user &&
> +                        (!hwpt->domain->ops->cache_invalidate_user_data_len ||
> +                         hwpt->domain->ops->cache_invalidate_user_data_len >
> +                         sizeof(union ucmd_buffer)))) {
> +               rc = -EINVAL;
> +               goto out_abort;
> +
> +       }
> 
> Though I am making this cache_invalidate_user optional here, I
> wonder if there actually could be a case that a user-managed
> domain doesn't need a cache_invalidate_user op...

If user-managed domain is the stage-1 domain in nested, then seems not
possible as cache invalidate is a must. But I think this logic is fine as not
all the domains allocated by the user is user-managed. It may be kernel
managed like the s2 domains.

Regards,
Yi Liu
  
Nicolin Chen Aug. 3, 2023, 3:13 a.m. UTC | #8
On Thu, Aug 03, 2023 at 03:07:23AM +0000, Liu, Yi L wrote:
> > > > It's not so explicit though. Perhaps worth to have a check like below in this patch?
> > > >
> > > > if (unlikely(klen > sizeof(union ucmd_buffer)))
> > > >   return -EINVAL;
> > >
> > > Yes, stick this in the domain allocate path with a WARN_ON. The driver
> > > is broken to allocate a domain with an invalid size.
> >
> > And here too with a WARN_ON_ONCE.
> >
> > +       /*
> > +        * Either the driver is broken by having an invalid size, or the user
> > +        * invalidate data struct used by the driver is missing in the union.
> > +        */
> > +       if (WARN_ON_ONCE(hwpt->domain->ops->cache_invalidate_user &&
> > +                        (!hwpt->domain->ops->cache_invalidate_user_data_len ||
> > +                         hwpt->domain->ops->cache_invalidate_user_data_len >
> > +                         sizeof(union ucmd_buffer)))) {
> > +               rc = -EINVAL;
> > +               goto out_abort;
> > +
> > +       }
> >
> > Though I am making this cache_invalidate_user optional here, I
> > wonder if there actually could be a case that a user-managed
> > domain doesn't need a cache_invalidate_user op...
> 
> If user-managed domain is the stage-1 domain in nested, then seems not
> possible as cache invalidate is a must. But I think this logic is fine as not
> all the domains allocated by the user is user-managed. It may be kernel
> managed like the s2 domains.

Oh, I forgot to mention that this piece is in the user-managed
HWPT allocator, following Jason's suggestion to separate them.

So, perhaps should just mark it mandatory:

+	if (WARN_ON_ONCE(!hwpt->domain->ops->cache_invalidate_user ||
+			 !hwpt->domain->ops->cache_invalidate_user_data_len ||
+			 hwpt->domain->ops->cache_invalidate_user_data_len >
+			 sizeof(union iommu_user_cache_invalidate))) {
+		rc = -EINVAL;
+		goto out_abort;
+	}
  

Patch

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 97e4114226de..9064e6d181b4 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -286,3 +286,48 @@  int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 	iommufd_put_object(&idev->obj);
 	return rc;
 }
+
+int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_hwpt_invalidate *cmd = ucmd->cmd;
+	struct iommufd_hw_pagetable *hwpt;
+	u32 user_data_len, klen;
+	u64 user_ptr;
+	int rc = 0;
+
+	if (!cmd->data_len || cmd->__reserved)
+		return -EOPNOTSUPP;
+
+	hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id);
+	if (IS_ERR(hwpt))
+		return PTR_ERR(hwpt);
+
+	/* Do not allow any kernel-managed hw_pagetable */
+	if (!hwpt->parent) {
+		rc = -EINVAL;
+		goto out_put_hwpt;
+	}
+
+	klen = hwpt->domain->ops->cache_invalidate_user_data_len;
+	if (!hwpt->domain->ops->cache_invalidate_user || !klen) {
+		rc = -EOPNOTSUPP;
+		goto out_put_hwpt;
+	}
+
+	/*
+	 * Copy the needed fields before reusing the ucmd buffer, this
+	 * avoids memory allocation in this path.
+	 */
+	user_ptr = cmd->data_uptr;
+	user_data_len = cmd->data_len;
+
+	rc = copy_struct_from_user(cmd, klen,
+				   u64_to_user_ptr(user_ptr), user_data_len);
+	if (rc)
+		goto out_put_hwpt;
+
+	rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain, cmd);
+out_put_hwpt:
+	iommufd_put_object(&hwpt->obj);
+	return rc;
+}
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 268ae0d5ae12..047317fa4df0 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -270,6 +270,7 @@  iommufd_hw_pagetable_detach(struct iommufd_device *idev);
 void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
 void iommufd_hw_pagetable_abort(struct iommufd_object *obj);
 int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd);
+int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd);
 
 static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
 					    struct iommufd_hw_pagetable *hwpt)
@@ -281,6 +282,14 @@  static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
 		refcount_dec(&hwpt->obj.users);
 }
 
+static inline struct iommufd_hw_pagetable *
+iommufd_get_hwpt(struct iommufd_ucmd *ucmd, u32 id)
+{
+	return container_of(iommufd_get_object(ucmd->ictx, id,
+					       IOMMUFD_OBJ_HW_PAGETABLE),
+			    struct iommufd_hw_pagetable, obj);
+}
+
 struct iommufd_group {
 	struct kref ref;
 	struct mutex lock;
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 5f4420626421..255e8a3c5b0e 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -391,6 +391,7 @@  union ucmd_buffer {
 	struct iommu_destroy destroy;
 	struct iommu_hw_info info;
 	struct iommu_hwpt_alloc hwpt;
+	struct iommu_hwpt_invalidate cache;
 	struct iommu_ioas_alloc alloc;
 	struct iommu_ioas_allow_iovas allow_iovas;
 	struct iommu_ioas_copy ioas_copy;
@@ -427,6 +428,8 @@  static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 __reserved),
 	IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
 		 data_uptr),
+	IOCTL_OP(IOMMU_HWPT_INVALIDATE, iommufd_hwpt_invalidate,
+		 struct iommu_hwpt_invalidate, data_uptr),
 	IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
 		 struct iommu_ioas_alloc, out_ioas_id),
 	IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 73bf9af91e99..034da283cd3a 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -48,6 +48,7 @@  enum {
 	IOMMUFD_CMD_HWPT_ALLOC,
 	IOMMUFD_CMD_GET_HW_INFO,
 	IOMMUFD_CMD_RESV_IOVA_RANGES,
+	IOMMUFD_CMD_HWPT_INVALIDATE,
 };
 
 /**
@@ -486,4 +487,25 @@  struct iommu_resv_iova_ranges {
 	__aligned_u64 resv_iovas;
 };
 #define IOMMU_RESV_IOVA_RANGES _IO(IOMMUFD_TYPE, IOMMUFD_CMD_RESV_IOVA_RANGES)
+
+/**
+ * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
+ * @size: sizeof(struct iommu_hwpt_invalidate)
+ * @hwpt_id: HWPT ID of target hardware page table for the invalidation
+ * @data_len: Length of the type specific data
+ * @__reserved: Must be 0
+ * @data_uptr: User pointer to the type specific data
+ *
+ * Invalidate the iommu cache for user-managed page table. Modifications
+ * on user-managed page table should be followed with this operation to
+ * sync the IOTLB. The data in @data_uptr differs per the hwpt type.
+ */
+struct iommu_hwpt_invalidate {
+	__u32 size;
+	__u32 hwpt_id;
+	__u32 data_len;
+	__u32 __reserved;
+	__aligned_u64 data_uptr;
+};
+#define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE)
 #endif