diff mbox series

[v6,2/6] iommufd: Add IOMMU_HWPT_INVALIDATE

Message ID	20231117130717.19875-3-yi.l.liu@intel.com
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::3:7 as permitted sender) client-ip=2620:137:e000::3:7; From: Yi Liu <yi.l.liu@intel.com> To: joro@8bytes.org, alex.williamson@redhat.com, jgg@nvidia.com, kevin.tian@intel.com, robin.murphy@arm.com, baolu.lu@linux.intel.com Cc: cohuck@redhat.com, eric.auger@redhat.com, nicolinc@nvidia.com, kvm@vger.kernel.org, mjrosato@linux.ibm.com, chao.p.peng@linux.intel.com, yi.l.liu@intel.com, yi.y.sun@linux.intel.com, peterx@redhat.com, jasowang@redhat.com, shameerali.kolothum.thodi@huawei.com, lulu@redhat.com, suravee.suthikulpanit@amd.com, iommu@lists.linux.dev, linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org, zhenzhong.duan@intel.com, joao.m.martins@oracle.com, xin.zeng@intel.com, yan.y.zhao@intel.com Subject: [PATCH v6 2/6] iommufd: Add IOMMU_HWPT_INVALIDATE Date: Fri, 17 Nov 2023 05:07:13 -0800 Message-Id: <20231117130717.19875-3-yi.l.liu@intel.com> In-Reply-To: <20231117130717.19875-1-yi.l.liu@intel.com> References: <20231117130717.19875-1-yi.l.liu@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	iommufd: Add nesting infrastructure (part 2/2) \| [v6,0/6] iommufd: Add nesting infrastructure (part 2/2) [v6,1/6] iommu: Add cache_invalidate_user op [v6,2/6] iommufd: Add IOMMU_HWPT_INVALIDATE [v6,3/6] iommu: Add iommu_copy_struct_from_user_array helper [v6,4/6] iommufd/selftest: Add mock_domain_cache_invalidate_user support [v6,5/6] iommufd/selftest: Add IOMMU_TEST_OP_MD_CHECK_IOTLB test op [v6,6/6] iommufd/selftest: Add coverage for IOMMU_HWPT_INVALIDATE ioctl

Commit Message

Yi Liu Nov. 17, 2023, 1:07 p.m. UTC

  In nested translation, the stage-1 page table is user-managed but cached
by the IOMMU hardware, so an update on present page table entries in the
stage-1 page table should be followed with a cache invalidation.

Add an IOMMU_HWPT_INVALIDATE ioctl to support such a cache invalidation.
It takes hwpt_id to specify the iommu_domain, and a multi-entry array to
support multiple invalidation requests in one ioctl.

Check cache_invalidate_user op in the iommufd_hw_pagetable_alloc_nested,
since all nested domains need that.

Co-developed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/iommu/iommufd/hw_pagetable.c    | 35 +++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |  9 +++++++
 drivers/iommu/iommufd/main.c            |  3 +++
 include/uapi/linux/iommufd.h            | 35 +++++++++++++++++++++++++
 4 files changed, 82 insertions(+)

Comments

Tian, Kevin Nov. 20, 2023, 8:09 a.m. UTC | #1

> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Friday, November 17, 2023 9:07 PM
> +
> +	hwpt = iommufd_hw_pagetable_get_nested(ucmd, cmd->hwpt_id);
> +	if (IS_ERR(hwpt))
> +		return PTR_ERR(hwpt);
> +
> +	rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain,
> &data_array,
> +						      &cmd-
> >out_driver_error_code);
> +	cmd->req_num = data_array.entry_num;
> +	if (iommufd_ucmd_respond(ucmd, sizeof(*cmd)))
> +		return -EFAULT;
> +	iommufd_put_object(&hwpt->obj);

the object is not put when ucmd response fails. It can be put right
after calling @cache_invalidate_user()

> @@ -309,6 +309,7 @@ union ucmd_buffer {
>  	struct iommu_hwpt_alloc hwpt;
>  	struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
>  	struct iommu_hwpt_set_dirty_tracking set_dirty_tracking;
> +	struct iommu_hwpt_invalidate cache;

In alphabetic order this should be put before "hwpt_set_dirty"

>  	struct iommu_ioas_alloc alloc;
>  	struct iommu_ioas_allow_iovas allow_iovas;
>  	struct iommu_ioas_copy ioas_copy;
> @@ -348,6 +349,8 @@ static const struct iommufd_ioctl_op
> iommufd_ioctl_ops[] = {
>  		 struct iommu_hwpt_get_dirty_bitmap, data),
>  	IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING,
> iommufd_hwpt_set_dirty_tracking,
>  		 struct iommu_hwpt_set_dirty_tracking, __reserved),
> +	IOCTL_OP(IOMMU_HWPT_INVALIDATE, iommufd_hwpt_invalidate,
> +		 struct iommu_hwpt_invalidate, out_driver_error_code),

ditto

> +/**
> + * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
> + * @size: sizeof(struct iommu_hwpt_invalidate)
> + * @hwpt_id: HWPT ID of a nested HWPT for cache invalidation

remove the first 'HWPT'

> + * @reqs_uptr: User pointer to an array having @req_num of cache
> invalidation
> + *             requests. The request entries in the array are of fixed width
> + *             @req_len, and contain a user data structure for invalidation
> + *             request specific to the given hardware page table.
> + * @req_type: One of enum iommu_hwpt_data_type, defining the data
> type of all
> + *            the entries in the invalidation request array. It should suit
> + *            with the data_type passed per the allocation of the hwpt pointed
> + *            by @hwpt_id.

"It should match the data_type given to the specified hwpt"

> + * @req_len: Length (in bytes) of a request entry in the request array
> + * @req_num: Input the number of cache invalidation requests in the array.
> + *           Output the number of requests successfully handled by kernel.
> + * @out_driver_error_code: Report a driver speicifc error code upon failure.
> + *                         It's optional, driver has a choice to fill it or
> + *                         not.

Being optional how does the user tell whether the code is filled or not?

> + *
> + * Invalidate the iommu cache for user-managed page table. Modifications
> on a
> + * user-managed page table should be followed by this operation to sync
> cache.
> + * Each ioctl can support one or more cache invalidation requests in the
> array
> + * that has a total size of @req_len * @req_num.
> + */
> +struct iommu_hwpt_invalidate {
> +	__u32 size;
> +	__u32 hwpt_id;
> +	__aligned_u64 reqs_uptr;
> +	__u32 req_type;
> +	__u32 req_len;
> +	__u32 req_num;
> +	__u32 out_driver_error_code;
> +};

Probably 'data_uptr', 'data_type', "entry_len' and 'entry_num" read slightly
better.

Yi Liu Nov. 20, 2023, 8:29 a.m. UTC | #2

On 2023/11/20 16:09, Tian, Kevin wrote:
>> From: Liu, Yi L <yi.l.liu@intel.com>
>> Sent: Friday, November 17, 2023 9:07 PM
>> +
>> +	hwpt = iommufd_hw_pagetable_get_nested(ucmd, cmd->hwpt_id);
>> +	if (IS_ERR(hwpt))
>> +		return PTR_ERR(hwpt);
>> +
>> +	rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain,
>> &data_array,
>> +						      &cmd-
>>> out_driver_error_code);
>> +	cmd->req_num = data_array.entry_num;
>> +	if (iommufd_ucmd_respond(ucmd, sizeof(*cmd)))
>> +		return -EFAULT;
>> +	iommufd_put_object(&hwpt->obj);
> 
> the object is not put when ucmd response fails. It can be put right
> after calling @cache_invalidate_user()

yes.

>> @@ -309,6 +309,7 @@ union ucmd_buffer {
>>   	struct iommu_hwpt_alloc hwpt;
>>   	struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
>>   	struct iommu_hwpt_set_dirty_tracking set_dirty_tracking;
>> +	struct iommu_hwpt_invalidate cache;
> 
> In alphabetic order this should be put before "hwpt_set_dirty"

yes.

>>   	struct iommu_ioas_alloc alloc;
>>   	struct iommu_ioas_allow_iovas allow_iovas;
>>   	struct iommu_ioas_copy ioas_copy;
>> @@ -348,6 +349,8 @@ static const struct iommufd_ioctl_op
>> iommufd_ioctl_ops[] = {
>>   		 struct iommu_hwpt_get_dirty_bitmap, data),
>>   	IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING,
>> iommufd_hwpt_set_dirty_tracking,
>>   		 struct iommu_hwpt_set_dirty_tracking, __reserved),
>> +	IOCTL_OP(IOMMU_HWPT_INVALIDATE, iommufd_hwpt_invalidate,
>> +		 struct iommu_hwpt_invalidate, out_driver_error_code),
> 
> ditto
> 
>> +/**
>> + * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
>> + * @size: sizeof(struct iommu_hwpt_invalidate)
>> + * @hwpt_id: HWPT ID of a nested HWPT for cache invalidation
> 
> remove the first 'HWPT'

>> + * @reqs_uptr: User pointer to an array having @req_num of cache
>> invalidation
>> + *             requests. The request entries in the array are of fixed width
>> + *             @req_len, and contain a user data structure for invalidation
>> + *             request specific to the given hardware page table.
>> + * @req_type: One of enum iommu_hwpt_data_type, defining the data
>> type of all
>> + *            the entries in the invalidation request array. It should suit
>> + *            with the data_type passed per the allocation of the hwpt pointed
>> + *            by @hwpt_id.
> 
> "It should match the data_type given to the specified hwpt"

above comments are well received. :)

>> + * @req_len: Length (in bytes) of a request entry in the request array
>> + * @req_num: Input the number of cache invalidation requests in the array.
>> + *           Output the number of requests successfully handled by kernel.
>> + * @out_driver_error_code: Report a driver speicifc error code upon failure.
>> + *                         It's optional, driver has a choice to fill it or
>> + *                         not.
> 
> Being optional how does the user tell whether the code is filled or not?

seems like we need a flag for it. otherwise, a reserved special value is
required. or is it enough to just document it that this field is available
or not per the iommu_hw_info_type?

>> + *
>> + * Invalidate the iommu cache for user-managed page table. Modifications
>> on a
>> + * user-managed page table should be followed by this operation to sync
>> cache.
>> + * Each ioctl can support one or more cache invalidation requests in the
>> array
>> + * that has a total size of @req_len * @req_num.
>> + */
>> +struct iommu_hwpt_invalidate {
>> +	__u32 size;
>> +	__u32 hwpt_id;
>> +	__aligned_u64 reqs_uptr;
>> +	__u32 req_type;
>> +	__u32 req_len;
>> +	__u32 req_num;
>> +	__u32 out_driver_error_code;
>> +};
> 
> Probably 'data_uptr', 'data_type', "entry_len' and 'entry_num" read slightly
> better.

sure.

Tian, Kevin Nov. 20, 2023, 8:34 a.m. UTC | #3

> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Monday, November 20, 2023 4:30 PM
> 
> On 2023/11/20 16:09, Tian, Kevin wrote:
> >> From: Liu, Yi L <yi.l.liu@intel.com>
> >> Sent: Friday, November 17, 2023 9:07 PM
> >> + * @req_len: Length (in bytes) of a request entry in the request array
> >> + * @req_num: Input the number of cache invalidation requests in the
> array.
> >> + *           Output the number of requests successfully handled by kernel.
> >> + * @out_driver_error_code: Report a driver speicifc error code upon
> failure.
> >> + *                         It's optional, driver has a choice to fill it or
> >> + *                         not.
> >
> > Being optional how does the user tell whether the code is filled or not?
> 
> seems like we need a flag for it. otherwise, a reserved special value is
> required. or is it enough to just document it that this field is available
> or not per the iommu_hw_info_type?
> 

No guarantee that a reserved special value applies to all vendors.

I'll just remove the optional part. the viommu driver knows what a valid
code is, if the driver fills it.

Nicolin Chen Nov. 20, 2023, 5:36 p.m. UTC | #4

On Mon, Nov 20, 2023 at 08:34:58AM +0000, Tian, Kevin wrote:
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Monday, November 20, 2023 4:30 PM
> >
> > On 2023/11/20 16:09, Tian, Kevin wrote:
> > >> From: Liu, Yi L <yi.l.liu@intel.com>
> > >> Sent: Friday, November 17, 2023 9:07 PM
> > >> + * @req_len: Length (in bytes) of a request entry in the request array
> > >> + * @req_num: Input the number of cache invalidation requests in the
> > array.
> > >> + *           Output the number of requests successfully handled by kernel.
> > >> + * @out_driver_error_code: Report a driver speicifc error code upon
> > failure.
> > >> + *                         It's optional, driver has a choice to fill it or
> > >> + *                         not.
> > >
> > > Being optional how does the user tell whether the code is filled or not?

Well, naming it "error_code" indicates zero means no error while
non-zero means something? An error return from this ioctl could
also tell the user space to look up for this driver error code,
if it ever cares.

> > seems like we need a flag for it. otherwise, a reserved special value is
> > required. or is it enough to just document it that this field is available
> > or not per the iommu_hw_info_type?
> >
> 
> No guarantee that a reserved special value applies to all vendors.
> 
> I'll just remove the optional part. the viommu driver knows what a valid
> code is, if the driver fills it.

Hmm, remove out_driver_error_code? Do you mean by reusing ioctl
error code to tell the user space what driver error code it gets?

Nicolin

Tian, Kevin Nov. 21, 2023, 2:50 a.m. UTC | #5

> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Tuesday, November 21, 2023 1:37 AM
> 
> On Mon, Nov 20, 2023 at 08:34:58AM +0000, Tian, Kevin wrote:
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Monday, November 20, 2023 4:30 PM
> > >
> > > On 2023/11/20 16:09, Tian, Kevin wrote:
> > > >> From: Liu, Yi L <yi.l.liu@intel.com>
> > > >> Sent: Friday, November 17, 2023 9:07 PM
> > > >> + * @req_len: Length (in bytes) of a request entry in the request array
> > > >> + * @req_num: Input the number of cache invalidation requests in the
> > > array.
> > > >> + *           Output the number of requests successfully handled by
> kernel.
> > > >> + * @out_driver_error_code: Report a driver speicifc error code upon
> > > failure.
> > > >> + *                         It's optional, driver has a choice to fill it or
> > > >> + *                         not.
> > > >
> > > > Being optional how does the user tell whether the code is filled or not?
> 
> Well, naming it "error_code" indicates zero means no error while
> non-zero means something? An error return from this ioctl could
> also tell the user space to look up for this driver error code,
> if it ever cares.

probably over-thinking but I'm not sure whether zero is guaranteed to
mean no error in all implementations...

> 
> > > seems like we need a flag for it. otherwise, a reserved special value is
> > > required. or is it enough to just document it that this field is available
> > > or not per the iommu_hw_info_type?
> > >
> >
> > No guarantee that a reserved special value applies to all vendors.
> >
> > I'll just remove the optional part. the viommu driver knows what a valid
> > code is, if the driver fills it.
> 
> Hmm, remove out_driver_error_code? Do you mean by reusing ioctl
> error code to tell the user space what driver error code it gets?
> 

No. I just meant to remove the last sentence "It's optional ..."

Baolu Lu Nov. 21, 2023, 5:02 a.m. UTC | #6

On 11/17/23 9:07 PM, Yi Liu wrote:
> In nested translation, the stage-1 page table is user-managed but cached
> by the IOMMU hardware, so an update on present page table entries in the
> stage-1 page table should be followed with a cache invalidation.
> 
> Add an IOMMU_HWPT_INVALIDATE ioctl to support such a cache invalidation.
> It takes hwpt_id to specify the iommu_domain, and a multi-entry array to
> support multiple invalidation requests in one ioctl.
> 
> Check cache_invalidate_user op in the iommufd_hw_pagetable_alloc_nested,
> since all nested domains need that.
> 
> Co-developed-by: Nicolin Chen<nicolinc@nvidia.com>
> Signed-off-by: Nicolin Chen<nicolinc@nvidia.com>
> Signed-off-by: Yi Liu<yi.l.liu@intel.com>
> ---
>   drivers/iommu/iommufd/hw_pagetable.c    | 35 +++++++++++++++++++++++++
>   drivers/iommu/iommufd/iommufd_private.h |  9 +++++++
>   drivers/iommu/iommufd/main.c            |  3 +++
>   include/uapi/linux/iommufd.h            | 35 +++++++++++++++++++++++++
>   4 files changed, 82 insertions(+)
> 
> diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
> index 2abbeafdbd22..367459d92f69 100644
> --- a/drivers/iommu/iommufd/hw_pagetable.c
> +++ b/drivers/iommu/iommufd/hw_pagetable.c
> @@ -238,6 +238,11 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
>   		rc = -EINVAL;
>   		goto out_abort;
>   	}
> +	/* Driver is buggy by missing cache_invalidate_user in domain_ops */
> +	if (WARN_ON_ONCE(!hwpt->domain->ops->cache_invalidate_user)) {
> +		rc = -EINVAL;
> +		goto out_abort;
> +	}
>   	return hwpt_nested;

The WARN message here may cause kernel regression when users bisect
issues. Till this patch, there are no drivers support the
cache_invalidation_user callback yet.

Best regards,
baolu

Nicolin Chen Nov. 21, 2023, 5:19 a.m. UTC | #7

On Tue, Nov 21, 2023 at 01:02:49PM +0800, Baolu Lu wrote:
> On 11/17/23 9:07 PM, Yi Liu wrote:
> > In nested translation, the stage-1 page table is user-managed but cached
> > by the IOMMU hardware, so an update on present page table entries in the
> > stage-1 page table should be followed with a cache invalidation.
> > 
> > Add an IOMMU_HWPT_INVALIDATE ioctl to support such a cache invalidation.
> > It takes hwpt_id to specify the iommu_domain, and a multi-entry array to
> > support multiple invalidation requests in one ioctl.
> > 
> > Check cache_invalidate_user op in the iommufd_hw_pagetable_alloc_nested,
> > since all nested domains need that.
> > 
> > Co-developed-by: Nicolin Chen<nicolinc@nvidia.com>
> > Signed-off-by: Nicolin Chen<nicolinc@nvidia.com>
> > Signed-off-by: Yi Liu<yi.l.liu@intel.com>
> > ---
> >   drivers/iommu/iommufd/hw_pagetable.c    | 35 +++++++++++++++++++++++++
> >   drivers/iommu/iommufd/iommufd_private.h |  9 +++++++
> >   drivers/iommu/iommufd/main.c            |  3 +++
> >   include/uapi/linux/iommufd.h            | 35 +++++++++++++++++++++++++
> >   4 files changed, 82 insertions(+)
> > 
> > diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
> > index 2abbeafdbd22..367459d92f69 100644
> > --- a/drivers/iommu/iommufd/hw_pagetable.c
> > +++ b/drivers/iommu/iommufd/hw_pagetable.c
> > @@ -238,6 +238,11 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
> >               rc = -EINVAL;
> >               goto out_abort;
> >       }
> > +     /* Driver is buggy by missing cache_invalidate_user in domain_ops */
> > +     if (WARN_ON_ONCE(!hwpt->domain->ops->cache_invalidate_user)) {
> > +             rc = -EINVAL;
> > +             goto out_abort;
> > +     }
> >       return hwpt_nested;
> 
> The WARN message here may cause kernel regression when users bisect
> issues. Till this patch, there are no drivers support the
> cache_invalidation_user callback yet.

Ah, this is an unintended consequence from our uAPI bisect to
merge the nesting alloc first...

Would removing the WARN_ON_ONCE be okay? Although having this
WARN is actually the point here...

Thanks
Nic

Nicolin Chen Nov. 21, 2023, 5:24 a.m. UTC | #8

On Tue, Nov 21, 2023 at 02:50:05AM +0000, Tian, Kevin wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Tuesday, November 21, 2023 1:37 AM
> >
> > On Mon, Nov 20, 2023 at 08:34:58AM +0000, Tian, Kevin wrote:
> > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > Sent: Monday, November 20, 2023 4:30 PM
> > > >
> > > > On 2023/11/20 16:09, Tian, Kevin wrote:
> > > > >> From: Liu, Yi L <yi.l.liu@intel.com>
> > > > >> Sent: Friday, November 17, 2023 9:07 PM
> > > > >> + * @req_len: Length (in bytes) of a request entry in the request array
> > > > >> + * @req_num: Input the number of cache invalidation requests in the
> > > > array.
> > > > >> + *           Output the number of requests successfully handled by
> > kernel.
> > > > >> + * @out_driver_error_code: Report a driver speicifc error code upon
> > > > failure.
> > > > >> + *                         It's optional, driver has a choice to fill it or
> > > > >> + *                         not.
> > > > >
> > > > > Being optional how does the user tell whether the code is filled or not?
> >
> > Well, naming it "error_code" indicates zero means no error while
> > non-zero means something? An error return from this ioctl could
> > also tell the user space to look up for this driver error code,
> > if it ever cares.
> 
> probably over-thinking but I'm not sure whether zero is guaranteed to
> mean no error in all implementations...

Well, you are right. Usually HW conveniently raises a flag in a
register to indicate something wrong, yet it is probably unsafe
to say it definitely.

> > > > seems like we need a flag for it. otherwise, a reserved special value is
> > > > required. or is it enough to just document it that this field is available
> > > > or not per the iommu_hw_info_type?
> > > >
> > >
> > > No guarantee that a reserved special value applies to all vendors.
> > >
> > > I'll just remove the optional part. the viommu driver knows what a valid
> > > code is, if the driver fills it.
> >
> > Hmm, remove out_driver_error_code? Do you mean by reusing ioctl
> > error code to tell the user space what driver error code it gets?
> >
> 
> No. I just meant to remove the last sentence "It's optional ..."

OK. Would it force all IOMMU drivers to report something upon
an error/failure?

Thanks
Nic

Tian, Kevin Nov. 24, 2023, 2:36 a.m. UTC | #9

> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Tuesday, November 21, 2023 1:24 PM
> 
> On Tue, Nov 21, 2023 at 02:50:05AM +0000, Tian, Kevin wrote:
> > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > Sent: Tuesday, November 21, 2023 1:37 AM
> > >
> > > On Mon, Nov 20, 2023 at 08:34:58AM +0000, Tian, Kevin wrote:
> > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > Sent: Monday, November 20, 2023 4:30 PM
> > > > >
> > > > > On 2023/11/20 16:09, Tian, Kevin wrote:
> > > > > >> From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > >> Sent: Friday, November 17, 2023 9:07 PM
> > > > > >> + * @req_len: Length (in bytes) of a request entry in the request
> array
> > > > > >> + * @req_num: Input the number of cache invalidation requests in
> the
> > > > > array.
> > > > > >> + *           Output the number of requests successfully handled by
> > > kernel.
> > > > > >> + * @out_driver_error_code: Report a driver speicifc error code
> upon
> > > > > failure.
> > > > > >> + *                         It's optional, driver has a choice to fill it or
> > > > > >> + *                         not.
> > > > > >
> > > > > > Being optional how does the user tell whether the code is filled or
> not?
> > >
> > > Well, naming it "error_code" indicates zero means no error while
> > > non-zero means something? An error return from this ioctl could
> > > also tell the user space to look up for this driver error code,
> > > if it ever cares.
> >
> > probably over-thinking but I'm not sure whether zero is guaranteed to
> > mean no error in all implementations...
> 
> Well, you are right. Usually HW conveniently raises a flag in a
> register to indicate something wrong, yet it is probably unsafe
> to say it definitely.
> 

this reminds me one open. What about an implementation having
a hierarchical error code layout e.g. one main error register with
each bit representing an error category then multiple error code
registers each for one error category? In this case probably
a single out_driver_error_code cannot carry that raw information.

Instead the iommu driver may need to define a customized error
code convention in uapi header which is converted from the
raw error information.

From this angle should we simply say that the error code definition
must be included in the uapi header? If raw error information can
be carried by this field then this hw can simply say that the error
code format is same as the hw spec defines.

With that explicit information then the viommu can easily tell
whether error code is filled or not based on its own convention.

Nicolin Chen Nov. 27, 2023, 7:53 p.m. UTC | #10

On Fri, Nov 24, 2023 at 02:36:29AM +0000, Tian, Kevin wrote:

> > > > > > >> + * @out_driver_error_code: Report a driver speicifc error code
> > upon
> > > > > > failure.
> > > > > > >> + *                         It's optional, driver has a choice to fill it or
> > > > > > >> + *                         not.
> > > > > > >
> > > > > > > Being optional how does the user tell whether the code is filled or
> > not?
> > > >
> > > > Well, naming it "error_code" indicates zero means no error while
> > > > non-zero means something? An error return from this ioctl could
> > > > also tell the user space to look up for this driver error code,
> > > > if it ever cares.
> > >
> > > probably over-thinking but I'm not sure whether zero is guaranteed to
> > > mean no error in all implementations...
> >
> > Well, you are right. Usually HW conveniently raises a flag in a
> > register to indicate something wrong, yet it is probably unsafe
> > to say it definitely.
> >
> 
> this reminds me one open. What about an implementation having
> a hierarchical error code layout e.g. one main error register with
> each bit representing an error category then multiple error code
> registers each for one error category? In this case probably
> a single out_driver_error_code cannot carry that raw information.

Hmm, good point.

> Instead the iommu driver may need to define a customized error
> code convention in uapi header which is converted from the
> raw error information.
> 
> From this angle should we simply say that the error code definition
> must be included in the uapi header? If raw error information can
> be carried by this field then this hw can simply say that the error
> code format is same as the hw spec defines.
> 
> With that explicit information then the viommu can easily tell
> whether error code is filled or not based on its own convention.

That'd be to put this error_code field into the driver uAPI
structure right?

I also thought about making this out_driver_error_code per HW.
Yet, an error can be either per array or per entry/quest. The
array-related error should be reported in the array structure
that is a core uAPI, v.s. the per-HW entry structure. Though
we could still report an array error in the entry structure
at the first entry (or indexed by "array->entry_num")?

Thanks
Nic

Yi Liu Nov. 28, 2023, 5:54 a.m. UTC | #11

On 2023/11/21 13:19, Nicolin Chen wrote:
> On Tue, Nov 21, 2023 at 01:02:49PM +0800, Baolu Lu wrote:
>> On 11/17/23 9:07 PM, Yi Liu wrote:
>>> In nested translation, the stage-1 page table is user-managed but cached
>>> by the IOMMU hardware, so an update on present page table entries in the
>>> stage-1 page table should be followed with a cache invalidation.
>>>
>>> Add an IOMMU_HWPT_INVALIDATE ioctl to support such a cache invalidation.
>>> It takes hwpt_id to specify the iommu_domain, and a multi-entry array to
>>> support multiple invalidation requests in one ioctl.
>>>
>>> Check cache_invalidate_user op in the iommufd_hw_pagetable_alloc_nested,
>>> since all nested domains need that.
>>>
>>> Co-developed-by: Nicolin Chen<nicolinc@nvidia.com>
>>> Signed-off-by: Nicolin Chen<nicolinc@nvidia.com>
>>> Signed-off-by: Yi Liu<yi.l.liu@intel.com>
>>> ---
>>>    drivers/iommu/iommufd/hw_pagetable.c    | 35 +++++++++++++++++++++++++
>>>    drivers/iommu/iommufd/iommufd_private.h |  9 +++++++
>>>    drivers/iommu/iommufd/main.c            |  3 +++
>>>    include/uapi/linux/iommufd.h            | 35 +++++++++++++++++++++++++
>>>    4 files changed, 82 insertions(+)
>>>
>>> diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
>>> index 2abbeafdbd22..367459d92f69 100644
>>> --- a/drivers/iommu/iommufd/hw_pagetable.c
>>> +++ b/drivers/iommu/iommufd/hw_pagetable.c
>>> @@ -238,6 +238,11 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
>>>                rc = -EINVAL;
>>>                goto out_abort;
>>>        }
>>> +     /* Driver is buggy by missing cache_invalidate_user in domain_ops */
>>> +     if (WARN_ON_ONCE(!hwpt->domain->ops->cache_invalidate_user)) {
>>> +             rc = -EINVAL;
>>> +             goto out_abort;
>>> +     }
>>>        return hwpt_nested;
>>
>> The WARN message here may cause kernel regression when users bisect
>> issues. Till this patch, there are no drivers support the
>> cache_invalidation_user callback yet.
> 
> Ah, this is an unintended consequence from our uAPI bisect to
> merge the nesting alloc first...
> 
> Would removing the WARN_ON_ONCE be okay? Although having this
> WARN is actually the point here...

seems like we may need to remove it. how about your opinion, @Jason?

> Thanks
> Nic

Yi Liu Nov. 28, 2023, 6:01 a.m. UTC | #12

On 2023/11/28 03:53, Nicolin Chen wrote:
> On Fri, Nov 24, 2023 at 02:36:29AM +0000, Tian, Kevin wrote:
> 
>>>>>>>>> + * @out_driver_error_code: Report a driver speicifc error code
>>> upon
>>>>>>> failure.
>>>>>>>>> + *                         It's optional, driver has a choice to fill it or
>>>>>>>>> + *                         not.
>>>>>>>>
>>>>>>>> Being optional how does the user tell whether the code is filled or
>>> not?
>>>>>
>>>>> Well, naming it "error_code" indicates zero means no error while
>>>>> non-zero means something? An error return from this ioctl could
>>>>> also tell the user space to look up for this driver error code,
>>>>> if it ever cares.
>>>>
>>>> probably over-thinking but I'm not sure whether zero is guaranteed to
>>>> mean no error in all implementations...
>>>
>>> Well, you are right. Usually HW conveniently raises a flag in a
>>> register to indicate something wrong, yet it is probably unsafe
>>> to say it definitely.
>>>
>>
>> this reminds me one open. What about an implementation having
>> a hierarchical error code layout e.g. one main error register with
>> each bit representing an error category then multiple error code
>> registers each for one error category? In this case probably
>> a single out_driver_error_code cannot carry that raw information.
> 
> Hmm, good point.
> 
>> Instead the iommu driver may need to define a customized error
>> code convention in uapi header which is converted from the
>> raw error information.
>>
>>  From this angle should we simply say that the error code definition
>> must be included in the uapi header? If raw error information can
>> be carried by this field then this hw can simply say that the error
>> code format is same as the hw spec defines.
>>
>> With that explicit information then the viommu can easily tell
>> whether error code is filled or not based on its own convention.
> 
> That'd be to put this error_code field into the driver uAPI
> structure right?

looks to be. Then it would be convenient to reserve a code for
the case of no error (either no error happened or just not used)

> 
> I also thought about making this out_driver_error_code per HW.
> Yet, an error can be either per array or per entry/quest. The
> array-related error should be reported in the array structure
> that is a core uAPI, v.s. the per-HW entry structure. Though
> we could still report an array error in the entry structure
> at the first entry (or indexed by "array->entry_num")?

per-entry error code seems like to be a completion code. Each
entry in the array can have a corresponding code (0 for succ,
others for failure). do you already have such a need?

Tian, Kevin Nov. 28, 2023, 8:03 a.m. UTC | #13

> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Tuesday, November 28, 2023 3:53 AM
> 
> On Fri, Nov 24, 2023 at 02:36:29AM +0000, Tian, Kevin wrote:
> 
> > > > > > > >> + * @out_driver_error_code: Report a driver speicifc error code
> > > upon
> > > > > > > failure.
> > > > > > > >> + *                         It's optional, driver has a choice to fill it or
> > > > > > > >> + *                         not.
> > > > > > > >
> > > > > > > > Being optional how does the user tell whether the code is filled
> or
> > > not?
> > > > >
> > > > > Well, naming it "error_code" indicates zero means no error while
> > > > > non-zero means something? An error return from this ioctl could
> > > > > also tell the user space to look up for this driver error code,
> > > > > if it ever cares.
> > > >
> > > > probably over-thinking but I'm not sure whether zero is guaranteed to
> > > > mean no error in all implementations...
> > >
> > > Well, you are right. Usually HW conveniently raises a flag in a
> > > register to indicate something wrong, yet it is probably unsafe
> > > to say it definitely.
> > >
> >
> > this reminds me one open. What about an implementation having
> > a hierarchical error code layout e.g. one main error register with
> > each bit representing an error category then multiple error code
> > registers each for one error category? In this case probably
> > a single out_driver_error_code cannot carry that raw information.
> 
> Hmm, good point.
> 
> > Instead the iommu driver may need to define a customized error
> > code convention in uapi header which is converted from the
> > raw error information.
> >
> > From this angle should we simply say that the error code definition
> > must be included in the uapi header? If raw error information can
> > be carried by this field then this hw can simply say that the error
> > code format is same as the hw spec defines.
> >
> > With that explicit information then the viommu can easily tell
> > whether error code is filled or not based on its own convention.
> 
> That'd be to put this error_code field into the driver uAPI
> structure right?
> 
> I also thought about making this out_driver_error_code per HW.
> Yet, an error can be either per array or per entry/quest. The
> array-related error should be reported in the array structure
> that is a core uAPI, v.s. the per-HW entry structure. Though
> we could still report an array error in the entry structure
> at the first entry (or indexed by "array->entry_num")?
> 

why would there be an array error? array is just a software
entity containing actual HW invalidation cmds. If there is
any error with the array itself it should be reported via
ioctl errno.

Jason, how about your opinion? I didn't spot big issues
except this one. Hope it can make into 6.8.

Nicolin Chen Nov. 29, 2023, 12:51 a.m. UTC | #14

On Tue, Nov 28, 2023 at 08:03:35AM +0000, Tian, Kevin wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Tuesday, November 28, 2023 3:53 AM
> >
> > On Fri, Nov 24, 2023 at 02:36:29AM +0000, Tian, Kevin wrote:
> >
> > > > > > > > >> + * @out_driver_error_code: Report a driver speicifc error code
> > > > upon
> > > > > > > > failure.
> > > > > > > > >> + *                         It's optional, driver has a choice to fill it or
> > > > > > > > >> + *                         not.
> > > > > > > > >
> > > > > > > > > Being optional how does the user tell whether the code is filled
> > or
> > > > not?
> > > > > >
> > > > > > Well, naming it "error_code" indicates zero means no error while
> > > > > > non-zero means something? An error return from this ioctl could
> > > > > > also tell the user space to look up for this driver error code,
> > > > > > if it ever cares.
> > > > >
> > > > > probably over-thinking but I'm not sure whether zero is guaranteed to
> > > > > mean no error in all implementations...
> > > >
> > > > Well, you are right. Usually HW conveniently raises a flag in a
> > > > register to indicate something wrong, yet it is probably unsafe
> > > > to say it definitely.
> > > >
> > >
> > > this reminds me one open. What about an implementation having
> > > a hierarchical error code layout e.g. one main error register with
> > > each bit representing an error category then multiple error code
> > > registers each for one error category? In this case probably
> > > a single out_driver_error_code cannot carry that raw information.
> >
> > Hmm, good point.
> >
> > > Instead the iommu driver may need to define a customized error
> > > code convention in uapi header which is converted from the
> > > raw error information.
> > >
> > > From this angle should we simply say that the error code definition
> > > must be included in the uapi header? If raw error information can
> > > be carried by this field then this hw can simply say that the error
> > > code format is same as the hw spec defines.
> > >
> > > With that explicit information then the viommu can easily tell
> > > whether error code is filled or not based on its own convention.
> >
> > That'd be to put this error_code field into the driver uAPI
> > structure right?
> >
> > I also thought about making this out_driver_error_code per HW.
> > Yet, an error can be either per array or per entry/quest. The
> > array-related error should be reported in the array structure
> > that is a core uAPI, v.s. the per-HW entry structure. Though
> > we could still report an array error in the entry structure
> > at the first entry (or indexed by "array->entry_num")?
> >
> 
> why would there be an array error? array is just a software
> entity containing actual HW invalidation cmds. If there is
> any error with the array itself it should be reported via
> ioctl errno.

User array reading is a software operation, but kernel array
reading is a hardware operation that can raise an error when
the memory location to the array is incorrect or so.

With that being said, I think errno (-EIO) could do the job,
as you suggested too.

Thanks
Nic

> Jason, how about your opinion? I didn't spot big issues
> except this one. Hope it can make into 6.8.

Nicolin Chen Nov. 29, 2023, 12:54 a.m. UTC | #15

On Tue, Nov 28, 2023 at 02:01:59PM +0800, Yi Liu wrote:
> On 2023/11/28 03:53, Nicolin Chen wrote:
> > On Fri, Nov 24, 2023 at 02:36:29AM +0000, Tian, Kevin wrote:
> > 
> > > > > > > > > > + * @out_driver_error_code: Report a driver speicifc error code
> > > > upon
> > > > > > > > failure.
> > > > > > > > > > + *                         It's optional, driver has a choice to fill it or
> > > > > > > > > > + *                         not.
> > > > > > > > > 
> > > > > > > > > Being optional how does the user tell whether the code is filled or
> > > > not?
> > > > > > 
> > > > > > Well, naming it "error_code" indicates zero means no error while
> > > > > > non-zero means something? An error return from this ioctl could
> > > > > > also tell the user space to look up for this driver error code,
> > > > > > if it ever cares.
> > > > > 
> > > > > probably over-thinking but I'm not sure whether zero is guaranteed to
> > > > > mean no error in all implementations...
> > > > 
> > > > Well, you are right. Usually HW conveniently raises a flag in a
> > > > register to indicate something wrong, yet it is probably unsafe
> > > > to say it definitely.
> > > > 
> > > 
> > > this reminds me one open. What about an implementation having
> > > a hierarchical error code layout e.g. one main error register with
> > > each bit representing an error category then multiple error code
> > > registers each for one error category? In this case probably
> > > a single out_driver_error_code cannot carry that raw information.
> > 
> > Hmm, good point.
> > 
> > > Instead the iommu driver may need to define a customized error
> > > code convention in uapi header which is converted from the
> > > raw error information.
> > > 
> > >  From this angle should we simply say that the error code definition
> > > must be included in the uapi header? If raw error information can
> > > be carried by this field then this hw can simply say that the error
> > > code format is same as the hw spec defines.
> > > 
> > > With that explicit information then the viommu can easily tell
> > > whether error code is filled or not based on its own convention.
> > 
> > That'd be to put this error_code field into the driver uAPI
> > structure right?
> 
> looks to be. Then it would be convenient to reserve a code for
> the case of no error (either no error happened or just not used)
> 
> > 
> > I also thought about making this out_driver_error_code per HW.
> > Yet, an error can be either per array or per entry/quest. The
> > array-related error should be reported in the array structure
> > that is a core uAPI, v.s. the per-HW entry structure. Though
> > we could still report an array error in the entry structure
> > at the first entry (or indexed by "array->entry_num")?
> 
> per-entry error code seems like to be a completion code. Each
> entry in the array can have a corresponding code (0 for succ,
> others for failure). do you already have such a need?

Yes, SMMU can report a PREFETCH error if reading an queue/array
itself fails, and a ILLCMD error if the command is illegal.

We can move the error field to driver specific entry structure.
On the other hand, if something about the array fails, we just
return -EIO.

Thanks
Nic

Jason Gunthorpe Nov. 29, 2023, 12:57 a.m. UTC | #16

On Tue, Nov 28, 2023 at 04:51:21PM -0800, Nicolin Chen wrote:
> > > I also thought about making this out_driver_error_code per HW.
> > > Yet, an error can be either per array or per entry/quest. The
> > > array-related error should be reported in the array structure
> > > that is a core uAPI, v.s. the per-HW entry structure. Though
> > > we could still report an array error in the entry structure
> > > at the first entry (or indexed by "array->entry_num")?
> > >
> > 
> > why would there be an array error? array is just a software
> > entity containing actual HW invalidation cmds. If there is
> > any error with the array itself it should be reported via
> > ioctl errno.
> 
> User array reading is a software operation, but kernel array
> reading is a hardware operation that can raise an error when
> the memory location to the array is incorrect or so.

Well, we shouldn't get into a situation like that.. By the time the HW
got the address it should be valid.

> With that being said, I think errno (-EIO) could do the job,
> as you suggested too.

Do we have any idea what HW failures can be generated by the commands
this will execture? IIRC I don't remember seeing any smmu specific
codes related to invalid invalidation? Everything is a valid input?

Can vt-d fail single commands? What about AMD?

> > Jason, how about your opinion? I didn't spot big issues
> > except this one. Hope it can make into 6.8.

Yes, lets try

Jason

Nicolin Chen Nov. 29, 2023, 1:09 a.m. UTC | #17

On Tue, Nov 28, 2023 at 08:57:15PM -0400, Jason Gunthorpe wrote:
> On Tue, Nov 28, 2023 at 04:51:21PM -0800, Nicolin Chen wrote:
> > > > I also thought about making this out_driver_error_code per HW.
> > > > Yet, an error can be either per array or per entry/quest. The
> > > > array-related error should be reported in the array structure
> > > > that is a core uAPI, v.s. the per-HW entry structure. Though
> > > > we could still report an array error in the entry structure
> > > > at the first entry (or indexed by "array->entry_num")?
> > > >
> > > 
> > > why would there be an array error? array is just a software
> > > entity containing actual HW invalidation cmds. If there is
> > > any error with the array itself it should be reported via
> > > ioctl errno.
> > 
> > User array reading is a software operation, but kernel array
> > reading is a hardware operation that can raise an error when
> > the memory location to the array is incorrect or so.
> 
> Well, we shouldn't get into a situation like that.. By the time the HW
> got the address it should be valid.

Oh, that's true. I was trying to say that out_driver_error_code
was to mimic such a queue validation for user space if an error
happens to the array.

> > With that being said, I think errno (-EIO) could do the job,
> > as you suggested too.
> 
> Do we have any idea what HW failures can be generated by the commands
> this will execture? IIRC I don't remember seeing any smmu specific
> codes related to invalid invalidation? Everything is a valid input?

"7.1 Command queue errors" has the info.

Thanks
Nic

Jason Gunthorpe Nov. 29, 2023, 7:58 p.m. UTC | #18

On Tue, Nov 28, 2023 at 05:09:07PM -0800, Nicolin Chen wrote:

> > > With that being said, I think errno (-EIO) could do the job,
> > > as you suggested too.
> > 
> > Do we have any idea what HW failures can be generated by the commands
> > this will execture? IIRC I don't remember seeing any smmu specific
> > codes related to invalid invalidation? Everything is a valid input?
> 
> "7.1 Command queue errors" has the info.

Hmm CERROR_ATC_INV_SYNC needs to be forwarded to the guest somehow

Jason

Nicolin Chen Nov. 29, 2023, 10:07 p.m. UTC | #19

On Wed, Nov 29, 2023 at 03:58:04PM -0400, Jason Gunthorpe wrote:
> On Tue, Nov 28, 2023 at 05:09:07PM -0800, Nicolin Chen wrote:
> 
> > > > With that being said, I think errno (-EIO) could do the job,
> > > > as you suggested too.
> > > 
> > > Do we have any idea what HW failures can be generated by the commands
> > > this will execture? IIRC I don't remember seeing any smmu specific
> > > codes related to invalid invalidation? Everything is a valid input?
> > 
> > "7.1 Command queue errors" has the info.
> 
> Hmm CERROR_ATC_INV_SYNC needs to be forwarded to the guest somehow

Oh, for sure. That's typically triggered with an asynchronous
timeout from the eventq, so we'd need the io page fault series
as you previously remarked. Though I also wonder if an invalid
vSID that doesn't link to a pSID should be CERROR_ATC_INV_SYNC
also v.s. CERROR_ILL.

Jason Gunthorpe Nov. 30, 2023, 12:08 a.m. UTC | #20

On Wed, Nov 29, 2023 at 02:07:58PM -0800, Nicolin Chen wrote:
> On Wed, Nov 29, 2023 at 03:58:04PM -0400, Jason Gunthorpe wrote:
> > On Tue, Nov 28, 2023 at 05:09:07PM -0800, Nicolin Chen wrote:
> > 
> > > > > With that being said, I think errno (-EIO) could do the job,
> > > > > as you suggested too.
> > > > 
> > > > Do we have any idea what HW failures can be generated by the commands
> > > > this will execture? IIRC I don't remember seeing any smmu specific
> > > > codes related to invalid invalidation? Everything is a valid input?
> > > 
> > > "7.1 Command queue errors" has the info.
> > 
> > Hmm CERROR_ATC_INV_SYNC needs to be forwarded to the guest somehow
> 
> Oh, for sure. That's typically triggered with an asynchronous
> timeout from the eventq, so we'd need the io page fault series
> as you previously remarked. Though I also wonder if an invalid
> vSID that doesn't link to a pSID should be CERROR_ATC_INV_SYNC
> also v.s. CERROR_ILL.

Yes, something like that make sense

Presumably sync becomes emulated and turns into something generated
when the ioctl returns. So userspace would have to read the event FD
before returning to be correct?

Maybe the kernel can somehow return a flag to indicate the event fd
has data in it?

If yes then all errors would flow through the event fd?

Would Intel be basically the same too?

Jason

Nicolin Chen Nov. 30, 2023, 8:41 p.m. UTC | #21

On Wed, Nov 29, 2023 at 08:08:16PM -0400, Jason Gunthorpe wrote:
> On Wed, Nov 29, 2023 at 02:07:58PM -0800, Nicolin Chen wrote:
> > On Wed, Nov 29, 2023 at 03:58:04PM -0400, Jason Gunthorpe wrote:
> > > On Tue, Nov 28, 2023 at 05:09:07PM -0800, Nicolin Chen wrote:
> > > 
> > > > > > With that being said, I think errno (-EIO) could do the job,
> > > > > > as you suggested too.
> > > > > 
> > > > > Do we have any idea what HW failures can be generated by the commands
> > > > > this will execture? IIRC I don't remember seeing any smmu specific
> > > > > codes related to invalid invalidation? Everything is a valid input?
> > > > 
> > > > "7.1 Command queue errors" has the info.
> > > 
> > > Hmm CERROR_ATC_INV_SYNC needs to be forwarded to the guest somehow
> > 
> > Oh, for sure. That's typically triggered with an asynchronous
> > timeout from the eventq, so we'd need the io page fault series
> > as you previously remarked. Though I also wonder if an invalid
> > vSID that doesn't link to a pSID should be CERROR_ATC_INV_SYNC
> > also v.s. CERROR_ILL.
> 
> Yes, something like that make sense
> 
> Presumably sync becomes emulated and turns into something generated
> when the ioctl returns.

CMD_SYNC right? Yes. When the ioctl returns, VMM simply moves the
CONS index next to CMD_SYNC upon a success, or stops the index at
the faulty command upon a failure.

> So userspace would have to read the event FD
> before returning to be correct?
> 
> Maybe the kernel can somehow return a flag to indicate the event fd
> has data in it?
> 
> If yes then all errors would flow through the event fd?

I think it'd be nicer to return an immediate error to stop guest
CMDQ to raise a fault there accordingly, similar to returning a
-EIO for a bad STE in your SMMU part-3 series.

If the "return a flag" is an errno of the ioctl, it could work by
reading from a separate memory that belongs to the event fd. Yet,
in this case, an eventfd signal (assuming there is one to trigger
VMM's fault handler) becomes unnecessary, since the invalidation
ioctl is already handling it?

Thanks
Nic

> Would Intel be basically the same too?
> 
> Jason

Jason Gunthorpe Dec. 1, 2023, 12:45 a.m. UTC | #22

On Thu, Nov 30, 2023 at 12:41:20PM -0800, Nicolin Chen wrote:

> > So userspace would have to read the event FD
> > before returning to be correct?
> > 
> > Maybe the kernel can somehow return a flag to indicate the event fd
> > has data in it?
> > 
> > If yes then all errors would flow through the event fd?
> 
> I think it'd be nicer to return an immediate error to stop guest
> CMDQ to raise a fault there accordingly, similar to returning a
> -EIO for a bad STE in your SMMU part-3 series.
> 
> If the "return a flag" is an errno of the ioctl, it could work by
> reading from a separate memory that belongs to the event fd. Yet,
> in this case, an eventfd signal (assuming there is one to trigger
> VMM's fault handler) becomes unnecessary, since the invalidation
> ioctl is already handling it?

My concern is how does all this fit together and do we push the right
things to the right places in the right order when an error occurs.

I did not study the spec carefully to see what exactly is supposed to
happen here, and I don't see things in Linux that make me think it
particularly cares..

ie Linux doesn't seem like it will know that an async event was even
triggered while processing the sync to generate an EIO. It looks like
it just gets ETIMEDOUT? Presumably we should be checking the event
queue to detect a pushed error?

It is worth understanding if the spec has language that requires
certain order so we can try to follow it.

Jason

Yi Liu Dec. 1, 2023, 3:51 a.m. UTC | #23

On 2023/11/29 08:57, Jason Gunthorpe wrote:
> On Tue, Nov 28, 2023 at 04:51:21PM -0800, Nicolin Chen wrote:
>>>> I also thought about making this out_driver_error_code per HW.
>>>> Yet, an error can be either per array or per entry/quest. The
>>>> array-related error should be reported in the array structure
>>>> that is a core uAPI, v.s. the per-HW entry structure. Though
>>>> we could still report an array error in the entry structure
>>>> at the first entry (or indexed by "array->entry_num")?
>>>>
>>>
>>> why would there be an array error? array is just a software
>>> entity containing actual HW invalidation cmds. If there is
>>> any error with the array itself it should be reported via
>>> ioctl errno.
>>
>> User array reading is a software operation, but kernel array
>> reading is a hardware operation that can raise an error when
>> the memory location to the array is incorrect or so.
> 
> Well, we shouldn't get into a situation like that.. By the time the HW
> got the address it should be valid.
> 
>> With that being said, I think errno (-EIO) could do the job,
>> as you suggested too.
> 
> Do we have any idea what HW failures can be generated by the commands
> this will execture? IIRC I don't remember seeing any smmu specific
> codes related to invalid invalidation? Everything is a valid input?
> 
> Can vt-d fail single commands? What about AMD?

Intel VT-d side, after each invalidation request, there is a wait 
descriptor which either provide an interrupt or an address for the
hw to notify software the request before the wait descriptor has been
completed. While, if there is error happened on the invalidation request,
a flag (IQE, ICE, ITE) would be set in the Fault Status Register, and some
detailed information would be recorded in the Invalidation Queue Error
Record Register. So an invalidation request may be failed with some error
reported. If no error, will return completion via the wait descriptor. Is
this what you mean by "fail a single command"?

>>> Jason, how about your opinion? I didn't spot big issues
>>> except this one. Hope it can make into 6.8.
> 
> Yes, lets try
> 
> Jason

Nicolin Chen Dec. 1, 2023, 4:29 a.m. UTC | #24

On Thu, Nov 30, 2023 at 08:45:23PM -0400, Jason Gunthorpe wrote:
> On Thu, Nov 30, 2023 at 12:41:20PM -0800, Nicolin Chen wrote:
> 
> > > So userspace would have to read the event FD
> > > before returning to be correct?
> > > 
> > > Maybe the kernel can somehow return a flag to indicate the event fd
> > > has data in it?
> > > 
> > > If yes then all errors would flow through the event fd?
> > 
> > I think it'd be nicer to return an immediate error to stop guest
> > CMDQ to raise a fault there accordingly, similar to returning a
> > -EIO for a bad STE in your SMMU part-3 series.
> > 
> > If the "return a flag" is an errno of the ioctl, it could work by
> > reading from a separate memory that belongs to the event fd. Yet,
> > in this case, an eventfd signal (assuming there is one to trigger
> > VMM's fault handler) becomes unnecessary, since the invalidation
> > ioctl is already handling it?
> 
> My concern is how does all this fit together and do we push the right
> things to the right places in the right order when an error occurs.
> 
> I did not study the spec carefully to see what exactly is supposed to
> happen here, and I don't see things in Linux that make me think it
> particularly cares..
> 
> ie Linux doesn't seem like it will know that an async event was even
> triggered while processing the sync to generate an EIO. It looks like
> it just gets ETIMEDOUT? Presumably we should be checking the event
> queue to detect a pushed error?
> 
> It is worth understanding if the spec has language that requires
> certain order so we can try to follow it.

Oh, I replied one misinformation previously. Actually eventq
doesn't report a CERROR. The global error interrupt does.

7.1 has that sequence: 1) CMDQ stops 2) Log current index
to the CONS register 3) Log error code to the CONS register
4) Set bit-0 "CMDQ error" of GERROR register to rise an irq.

FWIW, both gerror and cmdq are global. So we can't know if
the error is for which master or domain. So, the only way
is to get errno from the arm_smmu_cmdq_issue_cmd_with_sync
call in our user invalidate function, where we can then get
the error code. But this feels very much synchronous, since
both the error code and faulty CONS index could be simply
returned without an async eventfd.

Thanks
Nic

Nicolin Chen Dec. 1, 2023, 4:50 a.m. UTC | #25

On Fri, Dec 01, 2023 at 11:51:26AM +0800, Yi Liu wrote:
> On 2023/11/29 08:57, Jason Gunthorpe wrote:
> > On Tue, Nov 28, 2023 at 04:51:21PM -0800, Nicolin Chen wrote:
> > > > > I also thought about making this out_driver_error_code per HW.
> > > > > Yet, an error can be either per array or per entry/quest. The
> > > > > array-related error should be reported in the array structure
> > > > > that is a core uAPI, v.s. the per-HW entry structure. Though
> > > > > we could still report an array error in the entry structure
> > > > > at the first entry (or indexed by "array->entry_num")?
> > > > > 
> > > > 
> > > > why would there be an array error? array is just a software
> > > > entity containing actual HW invalidation cmds. If there is
> > > > any error with the array itself it should be reported via
> > > > ioctl errno.
> > > 
> > > User array reading is a software operation, but kernel array
> > > reading is a hardware operation that can raise an error when
> > > the memory location to the array is incorrect or so.
> > 
> > Well, we shouldn't get into a situation like that.. By the time the HW
> > got the address it should be valid.
> > 
> > > With that being said, I think errno (-EIO) could do the job,
> > > as you suggested too.
> > 
> > Do we have any idea what HW failures can be generated by the commands
> > this will execture? IIRC I don't remember seeing any smmu specific
> > codes related to invalid invalidation? Everything is a valid input?
> > 
> > Can vt-d fail single commands? What about AMD?
> 
> Intel VT-d side, after each invalidation request, there is a wait
> descriptor which either provide an interrupt or an address for the
> hw to notify software the request before the wait descriptor has been
> completed. While, if there is error happened on the invalidation request,
> a flag (IQE, ICE, ITE) would be set in the Fault Status Register, and some
> detailed information would be recorded in the Invalidation Queue Error
> Record Register. So an invalidation request may be failed with some error
> reported. If no error, will return completion via the wait descriptor. Is
> this what you mean by "fail a single command"?

I see the current VT-d series marking those as "REVISIT". How
will it report an error to the user space from those register?

Are they global status registers so that it might be difficult
to direct the error to the nested domain for an event fd?

Nic

Tian, Kevin Dec. 1, 2023, 5:19 a.m. UTC | #26

> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Friday, December 1, 2023 12:50 PM
> 
> On Fri, Dec 01, 2023 at 11:51:26AM +0800, Yi Liu wrote:
> > On 2023/11/29 08:57, Jason Gunthorpe wrote:
> > > On Tue, Nov 28, 2023 at 04:51:21PM -0800, Nicolin Chen wrote:
> > > > > > I also thought about making this out_driver_error_code per HW.
> > > > > > Yet, an error can be either per array or per entry/quest. The
> > > > > > array-related error should be reported in the array structure
> > > > > > that is a core uAPI, v.s. the per-HW entry structure. Though
> > > > > > we could still report an array error in the entry structure
> > > > > > at the first entry (or indexed by "array->entry_num")?
> > > > > >
> > > > >
> > > > > why would there be an array error? array is just a software
> > > > > entity containing actual HW invalidation cmds. If there is
> > > > > any error with the array itself it should be reported via
> > > > > ioctl errno.
> > > >
> > > > User array reading is a software operation, but kernel array
> > > > reading is a hardware operation that can raise an error when
> > > > the memory location to the array is incorrect or so.
> > >
> > > Well, we shouldn't get into a situation like that.. By the time the HW
> > > got the address it should be valid.
> > >
> > > > With that being said, I think errno (-EIO) could do the job,
> > > > as you suggested too.
> > >
> > > Do we have any idea what HW failures can be generated by the
> commands
> > > this will execture? IIRC I don't remember seeing any smmu specific
> > > codes related to invalid invalidation? Everything is a valid input?
> > >
> > > Can vt-d fail single commands? What about AMD?
> >
> > Intel VT-d side, after each invalidation request, there is a wait
> > descriptor which either provide an interrupt or an address for the
> > hw to notify software the request before the wait descriptor has been
> > completed. While, if there is error happened on the invalidation request,
> > a flag (IQE, ICE, ITE) would be set in the Fault Status Register, and some
> > detailed information would be recorded in the Invalidation Queue Error
> > Record Register. So an invalidation request may be failed with some error
> > reported. If no error, will return completion via the wait descriptor. Is
> > this what you mean by "fail a single command"?
> 
> I see the current VT-d series marking those as "REVISIT". How
> will it report an error to the user space from those register?
> 
> Are they global status registers so that it might be difficult
> to direct the error to the nested domain for an event fd?
> 

They are global registers but invalidation queue is also the global
resource. intel-iommu driver polls the status register after queueing
new invalidation descriptors. The submission is serialized.

If the error is related to a descriptor itself (e.g. format issue) then
the head register points to the problematic descriptor so software
can direct it to the related domain.

If the error is related to device tlb invalidation (e.g. timeout) there
is no way to associate the error with a specific descriptor by current
spec. But intel-iommu driver batches descriptors per domain so
we can still direct the error to the nested domain.

But I don't see the need of doing it via eventfd.

The poll semantics in intel-iommu driver is essentially a sync model.
vt-d spec does allow software to optionally enable notification upon
those errors but it's not used so far.

With that I still prefer to having driver-specific error code defined
in the entry. If ARM is an event-driven model then we can define
that field at least in vtd specific data structure.

btw given vtd doesn't use native format in uAPI it doesn't make
sense to forward descriptor formatting errors back to userspace.
Those, if happen, are driver's own problem. intel-iommu driver
should verify the uAPI structure and return -EINVAL or proper
errno to userspace purely in software.

With that Yi please just define error codes for device tlb related
errors for vtd.

Thanks
Kevin

Yi Liu Dec. 1, 2023, 7:05 a.m. UTC | #27

On 2023/12/1 13:19, Tian, Kevin wrote:
>> From: Nicolin Chen <nicolinc@nvidia.com>
>> Sent: Friday, December 1, 2023 12:50 PM
>>
>> On Fri, Dec 01, 2023 at 11:51:26AM +0800, Yi Liu wrote:
>>> On 2023/11/29 08:57, Jason Gunthorpe wrote:
>>>> On Tue, Nov 28, 2023 at 04:51:21PM -0800, Nicolin Chen wrote:
>>>>>>> I also thought about making this out_driver_error_code per HW.
>>>>>>> Yet, an error can be either per array or per entry/quest. The
>>>>>>> array-related error should be reported in the array structure
>>>>>>> that is a core uAPI, v.s. the per-HW entry structure. Though
>>>>>>> we could still report an array error in the entry structure
>>>>>>> at the first entry (or indexed by "array->entry_num")?
>>>>>>>
>>>>>>
>>>>>> why would there be an array error? array is just a software
>>>>>> entity containing actual HW invalidation cmds. If there is
>>>>>> any error with the array itself it should be reported via
>>>>>> ioctl errno.
>>>>>
>>>>> User array reading is a software operation, but kernel array
>>>>> reading is a hardware operation that can raise an error when
>>>>> the memory location to the array is incorrect or so.
>>>>
>>>> Well, we shouldn't get into a situation like that.. By the time the HW
>>>> got the address it should be valid.
>>>>
>>>>> With that being said, I think errno (-EIO) could do the job,
>>>>> as you suggested too.
>>>>
>>>> Do we have any idea what HW failures can be generated by the
>> commands
>>>> this will execture? IIRC I don't remember seeing any smmu specific
>>>> codes related to invalid invalidation? Everything is a valid input?
>>>>
>>>> Can vt-d fail single commands? What about AMD?
>>>
>>> Intel VT-d side, after each invalidation request, there is a wait
>>> descriptor which either provide an interrupt or an address for the
>>> hw to notify software the request before the wait descriptor has been
>>> completed. While, if there is error happened on the invalidation request,
>>> a flag (IQE, ICE, ITE) would be set in the Fault Status Register, and some
>>> detailed information would be recorded in the Invalidation Queue Error
>>> Record Register. So an invalidation request may be failed with some error
>>> reported. If no error, will return completion via the wait descriptor. Is
>>> this what you mean by "fail a single command"?
>>
>> I see the current VT-d series marking those as "REVISIT". How
>> will it report an error to the user space from those register?
>>
>> Are they global status registers so that it might be difficult
>> to direct the error to the nested domain for an event fd?
>>
> 
> They are global registers but invalidation queue is also the global
> resource. intel-iommu driver polls the status register after queueing
> new invalidation descriptors. The submission is serialized.
> 
> If the error is related to a descriptor itself (e.g. format issue) then
> the head register points to the problematic descriptor so software
> can direct it to the related domain.
> 
> If the error is related to device tlb invalidation (e.g. timeout) there
> is no way to associate the error with a specific descriptor by current
> spec. But intel-iommu driver batches descriptors per domain so
> we can still direct the error to the nested domain.
> 
> But I don't see the need of doing it via eventfd.
> 
> The poll semantics in intel-iommu driver is essentially a sync model.
> vt-d spec does allow software to optionally enable notification upon
> those errors but it's not used so far.
> 
> With that I still prefer to having driver-specific error code defined
> in the entry. If ARM is an event-driven model then we can define
> that field at least in vtd specific data structure.
> 
> btw given vtd doesn't use native format in uAPI it doesn't make
> sense to forward descriptor formatting errors back to userspace.
> Those, if happen, are driver's own problem. intel-iommu driver
> should verify the uAPI structure and return -EINVAL or proper
> errno to userspace purely in software.
> 
> With that Yi please just define error codes for device tlb related
> errors for vtd.

hmmm, this sounds like customized error code. is it? So far, VT-d
spec has two errors (ICE and ITE). ITE is valuable to let userspace
know. For ICE, looks like no much value. Intel iommu driver should
be responsible to submit a valid device-tlb invalidation to device.
is it?

Tian, Kevin Dec. 1, 2023, 7:10 a.m. UTC | #28

> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Friday, December 1, 2023 3:05 PM
> 
> On 2023/12/1 13:19, Tian, Kevin wrote:
> >> From: Nicolin Chen <nicolinc@nvidia.com>
> >> Sent: Friday, December 1, 2023 12:50 PM
> >>
> >> On Fri, Dec 01, 2023 at 11:51:26AM +0800, Yi Liu wrote:
> >>> On 2023/11/29 08:57, Jason Gunthorpe wrote:
> >>>> On Tue, Nov 28, 2023 at 04:51:21PM -0800, Nicolin Chen wrote:
> >>>>>>> I also thought about making this out_driver_error_code per HW.
> >>>>>>> Yet, an error can be either per array or per entry/quest. The
> >>>>>>> array-related error should be reported in the array structure
> >>>>>>> that is a core uAPI, v.s. the per-HW entry structure. Though
> >>>>>>> we could still report an array error in the entry structure
> >>>>>>> at the first entry (or indexed by "array->entry_num")?
> >>>>>>>
> >>>>>>
> >>>>>> why would there be an array error? array is just a software
> >>>>>> entity containing actual HW invalidation cmds. If there is
> >>>>>> any error with the array itself it should be reported via
> >>>>>> ioctl errno.
> >>>>>
> >>>>> User array reading is a software operation, but kernel array
> >>>>> reading is a hardware operation that can raise an error when
> >>>>> the memory location to the array is incorrect or so.
> >>>>
> >>>> Well, we shouldn't get into a situation like that.. By the time the HW
> >>>> got the address it should be valid.
> >>>>
> >>>>> With that being said, I think errno (-EIO) could do the job,
> >>>>> as you suggested too.
> >>>>
> >>>> Do we have any idea what HW failures can be generated by the
> >> commands
> >>>> this will execture? IIRC I don't remember seeing any smmu specific
> >>>> codes related to invalid invalidation? Everything is a valid input?
> >>>>
> >>>> Can vt-d fail single commands? What about AMD?
> >>>
> >>> Intel VT-d side, after each invalidation request, there is a wait
> >>> descriptor which either provide an interrupt or an address for the
> >>> hw to notify software the request before the wait descriptor has been
> >>> completed. While, if there is error happened on the invalidation request,
> >>> a flag (IQE, ICE, ITE) would be set in the Fault Status Register, and some
> >>> detailed information would be recorded in the Invalidation Queue Error
> >>> Record Register. So an invalidation request may be failed with some
> error
> >>> reported. If no error, will return completion via the wait descriptor. Is
> >>> this what you mean by "fail a single command"?
> >>
> >> I see the current VT-d series marking those as "REVISIT". How
> >> will it report an error to the user space from those register?
> >>
> >> Are they global status registers so that it might be difficult
> >> to direct the error to the nested domain for an event fd?
> >>
> >
> > They are global registers but invalidation queue is also the global
> > resource. intel-iommu driver polls the status register after queueing
> > new invalidation descriptors. The submission is serialized.
> >
> > If the error is related to a descriptor itself (e.g. format issue) then
> > the head register points to the problematic descriptor so software
> > can direct it to the related domain.
> >
> > If the error is related to device tlb invalidation (e.g. timeout) there
> > is no way to associate the error with a specific descriptor by current
> > spec. But intel-iommu driver batches descriptors per domain so
> > we can still direct the error to the nested domain.
> >
> > But I don't see the need of doing it via eventfd.
> >
> > The poll semantics in intel-iommu driver is essentially a sync model.
> > vt-d spec does allow software to optionally enable notification upon
> > those errors but it's not used so far.
> >
> > With that I still prefer to having driver-specific error code defined
> > in the entry. If ARM is an event-driven model then we can define
> > that field at least in vtd specific data structure.
> >
> > btw given vtd doesn't use native format in uAPI it doesn't make
> > sense to forward descriptor formatting errors back to userspace.
> > Those, if happen, are driver's own problem. intel-iommu driver
> > should verify the uAPI structure and return -EINVAL or proper
> > errno to userspace purely in software.
> >
> > With that Yi please just define error codes for device tlb related
> > errors for vtd.
> 
> hmmm, this sounds like customized error code. is it? So far, VT-d

yes. there is no need to replicate hardware registers/bits if most
of them are irrelevant to userspace.

> spec has two errors (ICE and ITE). ITE is valuable to let userspace
> know. For ICE, looks like no much value. Intel iommu driver should
> be responsible to submit a valid device-tlb invalidation to device.

it's an invalid completion message from the device which could be
caused by various reasons (not exactly due to the invalidation 
request by iommu driver). so it still makes sense to forward.

> is it?
> 
> --
> Regards,
> Yi Liu

Yi Liu Dec. 1, 2023, 9:08 a.m. UTC | #29

On 2023/12/1 15:10, Tian, Kevin wrote:
>> From: Liu, Yi L <yi.l.liu@intel.com>
>> Sent: Friday, December 1, 2023 3:05 PM
>>
>> On 2023/12/1 13:19, Tian, Kevin wrote:
>>>> From: Nicolin Chen <nicolinc@nvidia.com>
>>>> Sent: Friday, December 1, 2023 12:50 PM
>>>>
>>>> On Fri, Dec 01, 2023 at 11:51:26AM +0800, Yi Liu wrote:
>>>>> On 2023/11/29 08:57, Jason Gunthorpe wrote:
>>>>>> On Tue, Nov 28, 2023 at 04:51:21PM -0800, Nicolin Chen wrote:
>>>>>>>>> I also thought about making this out_driver_error_code per HW.
>>>>>>>>> Yet, an error can be either per array or per entry/quest. The
>>>>>>>>> array-related error should be reported in the array structure
>>>>>>>>> that is a core uAPI, v.s. the per-HW entry structure. Though
>>>>>>>>> we could still report an array error in the entry structure
>>>>>>>>> at the first entry (or indexed by "array->entry_num")?
>>>>>>>>>
>>>>>>>>
>>>>>>>> why would there be an array error? array is just a software
>>>>>>>> entity containing actual HW invalidation cmds. If there is
>>>>>>>> any error with the array itself it should be reported via
>>>>>>>> ioctl errno.
>>>>>>>
>>>>>>> User array reading is a software operation, but kernel array
>>>>>>> reading is a hardware operation that can raise an error when
>>>>>>> the memory location to the array is incorrect or so.
>>>>>>
>>>>>> Well, we shouldn't get into a situation like that.. By the time the HW
>>>>>> got the address it should be valid.
>>>>>>
>>>>>>> With that being said, I think errno (-EIO) could do the job,
>>>>>>> as you suggested too.
>>>>>>
>>>>>> Do we have any idea what HW failures can be generated by the
>>>> commands
>>>>>> this will execture? IIRC I don't remember seeing any smmu specific
>>>>>> codes related to invalid invalidation? Everything is a valid input?
>>>>>>
>>>>>> Can vt-d fail single commands? What about AMD?
>>>>>
>>>>> Intel VT-d side, after each invalidation request, there is a wait
>>>>> descriptor which either provide an interrupt or an address for the
>>>>> hw to notify software the request before the wait descriptor has been
>>>>> completed. While, if there is error happened on the invalidation request,
>>>>> a flag (IQE, ICE, ITE) would be set in the Fault Status Register, and some
>>>>> detailed information would be recorded in the Invalidation Queue Error
>>>>> Record Register. So an invalidation request may be failed with some
>> error
>>>>> reported. If no error, will return completion via the wait descriptor. Is
>>>>> this what you mean by "fail a single command"?
>>>>
>>>> I see the current VT-d series marking those as "REVISIT". How
>>>> will it report an error to the user space from those register?
>>>>
>>>> Are they global status registers so that it might be difficult
>>>> to direct the error to the nested domain for an event fd?
>>>>
>>>
>>> They are global registers but invalidation queue is also the global
>>> resource. intel-iommu driver polls the status register after queueing
>>> new invalidation descriptors. The submission is serialized.
>>>
>>> If the error is related to a descriptor itself (e.g. format issue) then
>>> the head register points to the problematic descriptor so software
>>> can direct it to the related domain.
>>>
>>> If the error is related to device tlb invalidation (e.g. timeout) there
>>> is no way to associate the error with a specific descriptor by current
>>> spec. But intel-iommu driver batches descriptors per domain so
>>> we can still direct the error to the nested domain.
>>>
>>> But I don't see the need of doing it via eventfd.
>>>
>>> The poll semantics in intel-iommu driver is essentially a sync model.
>>> vt-d spec does allow software to optionally enable notification upon
>>> those errors but it's not used so far.
>>>
>>> With that I still prefer to having driver-specific error code defined
>>> in the entry. If ARM is an event-driven model then we can define
>>> that field at least in vtd specific data structure.
>>>
>>> btw given vtd doesn't use native format in uAPI it doesn't make
>>> sense to forward descriptor formatting errors back to userspace.
>>> Those, if happen, are driver's own problem. intel-iommu driver
>>> should verify the uAPI structure and return -EINVAL or proper
>>> errno to userspace purely in software.
>>>
>>> With that Yi please just define error codes for device tlb related
>>> errors for vtd.
>>
>> hmmm, this sounds like customized error code. is it? So far, VT-d
> 
> yes. there is no need to replicate hardware registers/bits if most
> of them are irrelevant to userspace.
> 
>> spec has two errors (ICE and ITE). ITE is valuable to let userspace
>> know. For ICE, looks like no much value. Intel iommu driver should
>> be responsible to submit a valid device-tlb invalidation to device.
> 
> it's an invalid completion message from the device which could be
> caused by various reasons (not exactly due to the invalidation
> request by iommu driver). so it still makes sense to forward.

ok. so we may need to define a field to forward the detailed info to
user as well. This data is error-code specific. @Nic, are we aligned
that the error_code field and error data reporting should be moved
to the driver-specific part since it is different between vendors?

Jason Gunthorpe Dec. 1, 2023, 12:55 p.m. UTC | #30

On Thu, Nov 30, 2023 at 08:29:34PM -0800, Nicolin Chen wrote:
> On Thu, Nov 30, 2023 at 08:45:23PM -0400, Jason Gunthorpe wrote:
> > On Thu, Nov 30, 2023 at 12:41:20PM -0800, Nicolin Chen wrote:
> > 
> > > > So userspace would have to read the event FD
> > > > before returning to be correct?
> > > > 
> > > > Maybe the kernel can somehow return a flag to indicate the event fd
> > > > has data in it?
> > > > 
> > > > If yes then all errors would flow through the event fd?
> > > 
> > > I think it'd be nicer to return an immediate error to stop guest
> > > CMDQ to raise a fault there accordingly, similar to returning a
> > > -EIO for a bad STE in your SMMU part-3 series.
> > > 
> > > If the "return a flag" is an errno of the ioctl, it could work by
> > > reading from a separate memory that belongs to the event fd. Yet,
> > > in this case, an eventfd signal (assuming there is one to trigger
> > > VMM's fault handler) becomes unnecessary, since the invalidation
> > > ioctl is already handling it?
> > 
> > My concern is how does all this fit together and do we push the right
> > things to the right places in the right order when an error occurs.
> > 
> > I did not study the spec carefully to see what exactly is supposed to
> > happen here, and I don't see things in Linux that make me think it
> > particularly cares..
> > 
> > ie Linux doesn't seem like it will know that an async event was even
> > triggered while processing the sync to generate an EIO. It looks like
> > it just gets ETIMEDOUT? Presumably we should be checking the event
> > queue to detect a pushed error?
> > 
> > It is worth understanding if the spec has language that requires
> > certain order so we can try to follow it.
> 
> Oh, I replied one misinformation previously. Actually eventq
> doesn't report a CERROR. The global error interrupt does.
> 
> 7.1 has that sequence: 1) CMDQ stops 2) Log current index
> to the CONS register 3) Log error code to the CONS register
> 4) Set bit-0 "CMDQ error" of GERROR register to rise an irq.

Which triggers some async mechanism that seems to restart the command
queue and convert the error into a printk.

It seems there is not a simple way to realize this error back to
userspace since we can't block the global command queue and we proceed
to complete commands that the real HW would not have completed.

To actually emulate this the gerror handler would have to capture all
the necessary registers, return them back to the thread doing
invalidate_user and all of that would have to return back to userspace
to go into the virtual version of all the same registers.

Yes, it can be synchronous it seems, but we don't have any
infrastructure in the driver to do this.

Given this is pretty niche maybe we just don't support error
forwarding and simply ensure it could be added to the uapi later?

Jason

Nicolin Chen Dec. 1, 2023, 7:58 p.m. UTC | #31

On Fri, Dec 01, 2023 at 08:55:38AM -0400, Jason Gunthorpe wrote:
> On Thu, Nov 30, 2023 at 08:29:34PM -0800, Nicolin Chen wrote:
> > On Thu, Nov 30, 2023 at 08:45:23PM -0400, Jason Gunthorpe wrote:
> > > On Thu, Nov 30, 2023 at 12:41:20PM -0800, Nicolin Chen wrote:
> > > 
> > > > > So userspace would have to read the event FD
> > > > > before returning to be correct?
> > > > > 
> > > > > Maybe the kernel can somehow return a flag to indicate the event fd
> > > > > has data in it?
> > > > > 
> > > > > If yes then all errors would flow through the event fd?
> > > > 
> > > > I think it'd be nicer to return an immediate error to stop guest
> > > > CMDQ to raise a fault there accordingly, similar to returning a
> > > > -EIO for a bad STE in your SMMU part-3 series.
> > > > 
> > > > If the "return a flag" is an errno of the ioctl, it could work by
> > > > reading from a separate memory that belongs to the event fd. Yet,
> > > > in this case, an eventfd signal (assuming there is one to trigger
> > > > VMM's fault handler) becomes unnecessary, since the invalidation
> > > > ioctl is already handling it?
> > > 
> > > My concern is how does all this fit together and do we push the right
> > > things to the right places in the right order when an error occurs.
> > > 
> > > I did not study the spec carefully to see what exactly is supposed to
> > > happen here, and I don't see things in Linux that make me think it
> > > particularly cares..
> > > 
> > > ie Linux doesn't seem like it will know that an async event was even
> > > triggered while processing the sync to generate an EIO. It looks like
> > > it just gets ETIMEDOUT? Presumably we should be checking the event
> > > queue to detect a pushed error?
> > > 
> > > It is worth understanding if the spec has language that requires
> > > certain order so we can try to follow it.
> > 
> > Oh, I replied one misinformation previously. Actually eventq
> > doesn't report a CERROR. The global error interrupt does.
> > 
> > 7.1 has that sequence: 1) CMDQ stops 2) Log current index
> > to the CONS register 3) Log error code to the CONS register
> > 4) Set bit-0 "CMDQ error" of GERROR register to rise an irq.
> 
> Which triggers some async mechanism that seems to restart the command
> queue and convert the error into a printk.

Yes. For CERROR_ILL, it replaces the commands with another SYNC.

> It seems there is not a simple way to realize this error back to
> userspace since we can't block the global command queue and we proceed
> to complete commands that the real HW would not have completed.
> 
> To actually emulate this the gerror handler would have to capture all
> the necessary registers, return them back to the thread doing
> invalidate_user and all of that would have to return back to userspace
> to go into the virtual version of all the same registers.
> 
> Yes, it can be synchronous it seems, but we don't have any
> infrastructure in the driver to do this.
>
> Given this is pretty niche maybe we just don't support error
> forwarding and simply ensure it could be added to the uapi later?

If arm_smmu_cmdq_issue_cmdlist in arm_smmu_cache_invalidate_user
fails with ETIMEOUT, we polls the CONS register to get the error
code. This can cover CERROR_ABT and CERROR_ATC_INV.

As you remarked that we can't block the global CMDQ, so we have
to let a real CERROR_ILL go. Yet, we can make sure commands to
be fully sanitized before being issued, as we should immediately
reject faulty commands anyway, for errors such as unsupported op
codes, unzero-ed reserved fields, and unlinked vSIDs. This can
at least largely reduce the probability of a real CERROR_ILL.

So, combining these two, we can still have a basic synchronous
way by returning an errno to the invalidate ioctl? I see Kevin
replied something similar too.

Thanks
Nic

Jason Gunthorpe Dec. 1, 2023, 8:43 p.m. UTC | #32

On Fri, Dec 01, 2023 at 11:58:07AM -0800, Nicolin Chen wrote:
> > It seems there is not a simple way to realize this error back to
> > userspace since we can't block the global command queue and we proceed
> > to complete commands that the real HW would not have completed.
> > 
> > To actually emulate this the gerror handler would have to capture all
> > the necessary registers, return them back to the thread doing
> > invalidate_user and all of that would have to return back to userspace
> > to go into the virtual version of all the same registers.
> > 
> > Yes, it can be synchronous it seems, but we don't have any
> > infrastructure in the driver to do this.
> >
> > Given this is pretty niche maybe we just don't support error
> > forwarding and simply ensure it could be added to the uapi later?
> 
> If arm_smmu_cmdq_issue_cmdlist in arm_smmu_cache_invalidate_user
> fails with ETIMEOUT, we polls the CONS register to get the error
> code. This can cover CERROR_ABT and CERROR_ATC_INV.

Why is timeout linked to these two? Or rather, it doesn't have to be
linked like that. Any gerror is effectively synchronous because it
halts the queue and allows SW time to inspect which command failed and
record the gerror flags. So each and every command can get an error
indication.

Restarting the queue is done by putting sync in there to effectively
nop the failed command and we hope for the best and let it rip.

> As you remarked that we can't block the global CMDQ, so we have
> to let a real CERROR_ILL go. Yet, we can make sure commands to
> be fully sanitized before being issued, as we should immediately
> reject faulty commands anyway, for errors such as unsupported op
> codes, unzero-ed reserved fields, and unlinked vSIDs. This can
> at least largely reduce the probability of a real CERROR_ILL.

I'm more a little more concerend with ATC_INV as a malfunctioning
device can trigger this..

> So, combining these two, we can still have a basic synchronous
> way by returning an errno to the invalidate ioctl? I see Kevin
> replied something similar too.

It isn't enough information, you don't know which gerror bits to set
and you don't know what cons index to stick to indicate the error
triggering command with just a simple errno.

It does need to return a bunch of data to get it all right.

Though again, there is no driver infrastructure to do all this and it
doesn't seem that important so maybe we can just ensure there is a
future extension possiblity and have userspace understand an errno
means to generate CERROR_ILL on the last command in the batch?

Jason

Nicolin Chen Dec. 1, 2023, 10:12 p.m. UTC | #33

On Fri, Dec 01, 2023 at 04:43:40PM -0400, Jason Gunthorpe wrote:
> On Fri, Dec 01, 2023 at 11:58:07AM -0800, Nicolin Chen wrote:
> > > It seems there is not a simple way to realize this error back to
> > > userspace since we can't block the global command queue and we proceed
> > > to complete commands that the real HW would not have completed.
> > > 
> > > To actually emulate this the gerror handler would have to capture all
> > > the necessary registers, return them back to the thread doing
> > > invalidate_user and all of that would have to return back to userspace
> > > to go into the virtual version of all the same registers.
> > > 
> > > Yes, it can be synchronous it seems, but we don't have any
> > > infrastructure in the driver to do this.
> > >
> > > Given this is pretty niche maybe we just don't support error
> > > forwarding and simply ensure it could be added to the uapi later?
> > 
> > If arm_smmu_cmdq_issue_cmdlist in arm_smmu_cache_invalidate_user
> > fails with ETIMEOUT, we polls the CONS register to get the error
> > code. This can cover CERROR_ABT and CERROR_ATC_INV.
> 
> Why is timeout linked to these two? Or rather, it doesn't have to be
> linked like that. Any gerror is effectively synchronous because it
> halts the queue and allows SW time to inspect which command failed and
> record the gerror flags. So each and every command can get an error
> indication.
> 
> Restarting the queue is done by putting sync in there to effectively
> nop the failed command and we hope for the best and let it rip.

I see that SMMU driver only restarts the queue when dealing with
CERROR_ILL. So only CERROR_ABT or CERROR_ATC_INV would result in
-ETIMEOUT.

> > As you remarked that we can't block the global CMDQ, so we have
> > to let a real CERROR_ILL go. Yet, we can make sure commands to
> > be fully sanitized before being issued, as we should immediately
> > reject faulty commands anyway, for errors such as unsupported op
> > codes, unzero-ed reserved fields, and unlinked vSIDs. This can
> > at least largely reduce the probability of a real CERROR_ILL.
> 
> I'm more a little more concerend with ATC_INV as a malfunctioning
> device can trigger this..

How about making sure that the invalidate handler always issues
one CMD_ATC_INV at a time, so each arm_smmu_cmdq_issue_cmdlist()
call has a chance to timeout? Then, we can simply know which one
in the user array fails.

> > So, combining these two, we can still have a basic synchronous
> > way by returning an errno to the invalidate ioctl? I see Kevin
> > replied something similar too.
> 
> It isn't enough information, you don't know which gerror bits to set
> and you don't know what cons index to stick to indicate the error
> triggering command with just a simple errno.
>
> It does need to return a bunch of data to get it all right.

The array structure returns req_num to indicate the index. This
works, even if the command consumption stops in the middle:
 * @req_num: Input the number of cache invalidation requests in the array.
 *           Output the number of requests successfully handled by kernel.

So we only need an error code of CERROR_ABT/ILL/ATC_INV.

Or am I missing some point here?

> Though again, there is no driver infrastructure to do all this and it
> doesn't seem that important so maybe we can just ensure there is a
> future extension possiblity and have userspace understand an errno
> means to generate CERROR_ILL on the last command in the batch?

Yea, it certainly can be a plan.

Thanks
Nicolin

Jason Gunthorpe Dec. 4, 2023, 2:48 p.m. UTC | #34

On Fri, Dec 01, 2023 at 02:12:28PM -0800, Nicolin Chen wrote:
> > Why is timeout linked to these two? Or rather, it doesn't have to be
> > linked like that. Any gerror is effectively synchronous because it
> > halts the queue and allows SW time to inspect which command failed and
> > record the gerror flags. So each and every command can get an error
> > indication.
> > 
> > Restarting the queue is done by putting sync in there to effectively
> > nop the failed command and we hope for the best and let it rip.
> 
> I see that SMMU driver only restarts the queue when dealing with
> CERROR_ILL. So only CERROR_ABT or CERROR_ATC_INV would result in
> -ETIMEOUT.

I'm not sure that is the best thing to do. ABT is basically the
machine caught fire, so sure there is no recovery for that.

But ATC_INV could be recovered and should ideally be canceled then
forwarded to the VM.
 
> > > As you remarked that we can't block the global CMDQ, so we have
> > > to let a real CERROR_ILL go. Yet, we can make sure commands to
> > > be fully sanitized before being issued, as we should immediately
> > > reject faulty commands anyway, for errors such as unsupported op
> > > codes, unzero-ed reserved fields, and unlinked vSIDs. This can
> > > at least largely reduce the probability of a real CERROR_ILL.
> > 
> > I'm more a little more concerend with ATC_INV as a malfunctioning
> > device can trigger this..
> 
> How about making sure that the invalidate handler always issues
> one CMD_ATC_INV at a time, so each arm_smmu_cmdq_issue_cmdlist()
> call has a chance to timeout? Then, we can simply know which one
> in the user array fails.

That sounds slow

> > > So, combining these two, we can still have a basic synchronous
> > > way by returning an errno to the invalidate ioctl? I see Kevin
> > > replied something similar too.
> > 
> > It isn't enough information, you don't know which gerror bits to set
> > and you don't know what cons index to stick to indicate the error
> > triggering command with just a simple errno.
> >
> > It does need to return a bunch of data to get it all right.
> 
> The array structure returns req_num to indicate the index. This
> works, even if the command consumption stops in the middle:
>  * @req_num: Input the number of cache invalidation requests in the array.
>  *           Output the number of requests successfully handled by kernel.
> 
> So we only need an error code of CERROR_ABT/ILL/ATC_INV.

Yes

> Or am I missing some point here?

It sounds Ok, we just have to understand what userspace should be
doing and how much of this the kernel should implement.

It seems to me that the error code should return the gerror and the
req_num should indicate the halted cons. The vmm should relay both
into the virtual registers.

Jason

Nicolin Chen Dec. 5, 2023, 5:33 p.m. UTC | #35

On Mon, Dec 04, 2023 at 10:48:50AM -0400, Jason Gunthorpe wrote:

> > Or am I missing some point here?
> 
> It sounds Ok, we just have to understand what userspace should be
> doing and how much of this the kernel should implement.
> 
> It seems to me that the error code should return the gerror and the
> req_num should indicate the halted cons. The vmm should relay both
> into the virtual registers.

I see your concern. I will take a closer look and see if we can
add to the initial version of arm_smmu_cache_invalidate_user().
Otherwise, we can add later.

Btw, VT-d seems to want the error_code and reports in the VT-d
specific invalidate entry structure, as Kevin and Yi had that
discussion in the other side of the thread.

Thanks
Nicolin

Jason Gunthorpe Dec. 6, 2023, 12:48 p.m. UTC | #36

On Tue, Dec 05, 2023 at 09:33:30AM -0800, Nicolin Chen wrote:
> On Mon, Dec 04, 2023 at 10:48:50AM -0400, Jason Gunthorpe wrote:
>  
> > > Or am I missing some point here?
> > 
> > It sounds Ok, we just have to understand what userspace should be
> > doing and how much of this the kernel should implement.
> > 
> > It seems to me that the error code should return the gerror and the
> > req_num should indicate the halted cons. The vmm should relay both
> > into the virtual registers.
> 
> I see your concern. I will take a closer look and see if we can
> add to the initial version of arm_smmu_cache_invalidate_user().
> Otherwise, we can add later.
> 
> Btw, VT-d seems to want the error_code and reports in the VT-d
> specific invalidate entry structure, as Kevin and Yi had that
> discussion in the other side of the thread.

It could be fine to shift it into the driver data blob. It isn't
really generic in the end.

Jason

Jason Gunthorpe Dec. 6, 2023, 6:33 p.m. UTC | #37

On Fri, Nov 17, 2023 at 05:07:13AM -0800, Yi Liu wrote:
> +/**
> + * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
> + * @size: sizeof(struct iommu_hwpt_invalidate)
> + * @hwpt_id: HWPT ID of a nested HWPT for cache invalidation
> + * @reqs_uptr: User pointer to an array having @req_num of cache invalidation
> + *             requests. The request entries in the array are of fixed width
> + *             @req_len, and contain a user data structure for invalidation
> + *             request specific to the given hardware page table.
> + * @req_type: One of enum iommu_hwpt_data_type, defining the data type of all
> + *            the entries in the invalidation request array. It should suit
> + *            with the data_type passed per the allocation of the hwpt pointed
> + *            by @hwpt_id.
> + * @req_len: Length (in bytes) of a request entry in the request array
> + * @req_num: Input the number of cache invalidation requests in the array.
> + *           Output the number of requests successfully handled by kernel.
> + * @out_driver_error_code: Report a driver speicifc error code upon failure.
> + *                         It's optional, driver has a choice to fill it or
> + *                         not.

"specific"

Jason

Yi Liu Dec. 7, 2023, 6:59 a.m. UTC | #38

On 2023/11/17 21:07, Yi Liu wrote:
> In nested translation, the stage-1 page table is user-managed but cached
> by the IOMMU hardware, so an update on present page table entries in the
> stage-1 page table should be followed with a cache invalidation.
> 
> Add an IOMMU_HWPT_INVALIDATE ioctl to support such a cache invalidation.
> It takes hwpt_id to specify the iommu_domain, and a multi-entry array to
> support multiple invalidation requests in one ioctl.
> 
> Check cache_invalidate_user op in the iommufd_hw_pagetable_alloc_nested,
> since all nested domains need that.
> 
> Co-developed-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>   drivers/iommu/iommufd/hw_pagetable.c    | 35 +++++++++++++++++++++++++
>   drivers/iommu/iommufd/iommufd_private.h |  9 +++++++
>   drivers/iommu/iommufd/main.c            |  3 +++
>   include/uapi/linux/iommufd.h            | 35 +++++++++++++++++++++++++
>   4 files changed, 82 insertions(+)
> 
> diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
> index 2abbeafdbd22..367459d92f69 100644
> --- a/drivers/iommu/iommufd/hw_pagetable.c
> +++ b/drivers/iommu/iommufd/hw_pagetable.c
> @@ -238,6 +238,11 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
>   		rc = -EINVAL;
>   		goto out_abort;
>   	}
> +	/* Driver is buggy by missing cache_invalidate_user in domain_ops */
> +	if (WARN_ON_ONCE(!hwpt->domain->ops->cache_invalidate_user)) {
> +		rc = -EINVAL;
> +		goto out_abort;
> +	}
>   	return hwpt_nested;
>   
>   out_abort:
> @@ -370,4 +375,34 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd)
>   
>   	iommufd_put_object(&hwpt_paging->common.obj);
>   	return rc;
> +};
> +
> +int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_hwpt_invalidate *cmd = ucmd->cmd;
> +	struct iommu_user_data_array data_array = {
> +		.type = cmd->req_type,
> +		.uptr = u64_to_user_ptr(cmd->reqs_uptr),
> +		.entry_len = cmd->req_len,
> +		.entry_num = cmd->req_num,
> +	};
> +	struct iommufd_hw_pagetable *hwpt;
> +	int rc = 0;
> +
> +	if (cmd->req_type == IOMMU_HWPT_DATA_NONE)
> +		return -EINVAL;
> +	if (!cmd->reqs_uptr || !cmd->req_len || !cmd->req_num)
> +		return -EINVAL;
> +
> +	hwpt = iommufd_hw_pagetable_get_nested(ucmd, cmd->hwpt_id);
> +	if (IS_ERR(hwpt))
> +		return PTR_ERR(hwpt);
> +
> +	rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain, &data_array,
> +						      &cmd->out_driver_error_code);
> +	cmd->req_num = data_array.entry_num;
> +	if (iommufd_ucmd_respond(ucmd, sizeof(*cmd)))
> +		return -EFAULT;
> +	iommufd_put_object(&hwpt->obj);
> +	return rc;
>   }
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index a74cfefffbc6..160521800d9b 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -301,6 +301,7 @@ void iommufd_hwpt_paging_abort(struct iommufd_object *obj);
>   void iommufd_hwpt_nested_destroy(struct iommufd_object *obj);
>   void iommufd_hwpt_nested_abort(struct iommufd_object *obj);
>   int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd);
> +int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd);
>   
>   static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
>   					    struct iommufd_hw_pagetable *hwpt)
> @@ -318,6 +319,14 @@ static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
>   	refcount_dec(&hwpt->obj.users);
>   }
>   
> +static inline struct iommufd_hw_pagetable *
> +iommufd_hw_pagetable_get_nested(struct iommufd_ucmd *ucmd, u32 id)
> +{
> +	return container_of(iommufd_get_object(ucmd->ictx, id,
> +					       IOMMUFD_OBJ_HWPT_NESTED),
> +			    struct iommufd_hw_pagetable, obj);
> +}
> +
>   struct iommufd_group {
>   	struct kref ref;
>   	struct mutex lock;
> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index 45b9d40773b1..6edef860f91c 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -309,6 +309,7 @@ union ucmd_buffer {
>   	struct iommu_hwpt_alloc hwpt;
>   	struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
>   	struct iommu_hwpt_set_dirty_tracking set_dirty_tracking;
> +	struct iommu_hwpt_invalidate cache;
>   	struct iommu_ioas_alloc alloc;
>   	struct iommu_ioas_allow_iovas allow_iovas;
>   	struct iommu_ioas_copy ioas_copy;
> @@ -348,6 +349,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
>   		 struct iommu_hwpt_get_dirty_bitmap, data),
>   	IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking,
>   		 struct iommu_hwpt_set_dirty_tracking, __reserved),
> +	IOCTL_OP(IOMMU_HWPT_INVALIDATE, iommufd_hwpt_invalidate,
> +		 struct iommu_hwpt_invalidate, out_driver_error_code),
>   	IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
>   		 struct iommu_ioas_alloc, out_ioas_id),
>   	IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index 0b2bc6252e2c..7f92cecc87d7 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -49,6 +49,7 @@ enum {
>   	IOMMUFD_CMD_GET_HW_INFO,
>   	IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
>   	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
> +	IOMMUFD_CMD_HWPT_INVALIDATE,
>   };
>   
>   /**
> @@ -613,4 +614,38 @@ struct iommu_hwpt_get_dirty_bitmap {
>   #define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
>   					IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
>   
> +/**
> + * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
> + * @size: sizeof(struct iommu_hwpt_invalidate)
> + * @hwpt_id: HWPT ID of a nested HWPT for cache invalidation
> + * @reqs_uptr: User pointer to an array having @req_num of cache invalidation
> + *             requests. The request entries in the array are of fixed width
> + *             @req_len, and contain a user data structure for invalidation
> + *             request specific to the given hardware page table.
> + * @req_type: One of enum iommu_hwpt_data_type, defining the data type of all
> + *            the entries in the invalidation request array. It should suit
> + *            with the data_type passed per the allocation of the hwpt pointed
> + *            by @hwpt_id.

@Jason and Kevin,

Here a check with you two. I had a conversation with Nic on the definition
of req_type here. It was added to support potential multiple kinds of cache
invalidation data types for a invalidating cache for a single hwpt type[1].
But we defined it as reusing the hwpt_data_type. In this way, it is not
able to support the potential case in[1]. is it? Shall we define a separate
enum for invalidation data types? And how can we let user know the
available invalidation data types for a hwpt type? Any idea?

[1] https://lore.kernel.org/linux-iommu/20231018163720.GA3952@nvidia.com/


> + * @req_len: Length (in bytes) of a request entry in the request array
> + * @req_num: Input the number of cache invalidation requests in the array.
> + *           Output the number of requests successfully handled by kernel.
> + * @out_driver_error_code: Report a driver speicifc error code upon failure.
> + *                         It's optional, driver has a choice to fill it or
> + *                         not.
> + *
> + * Invalidate the iommu cache for user-managed page table. Modifications on a
> + * user-managed page table should be followed by this operation to sync cache.
> + * Each ioctl can support one or more cache invalidation requests in the array
> + * that has a total size of @req_len * @req_num.
> + */
> +struct iommu_hwpt_invalidate {
> +	__u32 size;
> +	__u32 hwpt_id;
> +	__aligned_u64 reqs_uptr;
> +	__u32 req_type;
> +	__u32 req_len;
> +	__u32 req_num;
> +	__u32 out_driver_error_code;
> +};
> +#define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE)
>   #endif

Tian, Kevin Dec. 7, 2023, 9:04 a.m. UTC | #39

> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Thursday, December 7, 2023 2:59 PM
> 
> On 2023/11/17 21:07, Yi Liu wrote:
> > @@ -613,4 +614,38 @@ struct iommu_hwpt_get_dirty_bitmap {
> >   #define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
> >
> 	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
> >
> > +/**
> > + * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
> > + * @size: sizeof(struct iommu_hwpt_invalidate)
> > + * @hwpt_id: HWPT ID of a nested HWPT for cache invalidation
> > + * @reqs_uptr: User pointer to an array having @req_num of cache
> invalidation
> > + *             requests. The request entries in the array are of fixed width
> > + *             @req_len, and contain a user data structure for invalidation
> > + *             request specific to the given hardware page table.
> > + * @req_type: One of enum iommu_hwpt_data_type, defining the data
> type of all
> > + *            the entries in the invalidation request array. It should suit
> > + *            with the data_type passed per the allocation of the hwpt pointed
> > + *            by @hwpt_id.
> 
> @Jason and Kevin,
> 
> Here a check with you two. I had a conversation with Nic on the definition
> of req_type here. It was added to support potential multiple kinds of cache
> invalidation data types for a invalidating cache for a single hwpt type[1].
> But we defined it as reusing the hwpt_data_type. In this way, it is not
> able to support the potential case in[1]. is it? Shall we define a separate
> enum for invalidation data types? And how can we let user know the
> available invalidation data types for a hwpt type? Any idea?
> 
> [1] https://lore.kernel.org/linux-
> iommu/20231018163720.GA3952@nvidia.com/
> 

From that thread Jason mentioned to make the invalidation format
part of domain allocation. If that is the direction to go then there
won't be multiple invalidation formats per hwpt. The user should
create multiple hwpt's per invalidation format (though mixing
formats in one virtual platform is very unlikely)?

Jason Gunthorpe Dec. 7, 2023, 2:42 p.m. UTC | #40

On Thu, Dec 07, 2023 at 09:04:00AM +0000, Tian, Kevin wrote:
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Thursday, December 7, 2023 2:59 PM
> > 
> > On 2023/11/17 21:07, Yi Liu wrote:
> > > @@ -613,4 +614,38 @@ struct iommu_hwpt_get_dirty_bitmap {
> > >   #define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
> > >
> > 	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
> > >
> > > +/**
> > > + * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
> > > + * @size: sizeof(struct iommu_hwpt_invalidate)
> > > + * @hwpt_id: HWPT ID of a nested HWPT for cache invalidation
> > > + * @reqs_uptr: User pointer to an array having @req_num of cache
> > invalidation
> > > + *             requests. The request entries in the array are of fixed width
> > > + *             @req_len, and contain a user data structure for invalidation
> > > + *             request specific to the given hardware page table.
> > > + * @req_type: One of enum iommu_hwpt_data_type, defining the data
> > type of all
> > > + *            the entries in the invalidation request array. It should suit
> > > + *            with the data_type passed per the allocation of the hwpt pointed
> > > + *            by @hwpt_id.
> > 
> > @Jason and Kevin,
> > 
> > Here a check with you two. I had a conversation with Nic on the definition
> > of req_type here. It was added to support potential multiple kinds of cache
> > invalidation data types for a invalidating cache for a single hwpt type[1].
> > But we defined it as reusing the hwpt_data_type. In this way, it is not
> > able to support the potential case in[1]. is it? Shall we define a separate
> > enum for invalidation data types? And how can we let user know the
> > available invalidation data types for a hwpt type? Any idea?
> > 
> > [1] https://lore.kernel.org/linux-
> > iommu/20231018163720.GA3952@nvidia.com/
> > 
> 
> From that thread Jason mentioned to make the invalidation format
> part of domain allocation. If that is the direction to go then there
> won't be multiple invalidation formats per hwpt. The user should
> create multiple hwpt's per invalidation format (though mixing
> formats in one virtual platform is very unlikely)?

I think we could do either, but I have a vauge cleanness preference
that the enums are just different? That would follow a pretty typical
pattern for a structure tag to reflect the content of the structure.

Jason

Yi Liu Dec. 11, 2023, 7:49 a.m. UTC | #41

On 2023/12/7 17:04, Tian, Kevin wrote:
>> From: Liu, Yi L <yi.l.liu@intel.com>
>> Sent: Thursday, December 7, 2023 2:59 PM
>>
>> On 2023/11/17 21:07, Yi Liu wrote:
>>> @@ -613,4 +614,38 @@ struct iommu_hwpt_get_dirty_bitmap {
>>>    #define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
>>>
>> 	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
>>>
>>> +/**
>>> + * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
>>> + * @size: sizeof(struct iommu_hwpt_invalidate)
>>> + * @hwpt_id: HWPT ID of a nested HWPT for cache invalidation
>>> + * @reqs_uptr: User pointer to an array having @req_num of cache
>> invalidation
>>> + *             requests. The request entries in the array are of fixed width
>>> + *             @req_len, and contain a user data structure for invalidation
>>> + *             request specific to the given hardware page table.
>>> + * @req_type: One of enum iommu_hwpt_data_type, defining the data
>> type of all
>>> + *            the entries in the invalidation request array. It should suit
>>> + *            with the data_type passed per the allocation of the hwpt pointed
>>> + *            by @hwpt_id.
>>
>> @Jason and Kevin,
>>
>> Here a check with you two. I had a conversation with Nic on the definition
>> of req_type here. It was added to support potential multiple kinds of cache
>> invalidation data types for a invalidating cache for a single hwpt type[1].
>> But we defined it as reusing the hwpt_data_type. In this way, it is not
>> able to support the potential case in[1]. is it? Shall we define a separate
>> enum for invalidation data types? And how can we let user know the
>> available invalidation data types for a hwpt type? Any idea?
>>
>> [1] https://lore.kernel.org/linux-
>> iommu/20231018163720.GA3952@nvidia.com/
>>
> 
>  From that thread Jason mentioned to make the invalidation format
> part of domain allocation. If that is the direction to go then there
> won't be multiple invalidation formats per hwpt. The user should
> create multiple hwpt's per invalidation format (though mixing
> formats in one virtual platform is very unlikely)?

yes, following that, the hwpt data type would be a kind of a combination
of page table type and cache invalidation type. In this way, the req_type
may also be dropped as the hwpt allocation path can record the type and
reuse it in the cache invalidation path.

Yi Liu Dec. 11, 2023, 7:53 a.m. UTC | #42

On 2023/12/7 22:42, Jason Gunthorpe wrote:
> On Thu, Dec 07, 2023 at 09:04:00AM +0000, Tian, Kevin wrote:
>>> From: Liu, Yi L <yi.l.liu@intel.com>
>>> Sent: Thursday, December 7, 2023 2:59 PM
>>>
>>> On 2023/11/17 21:07, Yi Liu wrote:
>>>> @@ -613,4 +614,38 @@ struct iommu_hwpt_get_dirty_bitmap {
>>>>    #define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
>>>>
>>> 	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
>>>>
>>>> +/**
>>>> + * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
>>>> + * @size: sizeof(struct iommu_hwpt_invalidate)
>>>> + * @hwpt_id: HWPT ID of a nested HWPT for cache invalidation
>>>> + * @reqs_uptr: User pointer to an array having @req_num of cache
>>> invalidation
>>>> + *             requests. The request entries in the array are of fixed width
>>>> + *             @req_len, and contain a user data structure for invalidation
>>>> + *             request specific to the given hardware page table.
>>>> + * @req_type: One of enum iommu_hwpt_data_type, defining the data
>>> type of all
>>>> + *            the entries in the invalidation request array. It should suit
>>>> + *            with the data_type passed per the allocation of the hwpt pointed
>>>> + *            by @hwpt_id.
>>>
>>> @Jason and Kevin,
>>>
>>> Here a check with you two. I had a conversation with Nic on the definition
>>> of req_type here. It was added to support potential multiple kinds of cache
>>> invalidation data types for a invalidating cache for a single hwpt type[1].
>>> But we defined it as reusing the hwpt_data_type. In this way, it is not
>>> able to support the potential case in[1]. is it? Shall we define a separate
>>> enum for invalidation data types? And how can we let user know the
>>> available invalidation data types for a hwpt type? Any idea?
>>>
>>> [1] https://lore.kernel.org/linux-
>>> iommu/20231018163720.GA3952@nvidia.com/
>>>
>>
>>  From that thread Jason mentioned to make the invalidation format
>> part of domain allocation. If that is the direction to go then there
>> won't be multiple invalidation formats per hwpt. The user should
>> create multiple hwpt's per invalidation format (though mixing
>> formats in one virtual platform is very unlikely)?
> 
> I think we could do either, but I have a vauge cleanness preference
> that the enums are just different? That would follow a pretty typical
> pattern for a structure tag to reflect the content of the structure.

Is this still following the direction to make the invalidation format
part of domain allocation?

Jason Gunthorpe Dec. 11, 2023, 1:21 p.m. UTC | #43

On Mon, Dec 11, 2023 at 03:53:40PM +0800, Yi Liu wrote:

> > >  From that thread Jason mentioned to make the invalidation format
> > > part of domain allocation. If that is the direction to go then there
> > > won't be multiple invalidation formats per hwpt. The user should
> > > create multiple hwpt's per invalidation format (though mixing
> > > formats in one virtual platform is very unlikely)?
> > 
> > I think we could do either, but I have a vauge cleanness preference
> > that the enums are just different? That would follow a pretty typical
> > pattern for a structure tag to reflect the content of the structure.
> 
> Is this still following the direction to make the invalidation format
> part of domain allocation?

I think you should make it seperate

Jason

Yi Liu Dec. 12, 2023, 1:45 p.m. UTC | #44

> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Monday, December 11, 2023 9:22 PM
> 
> On Mon, Dec 11, 2023 at 03:53:40PM +0800, Yi Liu wrote:
> 
> > > >  From that thread Jason mentioned to make the invalidation format
> > > > part of domain allocation. If that is the direction to go then there
> > > > won't be multiple invalidation formats per hwpt. The user should
> > > > create multiple hwpt's per invalidation format (though mixing
> > > > formats in one virtual platform is very unlikely)?
> > >
> > > I think we could do either, but I have a vauge cleanness preference
> > > that the enums are just different? That would follow a pretty typical
> > > pattern for a structure tag to reflect the content of the structure.
> >
> > Is this still following the direction to make the invalidation format
> > part of domain allocation?
> 
> I think you should make it seperate

Sure. I'll add a separate enum for cache invalidation format. Just to
see if it is good to pass down the invalidation format in the hwpt
alloc path? So iommu driver can check if the invalidation format
can be used for the hwpt to be allocated.

Regards,
Yi Liu

Jason Gunthorpe Dec. 12, 2023, 2:40 p.m. UTC | #45

On Tue, Dec 12, 2023 at 01:45:16PM +0000, Liu, Yi L wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Monday, December 11, 2023 9:22 PM
> > 
> > On Mon, Dec 11, 2023 at 03:53:40PM +0800, Yi Liu wrote:
> > 
> > > > >  From that thread Jason mentioned to make the invalidation format
> > > > > part of domain allocation. If that is the direction to go then there
> > > > > won't be multiple invalidation formats per hwpt. The user should
> > > > > create multiple hwpt's per invalidation format (though mixing
> > > > > formats in one virtual platform is very unlikely)?
> > > >
> > > > I think we could do either, but I have a vauge cleanness preference
> > > > that the enums are just different? That would follow a pretty typical
> > > > pattern for a structure tag to reflect the content of the structure.
> > >
> > > Is this still following the direction to make the invalidation format
> > > part of domain allocation?
> > 
> > I think you should make it seperate
> 
> Sure. I'll add a separate enum for cache invalidation format. Just to
> see if it is good to pass down the invalidation format in the hwpt
> alloc path? So iommu driver can check if the invalidation format
> can be used for the hwpt to be allocated.

I would skip it in the invalidation. If necessary drivers can push a 0
length invalidation to do 'try and fail' discovery of supported types.

Jason

Yi Liu Dec. 13, 2023, 1:47 p.m. UTC | #46

> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, December 12, 2023 10:40 PM
> 
> On Tue, Dec 12, 2023 at 01:45:16PM +0000, Liu, Yi L wrote:
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Monday, December 11, 2023 9:22 PM
> > >
> > > On Mon, Dec 11, 2023 at 03:53:40PM +0800, Yi Liu wrote:
> > >
> > > > > >  From that thread Jason mentioned to make the invalidation format
> > > > > > part of domain allocation. If that is the direction to go then there
> > > > > > won't be multiple invalidation formats per hwpt. The user should
> > > > > > create multiple hwpt's per invalidation format (though mixing
> > > > > > formats in one virtual platform is very unlikely)?
> > > > >
> > > > > I think we could do either, but I have a vauge cleanness preference
> > > > > that the enums are just different? That would follow a pretty typical
> > > > > pattern for a structure tag to reflect the content of the structure.
> > > >
> > > > Is this still following the direction to make the invalidation format
> > > > part of domain allocation?
> > >
> > > I think you should make it seperate
> >
> > Sure. I'll add a separate enum for cache invalidation format. Just to
> > see if it is good to pass down the invalidation format in the hwpt
> > alloc path? So iommu driver can check if the invalidation format
> > can be used for the hwpt to be allocated.
> 
> I would skip it in the invalidation. If necessary drivers can push a 0
> length invalidation to do 'try and fail' discovery of supported types.

I think you mean keep passing the req_type in cache invalidation path
instead of the way I proposed. For the 'try and fail' discovery, we should
allow a zero-length array, is it? If the req_type is supported by the iommu
driver, then return successful, otherwise fail the ioctl. Is it?

Regards,
Yi Liu

Jason Gunthorpe Dec. 13, 2023, 2:11 p.m. UTC | #47

On Wed, Dec 13, 2023 at 01:47:54PM +0000, Liu, Yi L wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, December 12, 2023 10:40 PM
> > 
> > On Tue, Dec 12, 2023 at 01:45:16PM +0000, Liu, Yi L wrote:
> > > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > > Sent: Monday, December 11, 2023 9:22 PM
> > > >
> > > > On Mon, Dec 11, 2023 at 03:53:40PM +0800, Yi Liu wrote:
> > > >
> > > > > > >  From that thread Jason mentioned to make the invalidation format
> > > > > > > part of domain allocation. If that is the direction to go then there
> > > > > > > won't be multiple invalidation formats per hwpt. The user should
> > > > > > > create multiple hwpt's per invalidation format (though mixing
> > > > > > > formats in one virtual platform is very unlikely)?
> > > > > >
> > > > > > I think we could do either, but I have a vauge cleanness preference
> > > > > > that the enums are just different? That would follow a pretty typical
> > > > > > pattern for a structure tag to reflect the content of the structure.
> > > > >
> > > > > Is this still following the direction to make the invalidation format
> > > > > part of domain allocation?
> > > >
> > > > I think you should make it seperate
> > >
> > > Sure. I'll add a separate enum for cache invalidation format. Just to
> > > see if it is good to pass down the invalidation format in the hwpt
> > > alloc path? So iommu driver can check if the invalidation format
> > > can be used for the hwpt to be allocated.
> > 
> > I would skip it in the invalidation. If necessary drivers can push a 0
> > length invalidation to do 'try and fail' discovery of supported types.
> 
> I think you mean keep passing the req_type in cache invalidation path
> instead of the way I proposed. For the 'try and fail' discovery, we should
> allow a zero-length array, is it? If the req_type is supported by the iommu
> driver, then return successful, otherwise fail the ioctl. Is it?

Yes

Jason

diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 2abbeafdbd22..367459d92f69 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -238,6 +238,11 @@  iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
 		rc = -EINVAL;
 		goto out_abort;
 	}
+	/* Driver is buggy by missing cache_invalidate_user in domain_ops */
+	if (WARN_ON_ONCE(!hwpt->domain->ops->cache_invalidate_user)) {
+		rc = -EINVAL;
+		goto out_abort;
+	}
 	return hwpt_nested;
 
 out_abort:
@@ -370,4 +375,34 @@  int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd)
 
 	iommufd_put_object(&hwpt_paging->common.obj);
 	return rc;
+};
+
+int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_hwpt_invalidate *cmd = ucmd->cmd;
+	struct iommu_user_data_array data_array = {
+		.type = cmd->req_type,
+		.uptr = u64_to_user_ptr(cmd->reqs_uptr),
+		.entry_len = cmd->req_len,
+		.entry_num = cmd->req_num,
+	};
+	struct iommufd_hw_pagetable *hwpt;
+	int rc = 0;
+
+	if (cmd->req_type == IOMMU_HWPT_DATA_NONE)
+		return -EINVAL;
+	if (!cmd->reqs_uptr || !cmd->req_len || !cmd->req_num)
+		return -EINVAL;
+
+	hwpt = iommufd_hw_pagetable_get_nested(ucmd, cmd->hwpt_id);
+	if (IS_ERR(hwpt))
+		return PTR_ERR(hwpt);
+
+	rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain, &data_array,
+						      &cmd->out_driver_error_code);
+	cmd->req_num = data_array.entry_num;
+	if (iommufd_ucmd_respond(ucmd, sizeof(*cmd)))
+		return -EFAULT;
+	iommufd_put_object(&hwpt->obj);
+	return rc;
 }
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index a74cfefffbc6..160521800d9b 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -301,6 +301,7 @@  void iommufd_hwpt_paging_abort(struct iommufd_object *obj);
 void iommufd_hwpt_nested_destroy(struct iommufd_object *obj);
 void iommufd_hwpt_nested_abort(struct iommufd_object *obj);
 int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd);
+int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd);
 
 static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
 					    struct iommufd_hw_pagetable *hwpt)
@@ -318,6 +319,14 @@  static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
 	refcount_dec(&hwpt->obj.users);
 }
 
+static inline struct iommufd_hw_pagetable *
+iommufd_hw_pagetable_get_nested(struct iommufd_ucmd *ucmd, u32 id)
+{
+	return container_of(iommufd_get_object(ucmd->ictx, id,
+					       IOMMUFD_OBJ_HWPT_NESTED),
+			    struct iommufd_hw_pagetable, obj);
+}
+
 struct iommufd_group {
 	struct kref ref;
 	struct mutex lock;
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 45b9d40773b1..6edef860f91c 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -309,6 +309,7 @@  union ucmd_buffer {
 	struct iommu_hwpt_alloc hwpt;
 	struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
 	struct iommu_hwpt_set_dirty_tracking set_dirty_tracking;
+	struct iommu_hwpt_invalidate cache;
 	struct iommu_ioas_alloc alloc;
 	struct iommu_ioas_allow_iovas allow_iovas;
 	struct iommu_ioas_copy ioas_copy;
@@ -348,6 +349,8 @@  static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 struct iommu_hwpt_get_dirty_bitmap, data),
 	IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking,
 		 struct iommu_hwpt_set_dirty_tracking, __reserved),
+	IOCTL_OP(IOMMU_HWPT_INVALIDATE, iommufd_hwpt_invalidate,
+		 struct iommu_hwpt_invalidate, out_driver_error_code),
 	IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
 		 struct iommu_ioas_alloc, out_ioas_id),
 	IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 0b2bc6252e2c..7f92cecc87d7 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -49,6 +49,7 @@  enum {
 	IOMMUFD_CMD_GET_HW_INFO,
 	IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
 	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
+	IOMMUFD_CMD_HWPT_INVALIDATE,
 };
 
 /**
@@ -613,4 +614,38 @@  struct iommu_hwpt_get_dirty_bitmap {
 #define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
 					IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
 
+/**
+ * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
+ * @size: sizeof(struct iommu_hwpt_invalidate)
+ * @hwpt_id: HWPT ID of a nested HWPT for cache invalidation
+ * @reqs_uptr: User pointer to an array having @req_num of cache invalidation
+ *             requests. The request entries in the array are of fixed width
+ *             @req_len, and contain a user data structure for invalidation
+ *             request specific to the given hardware page table.
+ * @req_type: One of enum iommu_hwpt_data_type, defining the data type of all
+ *            the entries in the invalidation request array. It should suit
+ *            with the data_type passed per the allocation of the hwpt pointed
+ *            by @hwpt_id.
+ * @req_len: Length (in bytes) of a request entry in the request array
+ * @req_num: Input the number of cache invalidation requests in the array.
+ *           Output the number of requests successfully handled by kernel.
+ * @out_driver_error_code: Report a driver speicifc error code upon failure.
+ *                         It's optional, driver has a choice to fill it or
+ *                         not.
+ *
+ * Invalidate the iommu cache for user-managed page table. Modifications on a
+ * user-managed page table should be followed by this operation to sync cache.
+ * Each ioctl can support one or more cache invalidation requests in the array
+ * that has a total size of @req_len * @req_num.
+ */
+struct iommu_hwpt_invalidate {
+	__u32 size;
+	__u32 hwpt_id;
+	__aligned_u64 reqs_uptr;
+	__u32 req_type;
+	__u32 req_len;
+	__u32 req_num;
+	__u32 out_driver_error_code;
+};
+#define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE)
 #endif