[v3,01/10] iommufd: Add data structure for Intel VT-d stage-1 domain allocation
Commit Message
This adds IOMMU_HWPT_TYPE_VTD_S1 for stage-1 hw_pagetable of Intel VT-d
and the corressponding data structure for userspace specified parameter
for the domain allocation.
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
include/uapi/linux/iommufd.h | 57 ++++++++++++++++++++++++++++++++++++
1 file changed, 57 insertions(+)
Comments
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Thursday, May 11, 2023 10:51 PM
>
> @@ -353,9 +353,64 @@ struct iommu_vfio_ioas {
> /**
> * enum iommu_hwpt_type - IOMMU HWPT Type
> * @IOMMU_HWPT_TYPE_DEFAULT: default
> + * @IOMMU_HWPT_TYPE_VTD_S1: Intel VT-d stage-1 page table
> */
> enum iommu_hwpt_type {
> IOMMU_HWPT_TYPE_DEFAULT,
> + IOMMU_HWPT_TYPE_VTD_S1,
> +};
> +
> +/**
> + * enum iommu_hwpt_intel_vtd_flags - Intel VT-d stage-1 page
> + * table entry attributes
No need to have both 'intel' and 'vtd' in one name. There won't
be another vtd. 😊
Also since it's for s1, let's be specific on the naming:
enum iommu_hwpt_vtd_s1_flags
> + * @IOMMU_VTD_PGTBL_SRE: Supervisor request
> + * @IOMMU_VTD_PGTBL_EAFE: Extended access enable
> + * @IOMMU_VTD_PGTBL_PCD: Page-level cache disable
> + * @IOMMU_VTD_PGTBL_PWT: Page-level write through
> + * @IOMMU_VTD_PGTBL_EMTE: Extended mem type enable
> + * @IOMMU_VTD_PGTBL_CD: PASID-level cache disable
> + * @IOMMU_VTD_PGTBL_WPE: Write protect enable
similarly above should be IOMMU_VTD_S1_SRE.
PGTBL can be skipped.
> + */
> +enum iommu_hwpt_intel_vtd_flags {
> + IOMMU_VTD_PGTBL_SRE = 1 << 0,
> + IOMMU_VTD_PGTBL_EAFE = 1 << 1,
> + IOMMU_VTD_PGTBL_PCD = 1 << 2,
> + IOMMU_VTD_PGTBL_PWT = 1 << 3,
> + IOMMU_VTD_PGTBL_EMTE = 1 << 4,
> + IOMMU_VTD_PGTBL_CD = 1 << 5,
> + IOMMU_VTD_PGTBL_WPE = 1 << 6,
> + IOMMU_VTD_PGTBL_LAST = 1 << 7,
> +};
> +
> +/**
> + * struct iommu_hwpt_intel_vtd - Intel VT-d specific user-managed
> + * stage-1 page table info
struct iommu_hwpt_vtd_s1
> + * @flags: Combination of enum iommu_hwpt_intel_vtd_flags
> + * @pgtbl_addr: The base address of the user-managed stage-1 page table.
no need to highlight 'user-managed' in every reference to stage-1
> + * @pat: Page attribute table data to compute effective memory type
> + * @emt: Extended memory type
> + * @addr_width: The address width of the untranslated addresses that are
> + * subjected to the user-managed stage-1 page table.
"The address width of the stage-1 page table"
> + * @__reserved: Must be 0
> + *
> + * The Intel VT-d specific data for creating hw_pagetable to represent
> + * the user-managed stage-1 page table that is used in nested translation.
"VT-d specific data for creating a stage-1 page table that ..."
> + *
> + * In nested translation, the stage-1 page table locates in the address
> + * space that defined by the corresponding stage-2 page table. Hence the
"locates in the address space of the specified stage-2 page table"
> + * stage-1 page table base address value should not be higher than the
> + * maximum untranslated address of stage-2 page table.
"should not exceed the maximum allowed input address of stage-2"
> + *
> + * The paging level of the stage-1 page table should be compatible with
> + * the hardware iommu. Otherwise, the allocation would be failed.
there is no information about paging level in this structure
> + */
> +struct iommu_hwpt_intel_vtd {
> + __u64 flags;
> + __u64 pgtbl_addr;
> + __u32 pat;
> + __u32 emt;
> + __u32 addr_width;
> + __u32 __reserved;
> };
>
I'd prefer to move vendor specific definitions before 'enum iommu_hwpt_type'
so any new added type can be easily correlated between the type enumeration
and following change in the comment of 'struct iommu_hwpt_alloc'
> /**
> @@ -391,6 +446,8 @@ enum iommu_hwpt_type {
> * +------------------------------+-------------------------------------+-----------+
> * | IOMMU_HWPT_TYPE_DEFAULT | N/A | IOAS |
> * +------------------------------+-------------------------------------+-----------+
> + * | IOMMU_HWPT_TYPE_VTD_S1 | struct iommu_hwpt_intel_vtd |
> HWPT |
> + * +------------------------------+-------------------------------------+-----------+
> */
> struct iommu_hwpt_alloc {
> __u32 size;
> --
> 2.34.1
Hi,
> -----Original Message-----
> From: Yi Liu <yi.l.liu@intel.com>
> Sent: Thursday, May 11, 2023 10:51 PM
> To: joro@8bytes.org; alex.williamson@redhat.com; jgg@nvidia.com; Tian,
> Kevin <kevin.tian@intel.com>; robin.murphy@arm.com;
> baolu.lu@linux.intel.com
> Cc: cohuck@redhat.com; eric.auger@redhat.com; nicolinc@nvidia.com;
> kvm@vger.kernel.org; mjrosato@linux.ibm.com;
> chao.p.peng@linux.intel.com; Liu, Yi L <yi.l.liu@intel.com>;
> yi.y.sun@linux.intel.com; peterx@redhat.com; jasowang@redhat.com;
> shameerali.kolothum.thodi@huawei.com; lulu@redhat.com;
> suravee.suthikulpanit@amd.com; iommu@lists.linux.dev; linux-
> kernel@vger.kernel.org; linux-kselftest@vger.kernel.org; Duan, Zhenzhong
> <zhenzhong.duan@intel.com>
> Subject: [PATCH v3 01/10] iommufd: Add data structure for Intel VT-d stage-1
> domain allocation
>
> This adds IOMMU_HWPT_TYPE_VTD_S1 for stage-1 hw_pagetable of Intel VT-
> d and the corressponding data structure for userspace specified parameter
> for the domain allocation.
>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
> include/uapi/linux/iommufd.h | 57
> ++++++++++++++++++++++++++++++++++++
> 1 file changed, 57 insertions(+)
>
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index 06dcad6ab082..c2658394827a 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -353,9 +353,64 @@ struct iommu_vfio_ioas {
> /**
> * enum iommu_hwpt_type - IOMMU HWPT Type
> * @IOMMU_HWPT_TYPE_DEFAULT: default
> + * @IOMMU_HWPT_TYPE_VTD_S1: Intel VT-d stage-1 page table
> */
> enum iommu_hwpt_type {
> IOMMU_HWPT_TYPE_DEFAULT,
> + IOMMU_HWPT_TYPE_VTD_S1,
> +};
> +
> +/**
> + * enum iommu_hwpt_intel_vtd_flags - Intel VT-d stage-1 page
> + * table entry attributes
> + * @IOMMU_VTD_PGTBL_SRE: Supervisor request
> + * @IOMMU_VTD_PGTBL_EAFE: Extended access enable
> + * @IOMMU_VTD_PGTBL_PCD: Page-level cache disable
> + * @IOMMU_VTD_PGTBL_PWT: Page-level write through
> + * @IOMMU_VTD_PGTBL_EMTE: Extended mem type enable
> + * @IOMMU_VTD_PGTBL_CD: PASID-level cache disable
> + * @IOMMU_VTD_PGTBL_WPE: Write protect enable */ enum
> +iommu_hwpt_intel_vtd_flags {
> + IOMMU_VTD_PGTBL_SRE = 1 << 0,
> + IOMMU_VTD_PGTBL_EAFE = 1 << 1,
> + IOMMU_VTD_PGTBL_PCD = 1 << 2,
> + IOMMU_VTD_PGTBL_PWT = 1 << 3,
> + IOMMU_VTD_PGTBL_EMTE = 1 << 4,
> + IOMMU_VTD_PGTBL_CD = 1 << 5,
> + IOMMU_VTD_PGTBL_WPE = 1 << 6,
> + IOMMU_VTD_PGTBL_LAST = 1 << 7,
> +};
> +
> +/**
> + * struct iommu_hwpt_intel_vtd - Intel VT-d specific user-managed
> + * stage-1 page table info
> + * @flags: Combination of enum iommu_hwpt_intel_vtd_flags
> + * @pgtbl_addr: The base address of the user-managed stage-1 page table.
> + * @pat: Page attribute table data to compute effective memory type
> + * @emt: Extended memory type
> + * @addr_width: The address width of the untranslated addresses that are
> + * subjected to the user-managed stage-1 page table.
> + * @__reserved: Must be 0
> + *
> + * The Intel VT-d specific data for creating hw_pagetable to represent
> + * the user-managed stage-1 page table that is used in nested translation.
> + *
> + * In nested translation, the stage-1 page table locates in the address
> + * space that defined by the corresponding stage-2 page table. Hence
> +the
> + * stage-1 page table base address value should not be higher than the
> + * maximum untranslated address of stage-2 page table.
> + *
> + * The paging level of the stage-1 page table should be compatible with
> + * the hardware iommu. Otherwise, the allocation would be failed.
> + */
> +struct iommu_hwpt_intel_vtd {
> + __u64 flags;
> + __u64 pgtbl_addr;
> + __u32 pat;
> + __u32 emt;
Do we need the emt field as part of the stage-1 page table info? IIUC, according to vt-d spec, the emt field is in stage-2 page table entries. Since in nested mode we only expose stage-1 page table to guest vt-d driver, I'm curious how does guest vt-d driver populate this EMT?
Thanks
-Tina
> + __u32 addr_width;
> + __u32 __reserved;
> };
>
> /**
> @@ -391,6 +446,8 @@ enum iommu_hwpt_type {
> * +------------------------------+-------------------------------------+-----------+
> * | IOMMU_HWPT_TYPE_DEFAULT | N/A | IOAS |
> * +------------------------------+-------------------------------------+-----------+
> + * | IOMMU_HWPT_TYPE_VTD_S1 | struct iommu_hwpt_intel_vtd |
> HWPT |
> + *
> + +------------------------------+-------------------------------------+
> + -----------+
> */
> struct iommu_hwpt_alloc {
> __u32 size;
> --
> 2.34.1
>
On Thu, May 11, 2023 at 07:51:01AM -0700, Yi Liu wrote:
> This adds IOMMU_HWPT_TYPE_VTD_S1 for stage-1 hw_pagetable of Intel VT-d
> +/**
> + * struct iommu_hwpt_intel_vtd - Intel VT-d specific user-managed
> + * stage-1 page table info
> + * @flags: Combination of enum iommu_hwpt_intel_vtd_flags
> + * @pgtbl_addr: The base address of the user-managed stage-1 page table.
> + * @pat: Page attribute table data to compute effective memory type
> + * @emt: Extended memory type
> + * @addr_width: The address width of the untranslated addresses that are
> + * subjected to the user-managed stage-1 page table.
> + * @__reserved: Must be 0
> + *
> + * The Intel VT-d specific data for creating hw_pagetable to represent
> + * the user-managed stage-1 page table that is used in nested translation.
> + *
> + * In nested translation, the stage-1 page table locates in the address
> + * space that defined by the corresponding stage-2 page table. Hence the
> + * stage-1 page table base address value should not be higher than the
> + * maximum untranslated address of stage-2 page table.
> + *
> + * The paging level of the stage-1 page table should be compatible with
> + * the hardware iommu. Otherwise, the allocation would be failed.
> + */
> +struct iommu_hwpt_intel_vtd {
> + __u64 flags;
> + __u64 pgtbl_addr;
__aligned_u64
> + __u32 pat;
> + __u32 emt;
> + __u32 addr_width;
> + __u32 __reserved;
> };
>
> /**
> @@ -391,6 +446,8 @@ enum iommu_hwpt_type {
> * +------------------------------+-------------------------------------+-----------+
> * | IOMMU_HWPT_TYPE_DEFAULT | N/A | IOAS |
> * +------------------------------+-------------------------------------+-----------+
> + * | IOMMU_HWPT_TYPE_VTD_S1 | struct iommu_hwpt_intel_vtd | HWPT |
> + * +------------------------------+-------------------------------------+-----------+
Please don't make ascii art tables.
Note beside the struct what enum it is for
Jason
On Thu, May 25, 2023 at 02:28:18AM +0000, Zhang, Tina wrote:
> > +struct iommu_hwpt_intel_vtd {
> > + __u64 flags;
> > + __u64 pgtbl_addr;
> > + __u32 pat;
> > + __u32 emt;
> Do we need the emt field as part of the stage-1 page table info?
> IIUC, according to vt-d spec, the emt field is in stage-2 page table
> entries. Since in nested mode we only expose stage-1 page table to
> guest vt-d driver, I'm curious how does guest vt-d driver populate
> this EMT?
Indeed. The EMT is controlling how the iommu HW parses memory that is
controlled by the kernel - this simply should not be something that
userspace controls.
The S2 itself has to decide if it is populating the EMT bits in the
IOPTE and if so it could enable EMT. Does userspace need to be
involved here?
The seemingly more tricky thing is that it feels like EMT and PAT
would like to be per-iova and we don't have a means for that right
now. (and even if we did, how would qemu decide what to do ?)
So probably drop it.
Jason
@@ -353,9 +353,64 @@ struct iommu_vfio_ioas {
/**
* enum iommu_hwpt_type - IOMMU HWPT Type
* @IOMMU_HWPT_TYPE_DEFAULT: default
+ * @IOMMU_HWPT_TYPE_VTD_S1: Intel VT-d stage-1 page table
*/
enum iommu_hwpt_type {
IOMMU_HWPT_TYPE_DEFAULT,
+ IOMMU_HWPT_TYPE_VTD_S1,
+};
+
+/**
+ * enum iommu_hwpt_intel_vtd_flags - Intel VT-d stage-1 page
+ * table entry attributes
+ * @IOMMU_VTD_PGTBL_SRE: Supervisor request
+ * @IOMMU_VTD_PGTBL_EAFE: Extended access enable
+ * @IOMMU_VTD_PGTBL_PCD: Page-level cache disable
+ * @IOMMU_VTD_PGTBL_PWT: Page-level write through
+ * @IOMMU_VTD_PGTBL_EMTE: Extended mem type enable
+ * @IOMMU_VTD_PGTBL_CD: PASID-level cache disable
+ * @IOMMU_VTD_PGTBL_WPE: Write protect enable
+ */
+enum iommu_hwpt_intel_vtd_flags {
+ IOMMU_VTD_PGTBL_SRE = 1 << 0,
+ IOMMU_VTD_PGTBL_EAFE = 1 << 1,
+ IOMMU_VTD_PGTBL_PCD = 1 << 2,
+ IOMMU_VTD_PGTBL_PWT = 1 << 3,
+ IOMMU_VTD_PGTBL_EMTE = 1 << 4,
+ IOMMU_VTD_PGTBL_CD = 1 << 5,
+ IOMMU_VTD_PGTBL_WPE = 1 << 6,
+ IOMMU_VTD_PGTBL_LAST = 1 << 7,
+};
+
+/**
+ * struct iommu_hwpt_intel_vtd - Intel VT-d specific user-managed
+ * stage-1 page table info
+ * @flags: Combination of enum iommu_hwpt_intel_vtd_flags
+ * @pgtbl_addr: The base address of the user-managed stage-1 page table.
+ * @pat: Page attribute table data to compute effective memory type
+ * @emt: Extended memory type
+ * @addr_width: The address width of the untranslated addresses that are
+ * subjected to the user-managed stage-1 page table.
+ * @__reserved: Must be 0
+ *
+ * The Intel VT-d specific data for creating hw_pagetable to represent
+ * the user-managed stage-1 page table that is used in nested translation.
+ *
+ * In nested translation, the stage-1 page table locates in the address
+ * space that defined by the corresponding stage-2 page table. Hence the
+ * stage-1 page table base address value should not be higher than the
+ * maximum untranslated address of stage-2 page table.
+ *
+ * The paging level of the stage-1 page table should be compatible with
+ * the hardware iommu. Otherwise, the allocation would be failed.
+ */
+struct iommu_hwpt_intel_vtd {
+ __u64 flags;
+ __u64 pgtbl_addr;
+ __u32 pat;
+ __u32 emt;
+ __u32 addr_width;
+ __u32 __reserved;
};
/**
@@ -391,6 +446,8 @@ enum iommu_hwpt_type {
* +------------------------------+-------------------------------------+-----------+
* | IOMMU_HWPT_TYPE_DEFAULT | N/A | IOAS |
* +------------------------------+-------------------------------------+-----------+
+ * | IOMMU_HWPT_TYPE_VTD_S1 | struct iommu_hwpt_intel_vtd | HWPT |
+ * +------------------------------+-------------------------------------+-----------+
*/
struct iommu_hwpt_alloc {
__u32 size;