[v1,01/14] iommu: Add iommu_get_unmanaged_domain helper

Message ID 9b1077601cace998533129327f5e7ad946752d29.1678348754.git.nicolinc@nvidia.com
State New
Headers
Series Add Nested Translation Support for SMMUv3 |

Commit Message

Nicolin Chen March 9, 2023, 10:53 a.m. UTC
  The nature of ITS virtualization on ARM is done via hypercalls, so kernel
handles all IOVA mappings for the MSI doorbell in iommu_dma_prepare_msi()
and iommu_dma_compose_msi_msg(). The current virtualization solution with
a 2-stage nested translation setup is to do 1:1 IOVA mappings at stage-1
guest-level IO page table via a RMR region in guest-level IORT, aligning
with an IOVA region that's predefined and mapped in the host kernel:

  [stage-2 host level]
  #define MSI_IOVA_BASE		0x8000000
  #define MSI_IOVA_LENGTH	0x100000
  ...
  iommu_get_msi_cookie():
	cookie->msi_iova = MSI_IOVA_BASE;
  ...
  iommu_dma_prepare_msi(its_pa):
	domain = iommu_get_domain_for_dev(dev);
	iommu_dma_get_msi_page(its_pa, domain):
		cookie = domain->iova_cookie;
		iova = iommu_dma_alloc_iova():
			return cookie->msi_iova - size;
		iommu_map(iova, its_pa, ...);

  [stage-1 guest level]
  // Define in IORT a RMR [MSI_IOVA_BASE, MSI_IOVA_LENGTH]
  ...
  iommu_create_device_direct_mappings():
	iommu_map(iova=MSI_IOVA_BASE, pa=MSI_IOVA_BASE, len=MSI_IOVA_LENGTH);

This solution calling iommu_get_domain_for_dev() needs the device to get
attached to a host-level iommu_domain that has the msi_cookie.

On the other hand, IOMMUFD designs two iommu_domain objects to represent
the two stages: a stage-1 domain (IOMMU_DOMAIN_NESTED type) and a stage-2
domain (IOMMU_DOMAIN_UNMANAGED type). In this design, the device will be
attached to the stage-1 domain representing a guest-level IO page table,
or a Context Descriptor Table in SMMU's term.

This is obviously a mismatch, as the iommu_get_domain_for_dev() does not
return the correct domain pointer in iommu_dma_prepare_msi().

Add an iommu_get_unmanaged_domain helper to allow drivers to return the
correct IOMMU_DOMAIN_UNMANAGED iommu_domain having the IOVA mappings for
the msi_cookie. Keep it in the iommu-priv header for internal use only.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/dma-iommu.c  |  5 +++--
 drivers/iommu/iommu-priv.h | 15 +++++++++++++++
 include/linux/iommu.h      |  2 ++
 3 files changed, 20 insertions(+), 2 deletions(-)
  

Comments

Robin Murphy March 9, 2023, 12:51 p.m. UTC | #1
On 2023-03-09 10:53, Nicolin Chen wrote:
> The nature of ITS virtualization on ARM is done via hypercalls, so kernel
> handles all IOVA mappings for the MSI doorbell in iommu_dma_prepare_msi()
> and iommu_dma_compose_msi_msg(). The current virtualization solution with
> a 2-stage nested translation setup is to do 1:1 IOVA mappings at stage-1
> guest-level IO page table via a RMR region in guest-level IORT, aligning
> with an IOVA region that's predefined and mapped in the host kernel:
> 
>    [stage-2 host level]
>    #define MSI_IOVA_BASE		0x8000000
>    #define MSI_IOVA_LENGTH	0x100000
>    ...
>    iommu_get_msi_cookie():
> 	cookie->msi_iova = MSI_IOVA_BASE;
>    ...
>    iommu_dma_prepare_msi(its_pa):
> 	domain = iommu_get_domain_for_dev(dev);
> 	iommu_dma_get_msi_page(its_pa, domain):
> 		cookie = domain->iova_cookie;
> 		iova = iommu_dma_alloc_iova():
> 			return cookie->msi_iova - size;
> 		iommu_map(iova, its_pa, ...);
> 
>    [stage-1 guest level]
>    // Define in IORT a RMR [MSI_IOVA_BASE, MSI_IOVA_LENGTH]
>    ...
>    iommu_create_device_direct_mappings():
> 	iommu_map(iova=MSI_IOVA_BASE, pa=MSI_IOVA_BASE, len=MSI_IOVA_LENGTH);
> 
> This solution calling iommu_get_domain_for_dev() needs the device to get
> attached to a host-level iommu_domain that has the msi_cookie.
> 
> On the other hand, IOMMUFD designs two iommu_domain objects to represent
> the two stages: a stage-1 domain (IOMMU_DOMAIN_NESTED type) and a stage-2
> domain (IOMMU_DOMAIN_UNMANAGED type). In this design, the device will be
> attached to the stage-1 domain representing a guest-level IO page table,
> or a Context Descriptor Table in SMMU's term.
> 
> This is obviously a mismatch, as the iommu_get_domain_for_dev() does not
> return the correct domain pointer in iommu_dma_prepare_msi().
> 
> Add an iommu_get_unmanaged_domain helper to allow drivers to return the
> correct IOMMU_DOMAIN_UNMANAGED iommu_domain having the IOVA mappings for
> the msi_cookie. Keep it in the iommu-priv header for internal use only.
> 
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>   drivers/iommu/dma-iommu.c  |  5 +++--
>   drivers/iommu/iommu-priv.h | 15 +++++++++++++++
>   include/linux/iommu.h      |  2 ++
>   3 files changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 99b2646cb5c7..6b0409d0ff85 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -31,6 +31,7 @@
>   #include <linux/vmalloc.h>
>   
>   #include "dma-iommu.h"
> +#include "iommu-priv.h"
>   
>   struct iommu_dma_msi_page {
>   	struct list_head	list;
> @@ -1652,7 +1653,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
>   int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
>   {
>   	struct device *dev = msi_desc_to_dev(desc);
> -	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> +	struct iommu_domain *domain = iommu_get_unmanaged_domain(dev);

This still doesn't make sense - most of the time this will be expected 
to return the default DMA/identity domain if that's what the device is 
currently using. We can't know whether the current domain is managed or 
not until we look at it.

Just like every other caller of iommu_get_domain_for_dev(), what we want 
here is the current kernel-owned domain that we can inspect and maybe do 
standard IOMMU API things with. Why can't iommu_get_domain_for_dev() 
simply maintain that established usage model and return the kernel-owned 
s2_domain from a nested domain automatically? No IOMMU API user expects 
or needs it to return anything else (and IOMMUFD should certainly not be 
losing track of a nested domain within its own higher-level abstractions 
and needing to fall back on iommu_get_domain_for_dev()), so I really 
don't see a valid reason to overcomplicate things.

Please note I stress "valid" since I'm not buying arbitrarily made-up 
conceptual purity arguments. A nested domain cannot be the "one true 
domain" that is an opaque combination of S1+S2; the IOMMU API view has 
to be more like the device is attached to both the nested domain and the 
parent stage 2 domain somewhat in parallel. Even when nesting is active, 
the S2 domain still exists as a domain in its own right, and still needs 
to be visible and operated on as such, for instance if memory is 
hotplugged in or out of the VM.

TBH I'd also move the s2_domain pointer into the iommu_domain itself, 
since it's going to be a common feature for all nesting implementations, 
thus there seems little need to indirect lookups through the drivers at all.

Thanks,
Robin.

>   	struct iommu_dma_msi_page *msi_page;
>   	static DEFINE_MUTEX(msi_prepare_lock); /* see below */
>   
> @@ -1685,7 +1686,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
>   void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
>   {
>   	struct device *dev = msi_desc_to_dev(desc);
> -	const struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> +	const struct iommu_domain *domain = iommu_get_unmanaged_domain(dev);
>   	const struct iommu_dma_msi_page *msi_page;
>   
>   	msi_page = msi_desc_get_iommu_cookie(desc);
> diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
> index a6e694f59f64..da8044da9ad8 100644
> --- a/drivers/iommu/iommu-priv.h
> +++ b/drivers/iommu/iommu-priv.h
> @@ -15,6 +15,21 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
>   	return dev->iommu->iommu_dev->ops;
>   }
>   
> +static inline struct iommu_domain *iommu_get_unmanaged_domain(struct device *dev)
> +{
> +	const struct iommu_ops *ops;
> +
> +	if (!dev->iommu || !dev->iommu->iommu_dev)
> +		goto attached_domain;
> +
> +	ops = dev_iommu_ops(dev);
> +	if (ops->get_unmanaged_domain)
> +		return ops->get_unmanaged_domain(dev);
> +
> +attached_domain:
> +	return iommu_get_domain_for_dev(dev);
> +}
> +
>   int iommu_group_replace_domain(struct iommu_group *group,
>   			       struct iommu_domain *new_domain);
>   
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 080278c8154d..76c65cc4fc15 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -275,6 +275,8 @@ struct iommu_ops {
>   						  struct iommu_domain *parent,
>   						  const void *user_data);
>   
> +	struct iommu_domain *(*get_unmanaged_domain)(struct device *dev);
> +
>   	struct iommu_device *(*probe_device)(struct device *dev);
>   	void (*release_device)(struct device *dev);
>   	void (*probe_finalize)(struct device *dev);
  
Jason Gunthorpe March 9, 2023, 2:19 p.m. UTC | #2
On Thu, Mar 09, 2023 at 12:51:20PM +0000, Robin Murphy wrote:

> Please note I stress "valid" since I'm not buying arbitrarily made-up
> conceptual purity arguments. A nested domain cannot be the "one true domain"
> that is an opaque combination of S1+S2; the IOMMU API view has to be more
> like the device is attached to both the nested domain and the parent stage 2
> domain somewhat in parallel.

I strongly disagree with this.

The design we have from the core perspective is an opaque domain that
is a hidden combination of S1/S2 inside the driver. We do not want to
change the basic design of the iommu core: there is only one domain
attached to a device/group at a time.

This patch should be seen as a temporary hack to allow the ARM ITS
stuff to hobble on a little longer. We already know that iommufd use
cases are incompatible with the design and we need to fix it. The
fixed solution must have iommufd install the ITS pages at domain
allocation time and so it will not need these APIs at all. This
tempoary code should not dictate the overall design of the iommu core.

If we belive exposing the S1/S2 relationships through the iommu core
is necessary for another legitimate purpose I would like to hear about
it. In my opinion using APIs to peek into these details is far more
likely to be buggy and long term I prefer to block the ability to
learn the S2 externally from iommufd completely.

Thus the overall design of the iommu core APIs is not being changed.
The core API design follows this logic with and without nesting:
   iommu_attach_device(domin);
   WARN_ON(domain != iommu_get_domain_for_dev());

The hack in this patch gets its own special single-use APIs so we can
purge them once they are not needed and do not confusingly contaminate
the whole design. For this reason the ops call back should only be
implemented by SMMUv3.

> Even when nesting is active, the S2 domain still exists as a domain
> in its own right, and still needs to be visible and operated on as
> such, for instance if memory is hotplugged in or out of the VM.

It exists in iommufd and iommufd will operate it. This is not a
problem.

iommufd is not using a dual attach model.

The S2 is provided to the S1's domain allocation function as creation
data. The returned S1 domain opaquely embeds the S2. The embedded S2
cannot be changed once the S1 domain is allocated.

Attaching the S1 always causes the embedded S2 to be used too - they
are not separable so we don't have APIs talking about about
"attaching" the S2.

Regards,
Jason
  
Robin Murphy March 9, 2023, 7:04 p.m. UTC | #3
On 2023-03-09 14:19, Jason Gunthorpe wrote:
> On Thu, Mar 09, 2023 at 12:51:20PM +0000, Robin Murphy wrote:
> 
>> Please note I stress "valid" since I'm not buying arbitrarily made-up
>> conceptual purity arguments. A nested domain cannot be the "one true domain"
>> that is an opaque combination of S1+S2; the IOMMU API view has to be more
>> like the device is attached to both the nested domain and the parent stage 2
>> domain somewhat in parallel.
> 
> I strongly disagree with this.
> 
> The design we have from the core perspective is an opaque domain that
> is a hidden combination of S1/S2 inside the driver. We do not want to
> change the basic design of the iommu core: there is only one domain
> attached to a device/group at a time.
> 
> This patch should be seen as a temporary hack to allow the ARM ITS
> stuff to hobble on a little longer. We already know that iommufd use
> cases are incompatible with the design and we need to fix it. The
> fixed solution must have iommufd install the ITS pages at domain
> allocation time and so it will not need these APIs at all. This
> tempoary code should not dictate the overall design of the iommu core.
> 
> If we belive exposing the S1/S2 relationships through the iommu core
> is necessary for another legitimate purpose I would like to hear about
> it. In my opinion using APIs to peek into these details is far more
> likely to be buggy and long term I prefer to block the ability to
> learn the S2 externally from iommufd completely.
> 
> Thus the overall design of the iommu core APIs is not being changed.
> The core API design follows this logic with and without nesting:
>     iommu_attach_device(domin);
>     WARN_ON(domain != iommu_get_domain_for_dev());

That is indeed one of many conditions that are true today, but the facts 
are that nothing makes that specific assertion, nothing should ever 
*need* to make that specific assertion, and any driver sufficiently dumb 
to not bother keeping track of its own domain and relying on 
iommu_get_domain_for_dev() to retrieve it should most definitely not be 
allowed anywhere near nesting.

The overall design of the iommu core APIs *is* being changed, because 
the current design is also that iommu_get_domain_for_dev() always 
returns the correct domain that iommu_dma_prepare_msi() needs, which 
breaks with nesting. You are literally insisting on changing this core 
API, to work around intentionally breaking an existing behaviour which 
could easily be preserved (far less invasively), for the sake of 
preserving some other theoretical behaviour that IS NOT USEFUL.

The overall design of the iommu core APIs *is* being changed, because 
the core API design also follows this logic for any domain type:

	domain = iommu_get_domain_for_dev();
	phys = iommu_iova_to_phys(domain, iova);
	//phys meaningfully represents whether iova was valid

Yes, even blocking and identity domains, because drivers ACTUALLY DO 
THIS. I'm not sure there even is a single correct thing that nesting 
domains could do to satisfy all the possible expectations that callers 
of iommu_iova_to_phys() may have. However if the grand design says it's 
not OK for iommu_get_domain_for_dev() to return what 
iommu_dma_prepare_msi() needs even though nobody else should ever be 
passing a nesting domain to it, then it's also not OK for 
iommu_iova_to_phys() to crash or lie and return 0 when a valid 
translation (by some notion) exists, even though nobody should ever pass 
a nesting domain in there either.

Forgive me for getting wound up, but I'm a pragmatist and my tolerance 
for ignoring reality is low.

> The hack in this patch gets its own special single-use APIs so we can
> purge them once they are not needed and do not confusingly contaminate
> the whole design. For this reason the ops call back should only be
> implemented by SMMUv3.
> 
>> Even when nesting is active, the S2 domain still exists as a domain
>> in its own right, and still needs to be visible and operated on as
>> such, for instance if memory is hotplugged in or out of the VM.
> 
> It exists in iommufd and iommufd will operate it. This is not a
> problem.
> 
> iommufd is not using a dual attach model.
> 
> The S2 is provided to the S1's domain allocation function as creation
> data. The returned S1 domain opaquely embeds the S2. The embedded S2
> cannot be changed once the S1 domain is allocated.
> 
> Attaching the S1 always causes the embedded S2 to be used too - they
> are not separable so we don't have APIs talking about about
> "attaching" the S2.

Yes, that is one way of viewing things, but it's far from the only way. 
The typical lifecycle will involve starting the VM with S2 alone, then 
enabling nesting later - we can view that as allocating a nested domain 
based on S2, then "replacing" S2 with nested, but we could equally view 
it as just attaching the nested domain on top of the existing S2, like 
pushing to a stack (I do agree that trying to model it as two completely 
independent and orthogonal attachments would not be sensible). It's 
really just semantics of how we prefer to describe things, and whether 
the s2_domain pointer is passed up-front to domain_alloc or later to 
attach_dev.

The opaque nested domain looks clean in some ways, but on the other hand 
it also heavily obfuscates how translations may be shared between one 
nested domain and other nested and non-nested domains, such that 
changing mappings in one may affect others. This is a major and 
potentially surprising paradigm shift away from the current notion that 
all domains represent individual isolated address spaces, so abstracting 
it more openly within the core API internals would be clearer about 
what's really going on. And putting common concepts at common levels of 
abstraction makes things simpler and is why we have core API code at all.

Frankly it's not like we don't already have various API-internal stuff 
in struct iommu_domain that nobody external should ever be looking at, 
but if that angst remains overwhelming then I can't imagine it being all 
that much work to move the definition to iommu-priv.h - AFAICS it 
shouldn't need much more than some helpers for the handful of 
iommu_get_domain_for_dev() users currently inspecting the type, 
pgsize_bitmap, or geometry - which would ultimately be a good deal 
neater and more productive than adding yet more special-case ops that 
every driver is just going to implement identically.

And to even imagine the future possibility of things like S2 pagetable 
sharing with KVM, or unpinned S2 with ATS/PRI or SMMU stalls, by 
tunnelling more private nesting interfaces directly between IOMMUFD and 
individual drivers without some degree of common IOMMU API abstraction 
in between... no. Just... no.

Thanks,
Robin.
  
Jason Gunthorpe March 10, 2023, 12:23 a.m. UTC | #4
On Thu, Mar 09, 2023 at 07:04:19PM +0000, Robin Murphy wrote:

> You are literally insisting on changing this core API, to work
> around intentionally breaking an existing behaviour which could easily be
> preserved (far less invasively), for the sake of preserving some other
> theoretical behaviour that IS NOT USEFUL.

No! I am insisting that the core API *make logical sense* and not be a
random mismash of things that just happen to work. I'm not optimizing
for LOC here.

The end goal is to remove access to the S2 and not have this API at
all. It makes no sense to have a temporary step that makes the S2 even
more available then go and undo that.

This is a 'code smell' annotation that it needs to use a special API
because this code has a direct special requirement based on ARM's
definition to work on the S2 of the nest.

The goal for iommufd is to make ITS page mapping to have already
happened at domain allocation time. It cannot be deferred until
irq_domain_ops.alloc() time. Obviously once we do that we don't have a
need to obtain the S2 from a nest domain.

> The overall design of the iommu core APIs *is* being changed, because the
> core API design also follows this logic for any domain type:
> 
> 	domain = iommu_get_domain_for_dev();
> 	phys = iommu_iova_to_phys(domain, iova);
> 	//phys meaningfully represents whether iova was valid

If a nesting domain is used then this really should not translate the
IOVA through the S2.

IMHO the proper answer for a nesting domain is the same as for an SVA
domain - EOPNOTSUP.

But more importantly it is illegal to call this API unless the caller
already has some range lock on the IOVA being queried. It is
impossible to obtain a range lock on a NEST or SVA, so this is not
allowed to be called on those domains.

Currently it looks like it crashes if something calls it with an
SVA/NEST to drive this point home :)

> Forgive me for getting wound up, but I'm a pragmatist and my tolerance for
> ignoring reality is low.

Well, I would like this settled and it seems bike-shedding to me.

> > Attaching the S1 always causes the embedded S2 to be used too - they
> > are not separable so we don't have APIs talking about about
> > "attaching" the S2.
> 
> Yes, that is one way of viewing things, but it's far from the only
> way.

Sure, but it is the design we are going with. It is the design that
was built into iommufd from day 1.

If there is a good reason to change it, I'm open to hear it, but we've
gone through a lot of use cases now and it is holding up well.

> typical lifecycle will involve starting the VM with S2 alone, then enabling
> nesting later - we can view that as allocating a nested domain based on S2,
> then "replacing" S2 with nested, but we could equally view it as just
> attaching the nested domain on top of the existing S2, like pushing to a
> stack (I do agree that trying to model it as two completely independent and
> orthogonal attachments would not be sensible). It's really just semantics of
> how we prefer to describe things, and whether the s2_domain pointer is
> passed up-front to domain_alloc or later to attach_dev.

We settled on domain_alloc time for a pretty basic reason - it keeps
the invariant that the IOVA to Phys of an iommu_domain is
universal. Meaning IOVA to Phys always gives the same answer
regardless of what device is attached.

We directly disallow mixing a S1 with different S2's. I also don't
like the idea of 'first to reach attach_dev assigns the S2', partially
initialized iommu_domains have not worked well so far IMHO.

I think this makes it easier for the IOMMU to assign cache tags as the
the S1 iommu_domain always gives the same translation so it can
trivially be assigned to a cache tag.

Ie a basic Intel implementation can assign a DID to the S1. We know
that no matter what device the NEST iommu domain is used the DID is
the correct cache tag because of the universal translation rule.

ARM will assign a VMID to the S2, the S1 is actually a CD table handle
and has its own invalidation.

AFAICT AMD will assign a DID per-device to the S1, because they don't
have ASIDs in their PASID table :\

> The opaque nested domain looks clean in some ways, but on the other hand it
> also heavily obfuscates how translations may be shared between one nested
> domain and other nested and non-nested domains, such that changing mappings
> in one may affect others.

Well, it keeps the logic in iommufd which should be the only user of
this stuff.

If we develop a non-iommufd user then we can revist where the
abstractions should live, but for now iommufd is handling the thin
common abstraction.

> This is a major and potentially surprising
> paradigm shift away from the current notion that all domains represent
> individual isolated address spaces, so abstracting it more openly within the
> core API internals would be clearer about what's really going on. And
> putting common concepts at common levels of abstraction makes things simpler
> and is why we have core API code at all.

I'm all for more commen concepts, but I'm pragmatic here - I'd like to
see duplicated code in the drivers become unduplicated by the
abstraction. I'm not sure what this is in the S2 area, so far I
haven't noticed anything in the ARM and Intel patch series.

Without a need for S2 pointers in any code outside iommufd and the
driver I'd prefer to keep APIs out of there to prevent abuse.

> ultimately be a good deal neater and more productive than adding yet more
> special-case ops that every driver is just going to implement identically.

To be clear for this patch only SMMUv3 will implement this op because
only SMMUv3 has this ITS problem. Once we fix it the op will be
deleted.

It really is a hack because we are all too scared to fix this properly
right now :)

> And to even imagine the future possibility of things like S2 pagetable
> sharing with KVM, or unpinned S2 with ATS/PRI or SMMU stalls, by tunnelling
> more private nesting interfaces directly between IOMMUFD and individual
> drivers without some degree of common IOMMU API abstraction in between...
> no. Just... no.

I'm all for abstractions that remove duplicated code from drivers, I'm
just not sure right now what they will actualy be...

Like I say, I don't view this as duplicated code, this is hack for ARM
to temporarily break the architectural model of a hidden S2, not true
lasting design.

Thanks,
Jason
  
Eric Auger March 10, 2023, 8:41 a.m. UTC | #5
Hi,

On 3/9/23 13:51, Robin Murphy wrote:
> On 2023-03-09 10:53, Nicolin Chen wrote:
>> The nature of ITS virtualization on ARM is done via hypercalls, so
>> kernel
>> handles all IOVA mappings for the MSI doorbell in
>> iommu_dma_prepare_msi()
>> and iommu_dma_compose_msi_msg(). The current virtualization solution
>> with
>> a 2-stage nested translation setup is to do 1:1 IOVA mappings at stage-1
>> guest-level IO page table via a RMR region in guest-level IORT, aligning
>> with an IOVA region that's predefined and mapped in the host kernel:
>>
>>    [stage-2 host level]
>>    #define MSI_IOVA_BASE        0x8000000
>>    #define MSI_IOVA_LENGTH    0x100000
>>    ...
>>    iommu_get_msi_cookie():
>>     cookie->msi_iova = MSI_IOVA_BASE;
>>    ...
>>    iommu_dma_prepare_msi(its_pa):
>>     domain = iommu_get_domain_for_dev(dev);
>>     iommu_dma_get_msi_page(its_pa, domain):
>>         cookie = domain->iova_cookie;
>>         iova = iommu_dma_alloc_iova():
>>             return cookie->msi_iova - size;
>>         iommu_map(iova, its_pa, ...);
>>
>>    [stage-1 guest level]
>>    // Define in IORT a RMR [MSI_IOVA_BASE, MSI_IOVA_LENGTH]
>>    ...
>>    iommu_create_device_direct_mappings():
>>     iommu_map(iova=MSI_IOVA_BASE, pa=MSI_IOVA_BASE,
>> len=MSI_IOVA_LENGTH);
>>
>> This solution calling iommu_get_domain_for_dev() needs the device to get
>> attached to a host-level iommu_domain that has the msi_cookie.
>>
>> On the other hand, IOMMUFD designs two iommu_domain objects to represent
>> the two stages: a stage-1 domain (IOMMU_DOMAIN_NESTED type) and a
>> stage-2
>> domain (IOMMU_DOMAIN_UNMANAGED type). In this design, the device will be
>> attached to the stage-1 domain representing a guest-level IO page table,
>> or a Context Descriptor Table in SMMU's term.
>>
>> This is obviously a mismatch, as the iommu_get_domain_for_dev() does not
>> return the correct domain pointer in iommu_dma_prepare_msi().
>>
>> Add an iommu_get_unmanaged_domain helper to allow drivers to return the
>> correct IOMMU_DOMAIN_UNMANAGED iommu_domain having the IOVA mappings for
>> the msi_cookie. Keep it in the iommu-priv header for internal use only.
>>
>> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
>> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
>> ---
>>   drivers/iommu/dma-iommu.c  |  5 +++--
>>   drivers/iommu/iommu-priv.h | 15 +++++++++++++++
>>   include/linux/iommu.h      |  2 ++
>>   3 files changed, 20 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
>> index 99b2646cb5c7..6b0409d0ff85 100644
>> --- a/drivers/iommu/dma-iommu.c
>> +++ b/drivers/iommu/dma-iommu.c
>> @@ -31,6 +31,7 @@
>>   #include <linux/vmalloc.h>
>>     #include "dma-iommu.h"
>> +#include "iommu-priv.h"
>>     struct iommu_dma_msi_page {
>>       struct list_head    list;
>> @@ -1652,7 +1653,7 @@ static struct iommu_dma_msi_page
>> *iommu_dma_get_msi_page(struct device *dev,
>>   int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
>>   {
>>       struct device *dev = msi_desc_to_dev(desc);
>> -    struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
>> +    struct iommu_domain *domain = iommu_get_unmanaged_domain(dev);
>
> This still doesn't make sense - most of the time this will be expected
> to return the default DMA/identity domain if that's what the device is
> currently using. We can't know whether the current domain is managed
> or not until we look at it.

I tend to agree with Robin here. This was first introduced by

[PATCH v7 21/22] iommu/dma: Add support for mapping MSIs <https://lore.kernel.org/all/2273af20d844bd618c6a90b57e639700328ebf7f.1473695704.git.robin.murphy@arm.com/#r>
https://lore.kernel.org/all/2273af20d844bd618c6a90b57e639700328ebf7f.1473695704.git.robin.murphy@arm.com/

even before the support un VFIO use case which came later on. So using the "unmanaged" terminology sounds improper to me, at least.
Couldn't we use a parent/child terminology as used in the past in 
[RFC v2] /dev/iommu uAPI proposal <https://lore.kernel.org/all/BN9PR11MB5433B1E4AE5B0480369F97178C189@BN9PR11MB5433.namprd11.prod.outlook.com/#r>

This would still hold for the former use case.

Thanks

Eric



>
> Just like every other caller of iommu_get_domain_for_dev(), what we
> want here is the current kernel-owned domain that we can inspect and
> maybe do standard IOMMU API things with. Why can't
> iommu_get_domain_for_dev() simply maintain that established usage
> model and return the kernel-owned s2_domain from a nested domain
> automatically? No IOMMU API user expects or needs it to return
> anything else (and IOMMUFD should certainly not be losing track of a
> nested domain within its own higher-level abstractions and needing to
> fall back on iommu_get_domain_for_dev()), so I really don't see a
> valid reason to overcomplicate things.
>
> Please note I stress "valid" since I'm not buying arbitrarily made-up
> conceptual purity arguments. A nested domain cannot be the "one true
> domain" that is an opaque combination of S1+S2; the IOMMU API view has
> to be more like the device is attached to both the nested domain and
> the parent stage 2 domain somewhat in parallel. Even when nesting is
> active, the S2 domain still exists as a domain in its own right, and
> still needs to be visible and operated on as such, for instance if
> memory is hotplugged in or out of the VM.
>
> TBH I'd also move the s2_domain pointer into the iommu_domain itself,
> since it's going to be a common feature for all nesting
> implementations, thus there seems little need to indirect lookups
> through the drivers at all.
>
> Thanks,
> Robin.
>
>>       struct iommu_dma_msi_page *msi_page;
>>       static DEFINE_MUTEX(msi_prepare_lock); /* see below */
>>   @@ -1685,7 +1686,7 @@ int iommu_dma_prepare_msi(struct msi_desc
>> *desc, phys_addr_t msi_addr)
>>   void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct
>> msi_msg *msg)
>>   {
>>       struct device *dev = msi_desc_to_dev(desc);
>> -    const struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
>> +    const struct iommu_domain *domain =
>> iommu_get_unmanaged_domain(dev);
>>       const struct iommu_dma_msi_page *msi_page;
>>         msi_page = msi_desc_get_iommu_cookie(desc);
>> diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
>> index a6e694f59f64..da8044da9ad8 100644
>> --- a/drivers/iommu/iommu-priv.h
>> +++ b/drivers/iommu/iommu-priv.h
>> @@ -15,6 +15,21 @@ static inline const struct iommu_ops
>> *dev_iommu_ops(struct device *dev)
>>       return dev->iommu->iommu_dev->ops;
>>   }
>>   +static inline struct iommu_domain
>> *iommu_get_unmanaged_domain(struct device *dev)
>> +{
>> +    const struct iommu_ops *ops;
>> +
>> +    if (!dev->iommu || !dev->iommu->iommu_dev)
>> +        goto attached_domain;
>> +
>> +    ops = dev_iommu_ops(dev);
>> +    if (ops->get_unmanaged_domain)
>> +        return ops->get_unmanaged_domain(dev);
>> +
>> +attached_domain:
>> +    return iommu_get_domain_for_dev(dev);
>> +}
>> +
>>   int iommu_group_replace_domain(struct iommu_group *group,
>>                      struct iommu_domain *new_domain);
>>   diff --git a/include/linux/iommu.h b/include/linux/iommu.h
>> index 080278c8154d..76c65cc4fc15 100644
>> --- a/include/linux/iommu.h
>> +++ b/include/linux/iommu.h
>> @@ -275,6 +275,8 @@ struct iommu_ops {
>>                             struct iommu_domain *parent,
>>                             const void *user_data);
>>   +    struct iommu_domain *(*get_unmanaged_domain)(struct device *dev);
>> +
>>       struct iommu_device *(*probe_device)(struct device *dev);
>>       void (*release_device)(struct device *dev);
>>       void (*probe_finalize)(struct device *dev);
>
  
Eric Auger March 10, 2023, 10:14 a.m. UTC | #6
Hi Nicolin,

On 3/9/23 11:53, Nicolin Chen wrote:
> The nature of ITS virtualization on ARM is done via hypercalls, so kernel
> handles all IOVA mappings for the MSI doorbell in iommu_dma_prepare_msi()
> and iommu_dma_compose_msi_msg(). The current virtualization solution with
> a 2-stage nested translation setup is to do 1:1 IOVA mappings at stage-1
Note that if we still intend to use that trick there is a known issue at
kernel side that needs to be fixed.

ARM DEN 0049E.b IORT specification mandates that when
RMRs are present, the OS must preserve PCIe configuration
performed by the boot FW.

As discussed in the past, enforcing this causes issue with PCI devices
with IO ports. See qemu commit
40c3472a29c9 ("Revert "acpi/gpex: Inform os to keep firmware resource
map"). This seemed to require a fix at kernel level. I am not sure this
fix has been worked on.

Thanks

Eric

> guest-level IO page table via a RMR region in guest-level IORT, aligning
> with an IOVA region that's predefined and mapped in the host kernel:
>
>   [stage-2 host level]
>   #define MSI_IOVA_BASE		0x8000000
>   #define MSI_IOVA_LENGTH	0x100000
>   ...
>   iommu_get_msi_cookie():
> 	cookie->msi_iova = MSI_IOVA_BASE;
>   ...
>   iommu_dma_prepare_msi(its_pa):
> 	domain = iommu_get_domain_for_dev(dev);
> 	iommu_dma_get_msi_page(its_pa, domain):
> 		cookie = domain->iova_cookie;
> 		iova = iommu_dma_alloc_iova():
> 			return cookie->msi_iova - size;
> 		iommu_map(iova, its_pa, ...);
>
>   [stage-1 guest level]
>   // Define in IORT a RMR [MSI_IOVA_BASE, MSI_IOVA_LENGTH]
>   ...
>   iommu_create_device_direct_mappings():
> 	iommu_map(iova=MSI_IOVA_BASE, pa=MSI_IOVA_BASE, len=MSI_IOVA_LENGTH);
>
> This solution calling iommu_get_domain_for_dev() needs the device to get
> attached to a host-level iommu_domain that has the msi_cookie.
>
> On the other hand, IOMMUFD designs two iommu_domain objects to represent
> the two stages: a stage-1 domain (IOMMU_DOMAIN_NESTED type) and a stage-2
> domain (IOMMU_DOMAIN_UNMANAGED type). In this design, the device will be
> attached to the stage-1 domain representing a guest-level IO page table,
> or a Context Descriptor Table in SMMU's term.
>
> This is obviously a mismatch, as the iommu_get_domain_for_dev() does not
> return the correct domain pointer in iommu_dma_prepare_msi().
>
> Add an iommu_get_unmanaged_domain helper to allow drivers to return the
> correct IOMMU_DOMAIN_UNMANAGED iommu_domain having the IOVA mappings for
> the msi_cookie. Keep it in the iommu-priv header for internal use only.
>
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>  drivers/iommu/dma-iommu.c  |  5 +++--
>  drivers/iommu/iommu-priv.h | 15 +++++++++++++++
>  include/linux/iommu.h      |  2 ++
>  3 files changed, 20 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 99b2646cb5c7..6b0409d0ff85 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -31,6 +31,7 @@
>  #include <linux/vmalloc.h>
>  
>  #include "dma-iommu.h"
> +#include "iommu-priv.h"
>  
>  struct iommu_dma_msi_page {
>  	struct list_head	list;
> @@ -1652,7 +1653,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
>  int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
>  {
>  	struct device *dev = msi_desc_to_dev(desc);
> -	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> +	struct iommu_domain *domain = iommu_get_unmanaged_domain(dev);
>  	struct iommu_dma_msi_page *msi_page;
>  	static DEFINE_MUTEX(msi_prepare_lock); /* see below */
>  
> @@ -1685,7 +1686,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
>  void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
>  {
>  	struct device *dev = msi_desc_to_dev(desc);
> -	const struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> +	const struct iommu_domain *domain = iommu_get_unmanaged_domain(dev);
>  	const struct iommu_dma_msi_page *msi_page;
>  
>  	msi_page = msi_desc_get_iommu_cookie(desc);
> diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
> index a6e694f59f64..da8044da9ad8 100644
> --- a/drivers/iommu/iommu-priv.h
> +++ b/drivers/iommu/iommu-priv.h
> @@ -15,6 +15,21 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
>  	return dev->iommu->iommu_dev->ops;
>  }
>  
> +static inline struct iommu_domain *iommu_get_unmanaged_domain(struct device *dev)
> +{
> +	const struct iommu_ops *ops;
> +
> +	if (!dev->iommu || !dev->iommu->iommu_dev)
> +		goto attached_domain;
> +
> +	ops = dev_iommu_ops(dev);
> +	if (ops->get_unmanaged_domain)
> +		return ops->get_unmanaged_domain(dev);
> +
> +attached_domain:
> +	return iommu_get_domain_for_dev(dev);
> +}
> +
>  int iommu_group_replace_domain(struct iommu_group *group,
>  			       struct iommu_domain *new_domain);
>  
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 080278c8154d..76c65cc4fc15 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -275,6 +275,8 @@ struct iommu_ops {
>  						  struct iommu_domain *parent,
>  						  const void *user_data);
>  
> +	struct iommu_domain *(*get_unmanaged_domain)(struct device *dev);
> +
>  	struct iommu_device *(*probe_device)(struct device *dev);
>  	void (*release_device)(struct device *dev);
>  	void (*probe_finalize)(struct device *dev);
  
Jason Gunthorpe March 10, 2023, 3:33 p.m. UTC | #7
On Fri, Mar 10, 2023 at 11:14:59AM +0100, Eric Auger wrote:
> Hi Nicolin,
> 
> On 3/9/23 11:53, Nicolin Chen wrote:
> > The nature of ITS virtualization on ARM is done via hypercalls, so kernel
> > handles all IOVA mappings for the MSI doorbell in iommu_dma_prepare_msi()
> > and iommu_dma_compose_msi_msg(). The current virtualization solution with
> > a 2-stage nested translation setup is to do 1:1 IOVA mappings at stage-1
> Note that if we still intend to use that trick there is a known issue at
> kernel side that needs to be fixed.
> 
> ARM DEN 0049E.b IORT specification mandates that when
> RMRs are present, the OS must preserve PCIe configuration
> performed by the boot FW.

This limitation doesn't seem necessary for this MSI stuff?

What is it for?

Jason
  
Shameerali Kolothum Thodi March 10, 2023, 3:44 p.m. UTC | #8
> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgg@nvidia.com]
> Sent: 10 March 2023 15:33
> To: Eric Auger <eric.auger@redhat.com>
> Cc: Nicolin Chen <nicolinc@nvidia.com>; robin.murphy@arm.com;
> will@kernel.org; kevin.tian@intel.com; baolu.lu@linux.intel.com;
> joro@8bytes.org; Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>; jean-philippe@linaro.org;
> linux-arm-kernel@lists.infradead.org; iommu@lists.linux.dev;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v1 01/14] iommu: Add iommu_get_unmanaged_domain
> helper
> 
> On Fri, Mar 10, 2023 at 11:14:59AM +0100, Eric Auger wrote:
> > Hi Nicolin,
> >
> > On 3/9/23 11:53, Nicolin Chen wrote:
> > > The nature of ITS virtualization on ARM is done via hypercalls, so
> > > kernel handles all IOVA mappings for the MSI doorbell in
> > > iommu_dma_prepare_msi() and iommu_dma_compose_msi_msg(). The
> current
> > > virtualization solution with a 2-stage nested translation setup is
> > > to do 1:1 IOVA mappings at stage-1
> > Note that if we still intend to use that trick there is a known issue
> > at kernel side that needs to be fixed.
> >
> > ARM DEN 0049E.b IORT specification mandates that when RMRs are
> > present, the OS must preserve PCIe configuration performed by the boot
> > FW.
> 
> This limitation doesn't seem necessary for this MSI stuff?
> 
> What is it for?

That is to make sure the Stream Ids specified in RMR are still valid and is not being
reassigned by OS. The kernel checks for this(iort_rmr_has_dev()),
https://lore.kernel.org/linux-arm-kernel/20220420164836.1181-5-shameerali.kolothum.thodi@huawei.com/

Thanks,
Shameer
  
Jason Gunthorpe March 10, 2023, 3:55 p.m. UTC | #9
On Fri, Mar 10, 2023 at 09:41:01AM +0100, Eric Auger wrote:

> I tend to agree with Robin here. This was first introduced by
> 
> [PATCH v7 21/22] iommu/dma: Add support for mapping MSIs <https://lore.kernel.org/all/2273af20d844bd618c6a90b57e639700328ebf7f.1473695704.git.robin.murphy@arm.com/#r>
> https://lore.kernel.org/all/2273af20d844bd618c6a90b57e639700328ebf7f.1473695704.git.robin.murphy@arm.com/

Presumably it had to use the iommu_get_domain_for_dev() instead of
iommu_get_dma_domain() to support ARM 32 arm-iommu. Ie it is poking
into the arm-iommu owned domain as well. VFIO just ended being the
same flow

> even before the support un VFIO use case which came later on. So
> using the "unmanaged" terminology sounds improper to me, at least.
> Couldn't we use a parent/child terminology as used in the past in

No objection to a better name...

Actually how about if we write it like this? Robin would you be
happier? I think it much more clearly explains why this function is
special within our single domain attachment model.

"get_unmanaged_msi_domain" seems like a much more narrowly specific to
the purpose name.

int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
{
	struct device *dev = msi_desc_to_dev(desc);
	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
	struct iommu_dma_msi_page *msi_page;
	static DEFINE_MUTEX(msi_prepare_lock); /* see below */

	desc->iommu_cookie = NULL;

	/*
	 * This probably shouldn't happen as the ARM32 systems should only have
	 * NULL if arm-iommu has been disconnected during setup/destruction.
	 * Assume it is an identity domain.
	 */
	if (!domain)
		return 0;

	/* Caller is expected to use msi_addr for the page */
	if (domain->type == IOMMU_DOMAIN_IDENTITY)
		return 0;

	/*
	 * The current domain is some driver opaque thing. We assume the
	 * driver/user knows what it is doing regarding ARM ITS MSI pages and we
	 * want to try to install the page into some kind of kernel owned
	 * unmanaged domain. Eg for nesting this will install the ITS page into
	 * the S2 domain and then we assume that the S1 domain has independently
	 * made it mapped at the same address.
	 */
	// FIXME wrap in a function
	if (domain->type != IOMMU_DOMAIN_UNMANAGED &&
	    domain->ops->get_unmanged_msi_domain)
		domain = domain->ops->get_unmanged_msi_domain(domain);

	if (!domain || domain->type != IOMMU_DOMAIN_UNMANAGED)
		return -EINVAL;

	// ???
	if (!domain->iova_cookie)
		return 0;

	/*
	 * In fact the whole prepare operation should already be serialised by
	 * irq_domain_mutex further up the callchain, but that's pretty subtle
	 * on its own, so consider this locking as failsafe documentation...
	 */
	mutex_lock(&msi_prepare_lock);
	msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain);
	mutex_unlock(&msi_prepare_lock);

	msi_desc_set_iommu_cookie(desc, msi_page);

	if (!msi_page)
		return -ENOMEM;
	return 0;
}

Jason
  
Jason Gunthorpe March 10, 2023, 3:56 p.m. UTC | #10
On Fri, Mar 10, 2023 at 03:44:02PM +0000, Shameerali Kolothum Thodi wrote:
> 
> 
> > -----Original Message-----
> > From: Jason Gunthorpe [mailto:jgg@nvidia.com]
> > Sent: 10 March 2023 15:33
> > To: Eric Auger <eric.auger@redhat.com>
> > Cc: Nicolin Chen <nicolinc@nvidia.com>; robin.murphy@arm.com;
> > will@kernel.org; kevin.tian@intel.com; baolu.lu@linux.intel.com;
> > joro@8bytes.org; Shameerali Kolothum Thodi
> > <shameerali.kolothum.thodi@huawei.com>; jean-philippe@linaro.org;
> > linux-arm-kernel@lists.infradead.org; iommu@lists.linux.dev;
> > linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH v1 01/14] iommu: Add iommu_get_unmanaged_domain
> > helper
> > 
> > On Fri, Mar 10, 2023 at 11:14:59AM +0100, Eric Auger wrote:
> > > Hi Nicolin,
> > >
> > > On 3/9/23 11:53, Nicolin Chen wrote:
> > > > The nature of ITS virtualization on ARM is done via hypercalls, so
> > > > kernel handles all IOVA mappings for the MSI doorbell in
> > > > iommu_dma_prepare_msi() and iommu_dma_compose_msi_msg(). The
> > current
> > > > virtualization solution with a 2-stage nested translation setup is
> > > > to do 1:1 IOVA mappings at stage-1
> > > Note that if we still intend to use that trick there is a known issue
> > > at kernel side that needs to be fixed.
> > >
> > > ARM DEN 0049E.b IORT specification mandates that when RMRs are
> > > present, the OS must preserve PCIe configuration performed by the boot
> > > FW.
> > 
> > This limitation doesn't seem necessary for this MSI stuff?
> > 
> > What is it for?
> 
> That is to make sure the Stream Ids specified in RMR are still valid and is not being
> reassigned by OS. The kernel checks for this(iort_rmr_has_dev()),
> https://lore.kernel.org/linux-arm-kernel/20220420164836.1181-5-shameerali.kolothum.thodi@huawei.com/

So "boot configration" is more like "don't change the RIDs"? Ie don't
enable SRIOV?

Jason
  
Shameerali Kolothum Thodi March 10, 2023, 4:07 p.m. UTC | #11
> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgg@nvidia.com]
> Sent: 10 March 2023 15:57
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: Eric Auger <eric.auger@redhat.com>; Nicolin Chen
> <nicolinc@nvidia.com>; robin.murphy@arm.com; will@kernel.org;
> kevin.tian@intel.com; baolu.lu@linux.intel.com; joro@8bytes.org;
> jean-philippe@linaro.org; linux-arm-kernel@lists.infradead.org;
> iommu@lists.linux.dev; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v1 01/14] iommu: Add iommu_get_unmanaged_domain
> helper
> 
> On Fri, Mar 10, 2023 at 03:44:02PM +0000, Shameerali Kolothum Thodi
> wrote:
> >
> >
> > > -----Original Message-----
> > > From: Jason Gunthorpe [mailto:jgg@nvidia.com]
> > > Sent: 10 March 2023 15:33
> > > To: Eric Auger <eric.auger@redhat.com>
> > > Cc: Nicolin Chen <nicolinc@nvidia.com>; robin.murphy@arm.com;
> > > will@kernel.org; kevin.tian@intel.com; baolu.lu@linux.intel.com;
> > > joro@8bytes.org; Shameerali Kolothum Thodi
> > > <shameerali.kolothum.thodi@huawei.com>; jean-philippe@linaro.org;
> > > linux-arm-kernel@lists.infradead.org; iommu@lists.linux.dev;
> > > linux-kernel@vger.kernel.org
> > > Subject: Re: [PATCH v1 01/14] iommu: Add
> iommu_get_unmanaged_domain
> > > helper
> > >
> > > On Fri, Mar 10, 2023 at 11:14:59AM +0100, Eric Auger wrote:
> > > > Hi Nicolin,
> > > >
> > > > On 3/9/23 11:53, Nicolin Chen wrote:
> > > > > The nature of ITS virtualization on ARM is done via hypercalls,
> > > > > so kernel handles all IOVA mappings for the MSI doorbell in
> > > > > iommu_dma_prepare_msi() and iommu_dma_compose_msi_msg().
> The
> > > current
> > > > > virtualization solution with a 2-stage nested translation setup
> > > > > is to do 1:1 IOVA mappings at stage-1
> > > > Note that if we still intend to use that trick there is a known
> > > > issue at kernel side that needs to be fixed.
> > > >
> > > > ARM DEN 0049E.b IORT specification mandates that when RMRs are
> > > > present, the OS must preserve PCIe configuration performed by the
> > > > boot FW.
> > >
> > > This limitation doesn't seem necessary for this MSI stuff?
> > >
> > > What is it for?
> >
> > That is to make sure the Stream Ids specified in RMR are still valid
> > and is not being reassigned by OS. The kernel checks for
> > this(iort_rmr_has_dev()),
> >
> https://lore.kernel.org/linux-arm-kernel/20220420164836.1181-5-shameer
> > ali.kolothum.thodi@huawei.com/
> 
> So "boot configration" is more like "don't change the RIDs"? Ie don't enable
> SRIOV?

Yes. Don't think it will work with SR-IOV if you can't guarantee the RMR specified 
SID.

Thanks,
Shameer
  
Jason Gunthorpe March 10, 2023, 4:21 p.m. UTC | #12
On Fri, Mar 10, 2023 at 04:07:38PM +0000, Shameerali Kolothum Thodi wrote:
> > https://lore.kernel.org/linux-arm-kernel/20220420164836.1181-5-shameer
> > > ali.kolothum.thodi@huawei.com/
> > 
> > So "boot configration" is more like "don't change the RIDs"? Ie don't enable
> > SRIOV?
> 
> Yes. Don't think it will work with SR-IOV if you can't guarantee the RMR specified 
> SID.

So I think we are probably good them because vSR-IOV is already not
supported by qemu, so it impossible for a VM to change the PCI
configuration in a way that would alter the RID to SID mapping?

Jason
  
Shameerali Kolothum Thodi March 10, 2023, 4:30 p.m. UTC | #13
> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgg@nvidia.com]
> Sent: 10 March 2023 16:21
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: Eric Auger <eric.auger@redhat.com>; Nicolin Chen
> <nicolinc@nvidia.com>; robin.murphy@arm.com; will@kernel.org;
> kevin.tian@intel.com; baolu.lu@linux.intel.com; joro@8bytes.org;
> jean-philippe@linaro.org; linux-arm-kernel@lists.infradead.org;
> iommu@lists.linux.dev; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v1 01/14] iommu: Add iommu_get_unmanaged_domain
> helper
> 
> On Fri, Mar 10, 2023 at 04:07:38PM +0000, Shameerali Kolothum Thodi
> wrote:
> > >
> https://lore.kernel.org/linux-arm-kernel/20220420164836.1181-5-shameer
> > > > ali.kolothum.thodi@huawei.com/
> > >
> > > So "boot configration" is more like "don't change the RIDs"? Ie don't
> enable
> > > SRIOV?
> >
> > Yes. Don't think it will work with SR-IOV if you can't guarantee the RMR
> specified
> > SID.
> 
> So I think we are probably good them because vSR-IOV is already not
> supported by qemu, so it impossible for a VM to change the PCI
> configuration in a way that would alter the RID to SID mapping?
> 

Provided we fix the issue mentioned by Eric. This was discussed here previously,

https://lore.kernel.org/linux-arm-kernel/bb3688c7-8f42-039e-e22f-6529078da97d@redhat.com/

Thanks,
Shameer
  
Jason Gunthorpe March 10, 2023, 5:03 p.m. UTC | #14
On Fri, Mar 10, 2023 at 04:30:03PM +0000, Shameerali Kolothum Thodi wrote:
> 
> 
> > -----Original Message-----
> > From: Jason Gunthorpe [mailto:jgg@nvidia.com]
> > Sent: 10 March 2023 16:21
> > To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> > Cc: Eric Auger <eric.auger@redhat.com>; Nicolin Chen
> > <nicolinc@nvidia.com>; robin.murphy@arm.com; will@kernel.org;
> > kevin.tian@intel.com; baolu.lu@linux.intel.com; joro@8bytes.org;
> > jean-philippe@linaro.org; linux-arm-kernel@lists.infradead.org;
> > iommu@lists.linux.dev; linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH v1 01/14] iommu: Add iommu_get_unmanaged_domain
> > helper
> > 
> > On Fri, Mar 10, 2023 at 04:07:38PM +0000, Shameerali Kolothum Thodi
> > wrote:
> > > >
> > https://lore.kernel.org/linux-arm-kernel/20220420164836.1181-5-shameer
> > > > > ali.kolothum.thodi@huawei.com/
> > > >
> > > > So "boot configration" is more like "don't change the RIDs"? Ie don't
> > enable
> > > > SRIOV?
> > >
> > > Yes. Don't think it will work with SR-IOV if you can't guarantee the RMR
> > specified
> > > SID.
> > 
> > So I think we are probably good them because vSR-IOV is already not
> > supported by qemu, so it impossible for a VM to change the PCI
> > configuration in a way that would alter the RID to SID mapping?
> > 
> 
> Provided we fix the issue mentioned by Eric. This was discussed here previously,
> 
> https://lore.kernel.org/linux-arm-kernel/bb3688c7-8f42-039e-e22f-6529078da97d@redhat.com/

Ah, I see so that we don't renumber the buses during PCI discovery..

It seems like Eric's issue is overly broad if we just want to block
RID reassignment that doesn't impact MMIO layout.

But, still, why do we care about this?

The vIOMMU should virtualize the vSIDs right? So why does qemu give a
vSID list to the guest anyhow? Shouldn't the guest use an algorithmic
calculation from the vRID so that qemu can reverse it to the correct
vPCI device and thus the correct vfio_device and then dev id in the
iommu_domain?

Jason
  
Nicolin Chen March 16, 2023, 1:21 a.m. UTC | #15
Hi Robin,

How do you think about Jason's proposal below? I'd like to see
us come to an agreement on an acceptable solution...

Thanks
Nic

On Fri, Mar 10, 2023 at 11:55:07AM -0400, Jason Gunthorpe wrote:
> On Fri, Mar 10, 2023 at 09:41:01AM +0100, Eric Auger wrote:
> 
> > I tend to agree with Robin here. This was first introduced by
> > 
> > [PATCH v7 21/22] iommu/dma: Add support for mapping MSIs <https://lore.kernel.org/all/2273af20d844bd618c6a90b57e639700328ebf7f.1473695704.git.robin.murphy@arm.com/#r>
> > https://lore.kernel.org/all/2273af20d844bd618c6a90b57e639700328ebf7f.1473695704.git.robin.murphy@arm.com/
> 
> Presumably it had to use the iommu_get_domain_for_dev() instead of
> iommu_get_dma_domain() to support ARM 32 arm-iommu. Ie it is poking
> into the arm-iommu owned domain as well. VFIO just ended being the
> same flow
> 
> > even before the support un VFIO use case which came later on. So
> > using the "unmanaged" terminology sounds improper to me, at least.
> > Couldn't we use a parent/child terminology as used in the past in
> 
> No objection to a better name...
> 
> Actually how about if we write it like this? Robin would you be
> happier? I think it much more clearly explains why this function is
> special within our single domain attachment model.
> 
> "get_unmanaged_msi_domain" seems like a much more narrowly specific to
> the purpose name.
> 
> int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
> {
> 	struct device *dev = msi_desc_to_dev(desc);
> 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> 	struct iommu_dma_msi_page *msi_page;
> 	static DEFINE_MUTEX(msi_prepare_lock); /* see below */
> 
> 	desc->iommu_cookie = NULL;
> 
> 	/*
> 	 * This probably shouldn't happen as the ARM32 systems should only have
> 	 * NULL if arm-iommu has been disconnected during setup/destruction.
> 	 * Assume it is an identity domain.
> 	 */
> 	if (!domain)
> 		return 0;
> 
> 	/* Caller is expected to use msi_addr for the page */
> 	if (domain->type == IOMMU_DOMAIN_IDENTITY)
> 		return 0;
> 
> 	/*
> 	 * The current domain is some driver opaque thing. We assume the
> 	 * driver/user knows what it is doing regarding ARM ITS MSI pages and we
> 	 * want to try to install the page into some kind of kernel owned
> 	 * unmanaged domain. Eg for nesting this will install the ITS page into
> 	 * the S2 domain and then we assume that the S1 domain has independently
> 	 * made it mapped at the same address.
> 	 */
> 	// FIXME wrap in a function
> 	if (domain->type != IOMMU_DOMAIN_UNMANAGED &&
> 	    domain->ops->get_unmanged_msi_domain)
> 		domain = domain->ops->get_unmanged_msi_domain(domain);
> 
> 	if (!domain || domain->type != IOMMU_DOMAIN_UNMANAGED)
> 		return -EINVAL;
> 
> 	// ???
> 	if (!domain->iova_cookie)
> 		return 0;
> 
> 	/*
> 	 * In fact the whole prepare operation should already be serialised by
> 	 * irq_domain_mutex further up the callchain, but that's pretty subtle
> 	 * on its own, so consider this locking as failsafe documentation...
> 	 */
> 	mutex_lock(&msi_prepare_lock);
> 	msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain);
> 	mutex_unlock(&msi_prepare_lock);
> 
> 	msi_desc_set_iommu_cookie(desc, msi_page);
> 
> 	if (!msi_page)
> 		return -ENOMEM;
> 	return 0;
> }
> 
> Jason
  
Robin Murphy March 16, 2023, 6:42 p.m. UTC | #16
On 16/03/2023 1:21 am, Nicolin Chen wrote:
> Hi Robin,
> 
> How do you think about Jason's proposal below? I'd like to see
> us come to an agreement on an acceptable solution...

I think it's so thoroughly broken that I suspect Cunningham's law might
be at play, but fine, you win :) Hopefully it's sufficiently obvious how
the other pieces would fit around the patch below. FWIW I'd still prefer
a generic domain->s2_domain pointer rather than any op at all, but I'm
happy enough with this compromise.

Thanks,
Robin.

----->8-----
Subject: [PATCH] iommu/dma: Support MSIs through nested domains

Currently, iommu-dma is the only place outside of IOMMUFD and drivers
which might need to be aware of the stage 2 domain encapsulated within
a nested domain. This would be in the legacy-VFIO-style case where we're
using host-managed MSIs with an identity mapping at stage 1, where it is
the underlying stage 2 domain which owns an MSI cookie and holds the
corresponding dynamic mappings. Hook up the new op to resolve what we
need from a nested domain.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
  drivers/iommu/dma-iommu.c | 18 ++++++++++++++++--
  1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 99b2646cb5c7..66b0d5fa49f8 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1642,6 +1642,20 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
  	return NULL;
  }
  
+/*
+ * Nested domains may not have an MSI cookie or accept mappings, but they may
+ * be related to a domain which does, so we let them tell us what they need.
+ */
+static struct iommu_domain *iommu_dma_get_msi_mapping_domain(struct device *dev)
+{
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+
+	if (domain && domain->type == IOMMU_DOMAIN_NESTED &&
+	    domain->ops->get_msi_mapping_domain)
+		domain = domain->ops->get_msi_mapping_domain(domain);
+	return domain;
+}
+
  /**
   * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain
   * @desc: MSI descriptor, will store the MSI page
@@ -1652,7 +1666,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
  int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
  {
  	struct device *dev = msi_desc_to_dev(desc);
-	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct iommu_domain *domain = iommu_dma_get_msi_mapping_domain(dev);
  	struct iommu_dma_msi_page *msi_page;
  	static DEFINE_MUTEX(msi_prepare_lock); /* see below */
  
@@ -1685,7 +1699,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
  void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
  {
  	struct device *dev = msi_desc_to_dev(desc);
-	const struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	const struct iommu_domain *domain = iommu_dma_get_msi_mapping_domain(dev);
  	const struct iommu_dma_msi_page *msi_page;
  
  	msi_page = msi_desc_get_iommu_cookie(desc);
  
Nicolin Chen March 16, 2023, 7:51 p.m. UTC | #17
Hi Shameer/Eric,

On Fri, Mar 10, 2023 at 04:30:03PM +0000, Shameerali Kolothum Thodi wrote:
> External email: Use caution opening links or attachments
> 
> 
> > -----Original Message-----
> > From: Jason Gunthorpe [mailto:jgg@nvidia.com]
> > Sent: 10 March 2023 16:21
> > To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> > Cc: Eric Auger <eric.auger@redhat.com>; Nicolin Chen
> > <nicolinc@nvidia.com>; robin.murphy@arm.com; will@kernel.org;
> > kevin.tian@intel.com; baolu.lu@linux.intel.com; joro@8bytes.org;
> > jean-philippe@linaro.org; linux-arm-kernel@lists.infradead.org;
> > iommu@lists.linux.dev; linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH v1 01/14] iommu: Add iommu_get_unmanaged_domain
> > helper
> >
> > On Fri, Mar 10, 2023 at 04:07:38PM +0000, Shameerali Kolothum Thodi
> > wrote:
> > > >
> > https://lore.kernel.org/linux-arm-kernel/20220420164836.1181-5-shameer
> > > > > ali.kolothum.thodi@huawei.com/
> > > >
> > > > So "boot configration" is more like "don't change the RIDs"? Ie don't
> > enable
> > > > SRIOV?
> > >
> > > Yes. Don't think it will work with SR-IOV if you can't guarantee the RMR
> > specified
> > > SID.
> >
> > So I think we are probably good them because vSR-IOV is already not
> > supported by qemu, so it impossible for a VM to change the PCI
> > configuration in a way that would alter the RID to SID mapping?
> >
> 
> Provided we fix the issue mentioned by Eric. This was discussed here previously,
> 
> https://lore.kernel.org/linux-arm-kernel/bb3688c7-8f42-039e-e22f-6529078da97d@redhat.com/

Have we fixed the issue? I saw Lorenzo replying in that thread:
https://lore.kernel.org/linux-arm-kernel/Yi8nV8H4Jjlpadmk@lpieralisi/

Or, what's remaining here regarding this topic? Is it a blocker?

Thanks
Nic
  
Shameerali Kolothum Thodi March 16, 2023, 7:56 p.m. UTC | #18
> -----Original Message-----
> From: Nicolin Chen [mailto:nicolinc@nvidia.com]
> Sent: 16 March 2023 19:51
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> Eric Auger <eric.auger@redhat.com>
> Cc: Jason Gunthorpe <jgg@nvidia.com>; robin.murphy@arm.com;
> will@kernel.org; kevin.tian@intel.com; baolu.lu@linux.intel.com;
> joro@8bytes.org; jean-philippe@linaro.org;
> linux-arm-kernel@lists.infradead.org; iommu@lists.linux.dev;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v1 01/14] iommu: Add iommu_get_unmanaged_domain
> helper
> 
> Hi Shameer/Eric,
> 
> On Fri, Mar 10, 2023 at 04:30:03PM +0000, Shameerali Kolothum Thodi
> wrote:
> > External email: Use caution opening links or attachments
> >
> >
> > > -----Original Message-----
> > > From: Jason Gunthorpe [mailto:jgg@nvidia.com]
> > > Sent: 10 March 2023 16:21
> > > To: Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>
> > > Cc: Eric Auger <eric.auger@redhat.com>; Nicolin Chen
> > > <nicolinc@nvidia.com>; robin.murphy@arm.com; will@kernel.org;
> > > kevin.tian@intel.com; baolu.lu@linux.intel.com; joro@8bytes.org;
> > > jean-philippe@linaro.org; linux-arm-kernel@lists.infradead.org;
> > > iommu@lists.linux.dev; linux-kernel@vger.kernel.org
> > > Subject: Re: [PATCH v1 01/14] iommu: Add
> iommu_get_unmanaged_domain
> > > helper
> > >
> > > On Fri, Mar 10, 2023 at 04:07:38PM +0000, Shameerali Kolothum Thodi
> > > wrote:
> > > > >
> > >
> https://lore.kernel.org/linux-arm-kernel/20220420164836.1181-5-shame
> > > er
> > > > > > ali.kolothum.thodi@huawei.com/
> > > > >
> > > > > So "boot configration" is more like "don't change the RIDs"? Ie
> > > > > don't
> > > enable
> > > > > SRIOV?
> > > >
> > > > Yes. Don't think it will work with SR-IOV if you can't guarantee
> > > > the RMR
> > > specified
> > > > SID.
> > >
> > > So I think we are probably good them because vSR-IOV is already not
> > > supported by qemu, so it impossible for a VM to change the PCI
> > > configuration in a way that would alter the RID to SID mapping?
> > >
> >
> > Provided we fix the issue mentioned by Eric. This was discussed here
> > previously,
> >
> > https://lore.kernel.org/linux-arm-kernel/bb3688c7-8f42-039e-e22f-65290
> > 78da97d@redhat.com/
> 
> Have we fixed the issue? I saw Lorenzo replying in that thread:
> https://lore.kernel.org/linux-arm-kernel/Yi8nV8H4Jjlpadmk@lpieralisi/
> 
> Or, what's remaining here regarding this topic? Is it a blocker?

[+Lorenzo]

Not sure it is fixed yet. Also, assuming we take RMR path, do we plan to support 
DT base Guests at all?

Thanks,
Shameer
  
Nicolin Chen March 16, 2023, 8:01 p.m. UTC | #19
On Thu, Mar 16, 2023 at 06:42:07PM +0000, Robin Murphy wrote:
> External email: Use caution opening links or attachments
> 
> 
> On 16/03/2023 1:21 am, Nicolin Chen wrote:
> > Hi Robin,
> > 
> > How do you think about Jason's proposal below? I'd like to see
> > us come to an agreement on an acceptable solution...
> 
> I think it's so thoroughly broken that I suspect Cunningham's law might
> be at play, but fine, you win :) Hopefully it's sufficiently obvious how
> the other pieces would fit around the patch below. FWIW I'd still prefer
> a generic domain->s2_domain pointer rather than any op at all, but I'm
> happy enough with this compromise.

Oh, I appreciate that! Looks like we can move on with a v2 :)

I will include this patch replacing mine in the next version.

Thanks!
Nic

> ----->8-----
> Subject: [PATCH] iommu/dma: Support MSIs through nested domains
> 
> Currently, iommu-dma is the only place outside of IOMMUFD and drivers
> which might need to be aware of the stage 2 domain encapsulated within
> a nested domain. This would be in the legacy-VFIO-style case where we're
> using host-managed MSIs with an identity mapping at stage 1, where it is
> the underlying stage 2 domain which owns an MSI cookie and holds the
> corresponding dynamic mappings. Hook up the new op to resolve what we
> need from a nested domain.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
>  drivers/iommu/dma-iommu.c | 18 ++++++++++++++++--
>  1 file changed, 16 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 99b2646cb5c7..66b0d5fa49f8 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -1642,6 +1642,20 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
>        return NULL;
>  }
> 
> +/*
> + * Nested domains may not have an MSI cookie or accept mappings, but they may
> + * be related to a domain which does, so we let them tell us what they need.
> + */
> +static struct iommu_domain *iommu_dma_get_msi_mapping_domain(struct device *dev)
> +{
> +       struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> +
> +       if (domain && domain->type == IOMMU_DOMAIN_NESTED &&
> +           domain->ops->get_msi_mapping_domain)
> +               domain = domain->ops->get_msi_mapping_domain(domain);
> +       return domain;
> +}
> +
>  /**
>   * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain
>   * @desc: MSI descriptor, will store the MSI page
> @@ -1652,7 +1666,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
>  int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
>  {
>        struct device *dev = msi_desc_to_dev(desc);
> -       struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> +       struct iommu_domain *domain = iommu_dma_get_msi_mapping_domain(dev);
>        struct iommu_dma_msi_page *msi_page;
>        static DEFINE_MUTEX(msi_prepare_lock); /* see below */
> 
> @@ -1685,7 +1699,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
>  void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
>  {
>        struct device *dev = msi_desc_to_dev(desc);
> -       const struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> +       const struct iommu_domain *domain = iommu_dma_get_msi_mapping_domain(dev);
>        const struct iommu_dma_msi_page *msi_page;
> 
>        msi_page = msi_desc_get_iommu_cookie(desc);
> --
> 2.39.2.101.g768bb238c484.dirty
>
  
Jason Gunthorpe March 20, 2023, 12:51 p.m. UTC | #20
On Thu, Mar 16, 2023 at 06:42:07PM +0000, Robin Murphy wrote:
> On 16/03/2023 1:21 am, Nicolin Chen wrote:
> > Hi Robin,
> > 
> > How do you think about Jason's proposal below? I'd like to see
> > us come to an agreement on an acceptable solution...
> 
> I think it's so thoroughly broken that I suspect Cunningham's law might
> be at play, but fine, you win :)

Not to belabor this, but what was wrong with it?

Jason
  
Eric Auger March 22, 2023, 3:44 p.m. UTC | #21
Hi,

On 3/16/23 20:56, Shameerali Kolothum Thodi wrote:
>
>> -----Original Message-----
>> From: Nicolin Chen [mailto:nicolinc@nvidia.com]
>> Sent: 16 March 2023 19:51
>> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
>> Eric Auger <eric.auger@redhat.com>
>> Cc: Jason Gunthorpe <jgg@nvidia.com>; robin.murphy@arm.com;
>> will@kernel.org; kevin.tian@intel.com; baolu.lu@linux.intel.com;
>> joro@8bytes.org; jean-philippe@linaro.org;
>> linux-arm-kernel@lists.infradead.org; iommu@lists.linux.dev;
>> linux-kernel@vger.kernel.org
>> Subject: Re: [PATCH v1 01/14] iommu: Add iommu_get_unmanaged_domain
>> helper
>>
>> Hi Shameer/Eric,
>>
>> On Fri, Mar 10, 2023 at 04:30:03PM +0000, Shameerali Kolothum Thodi
>> wrote:
>>> External email: Use caution opening links or attachments
>>>
>>>
>>>> -----Original Message-----
>>>> From: Jason Gunthorpe [mailto:jgg@nvidia.com]
>>>> Sent: 10 March 2023 16:21
>>>> To: Shameerali Kolothum Thodi
>> <shameerali.kolothum.thodi@huawei.com>
>>>> Cc: Eric Auger <eric.auger@redhat.com>; Nicolin Chen
>>>> <nicolinc@nvidia.com>; robin.murphy@arm.com; will@kernel.org;
>>>> kevin.tian@intel.com; baolu.lu@linux.intel.com; joro@8bytes.org;
>>>> jean-philippe@linaro.org; linux-arm-kernel@lists.infradead.org;
>>>> iommu@lists.linux.dev; linux-kernel@vger.kernel.org
>>>> Subject: Re: [PATCH v1 01/14] iommu: Add
>> iommu_get_unmanaged_domain
>>>> helper
>>>>
>>>> On Fri, Mar 10, 2023 at 04:07:38PM +0000, Shameerali Kolothum Thodi
>>>> wrote:
>> https://lore.kernel.org/linux-arm-kernel/20220420164836.1181-5-shame
>>>> er
>>>>>>> ali.kolothum.thodi@huawei.com/
>>>>>> So "boot configration" is more like "don't change the RIDs"? Ie
>>>>>> don't
>>>> enable
>>>>>> SRIOV?
>>>>> Yes. Don't think it will work with SR-IOV if you can't guarantee
>>>>> the RMR
>>>> specified
>>>>> SID.
>>>> So I think we are probably good them because vSR-IOV is already not
>>>> supported by qemu, so it impossible for a VM to change the PCI
>>>> configuration in a way that would alter the RID to SID mapping?
>>>>
>>> Provided we fix the issue mentioned by Eric. This was discussed here
>>> previously,
>>>
>>> https://lore.kernel.org/linux-arm-kernel/bb3688c7-8f42-039e-e22f-65290
>>> 78da97d@redhat.com/
>> Have we fixed the issue? I saw Lorenzo replying in that thread:
>> https://lore.kernel.org/linux-arm-kernel/Yi8nV8H4Jjlpadmk@lpieralisi/
>>
>> Or, what's remaining here regarding this topic? Is it a blocker?
> [+Lorenzo]

I am not aware of any change in the situation.

Thanks

Eric
>
> Not sure it is fixed yet. Also, assuming we take RMR path, do we plan to support 
> DT base Guests at all?
>
> Thanks,
> Shameer
>
  
Eric Auger March 22, 2023, 4:07 p.m. UTC | #22
Hi Jason,

On 3/10/23 18:03, Jason Gunthorpe wrote:
> On Fri, Mar 10, 2023 at 04:30:03PM +0000, Shameerali Kolothum Thodi wrote:
>>
>>> -----Original Message-----
>>> From: Jason Gunthorpe [mailto:jgg@nvidia.com]
>>> Sent: 10 March 2023 16:21
>>> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
>>> Cc: Eric Auger <eric.auger@redhat.com>; Nicolin Chen
>>> <nicolinc@nvidia.com>; robin.murphy@arm.com; will@kernel.org;
>>> kevin.tian@intel.com; baolu.lu@linux.intel.com; joro@8bytes.org;
>>> jean-philippe@linaro.org; linux-arm-kernel@lists.infradead.org;
>>> iommu@lists.linux.dev; linux-kernel@vger.kernel.org
>>> Subject: Re: [PATCH v1 01/14] iommu: Add iommu_get_unmanaged_domain
>>> helper
>>>
>>> On Fri, Mar 10, 2023 at 04:07:38PM +0000, Shameerali Kolothum Thodi
>>> wrote:
>>> https://lore.kernel.org/linux-arm-kernel/20220420164836.1181-5-shameer
>>>>>> ali.kolothum.thodi@huawei.com/
>>>>> So "boot configration" is more like "don't change the RIDs"? Ie don't
>>> enable
>>>>> SRIOV?
>>>> Yes. Don't think it will work with SR-IOV if you can't guarantee the RMR
>>> specified
>>>> SID.
>>> So I think we are probably good them because vSR-IOV is already not
>>> supported by qemu, so it impossible for a VM to change the PCI
>>> configuration in a way that would alter the RID to SID mapping?
>>>
>> Provided we fix the issue mentioned by Eric. This was discussed here previously,
>>
>> https://lore.kernel.org/linux-arm-kernel/bb3688c7-8f42-039e-e22f-6529078da97d@redhat.com/
> Ah, I see so that we don't renumber the buses during PCI discovery..
>
> It seems like Eric's issue is overly broad if we just want to block
> RID reassignment that doesn't impact MMIO layout.
IORT spec says

"
If reserved memory regions are present, the OS must preserve PCIe
configuration performed by the boot
firmware. This preservation is required to ensure functional continuity
of the endpoints that are using the reserved
memory regions. Therefore, RMR nodes must be supported by the inclusion
of the PCI Firmware defined _DSM
for ignoring PCI boot configuration, Function 5, in the ACPI device
object of the PCIe host bridge in ACPI
namespace. The _DSM method should return a value of 0 to indicate that
the OS must honour the PCI
configuration that the firmware has done at boot time. See [PCIFW] for
more details on this _DSM method.
"

Enforcing preservation was attempted in the past in QEMU and then
reverted due to the aforemented bug.

qemu commit: 40c3472a29  Revert "acpi/gpex: Inform os to keep firmware
resource map"

So if we want to rely on RMRs and re-introduce that change I don't see
how we can avoid fixing the kernel issue.
>
> But, still, why do we care about this?
>
> The vIOMMU should virtualize the vSIDs right? So why does qemu give a
> vSID list to the guest anyhow? Shouldn't the guest use an algorithmic
> calculation from the vRID so that qemu can reverse it to the correct
> vPCI device and thus the correct vfio_device and then dev id in the
> iommu_domain?
I don't understand how this changes the above picture?


Thanks

Eric
>
> Jason
>
  
Jason Gunthorpe March 22, 2023, 5:02 p.m. UTC | #23
On Wed, Mar 22, 2023 at 05:07:39PM +0100, Eric Auger wrote:

> > It seems like Eric's issue is overly broad if we just want to block
> > RID reassignment that doesn't impact MMIO layout.
> IORT spec says
> 
> "
> If reserved memory regions are present, the OS must preserve PCIe
> configuration performed by the boot
> firmware. This preservation is required to ensure functional continuity
> of the endpoints that are using the reserved
> memory regions. Therefore, RMR nodes must be supported by the inclusion
> of the PCI Firmware defined _DSM
> for ignoring PCI boot configuration, Function 5, in the ACPI device
> object of the PCIe host bridge in ACPI
> namespace. The _DSM method should return a value of 0 to indicate that
> the OS must honour the PCI
> configuration that the firmware has done at boot time. See [PCIFW] for
> more details on this _DSM method.
> "

I would say this spec language is overly broad. If the FW knows the
reserved memory regions it creates are not sensitive to PCI layout
then it should not be forced to set this flag.

> > But, still, why do we care about this?
> >
> > The vIOMMU should virtualize the vSIDs right? So why does qemu give a
> > vSID list to the guest anyhow? Shouldn't the guest use an algorithmic
> > calculation from the vRID so that qemu can reverse it to the correct
> > vPCI device and thus the correct vfio_device and then dev id in the
> > iommu_domain?
> I don't understand how this changes the above picture?

We are forced to use RMR because of the hacky GIC ITS stuff.

ITS placement is not sensitive to PCI layout.

ITS is not sensitive to bus numbers/etc.

vSID to dev_id should also be taken care of by QEMU even if bus
numbers change and doesn't need to be fixed.

So let's have a reason why we need to do all this weird stuff beyond
the spec says so.

If there is no actual functional issue we should not restrict the
guest and provide RMR without the DSM method. Someone should go and
update the spec if this offends them :)

Jason
  
Eric Auger March 22, 2023, 5:41 p.m. UTC | #24
Hi Jason,

On 3/22/23 18:02, Jason Gunthorpe wrote:
> On Wed, Mar 22, 2023 at 05:07:39PM +0100, Eric Auger wrote:
>
>>> It seems like Eric's issue is overly broad if we just want to block
>>> RID reassignment that doesn't impact MMIO layout.
>> IORT spec says
>>
>> "
>> If reserved memory regions are present, the OS must preserve PCIe
>> configuration performed by the boot
>> firmware. This preservation is required to ensure functional continuity
>> of the endpoints that are using the reserved
>> memory regions. Therefore, RMR nodes must be supported by the inclusion
>> of the PCI Firmware defined _DSM
>> for ignoring PCI boot configuration, Function 5, in the ACPI device
>> object of the PCIe host bridge in ACPI
>> namespace. The _DSM method should return a value of 0 to indicate that
>> the OS must honour the PCI
>> configuration that the firmware has done at boot time. See [PCIFW] for
>> more details on this _DSM method.
>> "
> I would say this spec language is overly broad. If the FW knows the
> reserved memory regions it creates are not sensitive to PCI layout
> then it should not be forced to set this flag.

But do we have any guarantee the bus numbers can't change. I thought the
guest was allowed to re-number at will? While further thinking at it,
all RID ID mappings should be affected by this concern, I mean not only
RID 2 RMRs? What do I miss?
>
>>> But, still, why do we care about this?
>>>
>>> The vIOMMU should virtualize the vSIDs right? So why does qemu give a
>>> vSID list to the guest anyhow? Shouldn't the guest use an algorithmic
>>> calculation from the vRID so that qemu can reverse it to the correct
>>> vPCI device and thus the correct vfio_device and then dev id in the
>>> iommu_domain?
>> I don't understand how this changes the above picture?
> We are forced to use RMR because of the hacky GIC ITS stuff.
well we are not obliged to use RMRs. My first revisions did not use it
and created a non direct S1 mapping. This is just a commodity that
simplifies the integration and was nicely suggested by jean.
>
> ITS placement is not sensitive to PCI layout.
>
> ITS is not sensitive to bus numbers/etc.
>
> vSID to dev_id should also be taken care of by QEMU even if bus
> numbers change and doesn't need to be fixed.
agreed, hence the above question.
>
> So let's have a reason why we need to do all this weird stuff beyond
> the spec says so.
>
> If there is no actual functional issue we should not restrict the
> guest and provide RMR without the DSM method. Someone should go and
> update the spec if this offends them :)
>
> Jason
>
Thanks

Eric
  
Jason Gunthorpe March 22, 2023, 6:07 p.m. UTC | #25
On Wed, Mar 22, 2023 at 06:41:42PM +0100, Eric Auger wrote:
> > I would say this spec language is overly broad. If the FW knows the
> > reserved memory regions it creates are not sensitive to PCI layout
> > then it should not be forced to set this flag.
> 
> But do we have any guarantee the bus numbers can't change. I thought the
> guest was allowed to re-number at will? While further thinking at it,
> all RID ID mappings should be affected by this concern, I mean not only
> RID 2 RMRs? What do I miss?

Bus number changing is allowed, but qemu should not be sensitive to
this.

qemu always knows the current guest assigned bus number for the vPCI,
since it traps the bus number changes like anything else.

Thus when a STE is configured qemu has access to accurate data to
convert the vSID to the vPCI and vfio_device. Even if the bus numbers
change since boot.

> > We are forced to use RMR because of the hacky GIC ITS stuff.
> well we are not obliged to use RMRs. My first revisions did not use it
> and created a non direct S1 mapping. This is just a commodity that
> simplifies the integration and was nicely suggested by jean.

I undertand it is ARM's architectural preference..

Personally I would prefer the vGIC model include the ITS page itself
and that the guest put the ITS page into the S1 mapping in the usual
way. But we are a long way away from that..

Jason
  

Patch

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 99b2646cb5c7..6b0409d0ff85 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -31,6 +31,7 @@ 
 #include <linux/vmalloc.h>
 
 #include "dma-iommu.h"
+#include "iommu-priv.h"
 
 struct iommu_dma_msi_page {
 	struct list_head	list;
@@ -1652,7 +1653,7 @@  static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
 int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
 {
 	struct device *dev = msi_desc_to_dev(desc);
-	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct iommu_domain *domain = iommu_get_unmanaged_domain(dev);
 	struct iommu_dma_msi_page *msi_page;
 	static DEFINE_MUTEX(msi_prepare_lock); /* see below */
 
@@ -1685,7 +1686,7 @@  int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
 void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
 {
 	struct device *dev = msi_desc_to_dev(desc);
-	const struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	const struct iommu_domain *domain = iommu_get_unmanaged_domain(dev);
 	const struct iommu_dma_msi_page *msi_page;
 
 	msi_page = msi_desc_get_iommu_cookie(desc);
diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
index a6e694f59f64..da8044da9ad8 100644
--- a/drivers/iommu/iommu-priv.h
+++ b/drivers/iommu/iommu-priv.h
@@ -15,6 +15,21 @@  static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
 	return dev->iommu->iommu_dev->ops;
 }
 
+static inline struct iommu_domain *iommu_get_unmanaged_domain(struct device *dev)
+{
+	const struct iommu_ops *ops;
+
+	if (!dev->iommu || !dev->iommu->iommu_dev)
+		goto attached_domain;
+
+	ops = dev_iommu_ops(dev);
+	if (ops->get_unmanaged_domain)
+		return ops->get_unmanaged_domain(dev);
+
+attached_domain:
+	return iommu_get_domain_for_dev(dev);
+}
+
 int iommu_group_replace_domain(struct iommu_group *group,
 			       struct iommu_domain *new_domain);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 080278c8154d..76c65cc4fc15 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -275,6 +275,8 @@  struct iommu_ops {
 						  struct iommu_domain *parent,
 						  const void *user_data);
 
+	struct iommu_domain *(*get_unmanaged_domain)(struct device *dev);
+
 	struct iommu_device *(*probe_device)(struct device *dev);
 	void (*release_device)(struct device *dev);
 	void (*probe_finalize)(struct device *dev);