[20/21] ARM: dma-mapping: split out arch_dma_mark_clean() helper

Message ID 20230327121317.4081816-21-arnd@kernel.org
State New
Headers
Series dma-mapping: unify support for cache flushes |

Commit Message

Arnd Bergmann March 27, 2023, 12:13 p.m. UTC
  From: Arnd Bergmann <arnd@arndb.de>

The arm version of the arch_sync_dma_for_cpu() function annotates pages as
PG_dcache_clean after a DMA, but no other architecture does this here. On
ia64, the same thing is done in arch_sync_dma_for_cpu(), so it makes sense
to use the same hook in order to have identical arch_sync_dma_for_cpu()
semantics as all other architectures.

Splitting this out has multiple effects:

 - for dma-direct, this now gets called after arch_sync_dma_for_cpu()
   for DMA_FROM_DEVICE mappings, but not for DMA_BIDIRECTIONAL. While
   it would not be harmful to keep doing it for bidirectional mappings,
   those are apparently not used in any callers that care about the flag.

 - Since arm has its own dma-iommu abstraction, this now also needs to
   call the same function, so the calls are added there to mirror the
   dma-direct version.

 - Like dma-direct, the dma-iommu version now marks the dcache clean
   for both coherent and noncoherent devices after a DMA, but it only
   does this for DMA_FROM_DEVICE, not DMA_BIDIRECTIONAL.

[ HELP NEEDED: can anyone confirm that it is a correct assumption
  on arm that a cache-coherent device writing to a page always results
  in it being in a PG_dcache_clean state like on ia64, or can a device
  write directly into the dcache?]

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/Kconfig          |  1 +
 arch/arm/mm/dma-mapping.c | 71 +++++++++++++++++++++++----------------
 2 files changed, 43 insertions(+), 29 deletions(-)
  

Comments

Robin Murphy March 27, 2023, 12:48 p.m. UTC | #1
On 2023-03-27 13:13, Arnd Bergmann wrote:
> From: Arnd Bergmann <arnd@arndb.de>
> 
> The arm version of the arch_sync_dma_for_cpu() function annotates pages as
> PG_dcache_clean after a DMA, but no other architecture does this here. On
> ia64, the same thing is done in arch_sync_dma_for_cpu(), so it makes sense
> to use the same hook in order to have identical arch_sync_dma_for_cpu()
> semantics as all other architectures.
> 
> Splitting this out has multiple effects:
> 
>   - for dma-direct, this now gets called after arch_sync_dma_for_cpu()
>     for DMA_FROM_DEVICE mappings, but not for DMA_BIDIRECTIONAL. While
>     it would not be harmful to keep doing it for bidirectional mappings,
>     those are apparently not used in any callers that care about the flag.
> 
>   - Since arm has its own dma-iommu abstraction, this now also needs to
>     call the same function, so the calls are added there to mirror the
>     dma-direct version.
> 
>   - Like dma-direct, the dma-iommu version now marks the dcache clean
>     for both coherent and noncoherent devices after a DMA, but it only
>     does this for DMA_FROM_DEVICE, not DMA_BIDIRECTIONAL.
> 
> [ HELP NEEDED: can anyone confirm that it is a correct assumption
>    on arm that a cache-coherent device writing to a page always results
>    in it being in a PG_dcache_clean state like on ia64, or can a device
>    write directly into the dcache?]

In AMBA at least, if a snooping write hits in a cache then the data is 
most likely going to get routed directly into that cache. If it has 
write-back write-allocate attributes it could also land in any cache 
along its normal path to RAM; it wouldn't have to go all the way.

Hence all the fun we have where treating a coherent device as 
non-coherent can still be almost as broken as the other way round :)

Cheers,
Robin.

> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
> ---
>   arch/arm/Kconfig          |  1 +
>   arch/arm/mm/dma-mapping.c | 71 +++++++++++++++++++++++----------------
>   2 files changed, 43 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index e24a9820e12f..125d58c54ab1 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -7,6 +7,7 @@ config ARM
>   	select ARCH_HAS_BINFMT_FLAT
>   	select ARCH_HAS_CURRENT_STACK_POINTER
>   	select ARCH_HAS_DEBUG_VIRTUAL if MMU
> +	select ARCH_HAS_DMA_MARK_CLEAN if MMU
>   	select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE
>   	select ARCH_HAS_ELF_RANDOMIZE
>   	select ARCH_HAS_FORTIFY_SOURCE
> diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
> index cc702cb27ae7..b703cb83d27e 100644
> --- a/arch/arm/mm/dma-mapping.c
> +++ b/arch/arm/mm/dma-mapping.c
> @@ -665,6 +665,28 @@ static void dma_cache_maint(phys_addr_t paddr,
>   	} while (left);
>   }
>   
> +/*
> + * Mark the D-cache clean for these pages to avoid extra flushing.
> + */
> +void arch_dma_mark_clean(phys_addr_t paddr, size_t size)
> +{
> +	unsigned long pfn = PFN_UP(paddr);
> +	unsigned long off = paddr & (PAGE_SIZE - 1);
> +	size_t left = size;
> +
> +	if (size < PAGE_SIZE)
> +		return;
> +
> +	if (off)
> +		left -= PAGE_SIZE - off;
> +
> +	while (left >= PAGE_SIZE) {
> +		struct page *page = pfn_to_page(pfn++);
> +		set_bit(PG_dcache_clean, &page->flags);
> +		left -= PAGE_SIZE;
> +	}
> +}
> +
>   static bool arch_sync_dma_cpu_needs_post_dma_flush(void)
>   {
>   	if (IS_ENABLED(CONFIG_CPU_V6) ||
> @@ -715,24 +737,6 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
>   		outer_inv_range(paddr, paddr + size);
>   		dma_cache_maint(paddr, size, dmac_inv_range);
>   	}
> -
> -	/*
> -	 * Mark the D-cache clean for these pages to avoid extra flushing.
> -	 */
> -	if (dir != DMA_TO_DEVICE && size >= PAGE_SIZE) {
> -		unsigned long pfn = PFN_UP(paddr);
> -		unsigned long off = paddr & (PAGE_SIZE - 1);
> -		size_t left = size;
> -
> -		if (off)
> -			left -= PAGE_SIZE - off;
> -
> -		while (left >= PAGE_SIZE) {
> -			struct page *page = pfn_to_page(pfn++);
> -			set_bit(PG_dcache_clean, &page->flags);
> -			left -= PAGE_SIZE;
> -		}
> -	}
>   }
>   
>   #ifdef CONFIG_ARM_DMA_USE_IOMMU
> @@ -1294,6 +1298,17 @@ static int arm_iommu_map_sg(struct device *dev, struct scatterlist *sg,
>   	return -EINVAL;
>   }
>   
> +static void arm_iommu_sync_dma_for_cpu(phys_addr_t phys, size_t len,
> +				       enum dma_data_direction dir,
> +				       bool dma_coherent)
> +{
> +	if (!dma_coherent)
> +		arch_sync_dma_for_cpu(phys, s->length, dir);
> +
> +	if (dir == DMA_FROM_DEVICE)
> +		arch_dma_mark_clean(phys, s->length);
> +}
> +
>   /**
>    * arm_iommu_unmap_sg - unmap a set of SG buffers mapped by dma_map_sg
>    * @dev: valid struct device pointer
> @@ -1316,8 +1331,9 @@ static void arm_iommu_unmap_sg(struct device *dev,
>   		if (sg_dma_len(s))
>   			__iommu_remove_mapping(dev, sg_dma_address(s),
>   					       sg_dma_len(s));
> -		if (!dev->dma_coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
> -			arch_sync_dma_for_cpu(sg_phys(s), s->length, dir);
> +		if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
> +			arm_iommu_sync_dma_for_cpu(sg_phys(s), s->length, dir,
> +						   dev->dma_coherent);
>   	}
>   }
>   
> @@ -1335,12 +1351,9 @@ static void arm_iommu_sync_sg_for_cpu(struct device *dev,
>   	struct scatterlist *s;
>   	int i;
>   
> -	if (dev->dma_coherent)
> -		return;
> -
>   	for_each_sg(sg, s, nents, i)
> -		arch_sync_dma_for_cpu(sg_phys(s), s->length, dir);
> -
> +		arm_iommu_sync_dma_for_cpu(sg_phys(s), s->length, dir,
> +					   dev->dma_coherent);
>   }
>   
>   /**
> @@ -1425,9 +1438,9 @@ static void arm_iommu_unmap_page(struct device *dev, dma_addr_t handle,
>   	if (!iova)
>   		return;
>   
> -	if (!dev->dma_coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
> +	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
>   		phys = iommu_iova_to_phys(mapping->domain, handle);
> -		arch_sync_dma_for_cpu(phys, size, dir);
> +		arm_iommu_sync_dma_for_cpu(phys, size, dir, dev->dma_coherent);
>   	}
>   
>   	iommu_unmap(mapping->domain, iova, len);
> @@ -1497,11 +1510,11 @@ static void arm_iommu_sync_single_for_cpu(struct device *dev,
>   	struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev);
>   	phys_addr_t phys;
>   
> -	if (dev->dma_coherent || !(handle & PAGE_MASK))
> +	if (!(handle & PAGE_MASK))
>   		return;
>   
>   	phys = iommu_iova_to_phys(mapping->domain, handle);
> -	arch_sync_dma_for_cpu(phys, size, dir);
> +	arm_iommu_sync_dma_for_cpu(phys, size, dir, dev->dma_coherent);
>   }
>   
>   static void arm_iommu_sync_single_for_device(struct device *dev,
  
Russell King (Oracle) March 27, 2023, 3:01 p.m. UTC | #2
On Mon, Mar 27, 2023 at 02:13:16PM +0200, Arnd Bergmann wrote:
> From: Arnd Bergmann <arnd@arndb.de>
> 
> The arm version of the arch_sync_dma_for_cpu() function annotates pages as
> PG_dcache_clean after a DMA, but no other architecture does this here.

... because this is an arm32 specific feature. Generically, it's
PG_arch_1, which is a page flag free for architecture use. On arm32
we decided to use this to mark whether we can skip dcache writebacks
when establishing a PTE - and thus it was decided to call it
PG_dcache_clean to reflect how arm32 decided to use that bit.

This isn't just a DMA thing, there are other places that we update
the bit, such as flush_dcache_page() and copy_user_highpage().

So thinking that the arm32 PG_dcache_clean is something for DMA is
actually wrong.

Other architectures are free to do their own other optimisations
using that bit, and their implementations may be DMA-centric.
  
Arnd Bergmann March 31, 2023, 2 p.m. UTC | #3
On Mon, Mar 27, 2023, at 14:48, Robin Murphy wrote:
> On 2023-03-27 13:13, Arnd Bergmann wrote:
>> 
>> [ HELP NEEDED: can anyone confirm that it is a correct assumption
>>    on arm that a cache-coherent device writing to a page always results
>>    in it being in a PG_dcache_clean state like on ia64, or can a device
>>    write directly into the dcache?]
>
> In AMBA at least, if a snooping write hits in a cache then the data is 
> most likely going to get routed directly into that cache. If it has 
> write-back write-allocate attributes it could also land in any cache 
> along its normal path to RAM; it wouldn't have to go all the way.
>
> Hence all the fun we have where treating a coherent device as 
> non-coherent can still be almost as broken as the other way round :)

Ok, thanks for the information. I'm still not sure whether this can
result in the situation where PG_dcache_clean is wrong though.

Specifically, the question is whether a DMA to a coherent buffer
can end up in a dirty L1 dcache of one core and require to write
back the dcache before invalidating the icache for that page.

On ia64, this is not the case, the optimization here is to
only flush the icache after a coherent DMA into an executable
user page, while Arm only does this for noncoherent DMA but not
coherent DMA.

From your explanation it sounds like this might happen,
even though that would mean that "coherent" DMA is slightly
less coherent than it is elsewhere.

To be on the safe side, I'd have to pass a flag into
arch_dma_mark_clean() about coherency, to let the arm
implementation still require the extra dcache flush
for coherent DMA, while ia64 can ignore that flag.

    Arnd
  
Arnd Bergmann March 31, 2023, 2:06 p.m. UTC | #4
On Mon, Mar 27, 2023, at 17:01, Russell King (Oracle) wrote:
> On Mon, Mar 27, 2023 at 02:13:16PM +0200, Arnd Bergmann wrote:
>> From: Arnd Bergmann <arnd@arndb.de>
>> 
>> The arm version of the arch_sync_dma_for_cpu() function annotates pages as
>> PG_dcache_clean after a DMA, but no other architecture does this here.
>
> ... because this is an arm32 specific feature. Generically, it's
> PG_arch_1, which is a page flag free for architecture use. On arm32
> we decided to use this to mark whether we can skip dcache writebacks
> when establishing a PTE - and thus it was decided to call it
> PG_dcache_clean to reflect how arm32 decided to use that bit.
>
> This isn't just a DMA thing, there are other places that we update
> the bit, such as flush_dcache_page() and copy_user_highpage().
>
> So thinking that the arm32 PG_dcache_clean is something for DMA is
> actually wrong.
>
> Other architectures are free to do their own other optimisations
> using that bit, and their implementations may be DMA-centric.

The flag is used the same way on most architectures, though some
use the opposite polarity and call it PG_dcache_dirty. The only
other architecture that uses it for DMA is ia64, with the difference
being that this also marks the page as clean even for coherent
DMA, not just when doing a flush as part of noncoherent DMA.

Based on Robin's reply it sounds that this is not a valid assumption
on Arm, if a coherent DMA can target a dirty dcache line without
cleaning it.

     Arnd
  
Robin Murphy March 31, 2023, 3:12 p.m. UTC | #5
On 31/03/2023 3:00 pm, Arnd Bergmann wrote:
> On Mon, Mar 27, 2023, at 14:48, Robin Murphy wrote:
>> On 2023-03-27 13:13, Arnd Bergmann wrote:
>>>
>>> [ HELP NEEDED: can anyone confirm that it is a correct assumption
>>>     on arm that a cache-coherent device writing to a page always results
>>>     in it being in a PG_dcache_clean state like on ia64, or can a device
>>>     write directly into the dcache?]
>>
>> In AMBA at least, if a snooping write hits in a cache then the data is
>> most likely going to get routed directly into that cache. If it has
>> write-back write-allocate attributes it could also land in any cache
>> along its normal path to RAM; it wouldn't have to go all the way.
>>
>> Hence all the fun we have where treating a coherent device as
>> non-coherent can still be almost as broken as the other way round :)
> 
> Ok, thanks for the information. I'm still not sure whether this can
> result in the situation where PG_dcache_clean is wrong though.
> 
> Specifically, the question is whether a DMA to a coherent buffer
> can end up in a dirty L1 dcache of one core and require to write
> back the dcache before invalidating the icache for that page.
> 
> On ia64, this is not the case, the optimization here is to
> only flush the icache after a coherent DMA into an executable
> user page, while Arm only does this for noncoherent DMA but not
> coherent DMA.
> 
>  From your explanation it sounds like this might happen,
> even though that would mean that "coherent" DMA is slightly
> less coherent than it is elsewhere.
> 
> To be on the safe side, I'd have to pass a flag into
> arch_dma_mark_clean() about coherency, to let the arm
> implementation still require the extra dcache flush
> for coherent DMA, while ia64 can ignore that flag.

Coherent DMA on Arm is assumed to be inner-shareable, so a coherent DMA 
write should be pretty much equivalent to a coherent write by another 
CPU (or indeed the local CPU itself) - nothing says that it *couldn't* 
dirty a line in a data cache above the level of unification, so in 
general the assumption must be that, yes, if coherent DMA is writing 
data intended to be executable, then it's going to want a Dcache clean 
to PoU and an Icache invalidate to PoU before trying to execute it. By 
comparison, a non-coherent DMA transfer will inherently have to 
invalidate the Dcache all the way to PoC in its dma_unmap, thus cannot 
leave dirty data above the PoU, so only the Icache maintenance is 
required in the executable case.

(FWIW I believe the Armv8 IDC/DIC features can safely be considered 
irrelevant to 32-bit kernels)

I don't know a great deal about IA-64, but it appears to be using its 
PG_arch_1 flag in a subtly different manner to Arm, namely to optimise 
out the *Icache* maintenance. So if anything, it seems IA-64 is the 
weirdo here (who'd have guessed?) where DMA manages to be *more* 
coherent than the CPUs themselves :)

This is all now making me think we need some careful consideration of 
whether the benefits of consolidating code outweigh the confusion of 
conflating multiple different meanings of "clean" together...

Thanks,
Robin.
  
Russell King (Oracle) March 31, 2023, 3:54 p.m. UTC | #6
On Fri, Mar 31, 2023 at 04:06:37PM +0200, Arnd Bergmann wrote:
> On Mon, Mar 27, 2023, at 17:01, Russell King (Oracle) wrote:
> > On Mon, Mar 27, 2023 at 02:13:16PM +0200, Arnd Bergmann wrote:
> >> From: Arnd Bergmann <arnd@arndb.de>
> >> 
> >> The arm version of the arch_sync_dma_for_cpu() function annotates pages as
> >> PG_dcache_clean after a DMA, but no other architecture does this here.
> >
> > ... because this is an arm32 specific feature. Generically, it's
> > PG_arch_1, which is a page flag free for architecture use. On arm32
> > we decided to use this to mark whether we can skip dcache writebacks
> > when establishing a PTE - and thus it was decided to call it
> > PG_dcache_clean to reflect how arm32 decided to use that bit.
> >
> > This isn't just a DMA thing, there are other places that we update
> > the bit, such as flush_dcache_page() and copy_user_highpage().
> >
> > So thinking that the arm32 PG_dcache_clean is something for DMA is
> > actually wrong.
> >
> > Other architectures are free to do their own other optimisations
> > using that bit, and their implementations may be DMA-centric.
> 
> The flag is used the same way on most architectures, though some
> use the opposite polarity and call it PG_dcache_dirty. The only
> other architecture that uses it for DMA is ia64, with the difference
> being that this also marks the page as clean even for coherent
> DMA, not just when doing a flush as part of noncoherent DMA.
> 
> Based on Robin's reply it sounds that this is not a valid assumption
> on Arm, if a coherent DMA can target a dirty dcache line without
> cleaning it.

The other thing to note here is that PG_dcache_clean doesn't have
much meaning on modern CPUs with PIPT caches. For these,
cache_is_vipt_nonaliasing() will be true, and
cache_ops_need_broadcast() will be false.

Firstly, if we're using coherent DMA, then PG_dcache_clean is
intentionally not touched, because the data cache isn't cleaned
in any way by DMA operations.

flush_dcache_page() turns into a no-op apart from clearing
PG_dcache_clean if it was set.

__sync_icache_dcache() will do nothing for non-executable pages,
but will write-back a page that isn't marked PG_dcache_clean to
ensure that it is visible to the instruction stream. This is only
used to ensure that a the instructions are visible to a newly
established executable mapping when e.g. the page has been DMA'd
in. The default state of PG_dcache_clean is zero on any new
allocation, so this has the effect of causing any executable page
to be flushed such that the instruction stream can see the
instructions, but only for the first establishment of the mapping.
That means that e.g. libc text pages don't keep getting flushed on
the start of every program.

update_mmu_cache() isn't compiled, so it's use of PG_dcache_clean
is irrelevant.

v6_copy_user_highpage_aliasing() won't be called because we're not
using an aliasing cache.

So, for modern ARM systems with DMA-coherent PG_dcache_clean only
serves for the __sync_icache_dcache() optimisation.

ARMs use of this remains valid in this circumstance.
  
Arnd Bergmann March 31, 2023, 5:20 p.m. UTC | #7
On Fri, Mar 31, 2023, at 17:12, Robin Murphy wrote:
> On 31/03/2023 3:00 pm, Arnd Bergmann wrote:
>> On Mon, Mar 27, 2023, at 14:48, Robin Murphy wrote:
>> 
>> To be on the safe side, I'd have to pass a flag into
>> arch_dma_mark_clean() about coherency, to let the arm
>> implementation still require the extra dcache flush
>> for coherent DMA, while ia64 can ignore that flag.
>
> Coherent DMA on Arm is assumed to be inner-shareable, so a coherent DMA 
> write should be pretty much equivalent to a coherent write by another 
> CPU (or indeed the local CPU itself) - nothing says that it *couldn't* 
> dirty a line in a data cache above the level of unification, so in 
> general the assumption must be that, yes, if coherent DMA is writing 
> data intended to be executable, then it's going to want a Dcache clean 
> to PoU and an Icache invalidate to PoU before trying to execute it. By 
> comparison, a non-coherent DMA transfer will inherently have to 
> invalidate the Dcache all the way to PoC in its dma_unmap, thus cannot 
> leave dirty data above the PoU, so only the Icache maintenance is 
> required in the executable case.

Ok, makes sense. I've already started reworking my patch for it.

> (FWIW I believe the Armv8 IDC/DIC features can safely be considered 
> irrelevant to 32-bit kernels)
>
> I don't know a great deal about IA-64, but it appears to be using its 
> PG_arch_1 flag in a subtly different manner to Arm, namely to optimise 
> out the *Icache* maintenance. So if anything, it seems IA-64 is the 
> weirdo here (who'd have guessed?) where DMA manages to be *more* 
> coherent than the CPUs themselves :)

I checked this in the ia64 manual, and as far as I can tell, it originally
only had one cacheflush instruction that flushes the dcache and invalidates
the icache at the same time. So flush_icache_range() actually does
both and flush_dcache_page() instead just marks the page as dirty to
ensure flush_icache_range() does not get skipped after a writing a
page from the kernel.

On later Itaniums, there is apparently a separate icache flush
instruction that gets used in flush_icache_range(), but that
still works for the DMA case that is allowed to skip the flush.

> This is all now making me think we need some careful consideration of 
> whether the benefits of consolidating code outweigh the confusion of 
> conflating multiple different meanings of "clean" together...

The difference in usage of PG_dcache_clean/PG_dcache_dirty/PG_arch_1
across architectures is certainly big enough that we can't just
define a a common arch_dma_mark_clean() across architectures, but
I think the idea of having a common entry point for
arch_dma_mark_clean() to be called from the dma-mapping code
to do something architecture specific after a DMA is clean still
makes sense, 

     Arnd
  
Geert Uytterhoeven July 3, 2023, 7:54 a.m. UTC | #8
Hi Arnd,

On Mon, Mar 27, 2023 at 2:16 PM Arnd Bergmann <arnd@kernel.org> wrote:
> From: Arnd Bergmann <arnd@arndb.de>
>
> The arm version of the arch_sync_dma_for_cpu() function annotates pages as
> PG_dcache_clean after a DMA, but no other architecture does this here. On
> ia64, the same thing is done in arch_sync_dma_for_cpu(), so it makes sense
> to use the same hook in order to have identical arch_sync_dma_for_cpu()
> semantics as all other architectures.
>
> Splitting this out has multiple effects:
>
>  - for dma-direct, this now gets called after arch_sync_dma_for_cpu()
>    for DMA_FROM_DEVICE mappings, but not for DMA_BIDIRECTIONAL. While
>    it would not be harmful to keep doing it for bidirectional mappings,
>    those are apparently not used in any callers that care about the flag.
>
>  - Since arm has its own dma-iommu abstraction, this now also needs to
>    call the same function, so the calls are added there to mirror the
>    dma-direct version.
>
>  - Like dma-direct, the dma-iommu version now marks the dcache clean
>    for both coherent and noncoherent devices after a DMA, but it only
>    does this for DMA_FROM_DEVICE, not DMA_BIDIRECTIONAL.
>
> [ HELP NEEDED: can anyone confirm that it is a correct assumption
>   on arm that a cache-coherent device writing to a page always results
>   in it being in a PG_dcache_clean state like on ia64, or can a device
>   write directly into the dcache?]
>
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

Thanks for your patch, which is now commit 322dbe898f82fd8a
("ARM: dma-mapping: split out arch_dma_mark_clean() helper") in
esmil/jh7100-dmapool.

If CONFIG_ARM_DMA_USE_IOMMU=y, the build fails.

> --- a/arch/arm/mm/dma-mapping.c
> +++ b/arch/arm/mm/dma-mapping.c

> @@ -1294,6 +1298,17 @@ static int arm_iommu_map_sg(struct device *dev, struct scatterlist *sg,
>         return -EINVAL;
>  }
>
> +static void arm_iommu_sync_dma_for_cpu(phys_addr_t phys, size_t len,
> +                                      enum dma_data_direction dir,
> +                                      bool dma_coherent)
> +{
> +       if (!dma_coherent)
> +               arch_sync_dma_for_cpu(phys, s->length, dir);

s/s->length/len/

> +
> +       if (dir == DMA_FROM_DEVICE)
> +               arch_dma_mark_clean(phys, s->length);

Likewise.

> +}
> +
>  /**
>   * arm_iommu_unmap_sg - unmap a set of SG buffers mapped by dma_map_sg
>   * @dev: valid struct device pointer

> @@ -1425,9 +1438,9 @@ static void arm_iommu_unmap_page(struct device *dev, dma_addr_t handle,
>         if (!iova)
>                 return;
>
> -       if (!dev->dma_coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
> +       if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))

Missing opening curly brace.

>                 phys = iommu_iova_to_phys(mapping->domain, handle);
> -               arch_sync_dma_for_cpu(phys, size, dir);
> +               arm_iommu_sync_dma_for_cpu(phys, size, dir, dev->dma_coherent);
>         }
>
>         iommu_unmap(mapping->domain, iova, len);

With the above fixed, it builds and boots fine (on R-Car M2-W).

Gr{oetje,eeting}s,

                        Geert
  
Christoph Hellwig July 6, 2023, 2:11 p.m. UTC | #9
> Thanks for your patch, which is now commit 322dbe898f82fd8a
> ("ARM: dma-mapping: split out arch_dma_mark_clean() helper") in
> esmil/jh7100-dmapool.

Well, something is wrong with that branch then, and this series still
needs more work, and should eventually be merged through the dma-mapping
tree.
  

Patch

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e24a9820e12f..125d58c54ab1 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -7,6 +7,7 @@  config ARM
 	select ARCH_HAS_BINFMT_FLAT
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
+	select ARCH_HAS_DMA_MARK_CLEAN if MMU
 	select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FORTIFY_SOURCE
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index cc702cb27ae7..b703cb83d27e 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -665,6 +665,28 @@  static void dma_cache_maint(phys_addr_t paddr,
 	} while (left);
 }
 
+/*
+ * Mark the D-cache clean for these pages to avoid extra flushing.
+ */
+void arch_dma_mark_clean(phys_addr_t paddr, size_t size)
+{
+	unsigned long pfn = PFN_UP(paddr);
+	unsigned long off = paddr & (PAGE_SIZE - 1);
+	size_t left = size;
+
+	if (size < PAGE_SIZE)
+		return;
+
+	if (off)
+		left -= PAGE_SIZE - off;
+
+	while (left >= PAGE_SIZE) {
+		struct page *page = pfn_to_page(pfn++);
+		set_bit(PG_dcache_clean, &page->flags);
+		left -= PAGE_SIZE;
+	}
+}
+
 static bool arch_sync_dma_cpu_needs_post_dma_flush(void)
 {
 	if (IS_ENABLED(CONFIG_CPU_V6) ||
@@ -715,24 +737,6 @@  void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 		outer_inv_range(paddr, paddr + size);
 		dma_cache_maint(paddr, size, dmac_inv_range);
 	}
-
-	/*
-	 * Mark the D-cache clean for these pages to avoid extra flushing.
-	 */
-	if (dir != DMA_TO_DEVICE && size >= PAGE_SIZE) {
-		unsigned long pfn = PFN_UP(paddr);
-		unsigned long off = paddr & (PAGE_SIZE - 1);
-		size_t left = size;
-
-		if (off)
-			left -= PAGE_SIZE - off;
-
-		while (left >= PAGE_SIZE) {
-			struct page *page = pfn_to_page(pfn++);
-			set_bit(PG_dcache_clean, &page->flags);
-			left -= PAGE_SIZE;
-		}
-	}
 }
 
 #ifdef CONFIG_ARM_DMA_USE_IOMMU
@@ -1294,6 +1298,17 @@  static int arm_iommu_map_sg(struct device *dev, struct scatterlist *sg,
 	return -EINVAL;
 }
 
+static void arm_iommu_sync_dma_for_cpu(phys_addr_t phys, size_t len,
+				       enum dma_data_direction dir,
+				       bool dma_coherent)
+{
+	if (!dma_coherent)
+		arch_sync_dma_for_cpu(phys, s->length, dir);
+
+	if (dir == DMA_FROM_DEVICE)
+		arch_dma_mark_clean(phys, s->length);
+}
+
 /**
  * arm_iommu_unmap_sg - unmap a set of SG buffers mapped by dma_map_sg
  * @dev: valid struct device pointer
@@ -1316,8 +1331,9 @@  static void arm_iommu_unmap_sg(struct device *dev,
 		if (sg_dma_len(s))
 			__iommu_remove_mapping(dev, sg_dma_address(s),
 					       sg_dma_len(s));
-		if (!dev->dma_coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-			arch_sync_dma_for_cpu(sg_phys(s), s->length, dir);
+		if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+			arm_iommu_sync_dma_for_cpu(sg_phys(s), s->length, dir,
+						   dev->dma_coherent);
 	}
 }
 
@@ -1335,12 +1351,9 @@  static void arm_iommu_sync_sg_for_cpu(struct device *dev,
 	struct scatterlist *s;
 	int i;
 
-	if (dev->dma_coherent)
-		return;
-
 	for_each_sg(sg, s, nents, i)
-		arch_sync_dma_for_cpu(sg_phys(s), s->length, dir);
-
+		arm_iommu_sync_dma_for_cpu(sg_phys(s), s->length, dir,
+					   dev->dma_coherent);
 }
 
 /**
@@ -1425,9 +1438,9 @@  static void arm_iommu_unmap_page(struct device *dev, dma_addr_t handle,
 	if (!iova)
 		return;
 
-	if (!dev->dma_coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		phys = iommu_iova_to_phys(mapping->domain, handle);
-		arch_sync_dma_for_cpu(phys, size, dir);
+		arm_iommu_sync_dma_for_cpu(phys, size, dir, dev->dma_coherent);
 	}
 
 	iommu_unmap(mapping->domain, iova, len);
@@ -1497,11 +1510,11 @@  static void arm_iommu_sync_single_for_cpu(struct device *dev,
 	struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev);
 	phys_addr_t phys;
 
-	if (dev->dma_coherent || !(handle & PAGE_MASK))
+	if (!(handle & PAGE_MASK))
 		return;
 
 	phys = iommu_iova_to_phys(mapping->domain, handle);
-	arch_sync_dma_for_cpu(phys, size, dir);
+	arm_iommu_sync_dma_for_cpu(phys, size, dir, dev->dma_coherent);
 }
 
 static void arm_iommu_sync_single_for_device(struct device *dev,