[net-next,2/7] dma: avoid expensive redundant calls for sync operations

Message ID 20240126135456.704351-3-aleksander.lobakin@intel.com
State New
Headers
Series dma: skip calling no-op sync ops when possible |

Commit Message

Alexander Lobakin Jan. 26, 2024, 1:54 p.m. UTC
  From: Eric Dumazet <edumazet@google.com>

Quite often, NIC devices do not need dma_sync operations on x86_64
at least.
Indeed, when dev_is_dma_coherent(dev) is true and
dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
and friends do nothing.

However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.

Add dev->skip_dma_sync boolean which is set during the device
initialization depending on the setup: dev_is_dma_coherent() for direct
DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
Then later, if/when swiotlb is used for the first time, the flag
is turned off, from swiotlb_tbl_map_single().

On iavf, the UDP trafficgen with XDP_DROP in skb mode test shows
+3-5% increase for direct DMA.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Co-developed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 include/linux/device.h      |  5 +++++
 include/linux/dma-map-ops.h | 17 +++++++++++++++++
 include/linux/dma-mapping.h | 12 ++++++++++--
 drivers/base/dd.c           |  2 ++
 kernel/dma/mapping.c        | 34 +++++++++++++++++++++++++++++++---
 kernel/dma/swiotlb.c        | 14 ++++++++++++++
 6 files changed, 79 insertions(+), 5 deletions(-)
  

Comments

Robin Murphy Jan. 26, 2024, 3:48 p.m. UTC | #1
On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> Quite often, NIC devices do not need dma_sync operations on x86_64
> at least.
> Indeed, when dev_is_dma_coherent(dev) is true and
> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
> and friends do nothing.
> 
> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
> 
> Add dev->skip_dma_sync boolean which is set during the device
> initialization depending on the setup: dev_is_dma_coherent() for direct
> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
> Then later, if/when swiotlb is used for the first time, the flag
> is turned off, from swiotlb_tbl_map_single().

I think you could probably just promote the dma_uses_io_tlb flag from 
SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.

Similarly I don't think a new op is necessary now that we have 
dma_map_ops.flags. A simple static flag to indicate that sync may be 
skipped under the same conditions as implied for dma-direct - i.e. 
dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought 
to suffice.

Thanks,
Robin.

> On iavf, the UDP trafficgen with XDP_DROP in skb mode test shows
> +3-5% increase for direct DMA.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Co-developed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
> ---
>   include/linux/device.h      |  5 +++++
>   include/linux/dma-map-ops.h | 17 +++++++++++++++++
>   include/linux/dma-mapping.h | 12 ++++++++++--
>   drivers/base/dd.c           |  2 ++
>   kernel/dma/mapping.c        | 34 +++++++++++++++++++++++++++++++---
>   kernel/dma/swiotlb.c        | 14 ++++++++++++++
>   6 files changed, 79 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/device.h b/include/linux/device.h
> index 97c4b046c09d..f23e6a32bea0 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -686,6 +686,8 @@ struct device_physical_location {
>    *		other devices probe successfully.
>    * @dma_coherent: this particular device is dma coherent, even if the
>    *		architecture supports non-coherent devices.
> + * @dma_skip_sync: DMA sync operations can be skipped for coherent non-SWIOTLB
> + *		buffers.
>    * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
>    *		streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
>    *		and optionall (if the coherent mask is large enough) also
> @@ -800,6 +802,9 @@ struct device {
>       defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
>   	bool			dma_coherent:1;
>   #endif
> +#ifdef CONFIG_DMA_NEED_SYNC
> +	bool			dma_skip_sync:1;
> +#endif
>   #ifdef CONFIG_DMA_OPS_BYPASS
>   	bool			dma_ops_bypass : 1;
>   #endif
> diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
> index 4abc60f04209..937c295e9da8 100644
> --- a/include/linux/dma-map-ops.h
> +++ b/include/linux/dma-map-ops.h
> @@ -78,6 +78,7 @@ struct dma_map_ops {
>   			int nents, enum dma_data_direction dir);
>   	void (*cache_sync)(struct device *dev, void *vaddr, size_t size,
>   			enum dma_data_direction direction);
> +	bool (*can_skip_sync)(struct device *dev);
>   	int (*dma_supported)(struct device *dev, u64 mask);
>   	u64 (*get_required_mask)(struct device *dev);
>   	size_t (*max_mapping_size)(struct device *dev);
> @@ -111,6 +112,22 @@ static inline void set_dma_ops(struct device *dev,
>   }
>   #endif /* CONFIG_DMA_OPS */
>   
> +#ifdef CONFIG_DMA_NEED_SYNC
> +
> +static inline void dma_set_skip_sync(struct device *dev, bool skip)
> +{
> +	dev->dma_skip_sync = skip;
> +}
> +
> +void dma_setup_skip_sync(struct device *dev);
> +
> +#else /* !CONFIG_DMA_NEED_SYNC */
> +
> +#define dma_set_skip_sync(dev, skip)		do { } while (0)
> +#define dma_setup_skip_sync(dev)		do { } while (0)
> +
> +#endif /* !CONFIG_DMA_NEED_SYNC */
> +
>   #ifdef CONFIG_DMA_CMA
>   extern struct cma *dma_contiguous_default_area;
>   
> diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
> index 9dd7e1578bf6..bc9f67e0c139 100644
> --- a/include/linux/dma-mapping.h
> +++ b/include/linux/dma-mapping.h
> @@ -365,9 +365,17 @@ __dma_sync_single_range_for_device(struct device *dev, dma_addr_t addr,
>   
>   #ifdef CONFIG_DMA_NEED_SYNC
>   
> -#define dma_skip_sync(dev)			false
> +static inline bool dma_skip_sync(const struct device *dev)
> +{
> +	return dev->dma_skip_sync;
> +}
> +
> +bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr);
>   
> -bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
> +static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
> +{
> +	return dma_skip_sync(dev) ? false : __dma_need_sync(dev, dma_addr);
> +}
>   
>   #else /* !CONFIG_DMA_NEED_SYNC */
>   
> diff --git a/drivers/base/dd.c b/drivers/base/dd.c
> index 85152537dbf1..67ad3e1d51f6 100644
> --- a/drivers/base/dd.c
> +++ b/drivers/base/dd.c
> @@ -642,6 +642,8 @@ static int really_probe(struct device *dev, struct device_driver *drv)
>   			goto pinctrl_bind_failed;
>   	}
>   
> +	dma_setup_skip_sync(dev);
> +
>   	ret = driver_sysfs_add(dev);
>   	if (ret) {
>   		pr_err("%s: driver_sysfs_add(%s) failed\n",
> diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
> index a30f37f9d4db..8fa464b3954e 100644
> --- a/kernel/dma/mapping.c
> +++ b/kernel/dma/mapping.c
> @@ -842,15 +842,43 @@ size_t dma_opt_mapping_size(struct device *dev)
>   EXPORT_SYMBOL_GPL(dma_opt_mapping_size);
>   
>   #ifdef CONFIG_DMA_NEED_SYNC
> -bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
> +bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr)
>   {
>   	const struct dma_map_ops *ops = get_dma_ops(dev);
>   
>   	if (dma_map_direct(dev, ops))
> +		/*
> +		 * dma_skip_sync could've been set to false on first SWIOTLB
> +		 * buffer mapping, but @dma_addr is not necessary an SWIOTLB
> +		 * buffer. In this case, fall back to more granular check.
> +		 */
>   		return dma_direct_need_sync(dev, dma_addr);
> -	return ops->sync_single_for_cpu || ops->sync_single_for_device;
> +
> +	return true;
> +}
> +EXPORT_SYMBOL_GPL(__dma_need_sync);
> +
> +void dma_setup_skip_sync(struct device *dev)
> +{
> +	const struct dma_map_ops *ops = get_dma_ops(dev);
> +	bool skip;
> +
> +	if (dma_map_direct(dev, ops))
> +		/*
> +		 * dma_skip_sync will be set to false on first SWIOTLB buffer
> +		 * mapping, if any. During the device initialization, it's
> +		 * enough to check only for DMA coherence.
> +		 */
> +		skip = dev_is_dma_coherent(dev);
> +	else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu)
> +		skip = true;
> +	else if (ops->can_skip_sync)
> +		skip = ops->can_skip_sync(dev);
> +	else
> +		skip = false;
> +
> +	dma_set_skip_sync(dev, skip);
>   }
> -EXPORT_SYMBOL_GPL(dma_need_sync);
>   #endif /* CONFIG_DMA_NEED_SYNC */
>   
>   unsigned long dma_get_merge_boundary(struct device *dev)
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index b079a9a8e087..b62ea0a4f106 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -1286,6 +1286,16 @@ static unsigned long mem_used(struct io_tlb_mem *mem)
>   
>   #endif /* CONFIG_DEBUG_FS */
>   
> +static inline void swiotlb_disable_dma_skip_sync(struct device *dev)
> +{
> +	/*
> +	 * If dma_skip_sync was set, reset it to false on first SWIOTLB buffer
> +	 * mapping/allocation to always sync SWIOTLB buffers.
> +	 */
> +	if (unlikely(dma_skip_sync(dev)))
> +		dma_set_skip_sync(dev, false);
> +}
> +
>   phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
>   		size_t mapping_size, size_t alloc_size,
>   		unsigned int alloc_align_mask, enum dma_data_direction dir,
> @@ -1323,6 +1333,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
>   		return (phys_addr_t)DMA_MAPPING_ERROR;
>   	}
>   
> +	swiotlb_disable_dma_skip_sync(dev);
> +
>   	/*
>   	 * Save away the mapping from the original address to the DMA address.
>   	 * This is needed when we sync the memory.  Then we sync the buffer if
> @@ -1640,6 +1652,8 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
>   	if (index == -1)
>   		return NULL;
>   
> +	swiotlb_disable_dma_skip_sync(dev);
> +
>   	tlb_addr = slot_addr(pool->start, index);
>   
>   	return pfn_to_page(PFN_DOWN(tlb_addr));
  
Alexander Lobakin Jan. 26, 2024, 4:45 p.m. UTC | #2
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 26 Jan 2024 15:48:54 +0000

> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
>> From: Eric Dumazet <edumazet@google.com>
>>
>> Quite often, NIC devices do not need dma_sync operations on x86_64
>> at least.
>> Indeed, when dev_is_dma_coherent(dev) is true and
>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>> and friends do nothing.
>>
>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>>
>> Add dev->skip_dma_sync boolean which is set during the device
>> initialization depending on the setup: dev_is_dma_coherent() for direct
>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
>> Then later, if/when swiotlb is used for the first time, the flag
>> is turned off, from swiotlb_tbl_map_single().
> 
> I think you could probably just promote the dma_uses_io_tlb flag from
> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.

Nice catch!

> 
> Similarly I don't think a new op is necessary now that we have
> dma_map_ops.flags. A simple static flag to indicate that sync may be> skipped under the same conditions as implied for dma-direct - i.e.
> dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
> to suffice.

In my initial implementation, I used a new dma_map_ops flag, but then I
realized different DMA ops may require or not require syncing under
different conditions, not only dev_is_dma_coherent().
Or am I wrong and they would always be the same?

> 
> Thanks,
> Robin.

Thanks,
Olek
  
Robin Murphy Jan. 26, 2024, 5:21 p.m. UTC | #3
On 26/01/2024 4:45 pm, Alexander Lobakin wrote:
> From: Robin Murphy <robin.murphy@arm.com>
> Date: Fri, 26 Jan 2024 15:48:54 +0000
> 
>> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
>>> From: Eric Dumazet <edumazet@google.com>
>>>
>>> Quite often, NIC devices do not need dma_sync operations on x86_64
>>> at least.
>>> Indeed, when dev_is_dma_coherent(dev) is true and
>>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>>> and friends do nothing.
>>>
>>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
>>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
>>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>>>
>>> Add dev->skip_dma_sync boolean which is set during the device
>>> initialization depending on the setup: dev_is_dma_coherent() for direct
>>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
>>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
>>> Then later, if/when swiotlb is used for the first time, the flag
>>> is turned off, from swiotlb_tbl_map_single().
>>
>> I think you could probably just promote the dma_uses_io_tlb flag from
>> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.
> 
> Nice catch!
> 
>>
>> Similarly I don't think a new op is necessary now that we have
>> dma_map_ops.flags. A simple static flag to indicate that sync may be> skipped under the same conditions as implied for dma-direct - i.e.
>> dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
>> to suffice.
> 
> In my initial implementation, I used a new dma_map_ops flag, but then I
> realized different DMA ops may require or not require syncing under
> different conditions, not only dev_is_dma_coherent().
> Or am I wrong and they would always be the same?

I think it's safe to assume that, as with P2P support, this will only 
matter for dma-direct and iommu-dma for the foreseeable future, and 
those do currently share the same conditions as above. Thus we may as 
well keep things simple for now, and if anything ever does have cause to 
change, it can be the future's problem to keep this mechanism working as 
intended.

Thanks,
Robin.
  
Petr Tesařík Jan. 26, 2024, 6:48 p.m. UTC | #4
On Fri, 26 Jan 2024 17:21:24 +0000
Robin Murphy <robin.murphy@arm.com> wrote:

> On 26/01/2024 4:45 pm, Alexander Lobakin wrote:
> > From: Robin Murphy <robin.murphy@arm.com>
> > Date: Fri, 26 Jan 2024 15:48:54 +0000
> >   
> >> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:  
> >>> From: Eric Dumazet <edumazet@google.com>
> >>>
> >>> Quite often, NIC devices do not need dma_sync operations on x86_64
> >>> at least.
> >>> Indeed, when dev_is_dma_coherent(dev) is true and
> >>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
> >>> and friends do nothing.
> >>>
> >>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
> >>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
> >>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
> >>>
> >>> Add dev->skip_dma_sync boolean which is set during the device
> >>> initialization depending on the setup: dev_is_dma_coherent() for direct
> >>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
> >>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
> >>> Then later, if/when swiotlb is used for the first time, the flag
> >>> is turned off, from swiotlb_tbl_map_single().  
> >>
> >> I think you could probably just promote the dma_uses_io_tlb flag from
> >> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.  
> > 
> > Nice catch!
> >   
> >>
> >> Similarly I don't think a new op is necessary now that we have
> >> dma_map_ops.flags. A simple static flag to indicate that sync may be> skipped under the same conditions as implied for dma-direct - i.e.
> >> dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
> >> to suffice.  
> > 
> > In my initial implementation, I used a new dma_map_ops flag, but then I
> > realized different DMA ops may require or not require syncing under
> > different conditions, not only dev_is_dma_coherent().
> > Or am I wrong and they would always be the same?  
> 
> I think it's safe to assume that, as with P2P support, this will only 
> matter for dma-direct and iommu-dma for the foreseeable future, and 
> those do currently share the same conditions as above. Thus we may as 
> well keep things simple for now, and if anything ever does have cause to 
> change, it can be the future's problem to keep this mechanism working as 
> intended.

Can we have a comment that states this assumption along with the flag?
Because when it breaks, it will keep someone cursing for days why DMA
sometimes fails on their device before they find out it's not synced.
And then wondering why the code makes such silly assumptions...

My two cents
Petr T
  
Robin Murphy Jan. 26, 2024, 7:13 p.m. UTC | #5
On 26/01/2024 6:48 pm, Petr Tesařík wrote:
> On Fri, 26 Jan 2024 17:21:24 +0000
> Robin Murphy <robin.murphy@arm.com> wrote:
> 
>> On 26/01/2024 4:45 pm, Alexander Lobakin wrote:
>>> From: Robin Murphy <robin.murphy@arm.com>
>>> Date: Fri, 26 Jan 2024 15:48:54 +0000
>>>    
>>>> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
>>>>> From: Eric Dumazet <edumazet@google.com>
>>>>>
>>>>> Quite often, NIC devices do not need dma_sync operations on x86_64
>>>>> at least.
>>>>> Indeed, when dev_is_dma_coherent(dev) is true and
>>>>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>>>>> and friends do nothing.
>>>>>
>>>>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
>>>>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
>>>>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>>>>>
>>>>> Add dev->skip_dma_sync boolean which is set during the device
>>>>> initialization depending on the setup: dev_is_dma_coherent() for direct
>>>>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
>>>>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
>>>>> Then later, if/when swiotlb is used for the first time, the flag
>>>>> is turned off, from swiotlb_tbl_map_single().
>>>>
>>>> I think you could probably just promote the dma_uses_io_tlb flag from
>>>> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.
>>>
>>> Nice catch!
>>>    
>>>>
>>>> Similarly I don't think a new op is necessary now that we have
>>>> dma_map_ops.flags. A simple static flag to indicate that sync may be> skipped under the same conditions as implied for dma-direct - i.e.
>>>> dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
>>>> to suffice.
>>>
>>> In my initial implementation, I used a new dma_map_ops flag, but then I
>>> realized different DMA ops may require or not require syncing under
>>> different conditions, not only dev_is_dma_coherent().
>>> Or am I wrong and they would always be the same?
>>
>> I think it's safe to assume that, as with P2P support, this will only
>> matter for dma-direct and iommu-dma for the foreseeable future, and
>> those do currently share the same conditions as above. Thus we may as
>> well keep things simple for now, and if anything ever does have cause to
>> change, it can be the future's problem to keep this mechanism working as
>> intended.
> 
> Can we have a comment that states this assumption along with the flag?
> Because when it breaks, it will keep someone cursing for days why DMA
> sometimes fails on their device before they find out it's not synced.
> And then wondering why the code makes such silly assumptions...

Indeed, apologies if it wasn't totally clear, but I really was implying 
a literal "may skip sync if coherent and not using SWIOTLB (which 
matches dma-direct)" flag, documented as such, and not trying to dress 
it up as anything more generic. I just can't suggest a suitably concise 
name for that of the top of my head... :)

Thanks,
Robin.
  
Christoph Hellwig Jan. 29, 2024, 6:09 a.m. UTC | #6
On Fri, Jan 26, 2024 at 07:13:05PM +0000, Robin Murphy wrote:
>> Can we have a comment that states this assumption along with the flag?
>> Because when it breaks, it will keep someone cursing for days why DMA
>> sometimes fails on their device before they find out it's not synced.
>> And then wondering why the code makes such silly assumptions...
>
> Indeed, apologies if it wasn't totally clear, but I really was implying a 
> literal "may skip sync if coherent and not using SWIOTLB (which matches 
> dma-direct)" flag, documented as such, and not trying to dress it up as 
> anything more generic. I just can't suggest a suitably concise name for 
> that of the top of my head... :)

Yes, that seems like the right way to go.
  
Alexander Lobakin Jan. 29, 2024, 2:07 p.m. UTC | #7
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 26 Jan 2024 17:45:11 +0100

> From: Robin Murphy <robin.murphy@arm.com>
> Date: Fri, 26 Jan 2024 15:48:54 +0000
> 
>> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
>>> From: Eric Dumazet <edumazet@google.com>
>>>
>>> Quite often, NIC devices do not need dma_sync operations on x86_64
>>> at least.
>>> Indeed, when dev_is_dma_coherent(dev) is true and
>>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>>> and friends do nothing.
>>>
>>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
>>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
>>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>>>
>>> Add dev->skip_dma_sync boolean which is set during the device
>>> initialization depending on the setup: dev_is_dma_coherent() for direct
>>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
>>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
>>> Then later, if/when swiotlb is used for the first time, the flag
>>> is turned off, from swiotlb_tbl_map_single().
>>
>> I think you could probably just promote the dma_uses_io_tlb flag from
>> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.
> 
> Nice catch!

BTW, this implies such hotpath check:

	if (dev->dma_skip_sync && !READ_ONCE(dev->dma_uses_io_tlb))
		// ...

This seems less effective than just resetting dma_skip_sync on first
allocation.

> 
>>
>> Similarly I don't think a new op is necessary now that we have
>> dma_map_ops.flags. A simple static flag to indicate that sync may be> skipped under the same conditions as implied for dma-direct - i.e.
>> dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
>> to suffice.
> 
> In my initial implementation, I used a new dma_map_ops flag, but then I
> realized different DMA ops may require or not require syncing under
> different conditions, not only dev_is_dma_coherent().
> Or am I wrong and they would always be the same?
> 
>>
>> Thanks,
>> Robin.
> 
> Thanks,
> Olek

Thanks,
Olek
  
Robin Murphy Jan. 29, 2024, 2:29 p.m. UTC | #8
On 2024-01-29 2:07 pm, Alexander Lobakin wrote:
> From: Alexander Lobakin <aleksander.lobakin@intel.com>
> Date: Fri, 26 Jan 2024 17:45:11 +0100
> 
>> From: Robin Murphy <robin.murphy@arm.com>
>> Date: Fri, 26 Jan 2024 15:48:54 +0000
>>
>>> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
>>>> From: Eric Dumazet <edumazet@google.com>
>>>>
>>>> Quite often, NIC devices do not need dma_sync operations on x86_64
>>>> at least.
>>>> Indeed, when dev_is_dma_coherent(dev) is true and
>>>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>>>> and friends do nothing.
>>>>
>>>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
>>>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
>>>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>>>>
>>>> Add dev->skip_dma_sync boolean which is set during the device
>>>> initialization depending on the setup: dev_is_dma_coherent() for direct
>>>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
>>>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
>>>> Then later, if/when swiotlb is used for the first time, the flag
>>>> is turned off, from swiotlb_tbl_map_single().
>>>
>>> I think you could probably just promote the dma_uses_io_tlb flag from
>>> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.
>>
>> Nice catch!
> 
> BTW, this implies such hotpath check:
> 
> 	if (dev->dma_skip_sync && !READ_ONCE(dev->dma_uses_io_tlb))
> 		// ...
> 
> This seems less effective than just resetting dma_skip_sync on first
> allocation.

Well, my point is not to have a dma_skip_sync at all; I'm suggesting the 
check would be:

	if (dev_is_dma_coherent(dev) && dev_uses_io_tlb(dev))
		...

where on the platform which cares about this most, that first condition 
is a compile-time constant (and as implied, the second would want to be 
similarly wrapped for !SWIOTLB configs).

Thanks,
Robin.
  
Alexander Lobakin Jan. 29, 2024, 2:34 p.m. UTC | #9
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 29 Jan 2024 14:29:43 +0000

> On 2024-01-29 2:07 pm, Alexander Lobakin wrote:
>> From: Alexander Lobakin <aleksander.lobakin@intel.com>
>> Date: Fri, 26 Jan 2024 17:45:11 +0100
>>
>>> From: Robin Murphy <robin.murphy@arm.com>
>>> Date: Fri, 26 Jan 2024 15:48:54 +0000
>>>
>>>> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
>>>>> From: Eric Dumazet <edumazet@google.com>
>>>>>
>>>>> Quite often, NIC devices do not need dma_sync operations on x86_64
>>>>> at least.
>>>>> Indeed, when dev_is_dma_coherent(dev) is true and
>>>>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>>>>> and friends do nothing.
>>>>>
>>>>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes
>>>>> about
>>>>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit
>>>>> rate.
>>>>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>>>>>
>>>>> Add dev->skip_dma_sync boolean which is set during the device
>>>>> initialization depending on the setup: dev_is_dma_coherent() for
>>>>> direct
>>>>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive
>>>>> result
>>>>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA
>>>>> ops.
>>>>> Then later, if/when swiotlb is used for the first time, the flag
>>>>> is turned off, from swiotlb_tbl_map_single().
>>>>
>>>> I think you could probably just promote the dma_uses_io_tlb flag from
>>>> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.
>>>
>>> Nice catch!
>>
>> BTW, this implies such hotpath check:
>>
>>     if (dev->dma_skip_sync && !READ_ONCE(dev->dma_uses_io_tlb))
>>         // ...
>>
>> This seems less effective than just resetting dma_skip_sync on first
>> allocation.
> 
> Well, my point is not to have a dma_skip_sync at all; I'm suggesting the
> check would be:
> 
>     if (dev_is_dma_coherent(dev) && dev_uses_io_tlb(dev))
>         ...

Aaah, okay, but what about dma_map_ops?
It would then become:

	if ((!dev->dma_ops ||
	     (!dev->dma_ops->sync_single_for_device &&
	      !dev->dma_ops->sync_single_for_cpu)) ||
	     (dev->dma_ops->flags & DMA_F_SKIP_SYNC)) &&
	    dev_is_dma_coherent(dev) && !dev_uses_io_tlb(dev))
		dma_sync_ ...

Quite a lot and everything except dev_uses_io_tlb() is known at device
probing time, that's why I decided to cache the result into a new flag...

> 
> where on the platform which cares about this most, that first condition
> is a compile-time constant (and as implied, the second would want to be
> similarly wrapped for !SWIOTLB configs).
> 
> Thanks,
> Robin.

Thanks,
Olek
  
Alexander Lobakin Jan. 29, 2024, 2:36 p.m. UTC | #10
From: Petr Tesařík <petr@tesarici.cz>
Date: Fri, 26 Jan 2024 19:48:19 +0100

> On Fri, 26 Jan 2024 17:21:24 +0000
> Robin Murphy <robin.murphy@arm.com> wrote:
> 
>> On 26/01/2024 4:45 pm, Alexander Lobakin wrote:
>>> From: Robin Murphy <robin.murphy@arm.com>
>>> Date: Fri, 26 Jan 2024 15:48:54 +0000
>>>   
>>>> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:  
>>>>> From: Eric Dumazet <edumazet@google.com>
>>>>>
>>>>> Quite often, NIC devices do not need dma_sync operations on x86_64
>>>>> at least.
>>>>> Indeed, when dev_is_dma_coherent(dev) is true and
>>>>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>>>>> and friends do nothing.
>>>>>
>>>>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
>>>>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
>>>>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>>>>>
>>>>> Add dev->skip_dma_sync boolean which is set during the device
>>>>> initialization depending on the setup: dev_is_dma_coherent() for direct
>>>>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
>>>>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
>>>>> Then later, if/when swiotlb is used for the first time, the flag
>>>>> is turned off, from swiotlb_tbl_map_single().  
>>>>
>>>> I think you could probably just promote the dma_uses_io_tlb flag from
>>>> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.  
>>>
>>> Nice catch!
>>>   
>>>>
>>>> Similarly I don't think a new op is necessary now that we have
>>>> dma_map_ops.flags. A simple static flag to indicate that sync may be> skipped under the same conditions as implied for dma-direct - i.e.
>>>> dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
>>>> to suffice.  
>>>
>>> In my initial implementation, I used a new dma_map_ops flag, but then I
>>> realized different DMA ops may require or not require syncing under
>>> different conditions, not only dev_is_dma_coherent().
>>> Or am I wrong and they would always be the same?  
>>
>> I think it's safe to assume that, as with P2P support, this will only 
>> matter for dma-direct and iommu-dma for the foreseeable future, and 
>> those do currently share the same conditions as above. Thus we may as 
>> well keep things simple for now, and if anything ever does have cause to 
>> change, it can be the future's problem to keep this mechanism working as 
>> intended.
> 
> Can we have a comment that states this assumption along with the flag?
> Because when it breaks, it will keep someone cursing for days why DMA
> sometimes fails on their device before they find out it's not synced.

BTW, dma_skip_sync is set right before driver->probe(), so that if any
problematic device appears, it could easily be fixed by adding one line
to its probe callback.

> And then wondering why the code makes such silly assumptions...
> 
> My two cents
> Petr T

Thanks,
Olek
  
Petr Tesařík Jan. 29, 2024, 4:15 p.m. UTC | #11
On Mon, 29 Jan 2024 15:36:35 +0100
Alexander Lobakin <aleksander.lobakin@intel.com> wrote:

> From: Petr Tesařík <petr@tesarici.cz>
> Date: Fri, 26 Jan 2024 19:48:19 +0100
> 
> > On Fri, 26 Jan 2024 17:21:24 +0000
> > Robin Murphy <robin.murphy@arm.com> wrote:
> >   
> >> On 26/01/2024 4:45 pm, Alexander Lobakin wrote:  
> >>> From: Robin Murphy <robin.murphy@arm.com>
> >>> Date: Fri, 26 Jan 2024 15:48:54 +0000
> >>>     
> >>>> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:    
> >>>>> From: Eric Dumazet <edumazet@google.com>
> >>>>>
> >>>>> Quite often, NIC devices do not need dma_sync operations on x86_64
> >>>>> at least.
> >>>>> Indeed, when dev_is_dma_coherent(dev) is true and
> >>>>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
> >>>>> and friends do nothing.
> >>>>>
> >>>>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
> >>>>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
> >>>>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
> >>>>>
> >>>>> Add dev->skip_dma_sync boolean which is set during the device
> >>>>> initialization depending on the setup: dev_is_dma_coherent() for direct
> >>>>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
> >>>>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
> >>>>> Then later, if/when swiotlb is used for the first time, the flag
> >>>>> is turned off, from swiotlb_tbl_map_single().    
> >>>>
> >>>> I think you could probably just promote the dma_uses_io_tlb flag from
> >>>> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now    
> >>>
> >>> Nice catch!
> >>>     
> >>>>
> >>>> Similarly I don't think a new op is necessary now that we have
> >>>> dma_map_ops.flags. A simple static flag to indicate that sync may be> skipped under the same conditions as implied for dma-direct - i.e.
> >>>> dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
> >>>> to suffice.    
> >>>
> >>> In my initial implementation, I used a new dma_map_ops flag, but then I
> >>> realized different DMA ops may require or not require syncing under
> >>> different conditions, not only dev_is_dma_coherent().
> >>> Or am I wrong and they would always be the same?    
> >>
> >> I think it's safe to assume that, as with P2P support, this will only 
> >> matter for dma-direct and iommu-dma for the foreseeable future, and 
> >> those do currently share the same conditions as above. Thus we may as 
> >> well keep things simple for now, and if anything ever does have cause to 
> >> change, it can be the future's problem to keep this mechanism working as 
> >> intended.  
> > 
> > Can we have a comment that states this assumption along with the flag?
> > Because when it breaks, it will keep someone cursing for days why DMA
> > sometimes fails on their device before they find out it's not synced.  
> 
> BTW, dma_skip_sync is set right before driver->probe(), so that if any
> problematic device appears, it could easily be fixed by adding one line
> to its probe callback.

Ah, perfect!

Petr T
  

Patch

diff --git a/include/linux/device.h b/include/linux/device.h
index 97c4b046c09d..f23e6a32bea0 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -686,6 +686,8 @@  struct device_physical_location {
  *		other devices probe successfully.
  * @dma_coherent: this particular device is dma coherent, even if the
  *		architecture supports non-coherent devices.
+ * @dma_skip_sync: DMA sync operations can be skipped for coherent non-SWIOTLB
+ *		buffers.
  * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
  *		streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
  *		and optionall (if the coherent mask is large enough) also
@@ -800,6 +802,9 @@  struct device {
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
 	bool			dma_coherent:1;
 #endif
+#ifdef CONFIG_DMA_NEED_SYNC
+	bool			dma_skip_sync:1;
+#endif
 #ifdef CONFIG_DMA_OPS_BYPASS
 	bool			dma_ops_bypass : 1;
 #endif
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 4abc60f04209..937c295e9da8 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -78,6 +78,7 @@  struct dma_map_ops {
 			int nents, enum dma_data_direction dir);
 	void (*cache_sync)(struct device *dev, void *vaddr, size_t size,
 			enum dma_data_direction direction);
+	bool (*can_skip_sync)(struct device *dev);
 	int (*dma_supported)(struct device *dev, u64 mask);
 	u64 (*get_required_mask)(struct device *dev);
 	size_t (*max_mapping_size)(struct device *dev);
@@ -111,6 +112,22 @@  static inline void set_dma_ops(struct device *dev,
 }
 #endif /* CONFIG_DMA_OPS */
 
+#ifdef CONFIG_DMA_NEED_SYNC
+
+static inline void dma_set_skip_sync(struct device *dev, bool skip)
+{
+	dev->dma_skip_sync = skip;
+}
+
+void dma_setup_skip_sync(struct device *dev);
+
+#else /* !CONFIG_DMA_NEED_SYNC */
+
+#define dma_set_skip_sync(dev, skip)		do { } while (0)
+#define dma_setup_skip_sync(dev)		do { } while (0)
+
+#endif /* !CONFIG_DMA_NEED_SYNC */
+
 #ifdef CONFIG_DMA_CMA
 extern struct cma *dma_contiguous_default_area;
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 9dd7e1578bf6..bc9f67e0c139 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -365,9 +365,17 @@  __dma_sync_single_range_for_device(struct device *dev, dma_addr_t addr,
 
 #ifdef CONFIG_DMA_NEED_SYNC
 
-#define dma_skip_sync(dev)			false
+static inline bool dma_skip_sync(const struct device *dev)
+{
+	return dev->dma_skip_sync;
+}
+
+bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr);
 
-bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
+static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+	return dma_skip_sync(dev) ? false : __dma_need_sync(dev, dma_addr);
+}
 
 #else /* !CONFIG_DMA_NEED_SYNC */
 
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 85152537dbf1..67ad3e1d51f6 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -642,6 +642,8 @@  static int really_probe(struct device *dev, struct device_driver *drv)
 			goto pinctrl_bind_failed;
 	}
 
+	dma_setup_skip_sync(dev);
+
 	ret = driver_sysfs_add(dev);
 	if (ret) {
 		pr_err("%s: driver_sysfs_add(%s) failed\n",
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index a30f37f9d4db..8fa464b3954e 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -842,15 +842,43 @@  size_t dma_opt_mapping_size(struct device *dev)
 EXPORT_SYMBOL_GPL(dma_opt_mapping_size);
 
 #ifdef CONFIG_DMA_NEED_SYNC
-bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	if (dma_map_direct(dev, ops))
+		/*
+		 * dma_skip_sync could've been set to false on first SWIOTLB
+		 * buffer mapping, but @dma_addr is not necessary an SWIOTLB
+		 * buffer. In this case, fall back to more granular check.
+		 */
 		return dma_direct_need_sync(dev, dma_addr);
-	return ops->sync_single_for_cpu || ops->sync_single_for_device;
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(__dma_need_sync);
+
+void dma_setup_skip_sync(struct device *dev)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	bool skip;
+
+	if (dma_map_direct(dev, ops))
+		/*
+		 * dma_skip_sync will be set to false on first SWIOTLB buffer
+		 * mapping, if any. During the device initialization, it's
+		 * enough to check only for DMA coherence.
+		 */
+		skip = dev_is_dma_coherent(dev);
+	else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu)
+		skip = true;
+	else if (ops->can_skip_sync)
+		skip = ops->can_skip_sync(dev);
+	else
+		skip = false;
+
+	dma_set_skip_sync(dev, skip);
 }
-EXPORT_SYMBOL_GPL(dma_need_sync);
 #endif /* CONFIG_DMA_NEED_SYNC */
 
 unsigned long dma_get_merge_boundary(struct device *dev)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index b079a9a8e087..b62ea0a4f106 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1286,6 +1286,16 @@  static unsigned long mem_used(struct io_tlb_mem *mem)
 
 #endif /* CONFIG_DEBUG_FS */
 
+static inline void swiotlb_disable_dma_skip_sync(struct device *dev)
+{
+	/*
+	 * If dma_skip_sync was set, reset it to false on first SWIOTLB buffer
+	 * mapping/allocation to always sync SWIOTLB buffers.
+	 */
+	if (unlikely(dma_skip_sync(dev)))
+		dma_set_skip_sync(dev, false);
+}
+
 phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 		size_t mapping_size, size_t alloc_size,
 		unsigned int alloc_align_mask, enum dma_data_direction dir,
@@ -1323,6 +1333,8 @@  phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 		return (phys_addr_t)DMA_MAPPING_ERROR;
 	}
 
+	swiotlb_disable_dma_skip_sync(dev);
+
 	/*
 	 * Save away the mapping from the original address to the DMA address.
 	 * This is needed when we sync the memory.  Then we sync the buffer if
@@ -1640,6 +1652,8 @@  struct page *swiotlb_alloc(struct device *dev, size_t size)
 	if (index == -1)
 		return NULL;
 
+	swiotlb_disable_dma_skip_sync(dev);
+
 	tlb_addr = slot_addr(pool->start, index);
 
 	return pfn_to_page(PFN_DOWN(tlb_addr));