[v2] dma-coherent: add support for multi coherent rmems per dev

Message ID 20240205072312.2342188-1-howardyen@google.com
State New
Headers
Series [v2] dma-coherent: add support for multi coherent rmems per dev |

Commit Message

Howard Yen Feb. 5, 2024, 7:23 a.m. UTC
  Add support for multiple coherent rmems per device. This patch replaces
original dma_mem with dma_mems list in device structure to store multiple
rmems.

These multiple rmems can be assigned to the device one by one by
of_reserved_mem_device_init_by_idx() with the memory-region
declaration in device tree as below and store the rmem to the dma_mems
list.

	device1@0 {
		...
		memory-region = <&reserved_mem0>, <&reserved_mem1>;
		...
	};

When driver tries to allocate memory from the rmems, looks for the first
available rmem and allocates the memory from this rmem.

Then if driver removed, of_reserved_mem_device_release() needs to be
invoked to release all the rmems assigned to the device.

Signed-off-by: Howard Yen <howardyen@google.com>
---
Changelog since v1:
(suggested by Robin Murphy <robin.murphy@arm.com>)
- Replace the pointer(dma_mem) to a list_head(dma_mems) in the device
  structure and initialize the list_head in device_initialize().
- Modify the required changes in coherent.c.

 drivers/base/core.c    |  3 ++
 include/linux/device.h |  5 ++--
 kernel/dma/coherent.c  | 66 ++++++++++++++++++++++++++++--------------
 3 files changed, 50 insertions(+), 24 deletions(-)
  

Comments

Andy Shevchenko Feb. 5, 2024, 12:08 p.m. UTC | #1
On Mon, Feb 05, 2024 at 07:23:00AM +0000, Howard Yen wrote:
> Add support for multiple coherent rmems per device. This patch replaces
> original dma_mem with dma_mems list in device structure to store multiple
> rmems.
> 
> These multiple rmems can be assigned to the device one by one by
> of_reserved_mem_device_init_by_idx() with the memory-region
> declaration in device tree as below and store the rmem to the dma_mems
> list.
> 
> 	device1@0 {
> 		...
> 		memory-region = <&reserved_mem0>, <&reserved_mem1>;
> 		...
> 	};
> 
> When driver tries to allocate memory from the rmems, looks for the first
> available rmem and allocates the memory from this rmem.
> 
> Then if driver removed, of_reserved_mem_device_release() needs to be
> invoked to release all the rmems assigned to the device.

..

> --- a/kernel/dma/coherent.c
> +++ b/kernel/dma/coherent.c
> @@ -18,15 +18,9 @@ struct dma_coherent_mem {
>  	unsigned long	*bitmap;
>  	spinlock_t	spinlock;
>  	bool		use_dev_dma_pfn_offset;
> +	struct list_head	node;

Have you run `pahole`? Here I see wasted bytes for nothing.

>  };

..

>  void dma_release_coherent_memory(struct device *dev)
>  {
> +	struct dma_coherent_mem *mem_tmp, *q;
> +
>  	if (dev) {

While at it, perhaps

	if (!dev)
		return;

?

> -		_dma_release_coherent_memory(dev->dma_mem);
> -		dev->dma_mem = NULL;
> +		list_for_each_entry_safe(mem_tmp, q, &dev->dma_mems, node) {
> +			list_del_init(&mem_tmp->node);
> +			_dma_release_coherent_memory(mem_tmp);
> +		}
>  	}
>  }

..

>  int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr)
>  {
> -	struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
> +	struct dma_coherent_mem *mem_tmp;
> +	int ret = 0;

'ret' (1)

> -	return __dma_release_from_coherent(mem, order, vaddr);
> +	list_for_each_entry(mem_tmp, &dev->dma_mems, node) {
> +		ret = __dma_release_from_coherent(mem_tmp, order, vaddr);
> +		if (ret == 1)
> +			break;
> +	}
> +
> +	return ret;
>  }

..

>  int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
>  			   void *vaddr, size_t size, int *ret)
>  {
> -	struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
> +	struct dma_coherent_mem *mem_tmp;
> +	int retval = 0;

'retval' (2)

Can we be consistent, please? (See (1) and (2) above.)

> +	list_for_each_entry(mem_tmp, &dev->dma_mems, node) {
> +		retval = __dma_mmap_from_coherent(mem_tmp, vma, vaddr, size, ret);
> +		if (retval == 1)
> +			break;
> +	}
>  
> -	return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret);
> +	return retval;
>  }

..

>  static void rmem_dma_device_release(struct reserved_mem *rmem,
>  				    struct device *dev)
>  {
> -	if (dev)
> -		dev->dma_mem = NULL;
> +	struct dma_coherent_mem *mem_tmp, *q;

> +	if (dev) {

As per above, esp. taking into account that you touch this line. With proposed
modification you won't need to.

> +		list_for_each_entry_safe(mem_tmp, q, &dev->dma_mems, node) {
> +			if (mem_tmp == rmem->priv) {
> +				list_del_init(&mem_tmp->node);
> +				break;
> +			}
> +		}
> +	}
>  }

Better question, do we really need the dev check (at least in static functions)
or it can be ommitted?
  
Andy Shevchenko Feb. 6, 2024, 3:43 p.m. UTC | #2
On Mon, Feb 05, 2024 at 02:08:00PM +0200, Andy Shevchenko wrote:
> On Mon, Feb 05, 2024 at 07:23:00AM +0000, Howard Yen wrote:

..

> > @@ -18,15 +18,9 @@ struct dma_coherent_mem {
> >  	unsigned long	*bitmap;
> >  	spinlock_t	spinlock;
> >  	bool		use_dev_dma_pfn_offset;
> > +	struct list_head	node;
> 
> Have you run `pahole`? Here I see wasted bytes for nothing.

On top of that one may make container_of() to be no-op, by placing this member
to be the first one. But, double check this with bloat-o-meter (that it indeed
does better code generation) and on the other hand check if the current first
member is not performance critical and having additional pointer arithmetics is
okay.

> >  };
  
Howard Yen Feb. 8, 2024, 7:53 a.m. UTC | #3
On Tue, Feb 6, 2024 at 11:43 PM Andy Shevchenko
<andriy.shevchenko@linux.intel.com> wrote:
>
> On Mon, Feb 05, 2024 at 02:08:00PM +0200, Andy Shevchenko wrote:
> > On Mon, Feb 05, 2024 at 07:23:00AM +0000, Howard Yen wrote:
>
> ...
>
> > > @@ -18,15 +18,9 @@ struct dma_coherent_mem {
> > >     unsigned long   *bitmap;
> > >     spinlock_t      spinlock;
> > >     bool            use_dev_dma_pfn_offset;
> > > +   struct list_head        node;
> >
> > Have you run `pahole`? Here I see wasted bytes for nothing.
>
> On top of that one may make container_of() to be no-op, by placing this member
> to be the first one. But, double check this with bloat-o-meter (that it indeed
> does better code generation) and on the other hand check if the current first
> member is not performance critical and having additional pointer arithmetics is
> okay.
>
> > >  };
>
>
> --
> With Best Regards,
> Andy Shevchenko
>
>

I'm trying to re-org the members as below

from ===>

struct dma_coherent_mem {
void *                     virt_base;            /*     0     8 */
dma_addr_t                 device_base;          /*     8     8 */
unsigned long              pfn_base;             /*    16     8 */
int                        size;                 /*    24     4 */

/* XXX 4 bytes hole, try to pack */

unsigned long *            bitmap;               /*    32     8 */
spinlock_t                 spinlock;             /*    40     4 */
bool                       use_dev_dma_pfn_offset; /*    44     1 */

/* XXX 3 bytes hole, try to pack */

struct list_head           node;                 /*    48    16 */

/* size: 64, cachelines: 1, members: 8 */
/* sum members: 57, holes: 2, sum holes: 7 */
};


to ===>

struct dma_coherent_mem {
struct list_head           node;                 /*     0    16 */
void *                     virt_base;            /*    16     8 */
dma_addr_t                 device_base;          /*    24     8 */
unsigned long              pfn_base;             /*    32     8 */
int                        size;                 /*    40     4 */
spinlock_t                 spinlock;             /*    44     4 */
unsigned long *            bitmap;               /*    48     8 */
bool                       use_dev_dma_pfn_offset; /*    56     1 */

/* size: 64, cachelines: 1, members: 8 */
/* padding: 7 */
};

Looks like there is about 7 bytes padding at the end of the structure.
Should I add __attribute__((__packed__)) to not add the padding?
  
Andy Shevchenko Feb. 8, 2024, 10:56 a.m. UTC | #4
On Thu, Feb 08, 2024 at 03:53:37PM +0800, Howard Yen wrote:
> On Tue, Feb 6, 2024 at 11:43 PM Andy Shevchenko
> <andriy.shevchenko@linux.intel.com> wrote:
> > On Mon, Feb 05, 2024 at 02:08:00PM +0200, Andy Shevchenko wrote:
> > > On Mon, Feb 05, 2024 at 07:23:00AM +0000, Howard Yen wrote:

..

> > > > @@ -18,15 +18,9 @@ struct dma_coherent_mem {
> > > >     unsigned long   *bitmap;
> > > >     spinlock_t      spinlock;
> > > >     bool            use_dev_dma_pfn_offset;
> > > > +   struct list_head        node;
> > >
> > > Have you run `pahole`? Here I see wasted bytes for nothing.
> >
> > On top of that one may make container_of() to be no-op, by placing this member
> > to be the first one. But, double check this with bloat-o-meter (that it indeed
> > does better code generation) and on the other hand check if the current first
> > member is not performance critical and having additional pointer arithmetics is
> > okay.
> >
> > > >  };
> 
> I'm trying to re-org the members as below
> 
> from ===>
> 
> struct dma_coherent_mem {
> void *                     virt_base;            /*     0     8 */
> dma_addr_t                 device_base;          /*     8     8 */
> unsigned long              pfn_base;             /*    16     8 */
> int                        size;                 /*    24     4 */
> 
> /* XXX 4 bytes hole, try to pack */
> 
> unsigned long *            bitmap;               /*    32     8 */
> spinlock_t                 spinlock;             /*    40     4 */
> bool                       use_dev_dma_pfn_offset; /*    44     1 */
> 
> /* XXX 3 bytes hole, try to pack */
> 
> struct list_head           node;                 /*    48    16 */
> 
> /* size: 64, cachelines: 1, members: 8 */
> /* sum members: 57, holes: 2, sum holes: 7 */
> };
> 
> 
> to ===>
> 
> struct dma_coherent_mem {
> struct list_head           node;                 /*     0    16 */
> void *                     virt_base;            /*    16     8 */
> dma_addr_t                 device_base;          /*    24     8 */
> unsigned long              pfn_base;             /*    32     8 */
> int                        size;                 /*    40     4 */
> spinlock_t                 spinlock;             /*    44     4 */
> unsigned long *            bitmap;               /*    48     8 */
> bool                       use_dev_dma_pfn_offset; /*    56     1 */
> 
> /* size: 64, cachelines: 1, members: 8 */
> /* padding: 7 */

Which seems better that above, right?

> };
> 
> Looks like there is about 7 bytes padding at the end of the structure.
> Should I add __attribute__((__packed__)) to not add the padding?

No, __packed is about alignment, may give you much worse code generation.
With list_head member first you might get better code from the original,
check it with bloat-o-meter.
  
Howard Yen Feb. 8, 2024, 2:57 p.m. UTC | #5
On Thu, Feb 8, 2024 at 6:56 PM Andy Shevchenko
<andriy.shevchenko@linux.intel.com> wrote:
>
> On Thu, Feb 08, 2024 at 03:53:37PM +0800, Howard Yen wrote:
> > On Tue, Feb 6, 2024 at 11:43 PM Andy Shevchenko
> > <andriy.shevchenko@linux.intel.com> wrote:
> > > On Mon, Feb 05, 2024 at 02:08:00PM +0200, Andy Shevchenko wrote:
> > > > On Mon, Feb 05, 2024 at 07:23:00AM +0000, Howard Yen wrote:
>
> ...
>
> > > > > @@ -18,15 +18,9 @@ struct dma_coherent_mem {
> > > > >     unsigned long   *bitmap;
> > > > >     spinlock_t      spinlock;
> > > > >     bool            use_dev_dma_pfn_offset;
> > > > > +   struct list_head        node;
> > > >
> > > > Have you run `pahole`? Here I see wasted bytes for nothing.
> > >
> > > On top of that one may make container_of() to be no-op, by placing this member
> > > to be the first one. But, double check this with bloat-o-meter (that it indeed
> > > does better code generation) and on the other hand check if the current first
> > > member is not performance critical and having additional pointer arithmetics is
> > > okay.
> > >
> > > > >  };
> >
> > I'm trying to re-org the members as below
> >
> > from ===>
> >
> > struct dma_coherent_mem {
> > void *                     virt_base;            /*     0     8 */
> > dma_addr_t                 device_base;          /*     8     8 */
> > unsigned long              pfn_base;             /*    16     8 */
> > int                        size;                 /*    24     4 */
> >
> > /* XXX 4 bytes hole, try to pack */
> >
> > unsigned long *            bitmap;               /*    32     8 */
> > spinlock_t                 spinlock;             /*    40     4 */
> > bool                       use_dev_dma_pfn_offset; /*    44     1 */
> >
> > /* XXX 3 bytes hole, try to pack */
> >
> > struct list_head           node;                 /*    48    16 */
> >
> > /* size: 64, cachelines: 1, members: 8 */
> > /* sum members: 57, holes: 2, sum holes: 7 */
> > };
> >
> >
> > to ===>
> >
> > struct dma_coherent_mem {
> > struct list_head           node;                 /*     0    16 */
> > void *                     virt_base;            /*    16     8 */
> > dma_addr_t                 device_base;          /*    24     8 */
> > unsigned long              pfn_base;             /*    32     8 */
> > int                        size;                 /*    40     4 */
> > spinlock_t                 spinlock;             /*    44     4 */
> > unsigned long *            bitmap;               /*    48     8 */
> > bool                       use_dev_dma_pfn_offset; /*    56     1 */
> >
> > /* size: 64, cachelines: 1, members: 8 */
> > /* padding: 7 */
>
> Which seems better that above, right?
>
> > };
> >
> > Looks like there is about 7 bytes padding at the end of the structure.
> > Should I add __attribute__((__packed__)) to not add the padding?
>
> No, __packed is about alignment, may give you much worse code generation.
> With list_head member first you might get better code from the original,
> check it with bloat-o-meter.
>
> --
> With Best Regards,
> Andy Shevchenko
>
>

From the check result with bloat-o-meter, there is about 3.38%
reduction totally from the
original version. Thanks for the suggestion!

add/remove: 0/0 grow/shrink: 0/7 up/down: 0/-60 (-60)
Function                                     old     new   delta
rmem_dma_device_release                      104     100      -4
dma_release_from_dev_coherent                184     180      -4
dma_release_coherent_memory                  144     140      -4
dma_mmap_from_dev_coherent                   228     224      -4
dma_init_coherent_memory                     292     284      -8
rmem_dma_device_init                         168     152     -16
dma_declare_coherent_memory                  184     164     -20
Total: Before=1776, After=1716, chg -3.38%
add/remove: 0/0 grow/shrink: 0/0 up/down: 0/0 (0)
Data                                         old     new   delta
Total: Before=0, After=0, chg +0.00%
add/remove: 0/0 grow/shrink: 0/0 up/down: 0/0 (0)
RO Data                                      old     new   delta
Total: Before=216, After=216, chg +0.00%

For the dev check, in the previous comment, they're in the static
function and are assigned
to ops function pointers, I think the check is required because they
might be invoked from
others.

I'll submit v3 with the members reorg, the return variable naming
changes and if (!dev) return; .
  

Patch

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 14d46af40f9a..d9af38d7b870 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -3108,6 +3108,9 @@  void device_initialize(struct device *dev)
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
 	dev->dma_coherent = dma_default_coherent;
+#endif
+#ifdef CONFIG_DMA_DECLARE_COHERENT
+	INIT_LIST_HEAD(&dev->dma_mems);
 #endif
 	swiotlb_dev_init(dev);
 }
diff --git a/include/linux/device.h b/include/linux/device.h
index 97c4b046c09d..5fa15e5adbdc 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -648,7 +648,7 @@  struct device_physical_location {
  * @dma_parms:	A low level driver may set these to teach IOMMU code about
  * 		segment limitations.
  * @dma_pools:	Dma pools (if dma'ble device).
- * @dma_mem:	Internal for coherent mem override.
+ * @dma_mems:	Internal for coherent mems.
  * @cma_area:	Contiguous memory area for dma allocations
  * @dma_io_tlb_mem: Software IO TLB allocator.  Not for driver use.
  * @dma_io_tlb_pools:	List of transient swiotlb memory pools.
@@ -749,8 +749,7 @@  struct device {
 	struct list_head	dma_pools;	/* dma pools (if dma'ble) */
 
 #ifdef CONFIG_DMA_DECLARE_COHERENT
-	struct dma_coherent_mem	*dma_mem; /* internal for coherent mem
-					     override */
+	struct list_head	dma_mems; /* Internal for coherent mems */
 #endif
 #ifdef CONFIG_DMA_CMA
 	struct cma *cma_area;		/* contiguous memory area for dma
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index ff5683a57f77..91aa63d3327b 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -18,15 +18,9 @@  struct dma_coherent_mem {
 	unsigned long	*bitmap;
 	spinlock_t	spinlock;
 	bool		use_dev_dma_pfn_offset;
+	struct list_head	node;
 };
 
-static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev)
-{
-	if (dev && dev->dma_mem)
-		return dev->dma_mem;
-	return NULL;
-}
-
 static inline dma_addr_t dma_get_device_base(struct device *dev,
 					     struct dma_coherent_mem * mem)
 {
@@ -61,6 +55,7 @@  static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
 	dma_mem->pfn_base = PFN_DOWN(phys_addr);
 	dma_mem->size = pages;
 	dma_mem->use_dev_dma_pfn_offset = use_dma_pfn_offset;
+	INIT_LIST_HEAD(&dma_mem->node);
 	spin_lock_init(&dma_mem->spinlock);
 
 	return dma_mem;
@@ -90,10 +85,8 @@  static int dma_assign_coherent_memory(struct device *dev,
 	if (!dev)
 		return -ENODEV;
 
-	if (dev->dma_mem)
-		return -EBUSY;
+	list_add_tail(&mem->node, &dev->dma_mems);
 
-	dev->dma_mem = mem;
 	return 0;
 }
 
@@ -132,9 +125,13 @@  int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 
 void dma_release_coherent_memory(struct device *dev)
 {
+	struct dma_coherent_mem *mem_tmp, *q;
+
 	if (dev) {
-		_dma_release_coherent_memory(dev->dma_mem);
-		dev->dma_mem = NULL;
+		list_for_each_entry_safe(mem_tmp, q, &dev->dma_mems, node) {
+			list_del_init(&mem_tmp->node);
+			_dma_release_coherent_memory(mem_tmp);
+		}
 	}
 }
 
@@ -187,12 +184,17 @@  static void *__dma_alloc_from_coherent(struct device *dev,
 int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
 		dma_addr_t *dma_handle, void **ret)
 {
-	struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
+	struct dma_coherent_mem *mem_tmp;
 
-	if (!mem)
+	if (list_empty(&dev->dma_mems))
 		return 0;
 
-	*ret = __dma_alloc_from_coherent(dev, mem, size, dma_handle);
+	list_for_each_entry(mem_tmp, &dev->dma_mems, node) {
+		*ret = __dma_alloc_from_coherent(dev, mem_tmp, size, dma_handle);
+		if (*ret)
+			break;
+	}
+
 	return 1;
 }
 
@@ -226,9 +228,16 @@  static int __dma_release_from_coherent(struct dma_coherent_mem *mem,
  */
 int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr)
 {
-	struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
+	struct dma_coherent_mem *mem_tmp;
+	int ret = 0;
 
-	return __dma_release_from_coherent(mem, order, vaddr);
+	list_for_each_entry(mem_tmp, &dev->dma_mems, node) {
+		ret = __dma_release_from_coherent(mem_tmp, order, vaddr);
+		if (ret == 1)
+			break;
+	}
+
+	return ret;
 }
 
 static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem,
@@ -271,9 +280,16 @@  static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem,
 int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
 			   void *vaddr, size_t size, int *ret)
 {
-	struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
+	struct dma_coherent_mem *mem_tmp;
+	int retval = 0;
+
+	list_for_each_entry(mem_tmp, &dev->dma_mems, node) {
+		retval = __dma_mmap_from_coherent(mem_tmp, vma, vaddr, size, ret);
+		if (retval == 1)
+			break;
+	}
 
-	return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret);
+	return retval;
 }
 
 #ifdef CONFIG_DMA_GLOBAL_POOL
@@ -351,8 +367,16 @@  static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
 static void rmem_dma_device_release(struct reserved_mem *rmem,
 				    struct device *dev)
 {
-	if (dev)
-		dev->dma_mem = NULL;
+	struct dma_coherent_mem *mem_tmp, *q;
+
+	if (dev) {
+		list_for_each_entry_safe(mem_tmp, q, &dev->dma_mems, node) {
+			if (mem_tmp == rmem->priv) {
+				list_del_init(&mem_tmp->node);
+				break;
+			}
+		}
+	}
 }
 
 static const struct reserved_mem_ops rmem_dma_ops = {