[v2] dma-coherent: add support for multi coherent rmems per dev
Commit Message
Add support for multiple coherent rmems per device. This patch replaces
original dma_mem with dma_mems list in device structure to store multiple
rmems.
These multiple rmems can be assigned to the device one by one by
of_reserved_mem_device_init_by_idx() with the memory-region
declaration in device tree as below and store the rmem to the dma_mems
list.
device1@0 {
...
memory-region = <&reserved_mem0>, <&reserved_mem1>;
...
};
When driver tries to allocate memory from the rmems, looks for the first
available rmem and allocates the memory from this rmem.
Then if driver removed, of_reserved_mem_device_release() needs to be
invoked to release all the rmems assigned to the device.
Signed-off-by: Howard Yen <howardyen@google.com>
---
Changelog since v1:
(suggested by Robin Murphy <robin.murphy@arm.com>)
- Replace the pointer(dma_mem) to a list_head(dma_mems) in the device
structure and initialize the list_head in device_initialize().
- Modify the required changes in coherent.c.
drivers/base/core.c | 3 ++
include/linux/device.h | 5 ++--
kernel/dma/coherent.c | 66 ++++++++++++++++++++++++++++--------------
3 files changed, 50 insertions(+), 24 deletions(-)
Comments
On Mon, Feb 05, 2024 at 07:23:00AM +0000, Howard Yen wrote:
> Add support for multiple coherent rmems per device. This patch replaces
> original dma_mem with dma_mems list in device structure to store multiple
> rmems.
>
> These multiple rmems can be assigned to the device one by one by
> of_reserved_mem_device_init_by_idx() with the memory-region
> declaration in device tree as below and store the rmem to the dma_mems
> list.
>
> device1@0 {
> ...
> memory-region = <&reserved_mem0>, <&reserved_mem1>;
> ...
> };
>
> When driver tries to allocate memory from the rmems, looks for the first
> available rmem and allocates the memory from this rmem.
>
> Then if driver removed, of_reserved_mem_device_release() needs to be
> invoked to release all the rmems assigned to the device.
..
> --- a/kernel/dma/coherent.c
> +++ b/kernel/dma/coherent.c
> @@ -18,15 +18,9 @@ struct dma_coherent_mem {
> unsigned long *bitmap;
> spinlock_t spinlock;
> bool use_dev_dma_pfn_offset;
> + struct list_head node;
Have you run `pahole`? Here I see wasted bytes for nothing.
> };
..
> void dma_release_coherent_memory(struct device *dev)
> {
> + struct dma_coherent_mem *mem_tmp, *q;
> +
> if (dev) {
While at it, perhaps
if (!dev)
return;
?
> - _dma_release_coherent_memory(dev->dma_mem);
> - dev->dma_mem = NULL;
> + list_for_each_entry_safe(mem_tmp, q, &dev->dma_mems, node) {
> + list_del_init(&mem_tmp->node);
> + _dma_release_coherent_memory(mem_tmp);
> + }
> }
> }
..
> int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr)
> {
> - struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
> + struct dma_coherent_mem *mem_tmp;
> + int ret = 0;
'ret' (1)
> - return __dma_release_from_coherent(mem, order, vaddr);
> + list_for_each_entry(mem_tmp, &dev->dma_mems, node) {
> + ret = __dma_release_from_coherent(mem_tmp, order, vaddr);
> + if (ret == 1)
> + break;
> + }
> +
> + return ret;
> }
..
> int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
> void *vaddr, size_t size, int *ret)
> {
> - struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
> + struct dma_coherent_mem *mem_tmp;
> + int retval = 0;
'retval' (2)
Can we be consistent, please? (See (1) and (2) above.)
> + list_for_each_entry(mem_tmp, &dev->dma_mems, node) {
> + retval = __dma_mmap_from_coherent(mem_tmp, vma, vaddr, size, ret);
> + if (retval == 1)
> + break;
> + }
>
> - return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret);
> + return retval;
> }
..
> static void rmem_dma_device_release(struct reserved_mem *rmem,
> struct device *dev)
> {
> - if (dev)
> - dev->dma_mem = NULL;
> + struct dma_coherent_mem *mem_tmp, *q;
> + if (dev) {
As per above, esp. taking into account that you touch this line. With proposed
modification you won't need to.
> + list_for_each_entry_safe(mem_tmp, q, &dev->dma_mems, node) {
> + if (mem_tmp == rmem->priv) {
> + list_del_init(&mem_tmp->node);
> + break;
> + }
> + }
> + }
> }
Better question, do we really need the dev check (at least in static functions)
or it can be ommitted?
On Mon, Feb 05, 2024 at 02:08:00PM +0200, Andy Shevchenko wrote:
> On Mon, Feb 05, 2024 at 07:23:00AM +0000, Howard Yen wrote:
..
> > @@ -18,15 +18,9 @@ struct dma_coherent_mem {
> > unsigned long *bitmap;
> > spinlock_t spinlock;
> > bool use_dev_dma_pfn_offset;
> > + struct list_head node;
>
> Have you run `pahole`? Here I see wasted bytes for nothing.
On top of that one may make container_of() to be no-op, by placing this member
to be the first one. But, double check this with bloat-o-meter (that it indeed
does better code generation) and on the other hand check if the current first
member is not performance critical and having additional pointer arithmetics is
okay.
> > };
On Tue, Feb 6, 2024 at 11:43 PM Andy Shevchenko
<andriy.shevchenko@linux.intel.com> wrote:
>
> On Mon, Feb 05, 2024 at 02:08:00PM +0200, Andy Shevchenko wrote:
> > On Mon, Feb 05, 2024 at 07:23:00AM +0000, Howard Yen wrote:
>
> ...
>
> > > @@ -18,15 +18,9 @@ struct dma_coherent_mem {
> > > unsigned long *bitmap;
> > > spinlock_t spinlock;
> > > bool use_dev_dma_pfn_offset;
> > > + struct list_head node;
> >
> > Have you run `pahole`? Here I see wasted bytes for nothing.
>
> On top of that one may make container_of() to be no-op, by placing this member
> to be the first one. But, double check this with bloat-o-meter (that it indeed
> does better code generation) and on the other hand check if the current first
> member is not performance critical and having additional pointer arithmetics is
> okay.
>
> > > };
>
>
> --
> With Best Regards,
> Andy Shevchenko
>
>
I'm trying to re-org the members as below
from ===>
struct dma_coherent_mem {
void * virt_base; /* 0 8 */
dma_addr_t device_base; /* 8 8 */
unsigned long pfn_base; /* 16 8 */
int size; /* 24 4 */
/* XXX 4 bytes hole, try to pack */
unsigned long * bitmap; /* 32 8 */
spinlock_t spinlock; /* 40 4 */
bool use_dev_dma_pfn_offset; /* 44 1 */
/* XXX 3 bytes hole, try to pack */
struct list_head node; /* 48 16 */
/* size: 64, cachelines: 1, members: 8 */
/* sum members: 57, holes: 2, sum holes: 7 */
};
to ===>
struct dma_coherent_mem {
struct list_head node; /* 0 16 */
void * virt_base; /* 16 8 */
dma_addr_t device_base; /* 24 8 */
unsigned long pfn_base; /* 32 8 */
int size; /* 40 4 */
spinlock_t spinlock; /* 44 4 */
unsigned long * bitmap; /* 48 8 */
bool use_dev_dma_pfn_offset; /* 56 1 */
/* size: 64, cachelines: 1, members: 8 */
/* padding: 7 */
};
Looks like there is about 7 bytes padding at the end of the structure.
Should I add __attribute__((__packed__)) to not add the padding?
On Thu, Feb 08, 2024 at 03:53:37PM +0800, Howard Yen wrote:
> On Tue, Feb 6, 2024 at 11:43 PM Andy Shevchenko
> <andriy.shevchenko@linux.intel.com> wrote:
> > On Mon, Feb 05, 2024 at 02:08:00PM +0200, Andy Shevchenko wrote:
> > > On Mon, Feb 05, 2024 at 07:23:00AM +0000, Howard Yen wrote:
..
> > > > @@ -18,15 +18,9 @@ struct dma_coherent_mem {
> > > > unsigned long *bitmap;
> > > > spinlock_t spinlock;
> > > > bool use_dev_dma_pfn_offset;
> > > > + struct list_head node;
> > >
> > > Have you run `pahole`? Here I see wasted bytes for nothing.
> >
> > On top of that one may make container_of() to be no-op, by placing this member
> > to be the first one. But, double check this with bloat-o-meter (that it indeed
> > does better code generation) and on the other hand check if the current first
> > member is not performance critical and having additional pointer arithmetics is
> > okay.
> >
> > > > };
>
> I'm trying to re-org the members as below
>
> from ===>
>
> struct dma_coherent_mem {
> void * virt_base; /* 0 8 */
> dma_addr_t device_base; /* 8 8 */
> unsigned long pfn_base; /* 16 8 */
> int size; /* 24 4 */
>
> /* XXX 4 bytes hole, try to pack */
>
> unsigned long * bitmap; /* 32 8 */
> spinlock_t spinlock; /* 40 4 */
> bool use_dev_dma_pfn_offset; /* 44 1 */
>
> /* XXX 3 bytes hole, try to pack */
>
> struct list_head node; /* 48 16 */
>
> /* size: 64, cachelines: 1, members: 8 */
> /* sum members: 57, holes: 2, sum holes: 7 */
> };
>
>
> to ===>
>
> struct dma_coherent_mem {
> struct list_head node; /* 0 16 */
> void * virt_base; /* 16 8 */
> dma_addr_t device_base; /* 24 8 */
> unsigned long pfn_base; /* 32 8 */
> int size; /* 40 4 */
> spinlock_t spinlock; /* 44 4 */
> unsigned long * bitmap; /* 48 8 */
> bool use_dev_dma_pfn_offset; /* 56 1 */
>
> /* size: 64, cachelines: 1, members: 8 */
> /* padding: 7 */
Which seems better that above, right?
> };
>
> Looks like there is about 7 bytes padding at the end of the structure.
> Should I add __attribute__((__packed__)) to not add the padding?
No, __packed is about alignment, may give you much worse code generation.
With list_head member first you might get better code from the original,
check it with bloat-o-meter.
On Thu, Feb 8, 2024 at 6:56 PM Andy Shevchenko
<andriy.shevchenko@linux.intel.com> wrote:
>
> On Thu, Feb 08, 2024 at 03:53:37PM +0800, Howard Yen wrote:
> > On Tue, Feb 6, 2024 at 11:43 PM Andy Shevchenko
> > <andriy.shevchenko@linux.intel.com> wrote:
> > > On Mon, Feb 05, 2024 at 02:08:00PM +0200, Andy Shevchenko wrote:
> > > > On Mon, Feb 05, 2024 at 07:23:00AM +0000, Howard Yen wrote:
>
> ...
>
> > > > > @@ -18,15 +18,9 @@ struct dma_coherent_mem {
> > > > > unsigned long *bitmap;
> > > > > spinlock_t spinlock;
> > > > > bool use_dev_dma_pfn_offset;
> > > > > + struct list_head node;
> > > >
> > > > Have you run `pahole`? Here I see wasted bytes for nothing.
> > >
> > > On top of that one may make container_of() to be no-op, by placing this member
> > > to be the first one. But, double check this with bloat-o-meter (that it indeed
> > > does better code generation) and on the other hand check if the current first
> > > member is not performance critical and having additional pointer arithmetics is
> > > okay.
> > >
> > > > > };
> >
> > I'm trying to re-org the members as below
> >
> > from ===>
> >
> > struct dma_coherent_mem {
> > void * virt_base; /* 0 8 */
> > dma_addr_t device_base; /* 8 8 */
> > unsigned long pfn_base; /* 16 8 */
> > int size; /* 24 4 */
> >
> > /* XXX 4 bytes hole, try to pack */
> >
> > unsigned long * bitmap; /* 32 8 */
> > spinlock_t spinlock; /* 40 4 */
> > bool use_dev_dma_pfn_offset; /* 44 1 */
> >
> > /* XXX 3 bytes hole, try to pack */
> >
> > struct list_head node; /* 48 16 */
> >
> > /* size: 64, cachelines: 1, members: 8 */
> > /* sum members: 57, holes: 2, sum holes: 7 */
> > };
> >
> >
> > to ===>
> >
> > struct dma_coherent_mem {
> > struct list_head node; /* 0 16 */
> > void * virt_base; /* 16 8 */
> > dma_addr_t device_base; /* 24 8 */
> > unsigned long pfn_base; /* 32 8 */
> > int size; /* 40 4 */
> > spinlock_t spinlock; /* 44 4 */
> > unsigned long * bitmap; /* 48 8 */
> > bool use_dev_dma_pfn_offset; /* 56 1 */
> >
> > /* size: 64, cachelines: 1, members: 8 */
> > /* padding: 7 */
>
> Which seems better that above, right?
>
> > };
> >
> > Looks like there is about 7 bytes padding at the end of the structure.
> > Should I add __attribute__((__packed__)) to not add the padding?
>
> No, __packed is about alignment, may give you much worse code generation.
> With list_head member first you might get better code from the original,
> check it with bloat-o-meter.
>
> --
> With Best Regards,
> Andy Shevchenko
>
>
From the check result with bloat-o-meter, there is about 3.38%
reduction totally from the
original version. Thanks for the suggestion!
add/remove: 0/0 grow/shrink: 0/7 up/down: 0/-60 (-60)
Function old new delta
rmem_dma_device_release 104 100 -4
dma_release_from_dev_coherent 184 180 -4
dma_release_coherent_memory 144 140 -4
dma_mmap_from_dev_coherent 228 224 -4
dma_init_coherent_memory 292 284 -8
rmem_dma_device_init 168 152 -16
dma_declare_coherent_memory 184 164 -20
Total: Before=1776, After=1716, chg -3.38%
add/remove: 0/0 grow/shrink: 0/0 up/down: 0/0 (0)
Data old new delta
Total: Before=0, After=0, chg +0.00%
add/remove: 0/0 grow/shrink: 0/0 up/down: 0/0 (0)
RO Data old new delta
Total: Before=216, After=216, chg +0.00%
For the dev check, in the previous comment, they're in the static
function and are assigned
to ops function pointers, I think the check is required because they
might be invoked from
others.
I'll submit v3 with the members reorg, the return variable naming
changes and if (!dev) return; .
@@ -3108,6 +3108,9 @@ void device_initialize(struct device *dev)
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
dev->dma_coherent = dma_default_coherent;
+#endif
+#ifdef CONFIG_DMA_DECLARE_COHERENT
+ INIT_LIST_HEAD(&dev->dma_mems);
#endif
swiotlb_dev_init(dev);
}
@@ -648,7 +648,7 @@ struct device_physical_location {
* @dma_parms: A low level driver may set these to teach IOMMU code about
* segment limitations.
* @dma_pools: Dma pools (if dma'ble device).
- * @dma_mem: Internal for coherent mem override.
+ * @dma_mems: Internal for coherent mems.
* @cma_area: Contiguous memory area for dma allocations
* @dma_io_tlb_mem: Software IO TLB allocator. Not for driver use.
* @dma_io_tlb_pools: List of transient swiotlb memory pools.
@@ -749,8 +749,7 @@ struct device {
struct list_head dma_pools; /* dma pools (if dma'ble) */
#ifdef CONFIG_DMA_DECLARE_COHERENT
- struct dma_coherent_mem *dma_mem; /* internal for coherent mem
- override */
+ struct list_head dma_mems; /* Internal for coherent mems */
#endif
#ifdef CONFIG_DMA_CMA
struct cma *cma_area; /* contiguous memory area for dma
@@ -18,15 +18,9 @@ struct dma_coherent_mem {
unsigned long *bitmap;
spinlock_t spinlock;
bool use_dev_dma_pfn_offset;
+ struct list_head node;
};
-static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev)
-{
- if (dev && dev->dma_mem)
- return dev->dma_mem;
- return NULL;
-}
-
static inline dma_addr_t dma_get_device_base(struct device *dev,
struct dma_coherent_mem * mem)
{
@@ -61,6 +55,7 @@ static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
dma_mem->pfn_base = PFN_DOWN(phys_addr);
dma_mem->size = pages;
dma_mem->use_dev_dma_pfn_offset = use_dma_pfn_offset;
+ INIT_LIST_HEAD(&dma_mem->node);
spin_lock_init(&dma_mem->spinlock);
return dma_mem;
@@ -90,10 +85,8 @@ static int dma_assign_coherent_memory(struct device *dev,
if (!dev)
return -ENODEV;
- if (dev->dma_mem)
- return -EBUSY;
+ list_add_tail(&mem->node, &dev->dma_mems);
- dev->dma_mem = mem;
return 0;
}
@@ -132,9 +125,13 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
void dma_release_coherent_memory(struct device *dev)
{
+ struct dma_coherent_mem *mem_tmp, *q;
+
if (dev) {
- _dma_release_coherent_memory(dev->dma_mem);
- dev->dma_mem = NULL;
+ list_for_each_entry_safe(mem_tmp, q, &dev->dma_mems, node) {
+ list_del_init(&mem_tmp->node);
+ _dma_release_coherent_memory(mem_tmp);
+ }
}
}
@@ -187,12 +184,17 @@ static void *__dma_alloc_from_coherent(struct device *dev,
int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
dma_addr_t *dma_handle, void **ret)
{
- struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
+ struct dma_coherent_mem *mem_tmp;
- if (!mem)
+ if (list_empty(&dev->dma_mems))
return 0;
- *ret = __dma_alloc_from_coherent(dev, mem, size, dma_handle);
+ list_for_each_entry(mem_tmp, &dev->dma_mems, node) {
+ *ret = __dma_alloc_from_coherent(dev, mem_tmp, size, dma_handle);
+ if (*ret)
+ break;
+ }
+
return 1;
}
@@ -226,9 +228,16 @@ static int __dma_release_from_coherent(struct dma_coherent_mem *mem,
*/
int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr)
{
- struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
+ struct dma_coherent_mem *mem_tmp;
+ int ret = 0;
- return __dma_release_from_coherent(mem, order, vaddr);
+ list_for_each_entry(mem_tmp, &dev->dma_mems, node) {
+ ret = __dma_release_from_coherent(mem_tmp, order, vaddr);
+ if (ret == 1)
+ break;
+ }
+
+ return ret;
}
static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem,
@@ -271,9 +280,16 @@ static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem,
int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
void *vaddr, size_t size, int *ret)
{
- struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
+ struct dma_coherent_mem *mem_tmp;
+ int retval = 0;
+
+ list_for_each_entry(mem_tmp, &dev->dma_mems, node) {
+ retval = __dma_mmap_from_coherent(mem_tmp, vma, vaddr, size, ret);
+ if (retval == 1)
+ break;
+ }
- return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret);
+ return retval;
}
#ifdef CONFIG_DMA_GLOBAL_POOL
@@ -351,8 +367,16 @@ static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
static void rmem_dma_device_release(struct reserved_mem *rmem,
struct device *dev)
{
- if (dev)
- dev->dma_mem = NULL;
+ struct dma_coherent_mem *mem_tmp, *q;
+
+ if (dev) {
+ list_for_each_entry_safe(mem_tmp, q, &dev->dma_mems, node) {
+ if (mem_tmp == rmem->priv) {
+ list_del_init(&mem_tmp->node);
+ break;
+ }
+ }
+ }
}
static const struct reserved_mem_ops rmem_dma_ops = {