[RFC,v3,07/12] page-pool: device memory support
Commit Message
Overload the LSB of struct page* to indicate that it's a page_pool_iov.
Refactor mm calls on struct page* into helpers, and add page_pool_iov
handling on those helpers. Modify callers of these mm APIs with calls to
these helpers instead.
In areas where struct page* is dereferenced, add a check for special
handling of page_pool_iov.
Signed-off-by: Mina Almasry <almasrymina@google.com>
---
include/net/page_pool/helpers.h | 74 ++++++++++++++++++++++++++++++++-
net/core/page_pool.c | 63 ++++++++++++++++++++--------
2 files changed, 118 insertions(+), 19 deletions(-)
Comments
On Sun, 2023-11-05 at 18:44 -0800, Mina Almasry wrote:
> Overload the LSB of struct page* to indicate that it's a page_pool_iov.
>
> Refactor mm calls on struct page* into helpers, and add page_pool_iov
> handling on those helpers. Modify callers of these mm APIs with calls to
> these helpers instead.
>
> In areas where struct page* is dereferenced, add a check for special
> handling of page_pool_iov.
>
> Signed-off-by: Mina Almasry <almasrymina@google.com>
>
> ---
> include/net/page_pool/helpers.h | 74 ++++++++++++++++++++++++++++++++-
> net/core/page_pool.c | 63 ++++++++++++++++++++--------
> 2 files changed, 118 insertions(+), 19 deletions(-)
>
> diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
> index b93243c2a640..08f1a2cc70d2 100644
> --- a/include/net/page_pool/helpers.h
> +++ b/include/net/page_pool/helpers.h
> @@ -151,6 +151,64 @@ static inline struct page_pool_iov *page_to_page_pool_iov(struct page *page)
> return NULL;
> }
>
> +static inline int page_pool_page_ref_count(struct page *page)
> +{
> + if (page_is_page_pool_iov(page))
> + return page_pool_iov_refcount(page_to_page_pool_iov(page));
> +
> + return page_ref_count(page);
> +}
> +
> +static inline void page_pool_page_get_many(struct page *page,
> + unsigned int count)
> +{
> + if (page_is_page_pool_iov(page))
> + return page_pool_iov_get_many(page_to_page_pool_iov(page),
> + count);
> +
> + return page_ref_add(page, count);
> +}
> +
> +static inline void page_pool_page_put_many(struct page *page,
> + unsigned int count)
> +{
> + if (page_is_page_pool_iov(page))
> + return page_pool_iov_put_many(page_to_page_pool_iov(page),
> + count);
> +
> + if (count > 1)
> + page_ref_sub(page, count - 1);
> +
> + put_page(page);
> +}
> +
> +static inline bool page_pool_page_is_pfmemalloc(struct page *page)
> +{
> + if (page_is_page_pool_iov(page))
> + return false;
> +
> + return page_is_pfmemalloc(page);
> +}
> +
> +static inline bool page_pool_page_is_pref_nid(struct page *page, int pref_nid)
> +{
> + /* Assume page_pool_iov are on the preferred node without actually
> + * checking...
> + *
> + * This check is only used to check for recycling memory in the page
> + * pool's fast paths. Currently the only implementation of page_pool_iov
> + * is dmabuf device memory. It's a deliberate decision by the user to
> + * bind a certain dmabuf to a certain netdev, and the netdev rx queue
> + * would not be able to reallocate memory from another dmabuf that
> + * exists on the preferred node, so, this check doesn't make much sense
> + * in this case. Assume all page_pool_iovs can be recycled for now.
> + */
> + if (page_is_page_pool_iov(page))
> + return true;
> +
> + return page_to_nid(page) == pref_nid;
> +}
> +
> /**
> * page_pool_dev_alloc_pages() - allocate a page.
> * @pool: pool from which to allocate
> @@ -301,6 +359,9 @@ static inline long page_pool_defrag_page(struct page *page, long nr)
> {
> long ret;
>
> + if (page_is_page_pool_iov(page))
> + return -EINVAL;
> +
> /* If nr == pp_frag_count then we have cleared all remaining
> * references to the page:
> * 1. 'n == 1': no need to actually overwrite it.
> @@ -431,7 +492,12 @@ static inline void page_pool_free_va(struct page_pool *pool, void *va,
> */
> static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
> {
> - dma_addr_t ret = page->dma_addr;
> + dma_addr_t ret;
> +
> + if (page_is_page_pool_iov(page))
> + return page_pool_iov_dma_addr(page_to_page_pool_iov(page));
Should the above conditional be guarded by the page_pool_mem_providers
static key? this looks like fast-path. Same question for the refcount
helper above.
Minor nit: possibly cache 'page_is_page_pool_iov(page)' to make the
code more readable.
> +
> + ret = page->dma_addr;
>
> if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA)
> ret <<= PAGE_SHIFT;
> @@ -441,6 +507,12 @@ static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
>
> static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
> {
> + /* page_pool_iovs are mapped and their dma-addr can't be modified. */
> + if (page_is_page_pool_iov(page)) {
> + DEBUG_NET_WARN_ON_ONCE(true);
> + return false;
> + }
Quickly skimming over the page_pool_code it looks like
page_pool_set_dma_addr() usage is guarded by the PP_FLAG_DMA_MAP page
pool flag. Could the device mem provider enforce such flag being
cleared on the page pool?
> +
> if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) {
> page->dma_addr = addr >> PAGE_SHIFT;
>
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 138ddea0b28f..d211996d423b 100644
> --- a/net/core/page_pool.cnn
> +++ b/net/core/page_pool.c
> @@ -317,7 +317,7 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
> if (unlikely(!page))
> break;
>
> - if (likely(page_to_nid(page) == pref_nid)) {
> + if (likely(page_pool_page_is_pref_nid(page, pref_nid))) {
> pool->alloc.cache[pool->alloc.count++] = page;
> } else {
> /* NUMA mismatch;
> @@ -362,7 +362,15 @@ static void page_pool_dma_sync_for_device(struct page_pool *pool,
> struct page *page,
> unsigned int dma_sync_size)
> {
> - dma_addr_t dma_addr = page_pool_get_dma_addr(page);
> + dma_addr_t dma_addr;
> +
> + /* page_pool_iov memory provider do not support PP_FLAG_DMA_SYNC_DEV */
> + if (page_is_page_pool_iov(page)) {
> + DEBUG_NET_WARN_ON_ONCE(true);
> + return;
> + }
Similar to the above point, mutatis mutandis.
> +
> + dma_addr = page_pool_get_dma_addr(page);
>
> dma_sync_size = min(dma_sync_size, pool->p.max_len);
> dma_sync_single_range_for_device(pool->p.dev, dma_addr,
> @@ -374,6 +382,12 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
> {
> dma_addr_t dma;
>
> + if (page_is_page_pool_iov(page)) {
> + /* page_pool_iovs are already mapped */
> + DEBUG_NET_WARN_ON_ONCE(true);
> + return true;
> + }
Ditto.
Cheers,
Paolo
@@ -151,6 +151,64 @@ static inline struct page_pool_iov *page_to_page_pool_iov(struct page *page)
return NULL;
}
+static inline int page_pool_page_ref_count(struct page *page)
+{
+ if (page_is_page_pool_iov(page))
+ return page_pool_iov_refcount(page_to_page_pool_iov(page));
+
+ return page_ref_count(page);
+}
+
+static inline void page_pool_page_get_many(struct page *page,
+ unsigned int count)
+{
+ if (page_is_page_pool_iov(page))
+ return page_pool_iov_get_many(page_to_page_pool_iov(page),
+ count);
+
+ return page_ref_add(page, count);
+}
+
+static inline void page_pool_page_put_many(struct page *page,
+ unsigned int count)
+{
+ if (page_is_page_pool_iov(page))
+ return page_pool_iov_put_many(page_to_page_pool_iov(page),
+ count);
+
+ if (count > 1)
+ page_ref_sub(page, count - 1);
+
+ put_page(page);
+}
+
+static inline bool page_pool_page_is_pfmemalloc(struct page *page)
+{
+ if (page_is_page_pool_iov(page))
+ return false;
+
+ return page_is_pfmemalloc(page);
+}
+
+static inline bool page_pool_page_is_pref_nid(struct page *page, int pref_nid)
+{
+ /* Assume page_pool_iov are on the preferred node without actually
+ * checking...
+ *
+ * This check is only used to check for recycling memory in the page
+ * pool's fast paths. Currently the only implementation of page_pool_iov
+ * is dmabuf device memory. It's a deliberate decision by the user to
+ * bind a certain dmabuf to a certain netdev, and the netdev rx queue
+ * would not be able to reallocate memory from another dmabuf that
+ * exists on the preferred node, so, this check doesn't make much sense
+ * in this case. Assume all page_pool_iovs can be recycled for now.
+ */
+ if (page_is_page_pool_iov(page))
+ return true;
+
+ return page_to_nid(page) == pref_nid;
+}
+
/**
* page_pool_dev_alloc_pages() - allocate a page.
* @pool: pool from which to allocate
@@ -301,6 +359,9 @@ static inline long page_pool_defrag_page(struct page *page, long nr)
{
long ret;
+ if (page_is_page_pool_iov(page))
+ return -EINVAL;
+
/* If nr == pp_frag_count then we have cleared all remaining
* references to the page:
* 1. 'n == 1': no need to actually overwrite it.
@@ -431,7 +492,12 @@ static inline void page_pool_free_va(struct page_pool *pool, void *va,
*/
static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
{
- dma_addr_t ret = page->dma_addr;
+ dma_addr_t ret;
+
+ if (page_is_page_pool_iov(page))
+ return page_pool_iov_dma_addr(page_to_page_pool_iov(page));
+
+ ret = page->dma_addr;
if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA)
ret <<= PAGE_SHIFT;
@@ -441,6 +507,12 @@ static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
{
+ /* page_pool_iovs are mapped and their dma-addr can't be modified. */
+ if (page_is_page_pool_iov(page)) {
+ DEBUG_NET_WARN_ON_ONCE(true);
+ return false;
+ }
+
if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) {
page->dma_addr = addr >> PAGE_SHIFT;
@@ -317,7 +317,7 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
if (unlikely(!page))
break;
- if (likely(page_to_nid(page) == pref_nid)) {
+ if (likely(page_pool_page_is_pref_nid(page, pref_nid))) {
pool->alloc.cache[pool->alloc.count++] = page;
} else {
/* NUMA mismatch;
@@ -362,7 +362,15 @@ static void page_pool_dma_sync_for_device(struct page_pool *pool,
struct page *page,
unsigned int dma_sync_size)
{
- dma_addr_t dma_addr = page_pool_get_dma_addr(page);
+ dma_addr_t dma_addr;
+
+ /* page_pool_iov memory provider do not support PP_FLAG_DMA_SYNC_DEV */
+ if (page_is_page_pool_iov(page)) {
+ DEBUG_NET_WARN_ON_ONCE(true);
+ return;
+ }
+
+ dma_addr = page_pool_get_dma_addr(page);
dma_sync_size = min(dma_sync_size, pool->p.max_len);
dma_sync_single_range_for_device(pool->p.dev, dma_addr,
@@ -374,6 +382,12 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
{
dma_addr_t dma;
+ if (page_is_page_pool_iov(page)) {
+ /* page_pool_iovs are already mapped */
+ DEBUG_NET_WARN_ON_ONCE(true);
+ return true;
+ }
+
/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
* since dma_addr_t can be either 32 or 64 bits and does not always fit
* into page private data (i.e 32bit cpu with 64bit DMA caps)
@@ -405,22 +419,33 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
static void page_pool_set_pp_info(struct page_pool *pool,
struct page *page)
{
- page->pp = pool;
- page->pp_magic |= PP_SIGNATURE;
-
- /* Ensuring all pages have been split into one fragment initially:
- * page_pool_set_pp_info() is only called once for every page when it
- * is allocated from the page allocator and page_pool_fragment_page()
- * is dirtying the same cache line as the page->pp_magic above, so
- * the overhead is negligible.
- */
- page_pool_fragment_page(page, 1);
+ if (!page_is_page_pool_iov(page)) {
+ page->pp = pool;
+ page->pp_magic |= PP_SIGNATURE;
+
+ /* Ensuring all pages have been split into one fragment
+ * initially:
+ * page_pool_set_pp_info() is only called once for every page
+ * when it is allocated from the page allocator and
+ * page_pool_fragment_page() is dirtying the same cache line as
+ * the page->pp_magic above, so * the overhead is negligible.
+ */
+ page_pool_fragment_page(page, 1);
+ } else {
+ page_to_page_pool_iov(page)->pp = pool;
+ }
+
if (pool->p.init_callback)
pool->p.init_callback(page, pool->p.init_arg);
}
static void page_pool_clear_pp_info(struct page *page)
{
+ if (page_is_page_pool_iov(page)) {
+ page_to_page_pool_iov(page)->pp = NULL;
+ return;
+ }
+
page->pp_magic = 0;
page->pp = NULL;
}
@@ -630,7 +655,7 @@ static bool page_pool_recycle_in_cache(struct page *page,
return false;
}
- /* Caller MUST have verified/know (page_ref_count(page) == 1) */
+ /* Caller MUST have verified/know (page_pool_page_ref_count(page) == 1) */
pool->alloc.cache[pool->alloc.count++] = page;
recycle_stat_inc(pool, cached);
return true;
@@ -655,9 +680,10 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
* refcnt == 1 means page_pool owns page, and can recycle it.
*
* page is NOT reusable when allocated when system is under
- * some pressure. (page_is_pfmemalloc)
+ * some pressure. (page_pool_page_is_pfmemalloc)
*/
- if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
+ if (likely(page_pool_page_ref_count(page) == 1 &&
+ !page_pool_page_is_pfmemalloc(page))) {
/* Read barrier done in page_ref_count / READ_ONCE */
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
@@ -772,7 +798,8 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
if (likely(page_pool_defrag_page(page, drain_count)))
return NULL;
- if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
+ if (page_pool_page_ref_count(page) == 1 &&
+ !page_pool_page_is_pfmemalloc(page)) {
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
page_pool_dma_sync_for_device(pool, page, -1);
@@ -848,9 +875,9 @@ static void page_pool_empty_ring(struct page_pool *pool)
/* Empty recycle ring */
while ((page = ptr_ring_consume_bh(&pool->ring))) {
/* Verify the refcnt invariant of cached pages */
- if (!(page_ref_count(page) == 1))
+ if (!(page_pool_page_ref_count(page) == 1))
pr_crit("%s() page_pool refcnt %d violation\n",
- __func__, page_ref_count(page));
+ __func__, page_pool_page_ref_count(page));
page_pool_return_page(pool, page);
}