[RFC,10/10] memory-provider: add dmabuf devmem provider

Message ID 20230710223304.1174642-11-almasrymina@google.com
State New
Headers
Series Device Memory TCP |

Commit Message

Mina Almasry July 10, 2023, 10:33 p.m. UTC
  Use Jakub's memory provider PoC API:
https://github.com/kuba-moo/linux/tree/pp-providers

To implement a dmabuf devmem memory provider. The provider allocates
NET_RX dmabuf pages to the page pool. This abstracts any custom memory
allocation or freeing changes for devmem TCP from drivers using the
page pool.

The memory provider allocates NET_RX pages from the
dmabuf pages provided by the driver. These pages are ZONE_DEVICE pages
with the sg dma_addrs stored in the zone_device_data entry in the page.
The page pool entries in struct page are in a union with the ZONE_DEVICE
entries, and - without special handling - the page pool would
accidentally overwrite the data in the ZONE_DEVICE fields.

To solve this, the memory provider converts the page from a ZONE_DEVICE
page to a ZONE_NORMAL page upon giving it to the page pool, and converts
it back to ZONE_DEVICE page upon getting it back from the page pool.
This is safe to do because the NET_RX pages are dmabuf pages created to
hold the dma_addr in the dma_buf_map_attachement sg_table entries, and
are only used with code that handles them specifically.

However, since dmabuf pages can now also be page pool page, we need
to update 2 places to detect this correctly:

1. is_dma_buf_page() needs to be updated to correctly detect dmabuf
   pages after they've been inserted into the pool.

2. dma_buf_page_to_dma_addr() needs to be updated. For page pool pages,
   the dma_addr exists in page->dma_addr. For non page pool pages, the
   dma_addr exists in page->zone_device_data.

Signed-off-by: Mina Almasry <almasrymina@google.com>
---
 include/linux/dma-buf.h |  29 ++++++++++-
 include/net/page_pool.h |  20 ++++++++
 net/core/page_pool.c    | 104 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 143 insertions(+), 10 deletions(-)
  

Patch

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 93228a2fec47..896359fa998d 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -692,15 +692,26 @@  static inline bool is_dma_buf_pages_file(struct file *file)
 
 struct page *dma_buf_pages_net_rx_alloc(struct dma_buf_pages *priv);
 
+static inline bool is_dma_buf_page_net_rx(struct page *page)
+{
+	struct dma_buf_pages *priv;
+
+	return (is_page_pool_page(page) && (priv = page->pp->mp_priv) &&
+		priv->pgmap.ops == &dma_buf_pgmap_ops);
+}
+
 static inline bool is_dma_buf_page(struct page *page)
 {
 	return (is_zone_device_page(page) && page->pgmap &&
-		page->pgmap->ops == &dma_buf_pgmap_ops);
+		page->pgmap->ops == &dma_buf_pgmap_ops) ||
+	       is_dma_buf_page_net_rx(page);
 }
 
 static inline dma_addr_t dma_buf_page_to_dma_addr(struct page *page)
 {
-	return (dma_addr_t)page->zone_device_data;
+	return is_dma_buf_page_net_rx(page) ?
+		       (dma_addr_t)page->dma_addr :
+		       (dma_addr_t)page->zone_device_data;
 }
 
 static inline int dma_buf_map_sg(struct device *dev, struct scatterlist *sg,
@@ -718,6 +729,16 @@  static inline int dma_buf_map_sg(struct device *dev, struct scatterlist *sg,
 
 	return nents;
 }
+
+static inline bool is_dma_buf_pages_priv(void *ptr)
+{
+	struct dma_buf_pages *priv = (struct dma_buf_pages *)ptr;
+
+	if (!priv || priv->pgmap.ops != &dma_buf_pgmap_ops)
+		return false;
+
+	return true;
+}
 #else
 static inline bool is_dma_buf_page(struct page *page)
 {
@@ -745,6 +766,10 @@  static inline struct page *dma_buf_pages_net_rx_alloc(struct dma_buf_pages *priv
 	return NULL;
 }
 
+static inline bool is_dma_buf_pages_priv(void *ptr)
+{
+	return false;
+}
 #endif
 
 
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 7b6668479baf..a57757a13cc8 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -157,6 +157,7 @@  enum pp_memory_provider_type {
 	PP_MP_HUGE_SPLIT, /* 2MB, online page alloc */
 	PP_MP_HUGE, /* 2MB, all memory pre-allocated */
 	PP_MP_HUGE_1G, /* 1G pages, MEP, pre-allocated */
+	PP_MP_DMABUF_DEVMEM, /* dmabuf devmem provider */
 };
 
 struct pp_memory_provider_ops {
@@ -170,6 +171,7 @@  extern const struct pp_memory_provider_ops basic_ops;
 extern const struct pp_memory_provider_ops hugesp_ops;
 extern const struct pp_memory_provider_ops huge_ops;
 extern const struct pp_memory_provider_ops huge_1g_ops;
+extern const struct pp_memory_provider_ops dmabuf_devmem_ops;
 
 struct page_pool {
 	struct page_pool_params p;
@@ -420,4 +422,22 @@  static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid)
 		page_pool_update_nid(pool, new_nid);
 }
 
+static inline bool is_page_pool_page(struct page *page)
+{
+	/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
+	 * in order to preserve any existing bits, such as bit 0 for the
+	 * head page of compound page and bit 1 for pfmemalloc page, so
+	 * mask those bits for freeing side when doing below checking,
+	 * and page_is_pfmemalloc() is checked in __page_pool_put_page()
+	 * to avoid recycling the pfmemalloc page.
+	 */
+	if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
+		return false;
+
+	if (!page->pp)
+		return false;
+
+	return true;
+}
+
 #endif /* _NET_PAGE_POOL_H */
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index df3f431fcff3..e626d4e309c1 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -236,6 +236,9 @@  static int page_pool_init(struct page_pool *pool,
 	case PP_MP_HUGE_1G:
 		pool->mp_ops = &huge_1g_ops;
 		break;
+	case PP_MP_DMABUF_DEVMEM:
+		pool->mp_ops = &dmabuf_devmem_ops;
+		break;
 	default:
 		err = -EINVAL;
 		goto free_ptr_ring;
@@ -975,14 +978,7 @@  bool page_pool_return_skb_page(struct page *page, bool napi_safe)
 
 	page = compound_head(page);
 
-	/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
-	 * in order to preserve any existing bits, such as bit 0 for the
-	 * head page of compound page and bit 1 for pfmemalloc page, so
-	 * mask those bits for freeing side when doing below checking,
-	 * and page_is_pfmemalloc() is checked in __page_pool_put_page()
-	 * to avoid recycling the pfmemalloc page.
-	 */
-	if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
+	if (!is_page_pool_page(page))
 		return false;
 
 	pp = page->pp;
@@ -1538,3 +1534,95 @@  const struct pp_memory_provider_ops huge_1g_ops = {
 	.alloc_pages		= mp_huge_1g_alloc_pages,
 	.release_page		= mp_huge_1g_release,
 };
+
+/*** "Dmabuf devmem page" ***/
+
+/* Dmabuf devmem memory provider allocates DMA_BUF_PAGES_NET_RX pages which are
+ * backing the dma_buf_map_attachment() from the NIC to the device memory.
+ *
+ * These pages are wrappers around the dma_addr of the sg entries in the
+ * sg_table returned from dma_buf_map_attachment(). They can be passed to the
+ * networking stack, which will generate devmem skbs from them and process them
+ * correctly.
+ */
+static int mp_dmabuf_devmem_init(struct page_pool *pool)
+{
+	struct dma_buf_pages *priv;
+
+	priv = pool->mp_priv;
+	if (!is_dma_buf_pages_priv(priv))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void mp_dmabuf_devmem_destroy(struct page_pool *pool)
+{
+}
+
+static struct page *mp_dmabuf_devmem_alloc_pages(struct page_pool *pool,
+						 gfp_t gfp)
+{
+	struct dma_buf_pages *priv = pool->mp_priv;
+	dma_addr_t dma_addr;
+	struct page *page;
+
+	page = dma_buf_pages_net_rx_alloc(priv);
+	if (!page)
+		return page;
+
+	/* It shouldn't be possible for the allocation to give us a page not
+	 * belonging to this page_pool's pgmap.
+	 */
+	BUG_ON(page->pgmap != &priv->pgmap);
+
+	/* netdev_rxq_alloc_dma_buf_page() allocates a ZONE_DEVICE page.
+	 * Prepare to convert it into a page_pool page. We need to hold pgmap
+	 * and zone_device_data (which holds the dma_addr).
+	 *
+	 * DMA_BUF_PAGES_NET_RX are dmabuf pages created specifically to wrap
+	 * the dma_addr of the sg_table into a struct page. These pages are
+	 * used by code specifically equipped to handle them, so this
+	 * conversation from ZONE_DEVICE page to page pool page should be safe.
+	 */
+	dma_addr = (dma_addr_t)page->zone_device_data;
+
+	set_page_zone(page, ZONE_NORMAL);
+	page->pp_magic = 0;
+	page_pool_set_pp_info(pool, page);
+
+	page->dma_addr = dma_addr;
+
+	return page;
+}
+
+static bool mp_dmabuf_devmem_release_page(struct page_pool *pool,
+		struct page *page)
+{
+	struct dma_buf_pages *priv = pool->mp_priv;
+	unsigned long dma_addr = page->dma_addr;
+
+	page_pool_clear_pp_info(page);
+
+	/* As the page pool releases the page, restore it back to a ZONE_DEVICE
+	 * page so it gets freed according to the
+	 * page->pgmap->ops->page_free().
+	 */
+	set_page_zone(page, ZONE_DEVICE);
+	page->zone_device_data = (void*)dma_addr;
+	page->pgmap = &priv->pgmap;
+	put_page(page);
+
+	/* Return false here as we don't want the page pool touching the page
+	 * after it's released to us.
+	 */
+	return false;
+}
+
+const struct pp_memory_provider_ops dmabuf_devmem_ops = {
+	.init			= mp_dmabuf_devmem_init,
+	.destroy		= mp_dmabuf_devmem_destroy,
+	.alloc_pages		= mp_dmabuf_devmem_alloc_pages,
+	.release_page		= mp_dmabuf_devmem_release_page,
+};
+EXPORT_SYMBOL(dmabuf_devmem_ops);