[net-next,v2,4/5] net: lan966x: Add support for XDP_TX

Message ID 20221115214456.1456856-5-horatiu.vultur@microchip.com
State New
Headers
Series net: lan966x: Extend xdp support |

Commit Message

Horatiu Vultur Nov. 15, 2022, 9:44 p.m. UTC
  Extend lan966x XDP support with the action XDP_TX. In this case when the
received buffer needs to execute XDP_TX, the buffer will be moved to the
TX buffers. So a new RX buffer will be allocated.
When the TX finish with the frame, it would release completely this
buffer.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
---
 .../ethernet/microchip/lan966x/lan966x_fdma.c | 78 +++++++++++++++++--
 .../ethernet/microchip/lan966x/lan966x_main.c |  4 +-
 .../ethernet/microchip/lan966x/lan966x_main.h |  8 ++
 .../ethernet/microchip/lan966x/lan966x_xdp.c  |  8 ++
 4 files changed, 91 insertions(+), 7 deletions(-)
  

Comments

Alexander Lobakin Nov. 16, 2022, 3:34 p.m. UTC | #1
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Tue, 15 Nov 2022 22:44:55 +0100

Extend lan966x XDP support with the action XDP_TX. In this case when the
received buffer needs to execute XDP_TX, the buffer will be moved to the
TX buffers. So a new RX buffer will be allocated.
When the TX finish with the frame, it would release completely this
buffer.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
---
 .../ethernet/microchip/lan966x/lan966x_fdma.c | 78 +++++++++++++++++--
 .../ethernet/microchip/lan966x/lan966x_main.c |  4 +-
 .../ethernet/microchip/lan966x/lan966x_main.h |  8 ++
 .../ethernet/microchip/lan966x/lan966x_xdp.c  |  8 ++
 4 files changed, 91 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
index 384ed34197d58..c2e56233a8da5 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
@@ -394,13 +394,21 @@ static void lan966x_fdma_tx_clear_buf(struct lan966x *lan966x, int weight)
 		dcb_buf->dev->stats.tx_bytes += dcb_buf->len;
 
 		dcb_buf->used = false;
-		dma_unmap_single(lan966x->dev,
-				 dcb_buf->dma_addr,
-				 dcb_buf->len,
-				 DMA_TO_DEVICE);
-		if (!dcb_buf->ptp)
+		if (dcb_buf->skb)
+			dma_unmap_single(lan966x->dev,
+					 dcb_buf->dma_addr,
+					 dcb_buf->len,
+					 DMA_TO_DEVICE);
+
+		if (dcb_buf->skb && !dcb_buf->ptp)
 			dev_kfree_skb_any(dcb_buf->skb);
 
+		if (dcb_buf->page) {
+			page_pool_release_page(lan966x->rx.page_pool,
+					       dcb_buf->page);
+			put_page(dcb_buf->page);
+		}

Hmm, that's not really correct.

For skb, you need to unmap + free, true (BPW, just use
napi_consume_skb()).
For %XDP_TX, as you use Page Pool, you don't need to unmap, but you
need to do xdp_return_frame{,_bulk}. Plus, as Tx is being done here
directly from an Rx NAPI polling cycle, xdp_return_frame_rx_napi()
is usually used. Anyway, each of xdp_return_frame()'s variants will
call page_pool_put_full_page() for you.
For %XDP_REDIRECT, as you don't know the source of the XDP frame,
you need to unmap it (as it was previously mapped in
::ndo_xdp_xmit()), plus call xdp_return_frame{,_bulk} to free the
XDP frame. Note that _rx_napi() variant is not applicable here.

That description might be confusing, so you can take a look at the
already existing code[0] to get the idea. I think this piece shows
the expected logics rather well.

+
 		clear = true;
 	}
 
@@ -532,6 +540,9 @@ static int lan966x_fdma_napi_poll(struct napi_struct *napi, int weight)
 			lan966x_fdma_rx_free_page(rx);
 			lan966x_fdma_rx_advance_dcb(rx);
 			goto allocate_new;
+		case FDMA_TX:
+			lan966x_fdma_rx_advance_dcb(rx);
+			continue;
 		case FDMA_DROP:
 			lan966x_fdma_rx_free_page(rx);
 			lan966x_fdma_rx_advance_dcb(rx);
@@ -653,6 +664,62 @@ static void lan966x_fdma_tx_start(struct lan966x_tx *tx, int next_to_use)
 	tx->last_in_use = next_to_use;
 }
 
+int lan966x_fdma_xmit_xdpf(struct lan966x_port *port,
+			   struct xdp_frame *xdpf,
+			   struct page *page)
+{
+	struct lan966x *lan966x = port->lan966x;
+	struct lan966x_tx_dcb_buf *next_dcb_buf;
+	struct lan966x_tx *tx = &lan966x->tx;
+	dma_addr_t dma_addr;
+	int next_to_use;
+	__be32 *ifh;
+	int ret = 0;
+
+	spin_lock(&lan966x->tx_lock);
+
+	/* Get next index */
+	next_to_use = lan966x_fdma_get_next_dcb(tx);
+	if (next_to_use < 0) {
+		netif_stop_queue(port->dev);
+		ret = NETDEV_TX_BUSY;
+		goto out;
+	}
+
+	/* Generate new IFH */
+	ifh = page_address(page) + XDP_PACKET_HEADROOM;
+	memset(ifh, 0x0, sizeof(__be32) * IFH_LEN);
+	lan966x_ifh_set_bypass(ifh, 1);
+	lan966x_ifh_set_port(ifh, BIT_ULL(port->chip_port));
+
+	dma_addr = page_pool_get_dma_addr(page);
+	dma_sync_single_for_device(lan966x->dev, dma_addr + XDP_PACKET_HEADROOM,
+				   xdpf->len + IFH_LEN_BYTES,
+				   DMA_TO_DEVICE);

Also not correct. This page was mapped with %DMA_FROM_DEVICE in the
Rx code, now you sync it for the opposite.
Most drivers in case of XDP enabled create Page Pools with ::dma_dir
set to %DMA_BIDIRECTIONAL. Now you would need only to sync it here
with the same direction (bidir) and that's it.

+
+	/* Setup next dcb */
+	lan966x_fdma_tx_setup_dcb(tx, next_to_use, xdpf->len + IFH_LEN_BYTES,
+				  dma_addr + XDP_PACKET_HEADROOM);
+
+	/* Fill up the buffer */
+	next_dcb_buf = &tx->dcbs_buf[next_to_use];
+	next_dcb_buf->skb = NULL;
+	next_dcb_buf->page = page;
+	next_dcb_buf->len = xdpf->len + IFH_LEN_BYTES;
+	next_dcb_buf->dma_addr = dma_addr;
+	next_dcb_buf->used = true;
+	next_dcb_buf->ptp = false;
+	next_dcb_buf->dev = port->dev;
+
+	/* Start the transmission */
+	lan966x_fdma_tx_start(tx, next_to_use);
+
+out:
+	spin_unlock(&lan966x->tx_lock);
+
+	return ret;
+}
+
 int lan966x_fdma_xmit(struct sk_buff *skb, __be32 *ifh, struct net_device *dev)
 {
 	struct lan966x_port *port = netdev_priv(dev);
@@ -709,6 +776,7 @@ int lan966x_fdma_xmit(struct sk_buff *skb, __be32 *ifh, struct net_device *dev)
 	/* Fill up the buffer */
 	next_dcb_buf = &tx->dcbs_buf[next_to_use];
 	next_dcb_buf->skb = skb;
+	next_dcb_buf->page = NULL;
 	next_dcb_buf->len = skb->len;
 	next_dcb_buf->dma_addr = dma_addr;
 	next_dcb_buf->used = true;

[...]
  
Horatiu Vultur Nov. 16, 2022, 8:55 p.m. UTC | #2
The 11/16/2022 16:34, Alexander Lobakin wrote:
> 
> From: Horatiu Vultur <horatiu.vultur@microchip.com>
> Date: Tue, 15 Nov 2022 22:44:55 +0100

Hi Olek,

> 
> Extend lan966x XDP support with the action XDP_TX. In this case when the
> received buffer needs to execute XDP_TX, the buffer will be moved to the
> TX buffers. So a new RX buffer will be allocated.
> When the TX finish with the frame, it would release completely this
> buffer.
> 
> Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
> ---
>  .../ethernet/microchip/lan966x/lan966x_fdma.c | 78 +++++++++++++++++--
>  .../ethernet/microchip/lan966x/lan966x_main.c |  4 +-
>  .../ethernet/microchip/lan966x/lan966x_main.h |  8 ++
>  .../ethernet/microchip/lan966x/lan966x_xdp.c  |  8 ++
>  4 files changed, 91 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
> index 384ed34197d58..c2e56233a8da5 100644
> --- a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
> +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
> @@ -394,13 +394,21 @@ static void lan966x_fdma_tx_clear_buf(struct lan966x *lan966x, int weight)
>                 dcb_buf->dev->stats.tx_bytes += dcb_buf->len;
> 
>                 dcb_buf->used = false;
> -               dma_unmap_single(lan966x->dev,
> -                                dcb_buf->dma_addr,
> -                                dcb_buf->len,
> -                                DMA_TO_DEVICE);
> -               if (!dcb_buf->ptp)
> +               if (dcb_buf->skb)
> +                       dma_unmap_single(lan966x->dev,
> +                                        dcb_buf->dma_addr,
> +                                        dcb_buf->len,
> +                                        DMA_TO_DEVICE);
> +
> +               if (dcb_buf->skb && !dcb_buf->ptp)
>                         dev_kfree_skb_any(dcb_buf->skb);
> 
> +               if (dcb_buf->page) {
> +                       page_pool_release_page(lan966x->rx.page_pool,
> +                                              dcb_buf->page);
> +                       put_page(dcb_buf->page);
> +               }
> 
> Hmm, that's not really correct.
> 
> For skb, you need to unmap + free, true (BPW, just use
> napi_consume_skb()).

What does BPW stand for?
Yes, I can use napi_consume_skb instead of dev_kfree_skb_any();

> For %XDP_TX, as you use Page Pool, you don't need to unmap, but you
> need to do xdp_return_frame{,_bulk}. Plus, as Tx is being done here
> directly from an Rx NAPI polling cycle, xdp_return_frame_rx_napi()
> is usually used. Anyway, each of xdp_return_frame()'s variants will
> call page_pool_put_full_page() for you.

If I understand correctly this part that you describe, the page will
be added back in the page_pool cache. While in my case, I am giving
back the page to the page allocator. In this way the page_pool needs
to allocate more pages every time when the action XDP_TX is happening.

BTW, this shows that there is a missing xdp_rxq_info_reg_mem_model call,
because when calling xdp_return_frame_rx_napi, the frame was not going
to page_pool but the was simply freed because xdp_mem_info was the wrong
type.

> For %XDP_REDIRECT, as you don't know the source of the XDP frame,

Why I don't know the source?
Will it not be from an RX page that is allocated by Page Pool?

> you need to unmap it (as it was previously mapped in
> ::ndo_xdp_xmit()), plus call xdp_return_frame{,_bulk} to free the
> XDP frame. Note that _rx_napi() variant is not applicable here.
> 
> That description might be confusing, so you can take a look at the
> already existing code[0] to get the idea. I think this piece shows
> the expected logics rather well.

I think you forgot to write the link to the code.
I looked also at different drivers but I didn't figure it out why the
frame needed to be mapped and where is happening that.

> 
> +
>                 clear = true;
>         }
> 
> @@ -532,6 +540,9 @@ static int lan966x_fdma_napi_poll(struct napi_struct *napi, int weight)
>                         lan966x_fdma_rx_free_page(rx);
>                         lan966x_fdma_rx_advance_dcb(rx);
>                         goto allocate_new;
> +               case FDMA_TX:
> +                       lan966x_fdma_rx_advance_dcb(rx);
> +                       continue;
>                 case FDMA_DROP:
>                         lan966x_fdma_rx_free_page(rx);
>                         lan966x_fdma_rx_advance_dcb(rx);
> @@ -653,6 +664,62 @@ static void lan966x_fdma_tx_start(struct lan966x_tx *tx, int next_to_use)
>         tx->last_in_use = next_to_use;
>  }
> 
> +int lan966x_fdma_xmit_xdpf(struct lan966x_port *port,
> +                          struct xdp_frame *xdpf,
> +                          struct page *page)
> +{
> +       struct lan966x *lan966x = port->lan966x;
> +       struct lan966x_tx_dcb_buf *next_dcb_buf;
> +       struct lan966x_tx *tx = &lan966x->tx;
> +       dma_addr_t dma_addr;
> +       int next_to_use;
> +       __be32 *ifh;
> +       int ret = 0;
> +
> +       spin_lock(&lan966x->tx_lock);
> +
> +       /* Get next index */
> +       next_to_use = lan966x_fdma_get_next_dcb(tx);
> +       if (next_to_use < 0) {
> +               netif_stop_queue(port->dev);
> +               ret = NETDEV_TX_BUSY;
> +               goto out;
> +       }
> +
> +       /* Generate new IFH */
> +       ifh = page_address(page) + XDP_PACKET_HEADROOM;
> +       memset(ifh, 0x0, sizeof(__be32) * IFH_LEN);
> +       lan966x_ifh_set_bypass(ifh, 1);
> +       lan966x_ifh_set_port(ifh, BIT_ULL(port->chip_port));
> +
> +       dma_addr = page_pool_get_dma_addr(page);
> +       dma_sync_single_for_device(lan966x->dev, dma_addr + XDP_PACKET_HEADROOM,
> +                                  xdpf->len + IFH_LEN_BYTES,
> +                                  DMA_TO_DEVICE);
> 
> Also not correct. This page was mapped with %DMA_FROM_DEVICE in the
> Rx code, now you sync it for the opposite.
> Most drivers in case of XDP enabled create Page Pools with ::dma_dir
> set to %DMA_BIDIRECTIONAL. Now you would need only to sync it here
> with the same direction (bidir) and that's it.

That is a really good catch!
I was wondering why the things were working when I tested this. Because
definitely, I can see the right behaviour.

> 
> +
> +       /* Setup next dcb */
> +       lan966x_fdma_tx_setup_dcb(tx, next_to_use, xdpf->len + IFH_LEN_BYTES,
> +                                 dma_addr + XDP_PACKET_HEADROOM);
> +
> +       /* Fill up the buffer */
> +       next_dcb_buf = &tx->dcbs_buf[next_to_use];
> +       next_dcb_buf->skb = NULL;
> +       next_dcb_buf->page = page;
> +       next_dcb_buf->len = xdpf->len + IFH_LEN_BYTES;
> +       next_dcb_buf->dma_addr = dma_addr;
> +       next_dcb_buf->used = true;
> +       next_dcb_buf->ptp = false;
> +       next_dcb_buf->dev = port->dev;
> +
> +       /* Start the transmission */
> +       lan966x_fdma_tx_start(tx, next_to_use);
> +
> +out:
> +       spin_unlock(&lan966x->tx_lock);
> +
> +       return ret;
> +}
> +
>  int lan966x_fdma_xmit(struct sk_buff *skb, __be32 *ifh, struct net_device *dev)
>  {
>         struct lan966x_port *port = netdev_priv(dev);
> @@ -709,6 +776,7 @@ int lan966x_fdma_xmit(struct sk_buff *skb, __be32 *ifh, struct net_device *dev)
>         /* Fill up the buffer */
>         next_dcb_buf = &tx->dcbs_buf[next_to_use];
>         next_dcb_buf->skb = skb;
> +       next_dcb_buf->page = NULL;
>         next_dcb_buf->len = skb->len;
>         next_dcb_buf->dma_addr = dma_addr;
>         next_dcb_buf->used = true;
> 
> [...]
> 
> --
> 2.38.0
> 
> Thanks,
> Olek
  
Alexander Lobakin Nov. 17, 2022, 3:31 p.m. UTC | #3
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Wed, 16 Nov 2022 21:55:57 +0100

> The 11/16/2022 16:34, Alexander Lobakin wrote:
> > 
> > From: Horatiu Vultur <horatiu.vultur@microchip.com>
> > Date: Tue, 15 Nov 2022 22:44:55 +0100
> 
> Hi Olek,

Hi!

> 
> > 
> > Extend lan966x XDP support with the action XDP_TX. In this case when the
> > received buffer needs to execute XDP_TX, the buffer will be moved to the
> > TX buffers. So a new RX buffer will be allocated.
> > When the TX finish with the frame, it would release completely this
> > buffer.
> > 
> > Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
> > ---
> >  .../ethernet/microchip/lan966x/lan966x_fdma.c | 78 +++++++++++++++++--
> >  .../ethernet/microchip/lan966x/lan966x_main.c |  4 +-
> >  .../ethernet/microchip/lan966x/lan966x_main.h |  8 ++
> >  .../ethernet/microchip/lan966x/lan966x_xdp.c  |  8 ++
> >  4 files changed, 91 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
> > index 384ed34197d58..c2e56233a8da5 100644
> > --- a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
> > +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
> > @@ -394,13 +394,21 @@ static void lan966x_fdma_tx_clear_buf(struct lan966x *lan966x, int weight)
> >                 dcb_buf->dev->stats.tx_bytes += dcb_buf->len;
> > 
> >                 dcb_buf->used = false;
> > -               dma_unmap_single(lan966x->dev,
> > -                                dcb_buf->dma_addr,
> > -                                dcb_buf->len,
> > -                                DMA_TO_DEVICE);
> > -               if (!dcb_buf->ptp)
> > +               if (dcb_buf->skb)
> > +                       dma_unmap_single(lan966x->dev,
> > +                                        dcb_buf->dma_addr,
> > +                                        dcb_buf->len,
> > +                                        DMA_TO_DEVICE);
> > +
> > +               if (dcb_buf->skb && !dcb_buf->ptp)
> >                         dev_kfree_skb_any(dcb_buf->skb);
> > 
> > +               if (dcb_buf->page) {
> > +                       page_pool_release_page(lan966x->rx.page_pool,
> > +                                              dcb_buf->page);
> > +                       put_page(dcb_buf->page);
> > +               }
> > 
> > Hmm, that's not really correct.
> > 
> > For skb, you need to unmap + free, true (BPW, just use
> > napi_consume_skb()).
> 
> What does BPW stand for?

Sorry, it was a typo <O> I meant BTW / "by the word" (or "by the
way").

> Yes, I can use napi_consume_skb instead of dev_kfree_skb_any();
> 
> > For %XDP_TX, as you use Page Pool, you don't need to unmap, but you
> > need to do xdp_return_frame{,_bulk}. Plus, as Tx is being done here
> > directly from an Rx NAPI polling cycle, xdp_return_frame_rx_napi()
> > is usually used. Anyway, each of xdp_return_frame()'s variants will
> > call page_pool_put_full_page() for you.
> 
> If I understand correctly this part that you describe, the page will
> be added back in the page_pool cache. While in my case, I am giving
> back the page to the page allocator. In this way the page_pool needs
> to allocate more pages every time when the action XDP_TX is happening.
> 
> BTW, this shows that there is a missing xdp_rxq_info_reg_mem_model call,
> because when calling xdp_return_frame_rx_napi, the frame was not going
> to page_pool but the was simply freed because xdp_mem_info was the wrong
> type.

Correct!

> 
> > For %XDP_REDIRECT, as you don't know the source of the XDP frame,
> 
> Why I don't know the source?
> Will it not be from an RX page that is allocated by Page Pool?

Imagine some NIC which does not use Page Pool, for example, it does
its own page allocation / splitting / recycling techniques, gets
%XDP_REDIRECT when running XDP prog on Rx. devmap says it must
redirect the frame to your NIC.
Then, your ::ndo_xdp_xmit() will be run on a frame/page not
belonging to any Page Pool.
The example can be any of Intel drivers (there are plans to switch
at least i40e and ice to Page Pool, but they're always deeply in
the backlogs (clownface)).

> 
> > you need to unmap it (as it was previously mapped in
> > ::ndo_xdp_xmit()), plus call xdp_return_frame{,_bulk} to free the
> > XDP frame. Note that _rx_napi() variant is not applicable here.
> > 
> > That description might be confusing, so you can take a look at the
> > already existing code[0] to get the idea. I think this piece shows
> > the expected logics rather well.
> 
> I think you forgot to write the link to the code.
> I looked also at different drivers but I didn't figure it out why the
> frame needed to be mapped and where is happening that.

Ooof, really. Pls look at the end of this reply :D
On ::ndo_xdp_xmit(), as I explained above, you can receive a frame
from any driver or BPF core code (such as cpumap), and BPF prog
there could be run on buffer of any kind: Page Pool page, just a
page, a kmalloc() chunk and so on.

So, in the code[0], you can see the following set of operations:

* DMA unmap in all cases excluding frame coming from %XDP_TX (then
  it was only synced);
* updating statistics and freeing skb for skb cases;
* xdp_return_frame_rx_napi() for %XDP_TX cases;
* xdp_return_frame_bulk() for ::ndo_xdp_xmit() cases.

> 
> > 
> > +
> >                 clear = true;
> >         }
> > 
> > @@ -532,6 +540,9 @@ static int lan966x_fdma_napi_poll(struct napi_struct *napi, int weight)
> >                         lan966x_fdma_rx_free_page(rx);
> >                         lan966x_fdma_rx_advance_dcb(rx);
> >                         goto allocate_new;
> > +               case FDMA_TX:
> > +                       lan966x_fdma_rx_advance_dcb(rx);
> > +                       continue;
> >                 case FDMA_DROP:
> >                         lan966x_fdma_rx_free_page(rx);
> >                         lan966x_fdma_rx_advance_dcb(rx);
> > @@ -653,6 +664,62 @@ static void lan966x_fdma_tx_start(struct lan966x_tx *tx, int next_to_use)
> >         tx->last_in_use = next_to_use;
> >  }
> > 
> > +int lan966x_fdma_xmit_xdpf(struct lan966x_port *port,
> > +                          struct xdp_frame *xdpf,
> > +                          struct page *page)
> > +{
> > +       struct lan966x *lan966x = port->lan966x;
> > +       struct lan966x_tx_dcb_buf *next_dcb_buf;
> > +       struct lan966x_tx *tx = &lan966x->tx;
> > +       dma_addr_t dma_addr;
> > +       int next_to_use;
> > +       __be32 *ifh;
> > +       int ret = 0;
> > +
> > +       spin_lock(&lan966x->tx_lock);
> > +
> > +       /* Get next index */
> > +       next_to_use = lan966x_fdma_get_next_dcb(tx);
> > +       if (next_to_use < 0) {
> > +               netif_stop_queue(port->dev);
> > +               ret = NETDEV_TX_BUSY;
> > +               goto out;
> > +       }
> > +
> > +       /* Generate new IFH */
> > +       ifh = page_address(page) + XDP_PACKET_HEADROOM;
> > +       memset(ifh, 0x0, sizeof(__be32) * IFH_LEN);
> > +       lan966x_ifh_set_bypass(ifh, 1);
> > +       lan966x_ifh_set_port(ifh, BIT_ULL(port->chip_port));
> > +
> > +       dma_addr = page_pool_get_dma_addr(page);
> > +       dma_sync_single_for_device(lan966x->dev, dma_addr + XDP_PACKET_HEADROOM,
> > +                                  xdpf->len + IFH_LEN_BYTES,
> > +                                  DMA_TO_DEVICE);
> > 
> > Also not correct. This page was mapped with %DMA_FROM_DEVICE in the
> > Rx code, now you sync it for the opposite.
> > Most drivers in case of XDP enabled create Page Pools with ::dma_dir
> > set to %DMA_BIDIRECTIONAL. Now you would need only to sync it here
> > with the same direction (bidir) and that's it.
> 
> That is a really good catch!
> I was wondering why the things were working when I tested this. Because
> definitely, I can see the right behaviour.

The reasons can be:

1) your platform might have a DMA coherence engine, so that all
   those DMA sync calls are no-ops;
2) on your platform, DMA writeback (TO_DEVICE) and DMA invalidate
   (FROM_DEVICE) invoke the same operation/instruction. Some
   hardware is designed that way, that any DMA sync is in fact a
   bidir synchronization;
3) if there were no frame modification from the kernel, e.g. you
   received it and immediately sent, cache was not polluted with
   some pending modifications, so there was no work for writeback;
4) probably something else I might've missed.

> 
> > 
> > +
> > +       /* Setup next dcb */
> > +       lan966x_fdma_tx_setup_dcb(tx, next_to_use, xdpf->len + IFH_LEN_BYTES,
> > +                                 dma_addr + XDP_PACKET_HEADROOM);
> > +
> > +       /* Fill up the buffer */
> > +       next_dcb_buf = &tx->dcbs_buf[next_to_use];
> > +       next_dcb_buf->skb = NULL;
> > +       next_dcb_buf->page = page;
> > +       next_dcb_buf->len = xdpf->len + IFH_LEN_BYTES;
> > +       next_dcb_buf->dma_addr = dma_addr;
> > +       next_dcb_buf->used = true;
> > +       next_dcb_buf->ptp = false;
> > +       next_dcb_buf->dev = port->dev;
> > +
> > +       /* Start the transmission */
> > +       lan966x_fdma_tx_start(tx, next_to_use);
> > +
> > +out:
> > +       spin_unlock(&lan966x->tx_lock);
> > +
> > +       return ret;
> > +}
> > +
> >  int lan966x_fdma_xmit(struct sk_buff *skb, __be32 *ifh, struct net_device *dev)
> >  {
> >         struct lan966x_port *port = netdev_priv(dev);
> > @@ -709,6 +776,7 @@ int lan966x_fdma_xmit(struct sk_buff *skb, __be32 *ifh, struct net_device *dev)
> >         /* Fill up the buffer */
> >         next_dcb_buf = &tx->dcbs_buf[next_to_use];
> >         next_dcb_buf->skb = skb;
> > +       next_dcb_buf->page = NULL;
> >         next_dcb_buf->len = skb->len;
> >         next_dcb_buf->dma_addr = dma_addr;
> >         next_dcb_buf->used = true;
> > 
> > [...]
> > 
> > --
> > 2.38.0
> > 
> > Thanks,
> > Olek
> 
> -- 
> /Horatiu

[0] https://elixir.bootlin.com/linux/v6.1-rc5/source/drivers/net/ethernet/marvell/mvneta.c#L1882

Thanks,
Olek
  
Horatiu Vultur Nov. 18, 2022, 3:50 p.m. UTC | #4
The 11/17/2022 16:31, Alexander Lobakin wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe
> 
> From: Horatiu Vultur <horatiu.vultur@microchip.com>
> Date: Wed, 16 Nov 2022 21:55:57 +0100
> 
> > The 11/16/2022 16:34, Alexander Lobakin wrote:
> > >
> > > From: Horatiu Vultur <horatiu.vultur@microchip.com>
> > > Date: Tue, 15 Nov 2022 22:44:55 +0100
> >
> > Hi Olek,
> 
> Hi!
> 
> > > For %XDP_REDIRECT, as you don't know the source of the XDP frame,
> >
> > Why I don't know the source?
> > Will it not be from an RX page that is allocated by Page Pool?
> 
> Imagine some NIC which does not use Page Pool, for example, it does
> its own page allocation / splitting / recycling techniques, gets
> %XDP_REDIRECT when running XDP prog on Rx. devmap says it must
> redirect the frame to your NIC.
> Then, your ::ndo_xdp_xmit() will be run on a frame/page not
> belonging to any Page Pool.
> The example can be any of Intel drivers (there are plans to switch
> at least i40e and ice to Page Pool, but they're always deeply in
> the backlogs (clownface)).

Silly me, I was always thinking and trying only from one port of lan966x
to another port of lan966x. Of course it can come from other NICs.

> 
> >
> > > you need to unmap it (as it was previously mapped in
> > > ::ndo_xdp_xmit()), plus call xdp_return_frame{,_bulk} to free the
> > > XDP frame. Note that _rx_napi() variant is not applicable here.
> > >
> > > That description might be confusing, so you can take a look at the
> > > already existing code[0] to get the idea. I think this piece shows
> > > the expected logics rather well.
> >
> > I think you forgot to write the link to the code.
> > I looked also at different drivers but I didn't figure it out why the
> > frame needed to be mapped and where is happening that.
> 
> Ooof, really. Pls look at the end of this reply :D
> On ::ndo_xdp_xmit(), as I explained above, you can receive a frame
> from any driver or BPF core code (such as cpumap), and BPF prog
> there could be run on buffer of any kind: Page Pool page, just a
> page, a kmalloc() chunk and so on.
> 
> So, in the code[0], you can see the following set of operations:
> 
> * DMA unmap in all cases excluding frame coming from %XDP_TX (then
>   it was only synced);
> * updating statistics and freeing skb for skb cases;
> * xdp_return_frame_rx_napi() for %XDP_TX cases;
> * xdp_return_frame_bulk() for ::ndo_xdp_xmit() cases.

Thanks for a detail explanation and for the link :D
I will update all this in the next version.

> 
> > > +       ifh = page_address(page) + XDP_PACKET_HEADROOM;
> > > +       memset(ifh, 0x0, sizeof(__be32) * IFH_LEN);
> > > +       lan966x_ifh_set_bypass(ifh, 1);
> > > +       lan966x_ifh_set_port(ifh, BIT_ULL(port->chip_port));
> > > +
> > > +       dma_addr = page_pool_get_dma_addr(page);
> > > +       dma_sync_single_for_device(lan966x->dev, dma_addr + XDP_PACKET_HEADROOM,
> > > +                                  xdpf->len + IFH_LEN_BYTES,
> > > +                                  DMA_TO_DEVICE);
> > >
> > > Also not correct. This page was mapped with %DMA_FROM_DEVICE in the
> > > Rx code, now you sync it for the opposite.
> > > Most drivers in case of XDP enabled create Page Pools with ::dma_dir
> > > set to %DMA_BIDIRECTIONAL. Now you would need only to sync it here
> > > with the same direction (bidir) and that's it.
> >
> > That is a really good catch!
> > I was wondering why the things were working when I tested this. Because
> > definitely, I can see the right behaviour.
> 
> The reasons can be:
> 
> 1) your platform might have a DMA coherence engine, so that all
>    those DMA sync calls are no-ops;
> 2) on your platform, DMA writeback (TO_DEVICE) and DMA invalidate
>    (FROM_DEVICE) invoke the same operation/instruction. Some
>    hardware is designed that way, that any DMA sync is in fact a
>    bidir synchronization;
> 3) if there were no frame modification from the kernel, e.g. you
>    received it and immediately sent, cache was not polluted with
>    some pending modifications, so there was no work for writeback;
> 4) probably something else I might've missed.
> 
> >
> > >
> > > +
> > > +       /* Setup next dcb */
> > > +       lan966x_fdma_tx_setup_dcb(tx, next_to_use, xdpf->len + IFH_LEN_BYTES,
> > > +                                 dma_addr + XDP_PACKET_HEADROOM);
> > > +
> > > +       /* Fill up the buffer */
> > > +       next_dcb_buf = &tx->dcbs_buf[next_to_use];
> > > +       next_dcb_buf->skb = NULL;
> > > +       next_dcb_buf->page = page;
> > > +       next_dcb_buf->len = xdpf->len + IFH_LEN_BYTES;
> > > +       next_dcb_buf->dma_addr = dma_addr;
> > > +       next_dcb_buf->used = true;
> > > +       next_dcb_buf->ptp = false;
> > > +       next_dcb_buf->dev = port->dev;
> > > +
> > > +       /* Start the transmission */
> > > +       lan966x_fdma_tx_start(tx, next_to_use);
> > > +
> > > +out:
> > > +       spin_unlock(&lan966x->tx_lock);
> > > +
> > > +       return ret;
> > > +}
> > > +
> > >  int lan966x_fdma_xmit(struct sk_buff *skb, __be32 *ifh, struct net_device *dev)
> > >  {
> > >         struct lan966x_port *port = netdev_priv(dev);
> > > @@ -709,6 +776,7 @@ int lan966x_fdma_xmit(struct sk_buff *skb, __be32 *ifh, struct net_device *dev)
> > >         /* Fill up the buffer */
> > >         next_dcb_buf = &tx->dcbs_buf[next_to_use];
> > >         next_dcb_buf->skb = skb;
> > > +       next_dcb_buf->page = NULL;
> > >         next_dcb_buf->len = skb->len;
> > >         next_dcb_buf->dma_addr = dma_addr;
> > >         next_dcb_buf->used = true;
> > >
> > > [...]
> > >
> > > --
> > > 2.38.0
> > >
> > > Thanks,
> > > Olek
> >
> > --
> > /Horatiu
> 
> [0] https://elixir.bootlin.com/linux/v6.1-rc5/source/drivers/net/ethernet/marvell/mvneta.c#L1882
> 
> Thanks,
> Olek
  

Patch

diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
index 384ed34197d58..c2e56233a8da5 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
@@ -394,13 +394,21 @@  static void lan966x_fdma_tx_clear_buf(struct lan966x *lan966x, int weight)
 		dcb_buf->dev->stats.tx_bytes += dcb_buf->len;
 
 		dcb_buf->used = false;
-		dma_unmap_single(lan966x->dev,
-				 dcb_buf->dma_addr,
-				 dcb_buf->len,
-				 DMA_TO_DEVICE);
-		if (!dcb_buf->ptp)
+		if (dcb_buf->skb)
+			dma_unmap_single(lan966x->dev,
+					 dcb_buf->dma_addr,
+					 dcb_buf->len,
+					 DMA_TO_DEVICE);
+
+		if (dcb_buf->skb && !dcb_buf->ptp)
 			dev_kfree_skb_any(dcb_buf->skb);
 
+		if (dcb_buf->page) {
+			page_pool_release_page(lan966x->rx.page_pool,
+					       dcb_buf->page);
+			put_page(dcb_buf->page);
+		}
+
 		clear = true;
 	}
 
@@ -532,6 +540,9 @@  static int lan966x_fdma_napi_poll(struct napi_struct *napi, int weight)
 			lan966x_fdma_rx_free_page(rx);
 			lan966x_fdma_rx_advance_dcb(rx);
 			goto allocate_new;
+		case FDMA_TX:
+			lan966x_fdma_rx_advance_dcb(rx);
+			continue;
 		case FDMA_DROP:
 			lan966x_fdma_rx_free_page(rx);
 			lan966x_fdma_rx_advance_dcb(rx);
@@ -653,6 +664,62 @@  static void lan966x_fdma_tx_start(struct lan966x_tx *tx, int next_to_use)
 	tx->last_in_use = next_to_use;
 }
 
+int lan966x_fdma_xmit_xdpf(struct lan966x_port *port,
+			   struct xdp_frame *xdpf,
+			   struct page *page)
+{
+	struct lan966x *lan966x = port->lan966x;
+	struct lan966x_tx_dcb_buf *next_dcb_buf;
+	struct lan966x_tx *tx = &lan966x->tx;
+	dma_addr_t dma_addr;
+	int next_to_use;
+	__be32 *ifh;
+	int ret = 0;
+
+	spin_lock(&lan966x->tx_lock);
+
+	/* Get next index */
+	next_to_use = lan966x_fdma_get_next_dcb(tx);
+	if (next_to_use < 0) {
+		netif_stop_queue(port->dev);
+		ret = NETDEV_TX_BUSY;
+		goto out;
+	}
+
+	/* Generate new IFH */
+	ifh = page_address(page) + XDP_PACKET_HEADROOM;
+	memset(ifh, 0x0, sizeof(__be32) * IFH_LEN);
+	lan966x_ifh_set_bypass(ifh, 1);
+	lan966x_ifh_set_port(ifh, BIT_ULL(port->chip_port));
+
+	dma_addr = page_pool_get_dma_addr(page);
+	dma_sync_single_for_device(lan966x->dev, dma_addr + XDP_PACKET_HEADROOM,
+				   xdpf->len + IFH_LEN_BYTES,
+				   DMA_TO_DEVICE);
+
+	/* Setup next dcb */
+	lan966x_fdma_tx_setup_dcb(tx, next_to_use, xdpf->len + IFH_LEN_BYTES,
+				  dma_addr + XDP_PACKET_HEADROOM);
+
+	/* Fill up the buffer */
+	next_dcb_buf = &tx->dcbs_buf[next_to_use];
+	next_dcb_buf->skb = NULL;
+	next_dcb_buf->page = page;
+	next_dcb_buf->len = xdpf->len + IFH_LEN_BYTES;
+	next_dcb_buf->dma_addr = dma_addr;
+	next_dcb_buf->used = true;
+	next_dcb_buf->ptp = false;
+	next_dcb_buf->dev = port->dev;
+
+	/* Start the transmission */
+	lan966x_fdma_tx_start(tx, next_to_use);
+
+out:
+	spin_unlock(&lan966x->tx_lock);
+
+	return ret;
+}
+
 int lan966x_fdma_xmit(struct sk_buff *skb, __be32 *ifh, struct net_device *dev)
 {
 	struct lan966x_port *port = netdev_priv(dev);
@@ -709,6 +776,7 @@  int lan966x_fdma_xmit(struct sk_buff *skb, __be32 *ifh, struct net_device *dev)
 	/* Fill up the buffer */
 	next_dcb_buf = &tx->dcbs_buf[next_to_use];
 	next_dcb_buf->skb = skb;
+	next_dcb_buf->page = NULL;
 	next_dcb_buf->len = skb->len;
 	next_dcb_buf->dma_addr = dma_addr;
 	next_dcb_buf->used = true;
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
index 42be5d0f1f015..0b7707306da26 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
@@ -302,13 +302,13 @@  static int lan966x_port_ifh_xmit(struct sk_buff *skb,
 	return NETDEV_TX_BUSY;
 }
 
-static void lan966x_ifh_set_bypass(void *ifh, u64 bypass)
+void lan966x_ifh_set_bypass(void *ifh, u64 bypass)
 {
 	packing(ifh, &bypass, IFH_POS_BYPASS + IFH_WID_BYPASS - 1,
 		IFH_POS_BYPASS, IFH_LEN * 4, PACK, 0);
 }
 
-static void lan966x_ifh_set_port(void *ifh, u64 bypass)
+void lan966x_ifh_set_port(void *ifh, u64 bypass)
 {
 	packing(ifh, &bypass, IFH_POS_DSTS + IFH_WID_DSTS - 1,
 		IFH_POS_DSTS, IFH_LEN * 4, PACK, 0);
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
index 7bb9098496f60..df7fec361962b 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
@@ -105,11 +105,13 @@  enum macaccess_entry_type {
  * FDMA_PASS, frame is valid and can be used
  * FDMA_ERROR, something went wrong, stop getting more frames
  * FDMA_DROP, frame is dropped, but continue to get more frames
+ * FDMA_TX, frame is given to TX, but continue to get more frames
  */
 enum lan966x_fdma_action {
 	FDMA_PASS = 0,
 	FDMA_ERROR,
 	FDMA_DROP,
+	FDMA_TX,
 };
 
 struct lan966x_port;
@@ -175,6 +177,7 @@  struct lan966x_rx {
 struct lan966x_tx_dcb_buf {
 	struct net_device *dev;
 	struct sk_buff *skb;
+	struct page *page;
 	int len;
 	dma_addr_t dma_addr;
 	bool used;
@@ -360,6 +363,8 @@  bool lan966x_hw_offload(struct lan966x *lan966x, u32 port, struct sk_buff *skb);
 
 void lan966x_ifh_get_src_port(void *ifh, u64 *src_port);
 void lan966x_ifh_get_timestamp(void *ifh, u64 *timestamp);
+void lan966x_ifh_set_bypass(void *ifh, u64 bypass);
+void lan966x_ifh_set_port(void *ifh, u64 bypass);
 
 void lan966x_stats_get(struct net_device *dev,
 		       struct rtnl_link_stats64 *stats);
@@ -460,6 +465,9 @@  u32 lan966x_ptp_get_period_ps(void);
 int lan966x_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts);
 
 int lan966x_fdma_xmit(struct sk_buff *skb, __be32 *ifh, struct net_device *dev);
+int lan966x_fdma_xmit_xdpf(struct lan966x_port *port,
+			   struct xdp_frame *frame,
+			   struct page *page);
 int lan966x_fdma_change_mtu(struct lan966x *lan966x);
 void lan966x_fdma_netdev_init(struct lan966x *lan966x, struct net_device *dev);
 void lan966x_fdma_netdev_deinit(struct lan966x *lan966x, struct net_device *dev);
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_xdp.c b/drivers/net/ethernet/microchip/lan966x/lan966x_xdp.c
index 8ebde1eb6a09c..9b0ba3179df62 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_xdp.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_xdp.c
@@ -39,6 +39,7 @@  int lan966x_xdp_run(struct lan966x_port *port, struct page *page, u32 data_len)
 {
 	struct bpf_prog *xdp_prog = port->xdp_prog;
 	struct lan966x *lan966x = port->lan966x;
+	struct xdp_frame *xdpf;
 	struct xdp_buff xdp;
 	u32 act;
 
@@ -51,6 +52,13 @@  int lan966x_xdp_run(struct lan966x_port *port, struct page *page, u32 data_len)
 	switch (act) {
 	case XDP_PASS:
 		return FDMA_PASS;
+	case XDP_TX:
+		xdpf = xdp_convert_buff_to_frame(&xdp);
+		if (!xdpf)
+			return FDMA_DROP;
+
+		return lan966x_fdma_xmit_xdpf(port, xdpf, page) ?
+		       FDMA_DROP : FDMA_TX;
 	default:
 		bpf_warn_invalid_xdp_action(port->dev, xdp_prog, act);
 		fallthrough;