[v2,1/1] net: fec: add initial XDP support

Message ID 20221031162200.1997788-1-shenwei.wang@nxp.com
State New
Headers
Series [v2,1/1] net: fec: add initial XDP support |

Commit Message

Shenwei Wang Oct. 31, 2022, 4:22 p.m. UTC
  This patch adds the initial XDP support to Freescale driver. It supports
XDP_PASS, XDP_DROP and XDP_REDIRECT actions. Upcoming patches will add
support for XDP_TX and Zero Copy features.

As the patch is rather large, the part of codes to collect the
statistics is separated and will prepare a dedicated patch for that
part.

The driver has a macro RX_RING_SIZE to configure the RX ring size. After
testing with the RX ring size, it turned out the small the rign size the
better the better performance for XDP mode. So the different ring size is
selected for XDP mode and normal mode in this patch.

I just tested with the application of xdpsock.
  -- Native here means running command of "xdpsock -i eth0"
  -- SKB-Mode means running command of "xdpsock -S -i eth0"

RX Ring Size       16       32        64       128
      Native      230K     227K      196K      160K
    SKB-Mode      207K     208K      203K      204K

Normal mode performance by iperf.

RX Ring Size         16         64       128
iperf              300Mbps    830Mbps   933Mbps

The following are the testing result relating to XDP mode:

root@imx8qxpc0mek:~/bpf# ./xdpsock -i eth0
 sock0@eth0:0 rxdrop xdp-drv
                   pps            pkts           1.00
rx                 384397         8048130
tx                 0              0

root@imx8qxpc0mek:~/bpf# ./xdpsock -S -i eth0
 sock0@eth0:0 rxdrop xdp-skb
                   pps            pkts           1.00
rx                 204151         204295
tx                 0              0

root@imx8qxpc0mek:~/bpf# ./xdp2 eth0
proto 0:     499359 pkt/s
proto 0:     508270 pkt/s
proto 0:     508160 pkt/s
proto 0:     508264 pkt/s

root@imx8qxpc0mek:~/bpf# ./xdp2 -S eth0
proto 0:          0 pkt/s
proto 17:     119155 pkt/s
proto 0:          1 pkt/s
proto 17:     119518 pkt/s
proto 17:     119363 pkt/s
proto 0:          0 pkt/s
proto 17:     119479 pkt/s
proto 17:     119484 pkt/s

Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com>
Reported-by: kernel test robot <lkp@intel.com>
---
 changes in V2:
 - Get rid of the expensive fec_net_close/open function calls during
   XDP/Normal Mode switch.
 - Update the testing data on i.mx8qxp mek board.
 - fix the compile issue reported by kernel_test_robot

 drivers/net/ethernet/freescale/fec.h      |   4 +
 drivers/net/ethernet/freescale/fec_main.c | 245 +++++++++++++++++++++-
 2 files changed, 248 insertions(+), 1 deletion(-)

--
2.34.1
  

Comments

Andrew Lunn Oct. 31, 2022, 5:10 p.m. UTC | #1
> +static int fec_enet_bpf(struct net_device *dev, struct netdev_bpf *bpf)
> +{
> +	struct fec_enet_private *fep = netdev_priv(dev);
> +	bool is_run = netif_running(dev);
> +	struct bpf_prog *old_prog;
> +	unsigned int dsize;
> +	int i;
> +
> +	switch (bpf->command) {
> +	case XDP_SETUP_PROG:
> +		if (is_run) {
> +			napi_disable(&fep->napi);
> +			netif_tx_disable(dev);
> +		}
> +
> +		old_prog = xchg(&fep->xdp_prog, bpf->prog);
> +
> +		/* Update RX ring size */
> +		dsize = fep->bufdesc_ex ? sizeof(struct bufdesc_ex) :
> +			sizeof(struct bufdesc);
> +		for (i = 0; i < fep->num_rx_queues; i++) {
> +			struct fec_enet_priv_rx_q *rxq = fep->rx_queue[i];
> +			struct bufdesc *cbd_base;
> +			unsigned int size;
> +
> +			cbd_base = rxq->bd.base;
> +			if (bpf->prog)
> +				rxq->bd.ring_size = XDP_RX_RING_SIZE;
> +			else
> +				rxq->bd.ring_size = RX_RING_SIZE;
> +			size = dsize * rxq->bd.ring_size;
> +			cbd_base = (struct bufdesc *)(((void *)cbd_base) + size);
> +			rxq->bd.last = (struct bufdesc *)(((void *)cbd_base) - dsize);

This does not look safe. netif_tx_disable(dev) will stop new
transmissions, but the hardware can be busy receiving, DMAing frames,
using the descriptors, etc. Modifying rxq->bd.last in particular seems
risky. I think you need to disable the receiver, wait for it to
indicate it really has stopped, and only then make these
modifications.

	Andrew
  
Shenwei Wang Oct. 31, 2022, 6:34 p.m. UTC | #2
> -----Original Message-----
> From: Andrew Lunn <andrew@lunn.ch>
> Sent: Monday, October 31, 2022 12:10 PM
> To: Shenwei Wang <shenwei.wang@nxp.com>
> > +                     cbd_base = rxq->bd.base;
> > +                     if (bpf->prog)
> > +                             rxq->bd.ring_size = XDP_RX_RING_SIZE;
> > +                     else
> > +                             rxq->bd.ring_size = RX_RING_SIZE;
> > +                     size = dsize * rxq->bd.ring_size;
> > +                     cbd_base = (struct bufdesc *)(((void *)cbd_base) + size);
> > +                     rxq->bd.last = (struct bufdesc *)(((void
> > + *)cbd_base) - dsize);
> 
> This does not look safe. netif_tx_disable(dev) will stop new transmissions, but
> the hardware can be busy receiving, DMAing frames, using the descriptors, etc.
> Modifying rxq->bd.last in particular seems risky. I think you need to disable the
> receiver, wait for it to indicate it really has stopped, and only then make these
> modifications.
> 

Sounds reasonable. How about moving the codes of updating ring size to the place
right after the enet reset inside the fec_restart? This should clear those risky corner
cases.

Thanks,
Shenwei

>         Andrew
  
Andrew Lunn Oct. 31, 2022, 8:17 p.m. UTC | #3
> > > +                     cbd_base = rxq->bd.base;
> > > +                     if (bpf->prog)
> > > +                             rxq->bd.ring_size = XDP_RX_RING_SIZE;
> > > +                     else
> > > +                             rxq->bd.ring_size = RX_RING_SIZE;
> > > +                     size = dsize * rxq->bd.ring_size;
> > > +                     cbd_base = (struct bufdesc *)(((void *)cbd_base) + size);
> > > +                     rxq->bd.last = (struct bufdesc *)(((void
> > > + *)cbd_base) - dsize);
> > 
> > This does not look safe. netif_tx_disable(dev) will stop new transmissions, but
> > the hardware can be busy receiving, DMAing frames, using the descriptors, etc.
> > Modifying rxq->bd.last in particular seems risky. I think you need to disable the
> > receiver, wait for it to indicate it really has stopped, and only then make these
> > modifications.
> > 
> 
> Sounds reasonable. How about moving the codes of updating ring size to the place
> right after the enet reset inside the fec_restart? This should clear those risky corner
> cases.

That sounds reasonable. But please add some comments. The driver has
RX_RING_SIZE elements allocated, but you are only using a subset. This
needs to be clear for when somebody implements the ethtool --rings
option.

And i still think it would be good to implement that code. As your
numbers show, the ring size does affect performance, and it is hard to
know if your hard coded XDP_RX_RING_SIZE is the right value, depending
on what the eBPF program is doing. If the ethtool option was provided,
it allows users to benchmark different ring sizes for their workload.

   Andrew
  
Shenwei Wang Oct. 31, 2022, 9:17 p.m. UTC | #4
> -----Original Message-----
> From: Andrew Lunn <andrew@lunn.ch>
> Sent: Monday, October 31, 2022 3:17 PM
> > > to disable the receiver, wait for it to indicate it really has
> > > stopped, and only then make these modifications.
> > >
> >
> > Sounds reasonable. How about moving the codes of updating ring size to
> > the place right after the enet reset inside the fec_restart? This
> > should clear those risky corner cases.
> 
> That sounds reasonable. But please add some comments. The driver has
> RX_RING_SIZE elements allocated, but you are only using a subset. This needs to
> be clear for when somebody implements the ethtool --rings option.
> 

The latest performance data regarding the native XDP has got improved a lot. The ring 
size change doesn't seem necessary any more. The v3 patch has removed that part
of codes, and that logic can be added back in future if necessary as well as the ethtool -rings option.

Thanks,
Shenwei

> And i still think it would be good to implement that code. As your numbers show,
> the ring size does affect performance, and it is hard to know if your hard coded
> XDP_RX_RING_SIZE is the right value, depending on what the eBPF program is
> doing. If the ethtool option was provided, it allows users to benchmark different
> ring sizes for their workload.
> 
>    Andrew
  

Patch

diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index 476e3863a310..07e85fc3d7ba 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -348,6 +348,7 @@  struct bufdesc_ex {
  */

 #define FEC_ENET_XDP_HEADROOM	(XDP_PACKET_HEADROOM)
+#define XDP_RX_RING_SIZE	16

 #define FEC_ENET_RX_PAGES	256
 #define FEC_ENET_RX_FRSIZE	(PAGE_SIZE - FEC_ENET_XDP_HEADROOM \
@@ -663,6 +664,9 @@  struct fec_enet_private {

 	struct imx_sc_ipc *ipc_handle;

+	/* XDP BPF Program */
+	struct bpf_prog *xdp_prog;
+
 	u64 ethtool_stats[];
 };

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 6986b74fb8af..d05f0bb1a60c 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -89,6 +89,11 @@  static const u16 fec_enet_vlan_pri_to_queue[8] = {0, 0, 1, 1, 1, 2, 2, 2};
 #define FEC_ENET_OPD_V	0xFFF0
 #define FEC_MDIO_PM_TIMEOUT  100 /* ms */

+#define FEC_ENET_XDP_PASS          0
+#define FEC_ENET_XDP_CONSUMED      BIT(0)
+#define FEC_ENET_XDP_TX            BIT(1)
+#define FEC_ENET_XDP_REDIR         BIT(2)
+
 struct fec_devinfo {
 	u32 quirks;
 };
@@ -418,13 +423,14 @@  static int
 fec_enet_create_page_pool(struct fec_enet_private *fep,
 			  struct fec_enet_priv_rx_q *rxq, int size)
 {
+	struct bpf_prog *xdp_prog = READ_ONCE(fep->xdp_prog);
 	struct page_pool_params pp_params = {
 		.order = 0,
 		.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
 		.pool_size = size,
 		.nid = dev_to_node(&fep->pdev->dev),
 		.dev = &fep->pdev->dev,
-		.dma_dir = DMA_FROM_DEVICE,
+		.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE,
 		.offset = FEC_ENET_XDP_HEADROOM,
 		.max_len = FEC_ENET_RX_FRSIZE,
 	};
@@ -1499,6 +1505,59 @@  static void fec_enet_update_cbd(struct fec_enet_priv_rx_q *rxq,
 	bdp->cbd_bufaddr = cpu_to_fec32(phys_addr);
 }

+static u32
+fec_enet_run_xdp(struct fec_enet_private *fep, struct bpf_prog *prog,
+		 struct xdp_buff *xdp, struct fec_enet_priv_rx_q *rxq, int index)
+{
+	unsigned int sync, len = xdp->data_end - xdp->data;
+	u32 ret = FEC_ENET_XDP_PASS;
+	struct page *page;
+	int err;
+	u32 act;
+
+	act = bpf_prog_run_xdp(prog, xdp);
+
+	/* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */
+	sync = xdp->data_end - xdp->data_hard_start - FEC_ENET_XDP_HEADROOM;
+	sync = max(sync, len);
+
+	switch (act) {
+	case XDP_PASS:
+		ret = FEC_ENET_XDP_PASS;
+		break;
+
+	case XDP_REDIRECT:
+		err = xdp_do_redirect(fep->netdev, xdp, prog);
+		if (!err) {
+			ret = FEC_ENET_XDP_REDIR;
+		} else {
+			ret = FEC_ENET_XDP_CONSUMED;
+			page = virt_to_head_page(xdp->data);
+			page_pool_put_page(rxq->page_pool, page, sync, true);
+		}
+		break;
+
+	default:
+		bpf_warn_invalid_xdp_action(fep->netdev, prog, act);
+		fallthrough;
+
+	case XDP_TX:
+		bpf_warn_invalid_xdp_action(fep->netdev, prog, act);
+		fallthrough;
+
+	case XDP_ABORTED:
+		fallthrough;    /* handle aborts by dropping packet */
+
+	case XDP_DROP:
+		ret = FEC_ENET_XDP_CONSUMED;
+		page = virt_to_head_page(xdp->data);
+		page_pool_put_page(rxq->page_pool, page, sync, true);
+		break;
+	}
+
+	return ret;
+}
+
 /* During a receive, the bd_rx.cur points to the current incoming buffer.
  * When we update through the ring, if the next incoming buffer has
  * not been given to the system, we just set the empty indicator,
@@ -1520,6 +1579,9 @@  fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 	u16	vlan_tag;
 	int	index = 0;
 	bool	need_swap = fep->quirks & FEC_QUIRK_SWAP_FRAME;
+	struct bpf_prog *xdp_prog = READ_ONCE(fep->xdp_prog);
+	u32 ret, xdp_result = FEC_ENET_XDP_PASS;
+	struct xdp_buff xdp;
 	struct page *page;

 #ifdef CONFIG_M532x
@@ -1531,6 +1593,7 @@  fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 	 * These get messed up if we get called due to a busy condition.
 	 */
 	bdp = rxq->bd.cur;
+	xdp_init_buff(&xdp, PAGE_SIZE, &rxq->xdp_rxq);

 	while (!((status = fec16_to_cpu(bdp->cbd_sc)) & BD_ENET_RX_EMPTY)) {

@@ -1580,6 +1643,17 @@  fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 		prefetch(page_address(page));
 		fec_enet_update_cbd(rxq, bdp, index);

+		if (xdp_prog) {
+			xdp_buff_clear_frags_flag(&xdp);
+			xdp_prepare_buff(&xdp, page_address(page),
+					 FEC_ENET_XDP_HEADROOM, pkt_len, false);
+
+			ret = fec_enet_run_xdp(fep, xdp_prog, &xdp, rxq, index);
+			xdp_result |= ret;
+			if (ret != FEC_ENET_XDP_PASS)
+				goto rx_processing_done;
+		}
+
 		/* The packet length includes FCS, but we don't want to
 		 * include that when passing upstream as it messes up
 		 * bridging applications.
@@ -1675,6 +1749,10 @@  fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 		writel(0, rxq->bd.reg_desc_active);
 	}
 	rxq->bd.cur = bdp;
+
+	if (xdp_result & FEC_ENET_XDP_REDIR)
+		xdp_do_flush_map();
+
 	return pkt_received;
 }

@@ -3476,6 +3554,169 @@  static u16 fec_enet_select_queue(struct net_device *ndev, struct sk_buff *skb,
 	return fec_enet_vlan_pri_to_queue[vlan_tag >> 13];
 }

+static int fec_enet_bpf(struct net_device *dev, struct netdev_bpf *bpf)
+{
+	struct fec_enet_private *fep = netdev_priv(dev);
+	bool is_run = netif_running(dev);
+	struct bpf_prog *old_prog;
+	unsigned int dsize;
+	int i;
+
+	switch (bpf->command) {
+	case XDP_SETUP_PROG:
+		if (is_run) {
+			napi_disable(&fep->napi);
+			netif_tx_disable(dev);
+		}
+
+		old_prog = xchg(&fep->xdp_prog, bpf->prog);
+
+		/* Update RX ring size */
+		dsize = fep->bufdesc_ex ? sizeof(struct bufdesc_ex) :
+			sizeof(struct bufdesc);
+		for (i = 0; i < fep->num_rx_queues; i++) {
+			struct fec_enet_priv_rx_q *rxq = fep->rx_queue[i];
+			struct bufdesc *cbd_base;
+			unsigned int size;
+
+			cbd_base = rxq->bd.base;
+			if (bpf->prog)
+				rxq->bd.ring_size = XDP_RX_RING_SIZE;
+			else
+				rxq->bd.ring_size = RX_RING_SIZE;
+			size = dsize * rxq->bd.ring_size;
+			cbd_base = (struct bufdesc *)(((void *)cbd_base) + size);
+			rxq->bd.last = (struct bufdesc *)(((void *)cbd_base) - dsize);
+		}
+
+		fec_restart(dev);
+
+		if (is_run) {
+			napi_enable(&fep->napi);
+			netif_tx_start_all_queues(dev);
+		}
+
+		if (old_prog)
+			bpf_prog_put(old_prog);
+
+		return 0;
+
+	case XDP_SETUP_XSK_POOL:
+		return -EOPNOTSUPP;
+
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int
+fec_enet_xdp_get_tx_queue(struct fec_enet_private *fep, int cpu)
+{
+	int index = cpu;
+
+	if (unlikely(index < 0))
+		index = 0;
+
+	while (index >= fep->num_tx_queues)
+		index -= fep->num_tx_queues;
+
+	return index;
+}
+
+static int fec_enet_txq_xmit_frame(struct fec_enet_private *fep,
+				   struct fec_enet_priv_tx_q *txq,
+				   struct xdp_frame *frame)
+{
+	unsigned int index, status, estatus;
+	struct bufdesc *bdp, *last_bdp;
+	dma_addr_t dma_addr;
+	int entries_free;
+
+	entries_free = fec_enet_get_free_txdesc_num(txq);
+	if (entries_free < MAX_SKB_FRAGS + 1) {
+		netdev_err(fep->netdev, "NOT enough BD for SG!\n");
+		return NETDEV_TX_OK;
+	}
+
+	/* Fill in a Tx ring entry */
+	bdp = txq->bd.cur;
+	last_bdp = bdp;
+	status = fec16_to_cpu(bdp->cbd_sc);
+	status &= ~BD_ENET_TX_STATS;
+
+	index = fec_enet_get_bd_index(bdp, &txq->bd);
+
+	dma_addr = dma_map_single(&fep->pdev->dev, frame->data,
+				  frame->len, DMA_TO_DEVICE);
+	if (dma_mapping_error(&fep->pdev->dev, dma_addr))
+		return FEC_ENET_XDP_CONSUMED;
+
+	status |= (BD_ENET_TX_INTR | BD_ENET_TX_LAST);
+	if (fep->bufdesc_ex)
+		estatus = BD_ENET_TX_INT;
+
+	bdp->cbd_bufaddr = cpu_to_fec32(dma_addr);
+	bdp->cbd_datlen = cpu_to_fec16(frame->len);
+
+	if (fep->bufdesc_ex) {
+		struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
+
+		if (fep->quirks & FEC_QUIRK_HAS_AVB)
+			estatus |= FEC_TX_BD_FTYPE(txq->bd.qid);
+
+		ebdp->cbd_bdu = 0;
+		ebdp->cbd_esc = cpu_to_fec32(estatus);
+	}
+
+	index = fec_enet_get_bd_index(last_bdp, &txq->bd);
+	txq->tx_skbuff[index] = NULL;
+
+	/* Send it on its way.  Tell FEC it's ready, interrupt when done,
+	 * it's the last BD of the frame, and to put the CRC on the end.
+	 */
+	status |= (BD_ENET_TX_READY | BD_ENET_TX_TC);
+	bdp->cbd_sc = cpu_to_fec16(status);
+
+	/* If this was the last BD in the ring, start at the beginning again. */
+	bdp = fec_enet_get_nextdesc(last_bdp, &txq->bd);
+
+	txq->bd.cur = bdp;
+
+	return 0;
+}
+
+static int fec_enet_xdp_xmit(struct net_device *dev,
+			     int num_frames,
+			     struct xdp_frame **frames,
+			     u32 flags)
+{
+	struct fec_enet_private *fep = netdev_priv(dev);
+	struct fec_enet_priv_tx_q *txq;
+	int cpu = smp_processor_id();
+	struct netdev_queue *nq;
+	unsigned int queue;
+	int i;
+
+	queue = fec_enet_xdp_get_tx_queue(fep, cpu);
+	txq = fep->tx_queue[queue];
+	nq = netdev_get_tx_queue(fep->netdev, queue);
+
+	__netif_tx_lock(nq, cpu);
+
+	for (i = 0; i < num_frames; i++)
+		fec_enet_txq_xmit_frame(fep, txq, frames[i]);
+
+	/* Make sure the update to bdp and tx_skbuff are performed. */
+	wmb();
+
+	/* Trigger transmission start */
+	writel(0, txq->bd.reg_desc_active);
+
+	__netif_tx_unlock(nq);
+
+	return num_frames;
+}
+
 static const struct net_device_ops fec_netdev_ops = {
 	.ndo_open		= fec_enet_open,
 	.ndo_stop		= fec_enet_close,
@@ -3490,6 +3731,8 @@  static const struct net_device_ops fec_netdev_ops = {
 	.ndo_poll_controller	= fec_poll_controller,
 #endif
 	.ndo_set_features	= fec_set_features,
+	.ndo_bpf		= fec_enet_bpf,
+	.ndo_xdp_xmit		= fec_enet_xdp_xmit,
 };

 static const unsigned short offset_des_active_rxq[] = {