On Thu, Aug 03, 2023 at 10:04:35PM +0800, huangjie.albert wrote:
...
> +static struct sk_buff *veth_build_skb_zerocopy(struct net_device *dev, struct xsk_buff_pool *pool,
> + struct xdp_desc *desc)
> +{
> + struct veth_seg_info *seg_info;
> + struct sk_buff *skb;
> + struct page *page;
> + void *hard_start;
> + u32 len, ts;
> + void *buffer;
> + int headroom;
> + u64 addr;
> + u32 index;
> +
> + addr = desc->addr;
> + len = desc->len;
> + buffer = xsk_buff_raw_get_data(pool, addr);
> + ts = pool->unaligned ? len : pool->chunk_size;
> +
> + headroom = offset_in_page(buffer);
> +
> + /* offset in umem pool buffer */
> + addr = buffer - pool->addrs;
> +
> + /* get the page of the desc */
> + page = pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> + /* in order to avoid to get freed by kfree_skb */
> + get_page(page);
> +
> + hard_start = page_to_virt(page);
> +
> + skb = veth_build_skb(hard_start, headroom, len, ts);
> + seg_info = (struct veth_seg_info *)kmalloc(struct_size(seg_info, desc, MAX_SKB_FRAGS), GFP_KERNEL);
There is no need to explicitly case the return value of kmalloc,
as it returns void *.
seg_info = kmalloc(struct_size(seg_info, desc, MAX_SKB_FRAGS),
GFP_KERNEL);
...
@@ -103,6 +103,11 @@ struct veth_xdp_tx_bq {
unsigned int count;
};
+struct veth_seg_info {
+ u32 segs;
+ u64 desc[] ____cacheline_aligned_in_smp;
+};
+
/*
* ethtool interface
*/
@@ -645,6 +650,100 @@ static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp,
return 0;
}
+static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
+ int buflen)
+{
+ struct sk_buff *skb;
+
+ skb = build_skb(head, buflen);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, headroom);
+ skb_put(skb, len);
+
+ return skb;
+}
+
+static void veth_xsk_destruct_skb(struct sk_buff *skb)
+{
+ struct veth_seg_info *seg_info = (struct veth_seg_info *)skb_shinfo(skb)->destructor_arg;
+ struct xsk_buff_pool *pool = (struct xsk_buff_pool *)skb_shinfo(skb)->destructor_arg_xsk_pool;
+ unsigned long flags;
+ u32 index = 0;
+ u64 addr;
+
+ /* release cq */
+ spin_lock_irqsave(&pool->cq_lock, flags);
+ for (index = 0; index < seg_info->segs; index++) {
+ addr = (u64)(long)seg_info->desc[index];
+ xsk_tx_completed_addr(pool, addr);
+ }
+ spin_unlock_irqrestore(&pool->cq_lock, flags);
+
+ kfree(seg_info);
+ skb_shinfo(skb)->destructor_arg = NULL;
+ skb_shinfo(skb)->destructor_arg_xsk_pool = NULL;
+}
+
+static struct sk_buff *veth_build_skb_zerocopy(struct net_device *dev, struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
+{
+ struct veth_seg_info *seg_info;
+ struct sk_buff *skb;
+ struct page *page;
+ void *hard_start;
+ u32 len, ts;
+ void *buffer;
+ int headroom;
+ u64 addr;
+ u32 index;
+
+ addr = desc->addr;
+ len = desc->len;
+ buffer = xsk_buff_raw_get_data(pool, addr);
+ ts = pool->unaligned ? len : pool->chunk_size;
+
+ headroom = offset_in_page(buffer);
+
+ /* offset in umem pool buffer */
+ addr = buffer - pool->addrs;
+
+ /* get the page of the desc */
+ page = pool->umem->pgs[addr >> PAGE_SHIFT];
+
+ /* in order to avoid to get freed by kfree_skb */
+ get_page(page);
+
+ hard_start = page_to_virt(page);
+
+ skb = veth_build_skb(hard_start, headroom, len, ts);
+ seg_info = (struct veth_seg_info *)kmalloc(struct_size(seg_info, desc, MAX_SKB_FRAGS), GFP_KERNEL);
+ if (!seg_info)
+ {
+ printk("here must to deal with\n");
+ }
+
+ /* later we will support gso for this */
+ index = skb_shinfo(skb)->gso_segs;
+ seg_info->desc[index] = desc->addr;
+ seg_info->segs = ++index;
+
+ skb->truesize += ts;
+ skb->dev = dev;
+ skb_shinfo(skb)->destructor_arg = (void *)(long)seg_info;
+ skb_shinfo(skb)->destructor_arg_xsk_pool = (void *)(long)pool;
+ skb->destructor = veth_xsk_destruct_skb;
+
+ /* set the mac header */
+ skb->protocol = eth_type_trans(skb, dev);
+
+ /* to do, add skb to sock. may be there is no need to do for this
+ * refcount_add(ts, &xs->sk.sk_wmem_alloc);
+ */
+ return skb;
+}
+
static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
struct xdp_frame *frame,
struct veth_xdp_tx_bq *bq,
@@ -1063,6 +1162,20 @@ static int veth_poll(struct napi_struct *napi, int budget)
return done;
}
+/* if buffer contain in a page */
+static inline bool buffer_in_page(void *buffer, u32 len)
+{
+ u32 offset;
+
+ offset = offset_in_page(buffer);
+
+ if(PAGE_SIZE - offset >= len) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool, int budget)
{
struct veth_priv *priv, *peer_priv;
@@ -1073,6 +1186,9 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
struct veth_xdp_tx_bq bq;
struct xdp_desc desc;
void *xdpf;
+ struct sk_buff *skb = NULL;
+ bool zc = xsk_pool->umem->zc;
+ u32 xsk_headroom = xsk_pool->headroom;
int done = 0;
bq.count = 0;
@@ -1102,12 +1218,6 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
break;
}
- /*
- * Get a xmit addr
- * desc.addr is a offset, so we should to convert to real virtual address
- */
- addr = xsk_buff_raw_get_data(xsk_pool, desc.addr);
-
/* can not hold all data in a page */
truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + desc.len + sizeof(struct xdp_frame);
if (truesize > PAGE_SIZE) {
@@ -1116,16 +1226,39 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
continue;
}
- page = dev_alloc_page();
- if (!page) {
- /*
- * error , release xdp frame and increase drops
- */
- xsk_tx_completed_addr(xsk_pool, desc.addr);
- stats.xdp_drops++;
- break;
+ /*
+ * Get a xmit addr
+ * desc.addr is a offset, so we should to convert to real virtual address
+ */
+ addr = xsk_buff_raw_get_data(xsk_pool, desc.addr);
+
+ /*
+ * in order to support zero copy, headroom must have enough space to hold xdp_frame
+ */
+ if (zc && (xsk_headroom < sizeof(struct xdp_frame)))
+ zc = false;
+
+ /*
+ * if desc not contain in a page, also do not support zero copy
+ */
+ if (!buffer_in_page(addr, desc.len))
+ zc = false;
+
+ if (zc) {
+ /* headroom is reserved for xdp_frame */
+ new_addr = addr - sizeof(struct xdp_frame);
+ } else {
+ page = dev_alloc_page();
+ if (!page) {
+ /*
+ * error , release xdp frame and increase drops
+ */
+ xsk_tx_completed_addr(xsk_pool, desc.addr);
+ stats.xdp_drops++;
+ break;
+ }
+ new_addr = page_to_virt(page);
}
- new_addr = page_to_virt(page);
p_frame = new_addr;
new_addr += sizeof(struct xdp_frame);
@@ -1137,19 +1270,37 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
*/
p_frame->headroom = 0;
p_frame->metasize = 0;
- p_frame->frame_sz = PAGE_SIZE;
p_frame->flags = 0;
- p_frame->mem.type = MEM_TYPE_PAGE_SHARED;
- memcpy(p_frame->data, addr, p_frame->len);
- xsk_tx_completed_addr(xsk_pool, desc.addr);
-
- /* if peer have xdp prog, if it has ,just send to peer */
- p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats);
- /* if no xdp with this queue, convert to skb to xmit*/
- if (p_frame) {
- xdpf = p_frame;
- veth_xdp_rcv_bulk_skb(peer_rq, &xdpf, 1, &bq, &peer_stats);
- p_frame = NULL;
+
+ if (zc) {
+ p_frame->frame_sz = xsk_pool->frame_len;
+ /* to do: if there is a xdp, how to recycle the tx desc */
+ p_frame->mem.type = MEM_TYPE_XSK_BUFF_POOL_TX;
+ /* no need to copy address for af+xdp */
+ p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats);
+ if (p_frame) {
+ skb = veth_build_skb_zerocopy(peer_dev, xsk_pool, &desc);
+ if (skb) {
+ napi_gro_receive(&peer_rq->xdp_napi, skb);
+ skb = NULL;
+ } else {
+ xsk_tx_completed_addr(xsk_pool, desc.addr);
+ }
+ }
+ } else {
+ p_frame->frame_sz = PAGE_SIZE;
+ p_frame->mem.type = MEM_TYPE_PAGE_SHARED;
+ memcpy(p_frame->data, addr, p_frame->len);
+ xsk_tx_completed_addr(xsk_pool, desc.addr);
+
+ /* if peer have xdp prog, if it has ,just send to peer */
+ p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats);
+ /* if no xdp with this queue, convert to skb to xmit*/
+ if (p_frame) {
+ xdpf = p_frame;
+ veth_xdp_rcv_bulk_skb(peer_rq, &xdpf, 1, &bq, &peer_stats);
+ p_frame = NULL;
+ }
}
stats.xdp_bytes += desc.len;
@@ -1163,8 +1314,6 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
xsk_tx_release(xsk_pool);
}
-
-
/* just for peer rq */
if (peer_stats.xdp_tx > 0)
veth_xdp_flush(peer_rq, &bq);