[net-next,v1,13/16] tcp: RX path for devmem TCP
Commit Message
In tcp_recvmsg_locked(), detect if the skb being received by the user
is a devmem skb. In this case - if the user provided the MSG_SOCK_DEVMEM
flag - pass it to tcp_recvmsg_devmem() for custom handling.
tcp_recvmsg_devmem() copies any data in the skb header to the linear
buffer, and returns a cmsg to the user indicating the number of bytes
returned in the linear buffer.
tcp_recvmsg_devmem() then loops over the unaccessible devmem skb frags,
and returns to the user a cmsg_devmem indicating the location of the
data in the dmabuf device memory. cmsg_devmem contains this information:
1. the offset into the dmabuf where the payload starts. 'frag_offset'.
2. the size of the frag. 'frag_size'.
3. an opaque token 'frag_token' to return to the kernel when the buffer
is to be released.
The pages awaiting freeing are stored in the newly added
sk->sk_user_pages, and each page passed to userspace is get_page()'d.
This reference is dropped once the userspace indicates that it is
done reading this page. All pages are released when the socket is
destroyed.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>
---
Changes in v1:
- Added dmabuf_id to dmabuf_cmsg (David/Stan).
- Devmem -> dmabuf (David).
- Change tcp_recvmsg_dmabuf() check to skb->dmabuf (Paolo).
- Use __skb_frag_ref() & napi_pp_put_page() for refcounting (Yunsheng).
RFC v3:
- Fixed issue with put_cmsg() failing silently.
---
include/linux/socket.h | 1 +
include/net/page_pool/helpers.h | 9 ++
include/net/sock.h | 2 +
include/uapi/asm-generic/socket.h | 5 +
include/uapi/linux/uio.h | 10 ++
net/ipv4/tcp.c | 190 +++++++++++++++++++++++++++++-
net/ipv4/tcp_ipv4.c | 8 ++
7 files changed, 220 insertions(+), 5 deletions(-)
Comments
Hi Mina,
kernel test robot noticed the following build errors:
[auto build test ERROR on net-next/main]
url: https://github.com/intel-lab-lkp/linux/commits/Mina-Almasry/net-page_pool-factor-out-releasing-DMA-from-releasing-the-page/20231208-085531
base: net-next/main
patch link: https://lore.kernel.org/r/20231208005250.2910004-14-almasrymina%40google.com
patch subject: [net-next v1 13/16] tcp: RX path for devmem TCP
config: alpha-defconfig (https://download.01.org/0day-ci/archive/20231208/202312082353.lFKTtexo-lkp@intel.com/config)
compiler: alpha-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231208/202312082353.lFKTtexo-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202312082353.lFKTtexo-lkp@intel.com/
All errors (new ones prefixed by >>):
net/ipv4/tcp.c: In function 'tcp_recvmsg_dmabuf':
>> net/ipv4/tcp.c:2348:57: error: 'SO_DEVMEM_LINEAR' undeclared (first use in this function)
2348 | err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR,
| ^~~~~~~~~~~~~~~~
net/ipv4/tcp.c:2348:57: note: each undeclared identifier is reported only once for each function it appears in
>> net/ipv4/tcp.c:2411:48: error: 'SO_DEVMEM_DMABUF' undeclared (first use in this function)
2411 | SO_DEVMEM_DMABUF,
| ^~~~~~~~~~~~~~~~
vim +/SO_DEVMEM_LINEAR +2348 net/ipv4/tcp.c
2306
2307 /* On error, returns the -errno. On success, returns number of bytes sent to the
2308 * user. May not consume all of @remaining_len.
2309 */
2310 static int tcp_recvmsg_dmabuf(const struct sock *sk, const struct sk_buff *skb,
2311 unsigned int offset, struct msghdr *msg,
2312 int remaining_len)
2313 {
2314 struct dmabuf_cmsg dmabuf_cmsg = { 0 };
2315 unsigned int start;
2316 int i, copy, n;
2317 int sent = 0;
2318 int err = 0;
2319
2320 do {
2321 start = skb_headlen(skb);
2322
2323 if (!skb->dmabuf) {
2324 err = -ENODEV;
2325 goto out;
2326 }
2327
2328 /* Copy header. */
2329 copy = start - offset;
2330 if (copy > 0) {
2331 copy = min(copy, remaining_len);
2332
2333 n = copy_to_iter(skb->data + offset, copy,
2334 &msg->msg_iter);
2335 if (n != copy) {
2336 err = -EFAULT;
2337 goto out;
2338 }
2339
2340 offset += copy;
2341 remaining_len -= copy;
2342
2343 /* First a dmabuf_cmsg for # bytes copied to user
2344 * buffer.
2345 */
2346 memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg));
2347 dmabuf_cmsg.frag_size = copy;
> 2348 err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR,
2349 sizeof(dmabuf_cmsg), &dmabuf_cmsg);
2350 if (err || msg->msg_flags & MSG_CTRUNC) {
2351 msg->msg_flags &= ~MSG_CTRUNC;
2352 if (!err)
2353 err = -ETOOSMALL;
2354 goto out;
2355 }
2356
2357 sent += copy;
2358
2359 if (remaining_len == 0)
2360 goto out;
2361 }
2362
2363 /* after that, send information of dmabuf pages through a
2364 * sequence of cmsg
2365 */
2366 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2367 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2368 struct page_pool_iov *ppiov;
2369 u64 frag_offset;
2370 u32 user_token;
2371 int end;
2372
2373 /* skb->dmabuf should indicate that ALL the frags in
2374 * this skb are dmabuf page_pool_iovs. We're checking
2375 * for that flag above, but also check individual frags
2376 * here. If the tcp stack is not setting skb->dmabuf
2377 * correctly, we still don't want to crash here when
2378 * accessing pgmap or priv below.
2379 */
2380 if (!skb_frag_page_pool_iov(frag)) {
2381 net_err_ratelimited("Found non-dmabuf skb with page_pool_iov");
2382 err = -ENODEV;
2383 goto out;
2384 }
2385
2386 ppiov = skb_frag_page_pool_iov(frag);
2387 end = start + skb_frag_size(frag);
2388 copy = end - offset;
2389
2390 if (copy > 0) {
2391 copy = min(copy, remaining_len);
2392
2393 frag_offset = page_pool_iov_virtual_addr(ppiov) +
2394 skb_frag_off(frag) + offset -
2395 start;
2396 dmabuf_cmsg.frag_offset = frag_offset;
2397 dmabuf_cmsg.frag_size = copy;
2398 err = xa_alloc((struct xarray *)&sk->sk_user_pages,
2399 &user_token, frag->bv_page,
2400 xa_limit_31b, GFP_KERNEL);
2401 if (err)
2402 goto out;
2403
2404 dmabuf_cmsg.frag_token = user_token;
2405 dmabuf_cmsg.dmabuf_id = page_pool_iov_binding_id(ppiov);
2406
2407 offset += copy;
2408 remaining_len -= copy;
2409
2410 err = put_cmsg(msg, SOL_SOCKET,
> 2411 SO_DEVMEM_DMABUF,
2412 sizeof(dmabuf_cmsg),
2413 &dmabuf_cmsg);
2414 if (err || msg->msg_flags & MSG_CTRUNC) {
2415 msg->msg_flags &= ~MSG_CTRUNC;
2416 xa_erase((struct xarray *)&sk->sk_user_pages,
2417 user_token);
2418 if (!err)
2419 err = -ETOOSMALL;
2420 goto out;
2421 }
2422
2423 __skb_frag_ref(frag);
2424
2425 sent += copy;
2426
2427 if (remaining_len == 0)
2428 goto out;
2429 }
2430 start = end;
2431 }
2432
2433 if (!remaining_len)
2434 goto out;
2435
2436 /* if remaining_len is not satisfied yet, we need to go to the
2437 * next frag in the frag_list to satisfy remaining_len.
2438 */
2439 skb = skb_shinfo(skb)->frag_list ?: skb->next;
2440
2441 offset = offset - start;
2442 } while (skb);
2443
2444 if (remaining_len) {
2445 err = -EFAULT;
2446 goto out;
2447 }
2448
2449 out:
2450 if (!sent)
2451 sent = err;
2452
2453 return sent;
2454 }
2455
On 12/7/23 5:52 PM, Mina Almasry wrote:
> In tcp_recvmsg_locked(), detect if the skb being received by the user
> is a devmem skb. In this case - if the user provided the MSG_SOCK_DEVMEM
> flag - pass it to tcp_recvmsg_devmem() for custom handling.
>
> tcp_recvmsg_devmem() copies any data in the skb header to the linear
> buffer, and returns a cmsg to the user indicating the number of bytes
> returned in the linear buffer.
>
> tcp_recvmsg_devmem() then loops over the unaccessible devmem skb frags,
> and returns to the user a cmsg_devmem indicating the location of the
> data in the dmabuf device memory. cmsg_devmem contains this information:
>
> 1. the offset into the dmabuf where the payload starts. 'frag_offset'.
> 2. the size of the frag. 'frag_size'.
> 3. an opaque token 'frag_token' to return to the kernel when the buffer
> is to be released.
>
> The pages awaiting freeing are stored in the newly added
> sk->sk_user_pages, and each page passed to userspace is get_page()'d.
> This reference is dropped once the userspace indicates that it is
> done reading this page. All pages are released when the socket is
> destroyed.
>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
> Signed-off-by: Mina Almasry <almasrymina@google.com>
>
> ---
>
> Changes in v1:
> - Added dmabuf_id to dmabuf_cmsg (David/Stan).
> - Devmem -> dmabuf (David).
> - Change tcp_recvmsg_dmabuf() check to skb->dmabuf (Paolo).
> - Use __skb_frag_ref() & napi_pp_put_page() for refcounting (Yunsheng).
>
> RFC v3:
> - Fixed issue with put_cmsg() failing silently.
>
What happens if a retransmitted packet is received or an rx window is
closed and a probe is received where the kernel drops the skb - is the
iov reference(s) in the skb returned to the pool by the stack and ready
for use again?
On Fri, Dec 8, 2023 at 9:55 AM David Ahern <dsahern@kernel.org> wrote:
>
> On 12/7/23 5:52 PM, Mina Almasry wrote:
> > In tcp_recvmsg_locked(), detect if the skb being received by the user
> > is a devmem skb. In this case - if the user provided the MSG_SOCK_DEVMEM
> > flag - pass it to tcp_recvmsg_devmem() for custom handling.
> >
> > tcp_recvmsg_devmem() copies any data in the skb header to the linear
> > buffer, and returns a cmsg to the user indicating the number of bytes
> > returned in the linear buffer.
> >
> > tcp_recvmsg_devmem() then loops over the unaccessible devmem skb frags,
> > and returns to the user a cmsg_devmem indicating the location of the
> > data in the dmabuf device memory. cmsg_devmem contains this information:
> >
> > 1. the offset into the dmabuf where the payload starts. 'frag_offset'.
> > 2. the size of the frag. 'frag_size'.
> > 3. an opaque token 'frag_token' to return to the kernel when the buffer
> > is to be released.
> >
> > The pages awaiting freeing are stored in the newly added
> > sk->sk_user_pages, and each page passed to userspace is get_page()'d.
> > This reference is dropped once the userspace indicates that it is
> > done reading this page. All pages are released when the socket is
> > destroyed.
> >
> > Signed-off-by: Willem de Bruijn <willemb@google.com>
> > Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
> > Signed-off-by: Mina Almasry <almasrymina@google.com>
> >
> > ---
> >
> > Changes in v1:
> > - Added dmabuf_id to dmabuf_cmsg (David/Stan).
> > - Devmem -> dmabuf (David).
> > - Change tcp_recvmsg_dmabuf() check to skb->dmabuf (Paolo).
> > - Use __skb_frag_ref() & napi_pp_put_page() for refcounting (Yunsheng).
> >
> > RFC v3:
> > - Fixed issue with put_cmsg() failing silently.
> >
>
> What happens if a retransmitted packet is received or an rx window is
> closed and a probe is received where the kernel drops the skb - is the
> iov reference(s) in the skb returned to the pool by the stack and ready
> for use again?
When an skb is dropped, skb_frag_unref() is called on the frags, which
calls napi_pp_put_page(), drops the references, and the iov is
recycled, yes.
@@ -326,6 +326,7 @@ struct ucred {
* plain text and require encryption
*/
+#define MSG_SOCK_DEVMEM 0x2000000 /* Receive devmem skbs as cmsg */
#define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */
#define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */
#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */
@@ -108,6 +108,15 @@ page_pool_iov_dma_addr(const struct page_pool_iov *ppiov)
((dma_addr_t)page_pool_iov_idx(ppiov) << PAGE_SHIFT);
}
+static inline unsigned long
+page_pool_iov_virtual_addr(const struct page_pool_iov *ppiov)
+{
+ struct dmabuf_genpool_chunk_owner *owner = page_pool_iov_owner(ppiov);
+
+ return owner->base_virtual +
+ ((unsigned long)page_pool_iov_idx(ppiov) << PAGE_SHIFT);
+}
+
static inline struct netdev_dmabuf_binding *
page_pool_iov_binding(const struct page_pool_iov *ppiov)
{
@@ -353,6 +353,7 @@ struct sk_filter;
* @sk_txtime_unused: unused txtime flags
* @ns_tracker: tracker for netns reference
* @sk_bind2_node: bind node in the bhash2 table
+ * @sk_user_pages: xarray of pages the user is holding a reference on.
*/
struct sock {
/*
@@ -545,6 +546,7 @@ struct sock {
struct rcu_head sk_rcu;
netns_tracker ns_tracker;
struct hlist_node sk_bind2_node;
+ struct xarray sk_user_pages;
};
enum sk_pacing {
@@ -135,6 +135,11 @@
#define SO_PASSPIDFD 76
#define SO_PEERPIDFD 77
+#define SO_DEVMEM_LINEAR 98
+#define SCM_DEVMEM_LINEAR SO_DEVMEM_LINEAR
+#define SO_DEVMEM_DMABUF 99
+#define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
@@ -20,6 +20,16 @@ struct iovec
__kernel_size_t iov_len; /* Must be size_t (1003.1g) */
};
+struct dmabuf_cmsg {
+ __u64 frag_offset; /* offset into the dmabuf where the frag starts.
+ */
+ __u32 frag_size; /* size of the frag. */
+ __u32 frag_token; /* token representing this frag for
+ * DEVMEM_DONTNEED.
+ */
+ __u32 dmabuf_id; /* dmabuf id this frag belongs to. */
+};
+
/*
* UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1)
*/
@@ -461,6 +461,7 @@ void tcp_init_sock(struct sock *sk)
set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
sk_sockets_allocated_inc(sk);
+ xa_init_flags(&sk->sk_user_pages, XA_FLAGS_ALLOC1);
}
EXPORT_SYMBOL(tcp_init_sock);
@@ -2303,6 +2304,155 @@ static int tcp_inq_hint(struct sock *sk)
return inq;
}
+/* On error, returns the -errno. On success, returns number of bytes sent to the
+ * user. May not consume all of @remaining_len.
+ */
+static int tcp_recvmsg_dmabuf(const struct sock *sk, const struct sk_buff *skb,
+ unsigned int offset, struct msghdr *msg,
+ int remaining_len)
+{
+ struct dmabuf_cmsg dmabuf_cmsg = { 0 };
+ unsigned int start;
+ int i, copy, n;
+ int sent = 0;
+ int err = 0;
+
+ do {
+ start = skb_headlen(skb);
+
+ if (!skb->dmabuf) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ /* Copy header. */
+ copy = start - offset;
+ if (copy > 0) {
+ copy = min(copy, remaining_len);
+
+ n = copy_to_iter(skb->data + offset, copy,
+ &msg->msg_iter);
+ if (n != copy) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ offset += copy;
+ remaining_len -= copy;
+
+ /* First a dmabuf_cmsg for # bytes copied to user
+ * buffer.
+ */
+ memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg));
+ dmabuf_cmsg.frag_size = copy;
+ err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR,
+ sizeof(dmabuf_cmsg), &dmabuf_cmsg);
+ if (err || msg->msg_flags & MSG_CTRUNC) {
+ msg->msg_flags &= ~MSG_CTRUNC;
+ if (!err)
+ err = -ETOOSMALL;
+ goto out;
+ }
+
+ sent += copy;
+
+ if (remaining_len == 0)
+ goto out;
+ }
+
+ /* after that, send information of dmabuf pages through a
+ * sequence of cmsg
+ */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page_pool_iov *ppiov;
+ u64 frag_offset;
+ u32 user_token;
+ int end;
+
+ /* skb->dmabuf should indicate that ALL the frags in
+ * this skb are dmabuf page_pool_iovs. We're checking
+ * for that flag above, but also check individual frags
+ * here. If the tcp stack is not setting skb->dmabuf
+ * correctly, we still don't want to crash here when
+ * accessing pgmap or priv below.
+ */
+ if (!skb_frag_page_pool_iov(frag)) {
+ net_err_ratelimited("Found non-dmabuf skb with page_pool_iov");
+ err = -ENODEV;
+ goto out;
+ }
+
+ ppiov = skb_frag_page_pool_iov(frag);
+ end = start + skb_frag_size(frag);
+ copy = end - offset;
+
+ if (copy > 0) {
+ copy = min(copy, remaining_len);
+
+ frag_offset = page_pool_iov_virtual_addr(ppiov) +
+ skb_frag_off(frag) + offset -
+ start;
+ dmabuf_cmsg.frag_offset = frag_offset;
+ dmabuf_cmsg.frag_size = copy;
+ err = xa_alloc((struct xarray *)&sk->sk_user_pages,
+ &user_token, frag->bv_page,
+ xa_limit_31b, GFP_KERNEL);
+ if (err)
+ goto out;
+
+ dmabuf_cmsg.frag_token = user_token;
+ dmabuf_cmsg.dmabuf_id = page_pool_iov_binding_id(ppiov);
+
+ offset += copy;
+ remaining_len -= copy;
+
+ err = put_cmsg(msg, SOL_SOCKET,
+ SO_DEVMEM_DMABUF,
+ sizeof(dmabuf_cmsg),
+ &dmabuf_cmsg);
+ if (err || msg->msg_flags & MSG_CTRUNC) {
+ msg->msg_flags &= ~MSG_CTRUNC;
+ xa_erase((struct xarray *)&sk->sk_user_pages,
+ user_token);
+ if (!err)
+ err = -ETOOSMALL;
+ goto out;
+ }
+
+ __skb_frag_ref(frag);
+
+ sent += copy;
+
+ if (remaining_len == 0)
+ goto out;
+ }
+ start = end;
+ }
+
+ if (!remaining_len)
+ goto out;
+
+ /* if remaining_len is not satisfied yet, we need to go to the
+ * next frag in the frag_list to satisfy remaining_len.
+ */
+ skb = skb_shinfo(skb)->frag_list ?: skb->next;
+
+ offset = offset - start;
+ } while (skb);
+
+ if (remaining_len) {
+ err = -EFAULT;
+ goto out;
+ }
+
+out:
+ if (!sent)
+ sent = err;
+
+ return sent;
+}
+
/*
* This routine copies from a sock struct into the user buffer.
*
@@ -2316,6 +2466,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
int *cmsg_flags)
{
struct tcp_sock *tp = tcp_sk(sk);
+ int last_copied_dmabuf = -1; /* uninitialized */
int copied = 0;
u32 peek_seq;
u32 *seq;
@@ -2493,15 +2644,44 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
}
if (!(flags & MSG_TRUNC)) {
- err = skb_copy_datagram_msg(skb, offset, msg, used);
- if (err) {
- /* Exception. Bailout! */
- if (!copied)
- copied = -EFAULT;
+ if (last_copied_dmabuf != -1 &&
+ last_copied_dmabuf != skb->dmabuf)
break;
+
+ if (!skb->dmabuf) {
+ err = skb_copy_datagram_msg(skb, offset, msg,
+ used);
+ if (err) {
+ /* Exception. Bailout! */
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
+ } else {
+ if (!(flags & MSG_SOCK_DEVMEM)) {
+ /* skb->dmabuf skbs can only be received
+ * with the MSG_SOCK_DEVMEM flag.
+ */
+ if (!copied)
+ copied = -EFAULT;
+
+ break;
+ }
+
+ err = tcp_recvmsg_dmabuf(sk, skb, offset, msg,
+ used);
+ if (err <= 0) {
+ if (!copied)
+ copied = -EFAULT;
+
+ break;
+ }
+ used = err;
}
}
+ last_copied_dmabuf = skb->dmabuf;
+
WRITE_ONCE(*seq, *seq + used);
copied += used;
len -= used;
@@ -2501,6 +2501,14 @@ static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
void tcp_v4_destroy_sock(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct page *page;
+ unsigned long index;
+
+ xa_for_each(&sk->sk_user_pages, index, page)
+ if (WARN_ON_ONCE(!napi_pp_put_page(page, false)))
+ page_pool_page_put_many(page, 1);
+
+ xa_destroy(&sk->sk_user_pages);
trace_tcp_destroy_sock(sk);