[net-next,v6,07/18] tcp: Support MSG_SPLICE_PAGES
Commit Message
Make TCP's sendmsg() support MSG_SPLICE_PAGES. This causes pages to be
spliced or copied (if it cannot be spliced) from the source iterator.
This allows ->sendpage() to be replaced by something that can handle
multiple multipage folios in a single transaction.
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Eric Dumazet <edumazet@google.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: David Ahern <dsahern@kernel.org>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: netdev@vger.kernel.org
---
Notes:
ver #6)
- Use common helper.
net/ipv4/tcp.c | 43 ++++++++++++++++++++++++++++++++++++-------
1 file changed, 36 insertions(+), 7 deletions(-)
Comments
On Tue, Apr 11, 2023 at 6:09 PM David Howells <dhowells@redhat.com> wrote:
>
> Make TCP's sendmsg() support MSG_SPLICE_PAGES. This causes pages to be
> spliced or copied (if it cannot be spliced) from the source iterator.
>
> This allows ->sendpage() to be replaced by something that can handle
> multiple multipage folios in a single transaction.
>
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Eric Dumazet <edumazet@google.com>
> cc: "David S. Miller" <davem@davemloft.net>
> cc: David Ahern <dsahern@kernel.org>
> cc: Jakub Kicinski <kuba@kernel.org>
> cc: Paolo Abeni <pabeni@redhat.com>
> cc: Jens Axboe <axboe@kernel.dk>
> cc: Matthew Wilcox <willy@infradead.org>
> cc: netdev@vger.kernel.org
> ---
>
> Notes:
> ver #6)
> - Use common helper.
>
> net/ipv4/tcp.c | 43 ++++++++++++++++++++++++++++++++++++-------
> 1 file changed, 36 insertions(+), 7 deletions(-)
>
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index fd68d49490f2..0b2213da5aaf 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -1221,7 +1221,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
> int flags, err, copied = 0;
> int mss_now = 0, size_goal, copied_syn = 0;
> int process_backlog = 0;
> - bool zc = false;
> + int zc = 0;
> long timeo;
>
> flags = msg->msg_flags;
> @@ -1232,17 +1232,22 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
> if (msg->msg_ubuf) {
> uarg = msg->msg_ubuf;
> net_zcopy_get(uarg);
> - zc = sk->sk_route_caps & NETIF_F_SG;
> + if (sk->sk_route_caps & NETIF_F_SG)
> + zc = 1;
zc is set to 0, 1, MSG_ZEROCOPY , MSG_SPLICE_PAGES
I find this a bit confusing. Maybe use a private enum ?
> } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
> uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
> if (!uarg) {
> err = -ENOBUFS;
> goto out_err;
> }
> - zc = sk->sk_route_caps & NETIF_F_SG;
> - if (!zc)
> + if (sk->sk_route_caps & NETIF_F_SG)
> + zc = MSG_ZEROCOPY;
> + else
> uarg_to_msgzc(uarg)->zerocopy = 0;
> }
> + } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
> + if (sk->sk_route_caps & NETIF_F_SG)
> + zc = MSG_SPLICE_PAGES;
Eric Dumazet <edumazet@google.com> wrote:
> > - zc = sk->sk_route_caps & NETIF_F_SG;
> > + if (sk->sk_route_caps & NETIF_F_SG)
> > + zc = 1;
>
> zc is set to 0, 1, MSG_ZEROCOPY , MSG_SPLICE_PAGES
>
> I find this a bit confusing. Maybe use a private enum ?
Meh. That should be "zc = MSG_ZEROCOPY;". Will fix.
David
@@ -1221,7 +1221,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
int process_backlog = 0;
- bool zc = false;
+ int zc = 0;
long timeo;
flags = msg->msg_flags;
@@ -1232,17 +1232,22 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
if (msg->msg_ubuf) {
uarg = msg->msg_ubuf;
net_zcopy_get(uarg);
- zc = sk->sk_route_caps & NETIF_F_SG;
+ if (sk->sk_route_caps & NETIF_F_SG)
+ zc = 1;
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
if (!uarg) {
err = -ENOBUFS;
goto out_err;
}
- zc = sk->sk_route_caps & NETIF_F_SG;
- if (!zc)
+ if (sk->sk_route_caps & NETIF_F_SG)
+ zc = MSG_ZEROCOPY;
+ else
uarg_to_msgzc(uarg)->zerocopy = 0;
}
+ } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
+ if (sk->sk_route_caps & NETIF_F_SG)
+ zc = MSG_SPLICE_PAGES;
}
if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
@@ -1305,7 +1310,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
goto do_error;
while (msg_data_left(msg)) {
- int copy = 0;
+ ssize_t copy = 0;
skb = tcp_write_queue_tail(sk);
if (skb)
@@ -1346,7 +1351,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
if (copy > msg_data_left(msg))
copy = msg_data_left(msg);
- if (!zc) {
+ if (zc == 0) {
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
@@ -1391,7 +1396,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
page_ref_inc(pfrag->page);
}
pfrag->offset += copy;
- } else {
+ } else if (zc == MSG_ZEROCOPY) {
/* First append to a fragless skb builds initial
* pure zerocopy skb
*/
@@ -1412,6 +1417,30 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
if (err < 0)
goto do_error;
copy = err;
+ } else if (zc == MSG_SPLICE_PAGES) {
+ /* Splice in data if we can; copy if we can't. */
+ if (tcp_downgrade_zcopy_pure(sk, skb))
+ goto wait_for_space;
+ copy = tcp_wmem_schedule(sk, copy);
+ if (!copy)
+ goto wait_for_space;
+
+ err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
+ sk->sk_allocation);
+ if (err < 0) {
+ if (err == -EMSGSIZE) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
+ }
+ goto do_error;
+ }
+ copy = err;
+
+ if (!(flags & MSG_NO_SHARED_FRAGS))
+ skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
+
+ sk_wmem_queued_add(sk, copy);
+ sk_mem_charge(sk, copy);
}
if (!copied)