[net-next,v7,03/16] net: Add a function to splice pages into an skbuff for MSG_SPLICE_PAGES

Message ID 20230515093345.396978-4-dhowells@redhat.com
State New
Headers
Series splice, net: Replace sendpage with sendmsg(MSG_SPLICE_PAGES), part 1 |

Commit Message

David Howells May 15, 2023, 9:33 a.m. UTC
  Add a function to handle MSG_SPLICE_PAGES being passed internally to
sendmsg().  Pages are spliced into the given socket buffer if possible and
copied in if not (e.g. they're slab pages or have a zero refcount).

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Eric Dumazet <edumazet@google.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: David Ahern <dsahern@kernel.org>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Al Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: netdev@vger.kernel.org
---

Notes:
    ver #7)
     - Export function.
     - Never copy data, return -EIO if sendpage_ok() returns false.

 include/linux/skbuff.h |  3 ++
 net/core/skbuff.c      | 95 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)
  

Comments

Paolo Abeni May 18, 2023, 8:28 a.m. UTC | #1
On Mon, 2023-05-15 at 10:33 +0100, David Howells wrote:
> Add a function to handle MSG_SPLICE_PAGES being passed internally to
> sendmsg().  Pages are spliced into the given socket buffer if possible and
> copied in if not (e.g. they're slab pages or have a zero refcount).
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Eric Dumazet <edumazet@google.com>
> cc: "David S. Miller" <davem@davemloft.net>
> cc: David Ahern <dsahern@kernel.org>
> cc: Jakub Kicinski <kuba@kernel.org>
> cc: Paolo Abeni <pabeni@redhat.com>
> cc: Al Viro <viro@zeniv.linux.org.uk>
> cc: Jens Axboe <axboe@kernel.dk>
> cc: Matthew Wilcox <willy@infradead.org>
> cc: netdev@vger.kernel.org
> ---
> 
> Notes:
>     ver #7)
>      - Export function.
>      - Never copy data, return -EIO if sendpage_ok() returns false.
> 
>  include/linux/skbuff.h |  3 ++
>  net/core/skbuff.c      | 95 ++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 98 insertions(+)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 4c0ad48e38ca..1c5f0ac6f8c3 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -5097,5 +5097,8 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb)
>  #endif
>  }
>  
> +ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
> +			     ssize_t maxsize, gfp_t gfp);
> +
>  #endif	/* __KERNEL__ */
>  #endif	/* _LINUX_SKBUFF_H */
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 7f53dcb26ad3..56d629ea2f3d 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -6892,3 +6892,98 @@ nodefer:	__kfree_skb(skb);
>  	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
>  		smp_call_function_single_async(cpu, &sd->defer_csd);
>  }
> +
> +static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
> +				 size_t offset, size_t len)
> +{
> +	const char *kaddr;
> +	__wsum csum;
> +
> +	kaddr = kmap_local_page(page);
> +	csum = csum_partial(kaddr + offset, len, 0);
> +	kunmap_local(kaddr);
> +	skb->csum = csum_block_add(skb->csum, csum, skb->len);
> +}
> +
> +/**
> + * skb_splice_from_iter - Splice (or copy) pages to skbuff
> + * @skb: The buffer to add pages to
> + * @iter: Iterator representing the pages to be added
> + * @maxsize: Maximum amount of pages to be added
> + * @gfp: Allocation flags
> + *
> + * This is a common helper function for supporting MSG_SPLICE_PAGES.  It
> + * extracts pages from an iterator and adds them to the socket buffer if
> + * possible, copying them to fragments if not possible (such as if they're slab
> + * pages).
> + *
> + * Returns the amount of data spliced/copied or -EMSGSIZE if there's
> + * insufficient space in the buffer to transfer anything.
> + */
> +ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
> +			     ssize_t maxsize, gfp_t gfp)
> +{
> +	struct page *pages[8], **ppages = pages;
> +	unsigned int i;
> +	ssize_t spliced = 0, ret = 0;
> +	size_t frag_limit = READ_ONCE(sysctl_max_skb_frags);

Minor nit: please respect the reverse x-mas tree order (there are a few
other occurrences around)

> +
> +	while (iter->count > 0) {
> +		ssize_t space, nr;
> +		size_t off, len;
> +
> +		ret = -EMSGSIZE;
> +		space = frag_limit - skb_shinfo(skb)->nr_frags;
> +		if (space < 0)
> +			break;
> +
> +		/* We might be able to coalesce without increasing nr_frags */
> +		nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages));
> +
> +		len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off);
> +		if (len <= 0) {
> +			ret = len ?: -EIO;
> +			break;
> +		}
> +
> +		if (space == 0 &&
> +		    !skb_can_coalesce(skb, skb_shinfo(skb)->nr_frags,
> +				      pages[0], off)) {
> +			iov_iter_revert(iter, len);
> +			break;
> +		}

It looks like the above condition/checks duplicate what the later
skb_append_pagefrags() will perform below. I guess the above chunk
could be removed?

> +
> +		i = 0;
> +		do {
> +			struct page *page = pages[i++];
> +			size_t part = min_t(size_t, PAGE_SIZE - off, len);
> +
> +			ret = -EIO;
> +			if (!sendpage_ok(page))
> +				goto out;

My (limited) understanding is that the current sendpage code assumes
that the caller provides/uses pages suitable for such use. The existing
sendpage_ok() check is in place as way to try to catch possible code
bug - via the WARN_ONCE().

I think the same could be done here?

Thanks!

Paolo

> +
> +			ret = skb_append_pagefrags(skb, page, off, part,
> +						   frag_limit);
> +			if (ret < 0) {
> +				iov_iter_revert(iter, len);
> +				goto out;
> +			}
> +
> +			if (skb->ip_summed == CHECKSUM_NONE)
> +				skb_splice_csum_page(skb, page, off, part);
> +
> +			off = 0;
> +			spliced += part;
> +			maxsize -= part;
> +			len -= part;
> +		} while (len > 0);
> +
> +		if (maxsize <= 0)
> +			break;
> +	}
> +
> +out:
> +	skb_len_add(skb, spliced);
> +	return spliced ?: ret;
> +}
> +EXPORT_SYMBOL(skb_splice_from_iter);
>
  
David Howells May 18, 2023, 9:53 a.m. UTC | #2
Paolo Abeni <pabeni@redhat.com> wrote:

> Minor nit: please respect the reverse x-mas tree order (there are a few
> other occurrences around)

I hadn't come across that.  Normally I only apply that to the types so that
the names aren't all over the place.  But whatever.

> > +		if (space == 0 &&
> > +		    !skb_can_coalesce(skb, skb_shinfo(skb)->nr_frags,
> > +				      pages[0], off)) {
> > +			iov_iter_revert(iter, len);
> > +			break;
> > +		}
> 
> It looks like the above condition/checks duplicate what the later
> skb_append_pagefrags() will perform below. I guess the above chunk
> could be removed?

Good point.  There used to be an allocation between in the case sendpage_ok()
failed and we wanted to copy the data.  I've removed that for the moment.

> > +			ret = -EIO;
> > +			if (!sendpage_ok(page))
> > +				goto out;
> 
> My (limited) understanding is that the current sendpage code assumes
> that the caller provides/uses pages suitable for such use. The existing
> sendpage_ok() check is in place as way to try to catch possible code
> bug - via the WARN_ONCE().
>
> I think the same could be done here?

Yeah.

Okay, I made the attached changes to this patch.

David
---
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 56d629ea2f3d..f4a5b51aed22 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6923,10 +6923,10 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
 ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
 			     ssize_t maxsize, gfp_t gfp)
 {
+	size_t frag_limit = READ_ONCE(sysctl_max_skb_frags);
 	struct page *pages[8], **ppages = pages;
-	unsigned int i;
 	ssize_t spliced = 0, ret = 0;
-	size_t frag_limit = READ_ONCE(sysctl_max_skb_frags);
+	unsigned int i;
 
 	while (iter->count > 0) {
 		ssize_t space, nr;
@@ -6946,20 +6946,13 @@ ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
 			break;
 		}
 
-		if (space == 0 &&
-		    !skb_can_coalesce(skb, skb_shinfo(skb)->nr_frags,
-				      pages[0], off)) {
-			iov_iter_revert(iter, len);
-			break;
-		}
-
 		i = 0;
 		do {
 			struct page *page = pages[i++];
 			size_t part = min_t(size_t, PAGE_SIZE - off, len);
 
 			ret = -EIO;
-			if (!sendpage_ok(page))
+			if (WARN_ON_ONCE(!sendpage_ok(page)))
 				goto out;
 
 			ret = skb_append_pagefrags(skb, page, off, part,
  
Paolo Abeni May 18, 2023, 10:25 a.m. UTC | #3
On Thu, 2023-05-18 at 10:53 +0100, David Howells wrote:
> Paolo Abeni <pabeni@redhat.com> wrote:
> 
> > Minor nit: please respect the reverse x-mas tree order (there are a few
> > other occurrences around)
> 
> I hadn't come across that.  Normally I only apply that to the types so that
> the names aren't all over the place.  But whatever.
> 
> > > +		if (space == 0 &&
> > > +		    !skb_can_coalesce(skb, skb_shinfo(skb)->nr_frags,
> > > +				      pages[0], off)) {
> > > +			iov_iter_revert(iter, len);
> > > +			break;
> > > +		}
> > 
> > It looks like the above condition/checks duplicate what the later
> > skb_append_pagefrags() will perform below. I guess the above chunk
> > could be removed?
> 
> Good point.  There used to be an allocation between in the case sendpage_ok()
> failed and we wanted to copy the data.  I've removed that for the moment.
> 
> > > +			ret = -EIO;
> > > +			if (!sendpage_ok(page))
> > > +				goto out;
> > 
> > My (limited) understanding is that the current sendpage code assumes
> > that the caller provides/uses pages suitable for such use. The existing
> > sendpage_ok() check is in place as way to try to catch possible code
> > bug - via the WARN_ONCE().
> > 
> > I think the same could be done here?
> 
> Yeah.
> 
> Okay, I made the attached changes to this patch.
> 
> David
> ---
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 56d629ea2f3d..f4a5b51aed22 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -6923,10 +6923,10 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
>  ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
>  			     ssize_t maxsize, gfp_t gfp)
>  {
> +	size_t frag_limit = READ_ONCE(sysctl_max_skb_frags);
>  	struct page *pages[8], **ppages = pages;
> -	unsigned int i;
>  	ssize_t spliced = 0, ret = 0;
> -	size_t frag_limit = READ_ONCE(sysctl_max_skb_frags);
> +	unsigned int i;
>  
>  	while (iter->count > 0) {
>  		ssize_t space, nr;
> @@ -6946,20 +6946,13 @@ ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
>  			break;
>  		}
>  
> -		if (space == 0 &&
> -		    !skb_can_coalesce(skb, skb_shinfo(skb)->nr_frags,
> -				      pages[0], off)) {
> -			iov_iter_revert(iter, len);
> -			break;
> -		}
> -
>  		i = 0;
>  		do {
>  			struct page *page = pages[i++];
>  			size_t part = min_t(size_t, PAGE_SIZE - off, len);
>  
>  			ret = -EIO;
> -			if (!sendpage_ok(page))
> +			if (WARN_ON_ONCE(!sendpage_ok(page)))

FWIS the current TCP code also has a 'IS_ENABLED(CONFIG_DEBUG_VM) &&'
guard, but I guess the plain WARN_ON_ONCE should be ok.

Side node: we need the whole series alltogether, you need to repost
even the unmodified patches.

Thanks!

Paolo
  
David Howells May 18, 2023, 10:32 a.m. UTC | #4
Paolo Abeni <pabeni@redhat.com> wrote:

> Side node: we need the whole series alltogether, you need to repost
> even the unmodified patches.

Any other things to change before I do that?

David
  
Paolo Abeni May 18, 2023, 11:10 a.m. UTC | #5
On Thu, 2023-05-18 at 11:32 +0100, David Howells wrote:
> Paolo Abeni <pabeni@redhat.com> wrote:
> 
> > Side node: we need the whole series alltogether, you need to repost
> > even the unmodified patches.
> 
> Any other things to change before I do that?

I went through the series and don't have other comments.

Cheers,

Paolo
  
David Howells May 18, 2023, 11:34 a.m. UTC | #6
Paolo Abeni <pabeni@redhat.com> wrote:

> > Any other things to change before I do that?
> 
> I went through the series and don't have other comments.

Thanks!

David
  

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4c0ad48e38ca..1c5f0ac6f8c3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -5097,5 +5097,8 @@  static inline void skb_mark_for_recycle(struct sk_buff *skb)
 #endif
 }
 
+ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
+			     ssize_t maxsize, gfp_t gfp);
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7f53dcb26ad3..56d629ea2f3d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6892,3 +6892,98 @@  nodefer:	__kfree_skb(skb);
 	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
 		smp_call_function_single_async(cpu, &sd->defer_csd);
 }
+
+static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
+				 size_t offset, size_t len)
+{
+	const char *kaddr;
+	__wsum csum;
+
+	kaddr = kmap_local_page(page);
+	csum = csum_partial(kaddr + offset, len, 0);
+	kunmap_local(kaddr);
+	skb->csum = csum_block_add(skb->csum, csum, skb->len);
+}
+
+/**
+ * skb_splice_from_iter - Splice (or copy) pages to skbuff
+ * @skb: The buffer to add pages to
+ * @iter: Iterator representing the pages to be added
+ * @maxsize: Maximum amount of pages to be added
+ * @gfp: Allocation flags
+ *
+ * This is a common helper function for supporting MSG_SPLICE_PAGES.  It
+ * extracts pages from an iterator and adds them to the socket buffer if
+ * possible, copying them to fragments if not possible (such as if they're slab
+ * pages).
+ *
+ * Returns the amount of data spliced/copied or -EMSGSIZE if there's
+ * insufficient space in the buffer to transfer anything.
+ */
+ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
+			     ssize_t maxsize, gfp_t gfp)
+{
+	struct page *pages[8], **ppages = pages;
+	unsigned int i;
+	ssize_t spliced = 0, ret = 0;
+	size_t frag_limit = READ_ONCE(sysctl_max_skb_frags);
+
+	while (iter->count > 0) {
+		ssize_t space, nr;
+		size_t off, len;
+
+		ret = -EMSGSIZE;
+		space = frag_limit - skb_shinfo(skb)->nr_frags;
+		if (space < 0)
+			break;
+
+		/* We might be able to coalesce without increasing nr_frags */
+		nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages));
+
+		len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off);
+		if (len <= 0) {
+			ret = len ?: -EIO;
+			break;
+		}
+
+		if (space == 0 &&
+		    !skb_can_coalesce(skb, skb_shinfo(skb)->nr_frags,
+				      pages[0], off)) {
+			iov_iter_revert(iter, len);
+			break;
+		}
+
+		i = 0;
+		do {
+			struct page *page = pages[i++];
+			size_t part = min_t(size_t, PAGE_SIZE - off, len);
+
+			ret = -EIO;
+			if (!sendpage_ok(page))
+				goto out;
+
+			ret = skb_append_pagefrags(skb, page, off, part,
+						   frag_limit);
+			if (ret < 0) {
+				iov_iter_revert(iter, len);
+				goto out;
+			}
+
+			if (skb->ip_summed == CHECKSUM_NONE)
+				skb_splice_csum_page(skb, page, off, part);
+
+			off = 0;
+			spliced += part;
+			maxsize -= part;
+			len -= part;
+		} while (len > 0);
+
+		if (maxsize <= 0)
+			break;
+	}
+
+out:
+	skb_len_add(skb, spliced);
+	return spliced ?: ret;
+}
+EXPORT_SYMBOL(skb_splice_from_iter);