[net-next,v5,04/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()
Commit Message
Use sendmsg() and MSG_SPLICE_PAGES rather than sendpage in ceph when
transmitting data. For the moment, this can only transmit one page at a
time because of the architecture of net/ceph/, but if
write_partial_message_data() can be given a bvec[] at a time by the
iteration code, this would allow pages to be sent in a batch.
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: ceph-devel@vger.kernel.org
cc: netdev@vger.kernel.org
---
net/ceph/messenger_v2.c | 91 +++++++++--------------------------------
1 file changed, 19 insertions(+), 72 deletions(-)
Comments
On Sat, Jun 24, 2023 at 12:55 AM David Howells <dhowells@redhat.com> wrote:
>
> Use sendmsg() and MSG_SPLICE_PAGES rather than sendpage in ceph when
> transmitting data. For the moment, this can only transmit one page at a
> time because of the architecture of net/ceph/, but if
> write_partial_message_data() can be given a bvec[] at a time by the
Hi David,
write_partial_message_data() is net/ceph/messenger_v1.c specific, so it
doesn't apply here. I would suggest squashing the two net/ceph patches
into one since even the titles are the same.
Also, we tend to use "libceph: " prefix for net/ceph changes.
> iteration code, this would allow pages to be sent in a batch.
>
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Ilya Dryomov <idryomov@gmail.com>
> cc: Xiubo Li <xiubli@redhat.com>
> cc: Jeff Layton <jlayton@kernel.org>
> cc: "David S. Miller" <davem@davemloft.net>
> cc: Eric Dumazet <edumazet@google.com>
> cc: Jakub Kicinski <kuba@kernel.org>
> cc: Paolo Abeni <pabeni@redhat.com>
> cc: Jens Axboe <axboe@kernel.dk>
> cc: Matthew Wilcox <willy@infradead.org>
> cc: ceph-devel@vger.kernel.org
> cc: netdev@vger.kernel.org
> ---
> net/ceph/messenger_v2.c | 91 +++++++++--------------------------------
> 1 file changed, 19 insertions(+), 72 deletions(-)
>
> diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
> index 301a991dc6a6..87ac97073e75 100644
> --- a/net/ceph/messenger_v2.c
> +++ b/net/ceph/messenger_v2.c
> @@ -117,91 +117,38 @@ static int ceph_tcp_recv(struct ceph_connection *con)
> return ret;
> }
>
> -static int do_sendmsg(struct socket *sock, struct iov_iter *it)
> -{
> - struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
> - int ret;
> -
> - msg.msg_iter = *it;
> - while (iov_iter_count(it)) {
> - ret = sock_sendmsg(sock, &msg);
> - if (ret <= 0) {
> - if (ret == -EAGAIN)
> - ret = 0;
> - return ret;
> - }
> -
> - iov_iter_advance(it, ret);
> - }
> -
> - WARN_ON(msg_data_left(&msg));
> - return 1;
> -}
> -
> -static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
> -{
> - struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
> - struct bio_vec bv;
> - int ret;
> -
> - if (WARN_ON(!iov_iter_is_bvec(it)))
> - return -EINVAL;
> -
> - while (iov_iter_count(it)) {
> - /* iov_iter_iovec() for ITER_BVEC */
> - bvec_set_page(&bv, it->bvec->bv_page,
> - min(iov_iter_count(it),
> - it->bvec->bv_len - it->iov_offset),
> - it->bvec->bv_offset + it->iov_offset);
> -
> - /*
> - * sendpage cannot properly handle pages with
> - * page_count == 0, we need to fall back to sendmsg if
> - * that's the case.
> - *
> - * Same goes for slab pages: skb_can_coalesce() allows
> - * coalescing neighboring slab objects into a single frag
> - * which triggers one of hardened usercopy checks.
> - */
> - if (sendpage_ok(bv.bv_page)) {
> - ret = sock->ops->sendpage(sock, bv.bv_page,
> - bv.bv_offset, bv.bv_len,
> - CEPH_MSG_FLAGS);
> - } else {
> - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, bv.bv_len);
> - ret = sock_sendmsg(sock, &msg);
> - }
> - if (ret <= 0) {
> - if (ret == -EAGAIN)
> - ret = 0;
> - return ret;
> - }
> -
> - iov_iter_advance(it, ret);
> - }
> -
> - return 1;
> -}
> -
> /*
> * Write as much as possible. The socket is expected to be corked,
> - * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
> + * so we don't bother with MSG_MORE here.
> *
> * Return:
> - * 1 - done, nothing (else) to write
> + * >0 - done, nothing (else) to write
It would be nice to avoid making tweaks like this to the outer
interface as part of switching to a new internal API.
> * 0 - socket is full, need to wait
> * <0 - error
> */
> static int ceph_tcp_send(struct ceph_connection *con)
> {
> + struct msghdr msg = {
> + .msg_iter = con->v2.out_iter,
> + .msg_flags = CEPH_MSG_FLAGS,
> + };
> int ret;
>
> + if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
> + return -EINVAL;
Previously, this WARN_ON + error applied only to the "try sendpage"
path. There is a ton of kvec usage in net/ceph/messenger_v2.c, so I'm
pretty sure that placing it here breaks everything.
> +
> + if (con->v2.out_iter_sendpage)
> + msg.msg_flags |= MSG_SPLICE_PAGES;
> +
> dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
> iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
> - if (con->v2.out_iter_sendpage)
> - ret = do_try_sendpage(con->sock, &con->v2.out_iter);
> - else
> - ret = do_sendmsg(con->sock, &con->v2.out_iter);
> +
> + ret = sock_sendmsg(con->sock, &msg);
> + if (ret > 0)
> + iov_iter_advance(&con->v2.out_iter, ret);
> + else if (ret == -EAGAIN)
> + ret = 0;
Hrm, is sock_sendmsg() now guaranteed to exhaust the iterator (i.e.
a "short write" is no longer possible)? Unless that is the case, this
is not an equivalent transformation.
This is actually the reason for
> * Return:
> * 1 - done, nothing (else) to write
specification which you also tweaked. It doesn't make sense for
ceph_tcp_send() to return the number of bytes sent because the caller
expects everything to be sent when a positive number is returned.
Thanks,
Ilya
Ilya Dryomov <idryomov@gmail.com> wrote:
> write_partial_message_data() is net/ceph/messenger_v1.c specific, so it
> doesn't apply here. I would suggest squashing the two net/ceph patches
> into one since even the titles are the same.
I would, but they're now applied to net-next, so we need to patch that.
> > * Write as much as possible. The socket is expected to be corked,
> > - * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
> > + * so we don't bother with MSG_MORE here.
> > *
> > * Return:
> > - * 1 - done, nothing (else) to write
> > + * >0 - done, nothing (else) to write
>
> It would be nice to avoid making tweaks like this to the outer
> interface as part of switching to a new internal API.
Ok. I'll change that and wrap the sendmsg in a loop. Though, as I asked in
an earlier reply, why is MSG_DONTWAIT used here?
> > + if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
> > + return -EINVAL;
>
> Previously, this WARN_ON + error applied only to the "try sendpage"
> path. There is a ton of kvec usage in net/ceph/messenger_v2.c, so I'm
> pretty sure that placing it here breaks everything.
This should have been removed as MSG_SPLICE_PAGES now accepts KVEC and XARRAY
iterators also.
Btw, is it feasible to use con->v2.out_iter_sendpage to apply MSG_SPLICE_PAGES
to the iterator to be transmitted as a whole? It seems to be set depending on
iterator type.
David
On Mon, Jun 26, 2023 at 5:30 PM David Howells <dhowells@redhat.com> wrote:
>
> Ilya Dryomov <idryomov@gmail.com> wrote:
>
> > write_partial_message_data() is net/ceph/messenger_v1.c specific, so it
> > doesn't apply here. I would suggest squashing the two net/ceph patches
> > into one since even the titles are the same.
>
> I would, but they're now applied to net-next, so we need to patch that.
I don't see a problem with that given that the patches themselves have
major issues (i.e. it's not just a commit message/title nit).
>
> > > * Write as much as possible. The socket is expected to be corked,
> > > - * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
> > > + * so we don't bother with MSG_MORE here.
> > > *
> > > * Return:
> > > - * 1 - done, nothing (else) to write
> > > + * >0 - done, nothing (else) to write
> >
> > It would be nice to avoid making tweaks like this to the outer
> > interface as part of switching to a new internal API.
>
> Ok. I'll change that and wrap the sendmsg in a loop. Though, as I asked in
> an earlier reply, why is MSG_DONTWAIT used here?
See my reply there.
>
> > > + if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
> > > + return -EINVAL;
> >
> > Previously, this WARN_ON + error applied only to the "try sendpage"
> > path. There is a ton of kvec usage in net/ceph/messenger_v2.c, so I'm
> > pretty sure that placing it here breaks everything.
>
> This should have been removed as MSG_SPLICE_PAGES now accepts KVEC and XARRAY
> iterators also.
>
> Btw, is it feasible to use con->v2.out_iter_sendpage to apply MSG_SPLICE_PAGES
> to the iterator to be transmitted as a whole? It seems to be set depending on
> iterator type.
I'm not sure I understand what you mean by "transmitted as a whole".
con->v2.out_iter_sendpage is set only when zerocopy is desired. If the
underlying data is not guaranteed to remain stable, zerocopy behavior
is not safe.
Thanks,
Ilya
Ilya Dryomov <idryomov@gmail.com> wrote:
> > Btw, is it feasible to use con->v2.out_iter_sendpage to apply
> > MSG_SPLICE_PAGES to the iterator to be transmitted as a whole? It seems
> > to be set depending on iterator type.
>
> I'm not sure I understand what you mean by "transmitted as a whole".
> con->v2.out_iter_sendpage is set only when zerocopy is desired. If the
> underlying data is not guaranteed to remain stable, zerocopy behavior
> is not safe.
I think I need to reinstate the per-page sendpage_ok() check here also -
though Al pointed out it isn't sufficiently exhaustive. There are pages that
sendpage_ok() will return true on that you shouldn't be passing to sendpage().
I'll whip up a patch to partially revert this also.
David
@@ -117,91 +117,38 @@ static int ceph_tcp_recv(struct ceph_connection *con)
return ret;
}
-static int do_sendmsg(struct socket *sock, struct iov_iter *it)
-{
- struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
- int ret;
-
- msg.msg_iter = *it;
- while (iov_iter_count(it)) {
- ret = sock_sendmsg(sock, &msg);
- if (ret <= 0) {
- if (ret == -EAGAIN)
- ret = 0;
- return ret;
- }
-
- iov_iter_advance(it, ret);
- }
-
- WARN_ON(msg_data_left(&msg));
- return 1;
-}
-
-static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
-{
- struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
- struct bio_vec bv;
- int ret;
-
- if (WARN_ON(!iov_iter_is_bvec(it)))
- return -EINVAL;
-
- while (iov_iter_count(it)) {
- /* iov_iter_iovec() for ITER_BVEC */
- bvec_set_page(&bv, it->bvec->bv_page,
- min(iov_iter_count(it),
- it->bvec->bv_len - it->iov_offset),
- it->bvec->bv_offset + it->iov_offset);
-
- /*
- * sendpage cannot properly handle pages with
- * page_count == 0, we need to fall back to sendmsg if
- * that's the case.
- *
- * Same goes for slab pages: skb_can_coalesce() allows
- * coalescing neighboring slab objects into a single frag
- * which triggers one of hardened usercopy checks.
- */
- if (sendpage_ok(bv.bv_page)) {
- ret = sock->ops->sendpage(sock, bv.bv_page,
- bv.bv_offset, bv.bv_len,
- CEPH_MSG_FLAGS);
- } else {
- iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, bv.bv_len);
- ret = sock_sendmsg(sock, &msg);
- }
- if (ret <= 0) {
- if (ret == -EAGAIN)
- ret = 0;
- return ret;
- }
-
- iov_iter_advance(it, ret);
- }
-
- return 1;
-}
-
/*
* Write as much as possible. The socket is expected to be corked,
- * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
+ * so we don't bother with MSG_MORE here.
*
* Return:
- * 1 - done, nothing (else) to write
+ * >0 - done, nothing (else) to write
* 0 - socket is full, need to wait
* <0 - error
*/
static int ceph_tcp_send(struct ceph_connection *con)
{
+ struct msghdr msg = {
+ .msg_iter = con->v2.out_iter,
+ .msg_flags = CEPH_MSG_FLAGS,
+ };
int ret;
+ if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
+ return -EINVAL;
+
+ if (con->v2.out_iter_sendpage)
+ msg.msg_flags |= MSG_SPLICE_PAGES;
+
dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
- if (con->v2.out_iter_sendpage)
- ret = do_try_sendpage(con->sock, &con->v2.out_iter);
- else
- ret = do_sendmsg(con->sock, &con->v2.out_iter);
+
+ ret = sock_sendmsg(con->sock, &msg);
+ if (ret > 0)
+ iov_iter_advance(&con->v2.out_iter, ret);
+ else if (ret == -EAGAIN)
+ ret = 0;
+
dout("%s con %p ret %d left %zu\n", __func__, con, ret,
iov_iter_count(&con->v2.out_iter));
return ret;