tcp/udp: add tracepoint for send recv length

Message ID 20221229080207.1029-1-cuiyunhui@bytedance.com
State New
Headers
Series tcp/udp: add tracepoint for send recv length |

Commit Message

yunhui cui Dec. 29, 2022, 8:02 a.m. UTC
  From: Xiongchun Duan <duanxiongchun@bytedance.com>

Add a tracepoint for capturing TCP segments with
a send or receive length. This makes it easy to obtain
the packet sending and receiving information of each process
in the user mode, such as the netatop tool.

Signed-off-by: Xiongchun Duan <duanxiongchun@bytedance.com>
---
 include/trace/events/tcp.h | 41 ++++++++++++++++++++++++++++++++++++++
 include/trace/events/udp.h | 34 +++++++++++++++++++++++++++++++
 net/ipv4/tcp.c             |  7 +++++++
 net/ipv4/udp.c             | 11 ++++++++--
 4 files changed, 91 insertions(+), 2 deletions(-)
  

Comments

yunhui cui Dec. 29, 2022, 2 p.m. UTC | #1
Hi All,

The following content is a supplement to the commit log of this patch.
In v2, the following content will be added to the commit log.

The purpose of adding these 4 tracepoints is to monitor the
tcp/udp traffic of per process and per cgroup.

Regarding monitoring the tcp/udp traffic of each process,
the existing implementation is https://www.atoptool.nl/netatop.php.
This solution is implemented by registering the hook function at
the hook point provided by the netfilter framework.

These hook functions may be in the soft interrupt context
and cannot directly obtain the pid. Some data structures
are added to bind packets and processes.
For example, struct taskinfobucket, struct taskinfo ...

Every time the process sends and receives packets it needs multiple hashmaps,
resulting in low performance and the problem of inaccurate tcp/udp
traffic statistics(for example: multiple threads share sockets).

Based on these 4 tracepoints, we have optimized and tested performance.
Time Per Request as an indicator,
without monitoring: 50.95ms, netatop: 63.27 ms, Hook on these
tracepoints: 52.24ms.
The performance has been improved 10 times.
The tcp/udp traffic of each process has also been accurately counted.

We also used these 4 tracepoints to monitor the traffic of each cgroup.

Therefore, these 4 tracepoints are the basis. Thanks.

Yunhui Cui <cuiyunhui@bytedance.com> 于2022年12月29日周四 16:02写道:
>
> From: Xiongchun Duan <duanxiongchun@bytedance.com>
>
> Add a tracepoint for capturing TCP segments with
> a send or receive length. This makes it easy to obtain
> the packet sending and receiving information of each process
> in the user mode, such as the netatop tool.
>
> Signed-off-by: Xiongchun Duan <duanxiongchun@bytedance.com>
> ---
>  include/trace/events/tcp.h | 41 ++++++++++++++++++++++++++++++++++++++
>  include/trace/events/udp.h | 34 +++++++++++++++++++++++++++++++
>  net/ipv4/tcp.c             |  7 +++++++
>  net/ipv4/udp.c             | 11 ++++++++--
>  4 files changed, 91 insertions(+), 2 deletions(-)
>
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> index 901b440238d5..d9973c8508d1 100644
> --- a/include/trace/events/tcp.h
> +++ b/include/trace/events/tcp.h
> @@ -187,6 +187,47 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,
>         TP_ARGS(sk)
>  );
>
> +/*
> + * tcp send/recv stream length
> + *
> + * Note: this class requires positive integer
> + */
> +DECLARE_EVENT_CLASS(tcp_stream_length,
> +
> +       TP_PROTO(struct sock *sk, int length, int error, int flags),
> +
> +       TP_ARGS(sk, length, error, flags),
> +
> +       TP_STRUCT__entry(
> +               __field(void *, sk)
> +               __field(int, length)
> +               __field(int, error)
> +               __field(int, flags)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->sk = sk;
> +               __entry->length = length;
> +               __entry->error = error;
> +               __entry->flags = flags;
> +       ),
> +
> +       TP_printk("sk address = %p, length = %d, error = %d flags = %u ",
> +               __entry->sk, __entry->length, __entry->error, __entry->flags)
> +);
> +
> +DEFINE_EVENT(tcp_stream_length, tcp_send_length,
> +       TP_PROTO(struct sock *sk, int length, int error, int flags),
> +
> +       TP_ARGS(sk, length, error, flags)
> +);
> +
> +DEFINE_EVENT(tcp_stream_length, tcp_recv_length,
> +       TP_PROTO(struct sock *sk, int length, int error, int flags),
> +
> +       TP_ARGS(sk, length, error, flags)
> +);
> +
>  TRACE_EVENT(tcp_retransmit_synack,
>
>         TP_PROTO(const struct sock *sk, const struct request_sock *req),
> diff --git a/include/trace/events/udp.h b/include/trace/events/udp.h
> index 336fe272889f..22181c91c8e2 100644
> --- a/include/trace/events/udp.h
> +++ b/include/trace/events/udp.h
> @@ -27,6 +27,40 @@ TRACE_EVENT(udp_fail_queue_rcv_skb,
>         TP_printk("rc=%d port=%hu", __entry->rc, __entry->lport)
>  );
>
> +DECLARE_EVENT_CLASS(udp_stream_length,
> +
> +       TP_PROTO(struct sock *sk, int length, int error, int flags),
> +
> +       TP_ARGS(sk, length, error, flags),
> +
> +       TP_STRUCT__entry(
> +               __field(void *, sk)
> +               __field(int, length)
> +               __field(int, error)
> +               __field(int, flags)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->sk = sk;
> +               __entry->length = length;
> +               __entry->error = error;
> +               __entry->flags = flags;
> +       ),
> +
> +       TP_printk("sk address = %p, length = %d, error=%d, flags = %u ",
> +       __entry->sk, __entry->length, __entry->error, __entry->flags)
> +);
> +
> +DEFINE_EVENT(udp_stream_length, udp_send_length,
> +       TP_PROTO(struct sock *sk, int length, int error, int flags),
> +       TP_ARGS(sk, length, error, flags)
> +);
> +
> +DEFINE_EVENT(udp_stream_length, udp_recv_length,
> +       TP_PROTO(struct sock *sk, int length, int error, int flags),
> +       TP_ARGS(sk, length, error, flags)
> +);
> +
>  #endif /* _TRACE_UDP_H */
>
>  /* This part must be outside protection */
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index c567d5e8053e..5deb69e2d3e7 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -267,6 +267,7 @@
>  #include <linux/errqueue.h>
>  #include <linux/static_key.h>
>  #include <linux/btf.h>
> +#include <trace/events/tcp.h>
>
>  #include <net/icmp.h>
>  #include <net/inet_common.h>
> @@ -1150,6 +1151,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
>         lock_sock(sk);
>         ret = tcp_sendpage_locked(sk, page, offset, size, flags);
>         release_sock(sk);
> +       trace_tcp_send_length(sk, ret > 0 ? ret : 0, ret > 0 ? 0 : ret, 0);
>
>         return ret;
>  }
> @@ -1482,6 +1484,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
>         lock_sock(sk);
>         ret = tcp_sendmsg_locked(sk, msg, size);
>         release_sock(sk);
> +       trace_tcp_send_length(sk, ret > 0 ? ret : 0, ret > 0 ? 0 : ret, 0);
>
>         return ret;
>  }
> @@ -2647,6 +2650,10 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
>
>         /* Clean up data we have read: This will do ACK frames. */
>         tcp_cleanup_rbuf(sk, copied);
> +       trace_tcp_recv_length(sk, (copied > 0 && !(flags & MSG_PEEK)) ?
> +                                  copied : 0,
> +                             (copied > 0 &&
> +                              !(flags & MSG_PEEK)) ? 0 : copied, flags);
>         return copied;
>
>  out:
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 9592fe3e444a..1b336af4df6d 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1300,6 +1300,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>         release_sock(sk);
>
>  out:
> +       trace_udp_send_length(sk, err == 0 ? len : 0, err, 0);
>         ip_rt_put(rt);
>  out_free:
>         if (free)
> @@ -1364,8 +1365,10 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
>                              page, offset, size, flags);
>         if (ret == -EOPNOTSUPP) {
>                 release_sock(sk);
> -               return sock_no_sendpage(sk->sk_socket, page, offset,
> -                                       size, flags);
> +               ret = sock_no_sendpage(sk->sk_socket, page, offset,
> +                                      size, flags);
> +               trace_udp_send_length(sk, ret > 0 ? ret : 0, ret > 0 ? 0 : ret, 0);
> +               return ret;
>         }
>         if (ret < 0) {
>                 udp_flush_pending_frames(sk);
> @@ -1377,6 +1380,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
>                 ret = udp_push_pending_frames(sk);
>         if (!ret)
>                 ret = size;
> +       trace_udp_send_length(sk, ret > 0 ? ret : 0, ret > 0 ? 0 : ret, 0);
>  out:
>         release_sock(sk);
>         return ret;
> @@ -1935,6 +1939,9 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
>         if (flags & MSG_TRUNC)
>                 err = ulen;
>
> +       trace_udp_recv_length(sk, (err > 0 && !peeking) ? err : 0,
> +                             (err > 0 && !peeking) ? 0 : err, flags);
> +
>         skb_consume_udp(sk, skb, peeking ? -err : err);
>         return err;
>
> --
> 2.20.1
>
  
Cong Wang Jan. 1, 2023, 7:09 p.m. UTC | #2
On Thu, Dec 29, 2022 at 04:02:07PM +0800, Yunhui Cui wrote:
> From: Xiongchun Duan <duanxiongchun@bytedance.com>
> 
> Add a tracepoint for capturing TCP segments with
> a send or receive length. This makes it easy to obtain
> the packet sending and receiving information of each process
> in the user mode, such as the netatop tool.

You can obtain the same information with kretprobe:
https://www.gcardone.net/2020-07-31-per-process-bandwidth-monitoring-on-Linux-with-bpftrace/

Thanks.
  
yunhui cui Jan. 4, 2023, 3:34 a.m. UTC | #3
Cong Wang <xiyou.wangcong@gmail.com> 于2023年1月2日周一 03:10写道:
>
> On Thu, Dec 29, 2022 at 04:02:07PM +0800, Yunhui Cui wrote:
> > From: Xiongchun Duan <duanxiongchun@bytedance.com>
> >
> > Add a tracepoint for capturing TCP segments with
> > a send or receive length. This makes it easy to obtain
> > the packet sending and receiving information of each process
> > in the user mode, such as the netatop tool.
>
> You can obtain the same information with kretprobe:
> https://www.gcardone.net/2020-07-31-per-process-bandwidth-monitoring-on-Linux-with-bpftrace/

As we know, kprobe gets the result by trapping in an exception, which
loses performance compared to tracepoint.

We did a test for performance comparison. The results are as follows.

Time per request
sock_sendmsg(k,kr):  12.382ms, tcp_send_length(tracepoint): 11.887ms,
without hook:11.222ms

It can be seen that the performance loss of tracepoint is only half of
that of kprobe.
  

Patch

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 901b440238d5..d9973c8508d1 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -187,6 +187,47 @@  DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,
 	TP_ARGS(sk)
 );
 
+/*
+ * tcp send/recv stream length
+ *
+ * Note: this class requires positive integer
+ */
+DECLARE_EVENT_CLASS(tcp_stream_length,
+
+	TP_PROTO(struct sock *sk, int length, int error, int flags),
+
+	TP_ARGS(sk, length, error, flags),
+
+	TP_STRUCT__entry(
+		__field(void *, sk)
+		__field(int, length)
+		__field(int, error)
+		__field(int, flags)
+	),
+
+	TP_fast_assign(
+		__entry->sk = sk;
+		__entry->length = length;
+		__entry->error = error;
+		__entry->flags = flags;
+	),
+
+	TP_printk("sk address = %p, length = %d, error = %d flags = %u ",
+		__entry->sk, __entry->length, __entry->error, __entry->flags)
+);
+
+DEFINE_EVENT(tcp_stream_length, tcp_send_length,
+	TP_PROTO(struct sock *sk, int length, int error, int flags),
+
+	TP_ARGS(sk, length, error, flags)
+);
+
+DEFINE_EVENT(tcp_stream_length, tcp_recv_length,
+	TP_PROTO(struct sock *sk, int length, int error, int flags),
+
+	TP_ARGS(sk, length, error, flags)
+);
+
 TRACE_EVENT(tcp_retransmit_synack,
 
 	TP_PROTO(const struct sock *sk, const struct request_sock *req),
diff --git a/include/trace/events/udp.h b/include/trace/events/udp.h
index 336fe272889f..22181c91c8e2 100644
--- a/include/trace/events/udp.h
+++ b/include/trace/events/udp.h
@@ -27,6 +27,40 @@  TRACE_EVENT(udp_fail_queue_rcv_skb,
 	TP_printk("rc=%d port=%hu", __entry->rc, __entry->lport)
 );
 
+DECLARE_EVENT_CLASS(udp_stream_length,
+
+	TP_PROTO(struct sock *sk, int length, int error, int flags),
+
+	TP_ARGS(sk, length, error, flags),
+
+	TP_STRUCT__entry(
+		__field(void *, sk)
+		__field(int, length)
+		__field(int, error)
+		__field(int, flags)
+	),
+
+	TP_fast_assign(
+		__entry->sk = sk;
+		__entry->length = length;
+		__entry->error = error;
+		__entry->flags = flags;
+	),
+
+	TP_printk("sk address = %p, length = %d, error=%d, flags = %u ",
+	__entry->sk, __entry->length, __entry->error, __entry->flags)
+);
+
+DEFINE_EVENT(udp_stream_length, udp_send_length,
+	TP_PROTO(struct sock *sk, int length, int error, int flags),
+	TP_ARGS(sk, length, error, flags)
+);
+
+DEFINE_EVENT(udp_stream_length, udp_recv_length,
+	TP_PROTO(struct sock *sk, int length, int error, int flags),
+	TP_ARGS(sk, length, error, flags)
+);
+
 #endif /* _TRACE_UDP_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c567d5e8053e..5deb69e2d3e7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -267,6 +267,7 @@ 
 #include <linux/errqueue.h>
 #include <linux/static_key.h>
 #include <linux/btf.h>
+#include <trace/events/tcp.h>
 
 #include <net/icmp.h>
 #include <net/inet_common.h>
@@ -1150,6 +1151,7 @@  int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 	lock_sock(sk);
 	ret = tcp_sendpage_locked(sk, page, offset, size, flags);
 	release_sock(sk);
+	trace_tcp_send_length(sk, ret > 0 ? ret : 0, ret > 0 ? 0 : ret, 0);
 
 	return ret;
 }
@@ -1482,6 +1484,7 @@  int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	lock_sock(sk);
 	ret = tcp_sendmsg_locked(sk, msg, size);
 	release_sock(sk);
+	trace_tcp_send_length(sk, ret > 0 ? ret : 0, ret > 0 ? 0 : ret, 0);
 
 	return ret;
 }
@@ -2647,6 +2650,10 @@  static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
 
 	/* Clean up data we have read: This will do ACK frames. */
 	tcp_cleanup_rbuf(sk, copied);
+	trace_tcp_recv_length(sk, (copied > 0 && !(flags & MSG_PEEK)) ?
+				   copied : 0,
+			      (copied > 0 &&
+			       !(flags & MSG_PEEK)) ? 0 : copied, flags);
 	return copied;
 
 out:
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9592fe3e444a..1b336af4df6d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1300,6 +1300,7 @@  int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	release_sock(sk);
 
 out:
+	trace_udp_send_length(sk, err == 0 ? len : 0, err, 0);
 	ip_rt_put(rt);
 out_free:
 	if (free)
@@ -1364,8 +1365,10 @@  int udp_sendpage(struct sock *sk, struct page *page, int offset,
 			     page, offset, size, flags);
 	if (ret == -EOPNOTSUPP) {
 		release_sock(sk);
-		return sock_no_sendpage(sk->sk_socket, page, offset,
-					size, flags);
+		ret = sock_no_sendpage(sk->sk_socket, page, offset,
+				       size, flags);
+		trace_udp_send_length(sk, ret > 0 ? ret : 0, ret > 0 ? 0 : ret, 0);
+		return ret;
 	}
 	if (ret < 0) {
 		udp_flush_pending_frames(sk);
@@ -1377,6 +1380,7 @@  int udp_sendpage(struct sock *sk, struct page *page, int offset,
 		ret = udp_push_pending_frames(sk);
 	if (!ret)
 		ret = size;
+	trace_udp_send_length(sk, ret > 0 ? ret : 0, ret > 0 ? 0 : ret, 0);
 out:
 	release_sock(sk);
 	return ret;
@@ -1935,6 +1939,9 @@  int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
 	if (flags & MSG_TRUNC)
 		err = ulen;
 
+	trace_udp_recv_length(sk, (err > 0 && !peeking) ? err : 0,
+			      (err > 0 && !peeking) ? 0 : err, flags);
+
 	skb_consume_udp(sk, skb, peeking ? -err : err);
 	return err;