[net-next,v2,1/3] net: tcp: send zero-window ACK when no memory

Message ID 20230807134547.2782227-2-imagedong@tencent.com
State New
Headers
Series net: tcp: support probing OOM |

Commit Message

Menglong Dong Aug. 7, 2023, 1:45 p.m. UTC
  From: Menglong Dong <imagedong@tencent.com>

For now, skb will be dropped when no memory, which makes client keep
retrans util timeout and it's not friendly to the users.

In this patch, we reply an ACK with zero-window in this case to update
the snd_wnd of the sender to 0. Therefore, the sender won't timeout the
connection and will probe the zero-window with the retransmits.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
---
v2:
- send 0 rwin ACK for the receive queue empty case when necessary
- send the ACK immediately by using the ICSK_ACK_NOW flag
---
 include/net/inet_connection_sock.h |  3 ++-
 net/ipv4/tcp_input.c               | 14 +++++++++++---
 net/ipv4/tcp_output.c              | 14 +++++++++++---
 3 files changed, 24 insertions(+), 7 deletions(-)
  

Comments

Eric Dumazet Aug. 7, 2023, 2:05 p.m. UTC | #1
On Mon, Aug 7, 2023 at 3:47 PM <menglong8.dong@gmail.com> wrote:
>
> From: Menglong Dong <imagedong@tencent.com>
>
> For now, skb will be dropped when no memory, which makes client keep
> retrans util timeout and it's not friendly to the users.
>
> In this patch, we reply an ACK with zero-window in this case to update
> the snd_wnd of the sender to 0. Therefore, the sender won't timeout the
> connection and will probe the zero-window with the retransmits.
>
> Signed-off-by: Menglong Dong <imagedong@tencent.com>
> ---
> v2:
> - send 0 rwin ACK for the receive queue empty case when necessary
> - send the ACK immediately by using the ICSK_ACK_NOW flag
> ---
>  include/net/inet_connection_sock.h |  3 ++-
>  net/ipv4/tcp_input.c               | 14 +++++++++++---
>  net/ipv4/tcp_output.c              | 14 +++++++++++---
>  3 files changed, 24 insertions(+), 7 deletions(-)
>
> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
> index c2b15f7e5516..be3c858a2ebb 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -164,7 +164,8 @@ enum inet_csk_ack_state_t {
>         ICSK_ACK_TIMER  = 2,
>         ICSK_ACK_PUSHED = 4,
>         ICSK_ACK_PUSHED2 = 8,
> -       ICSK_ACK_NOW = 16       /* Send the next ACK immediately (once) */
> +       ICSK_ACK_NOW = 16,      /* Send the next ACK immediately (once) */
> +       ICSK_ACK_NOMEM = 32,
>  };
>
>  void inet_csk_init_xmit_timers(struct sock *sk,
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 8e96ebe373d7..aae485d0a3b6 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -5059,12 +5059,20 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
>
>                 /* Ok. In sequence. In window. */
>  queue_and_out:
> -               if (skb_queue_len(&sk->sk_receive_queue) == 0)
> -                       sk_forced_mem_schedule(sk, skb->truesize);
> -               else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
> +               if (skb_queue_len(&sk->sk_receive_queue) == 0) {
> +                       if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
> +                               sk_forced_mem_schedule(sk, skb->truesize);

I think we want sk->sk_data_ready() here, to let applications drain the queue,
regardless of sk->sk_rcvlowat value.


> +                               inet_csk(sk)->icsk_ack.pending |=
> +                                       (ICSK_ACK_NOMEM | ICSK_ACK_NOW);
> +                               inet_csk_schedule_ack(sk);
> +                       }
> +               } else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
>                         reason = SKB_DROP_REASON_PROTO_MEM;
>                         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
>                         sk->sk_data_ready(sk);

We also want to keep this sk->sk_data_ready(sk) call.

> +                       inet_csk(sk)->icsk_ack.pending |=
> +                               (ICSK_ACK_NOMEM | ICSK_ACK_NOW);
> +                       inet_csk_schedule_ack(sk);
>                         goto drop;
>                 }

This would suggest a different code refactoring, to avoid code duplication.

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 57c8af1859c16eba5e952a23ea959b628006f9c1..dde6c44f2c1e33dcf60c23b49cd99f270874ca96
100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5050,13 +5050,17 @@ static void tcp_data_queue(struct sock *sk,
struct sk_buff *skb)

                /* Ok. In sequence. In window. */
 queue_and_out:
-               if (skb_queue_len(&sk->sk_receive_queue) == 0)
-                       sk_forced_mem_schedule(sk, skb->truesize);
-               else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
-                       reason = SKB_DROP_REASON_PROTO_MEM;
-                       NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
+               if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+                       /* TODO: maybe ratelimit these WIN 0 ACK ? */
+                       inet_csk(sk)->icsk_ack.pending |=
CSK_ACK_NOMEM | ICSK_ACK_NOW;
+                       inet_csk_schedule_ack(sk);
                        sk->sk_data_ready(sk);
-                       goto drop;
+                       if (skb_queue_len(&sk->sk_receive_queue)) {
+                               reason = SKB_DROP_REASON_PROTO_MEM;
+                               NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPRCVQDROP);
+                               goto drop;
+                       }
+                       sk_forced_mem_schedule(sk, skb->truesize);
                }

                eaten = tcp_queue_rcv(sk, skb, &fragstolen);
  
Menglong Dong Aug. 8, 2023, 1:58 a.m. UTC | #2
On Mon, Aug 7, 2023 at 10:05 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Aug 7, 2023 at 3:47 PM <menglong8.dong@gmail.com> wrote:
> >
> > From: Menglong Dong <imagedong@tencent.com>
> >
> > For now, skb will be dropped when no memory, which makes client keep
> > retrans util timeout and it's not friendly to the users.
> >
> > In this patch, we reply an ACK with zero-window in this case to update
> > the snd_wnd of the sender to 0. Therefore, the sender won't timeout the
> > connection and will probe the zero-window with the retransmits.
> >
> > Signed-off-by: Menglong Dong <imagedong@tencent.com>
> > ---
> > v2:
> > - send 0 rwin ACK for the receive queue empty case when necessary
> > - send the ACK immediately by using the ICSK_ACK_NOW flag
> > ---
> >  include/net/inet_connection_sock.h |  3 ++-
> >  net/ipv4/tcp_input.c               | 14 +++++++++++---
> >  net/ipv4/tcp_output.c              | 14 +++++++++++---
> >  3 files changed, 24 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
> > index c2b15f7e5516..be3c858a2ebb 100644
> > --- a/include/net/inet_connection_sock.h
> > +++ b/include/net/inet_connection_sock.h
> > @@ -164,7 +164,8 @@ enum inet_csk_ack_state_t {
> >         ICSK_ACK_TIMER  = 2,
> >         ICSK_ACK_PUSHED = 4,
> >         ICSK_ACK_PUSHED2 = 8,
> > -       ICSK_ACK_NOW = 16       /* Send the next ACK immediately (once) */
> > +       ICSK_ACK_NOW = 16,      /* Send the next ACK immediately (once) */
> > +       ICSK_ACK_NOMEM = 32,
> >  };
> >
> >  void inet_csk_init_xmit_timers(struct sock *sk,
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index 8e96ebe373d7..aae485d0a3b6 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -5059,12 +5059,20 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
> >
> >                 /* Ok. In sequence. In window. */
> >  queue_and_out:
> > -               if (skb_queue_len(&sk->sk_receive_queue) == 0)
> > -                       sk_forced_mem_schedule(sk, skb->truesize);
> > -               else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
> > +               if (skb_queue_len(&sk->sk_receive_queue) == 0) {
> > +                       if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
> > +                               sk_forced_mem_schedule(sk, skb->truesize);
>
> I think we want sk->sk_data_ready() here, to let applications drain the queue,
> regardless of sk->sk_rcvlowat value.
>
>
> > +                               inet_csk(sk)->icsk_ack.pending |=
> > +                                       (ICSK_ACK_NOMEM | ICSK_ACK_NOW);
> > +                               inet_csk_schedule_ack(sk);
> > +                       }
> > +               } else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
> >                         reason = SKB_DROP_REASON_PROTO_MEM;
> >                         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
> >                         sk->sk_data_ready(sk);
>
> We also want to keep this sk->sk_data_ready(sk) call.
>
> > +                       inet_csk(sk)->icsk_ack.pending |=
> > +                               (ICSK_ACK_NOMEM | ICSK_ACK_NOW);
> > +                       inet_csk_schedule_ack(sk);
> >                         goto drop;
> >                 }
>
> This would suggest a different code refactoring, to avoid code duplication.
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 57c8af1859c16eba5e952a23ea959b628006f9c1..dde6c44f2c1e33dcf60c23b49cd99f270874ca96
> 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -5050,13 +5050,17 @@ static void tcp_data_queue(struct sock *sk,
> struct sk_buff *skb)
>
>                 /* Ok. In sequence. In window. */
>  queue_and_out:
> -               if (skb_queue_len(&sk->sk_receive_queue) == 0)
> -                       sk_forced_mem_schedule(sk, skb->truesize);
> -               else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
> -                       reason = SKB_DROP_REASON_PROTO_MEM;
> -                       NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
> +               if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
> +                       /* TODO: maybe ratelimit these WIN 0 ACK ? */
> +                       inet_csk(sk)->icsk_ack.pending |=
> CSK_ACK_NOMEM | ICSK_ACK_NOW;
> +                       inet_csk_schedule_ack(sk);
>                         sk->sk_data_ready(sk);
> -                       goto drop;
> +                       if (skb_queue_len(&sk->sk_receive_queue)) {
> +                               reason = SKB_DROP_REASON_PROTO_MEM;
> +                               NET_INC_STATS(sock_net(sk),
> LINUX_MIB_TCPRCVQDROP);
> +                               goto drop;
> +                       }
> +                       sk_forced_mem_schedule(sk, skb->truesize);
>                 }
>
>                 eaten = tcp_queue_rcv(sk, skb, &fragstolen);

Looks much better, thank you!
  

Patch

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index c2b15f7e5516..be3c858a2ebb 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -164,7 +164,8 @@  enum inet_csk_ack_state_t {
 	ICSK_ACK_TIMER  = 2,
 	ICSK_ACK_PUSHED = 4,
 	ICSK_ACK_PUSHED2 = 8,
-	ICSK_ACK_NOW = 16	/* Send the next ACK immediately (once) */
+	ICSK_ACK_NOW = 16,	/* Send the next ACK immediately (once) */
+	ICSK_ACK_NOMEM = 32,
 };
 
 void inet_csk_init_xmit_timers(struct sock *sk,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8e96ebe373d7..aae485d0a3b6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5059,12 +5059,20 @@  static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 
 		/* Ok. In sequence. In window. */
 queue_and_out:
-		if (skb_queue_len(&sk->sk_receive_queue) == 0)
-			sk_forced_mem_schedule(sk, skb->truesize);
-		else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+		if (skb_queue_len(&sk->sk_receive_queue) == 0) {
+			if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+				sk_forced_mem_schedule(sk, skb->truesize);
+				inet_csk(sk)->icsk_ack.pending |=
+					(ICSK_ACK_NOMEM | ICSK_ACK_NOW);
+				inet_csk_schedule_ack(sk);
+			}
+		} else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
 			reason = SKB_DROP_REASON_PROTO_MEM;
 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
 			sk->sk_data_ready(sk);
+			inet_csk(sk)->icsk_ack.pending |=
+				(ICSK_ACK_NOMEM | ICSK_ACK_NOW);
+			inet_csk_schedule_ack(sk);
 			goto drop;
 		}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c5412ee77fc8..769a558159ee 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -257,11 +257,19 @@  EXPORT_SYMBOL(tcp_select_initial_window);
 static u16 tcp_select_window(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	u32 old_win = tp->rcv_wnd;
-	u32 cur_win = tcp_receive_window(tp);
-	u32 new_win = __tcp_select_window(sk);
 	struct net *net = sock_net(sk);
+	u32 old_win = tp->rcv_wnd;
+	u32 cur_win, new_win;
+
+	/* Make the window 0 if we failed to queue the data because we
+	 * are out of memory. The window is temporary, so we don't store
+	 * it on the socket.
+	 */
+	if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM))
+		return 0;
 
+	cur_win = tcp_receive_window(tp);
+	new_win = __tcp_select_window(sk);
 	if (new_win < cur_win) {
 		/* Danger Will Robinson!
 		 * Don't update rcv_wup/rcv_wnd here or else