[net,03/13] mptcp: fix lockless access in subflow ULP diag

Message ID 20240215-upstream-net-20240215-misc-fixes-v1-3-8c01a55d8f6a@kernel.org
State New
Headers
Series mptcp: misc. fixes for v6.8 |

Commit Message

Matthieu Baerts (NGI0) Feb. 15, 2024, 6:25 p.m. UTC
  From: Paolo Abeni <pabeni@redhat.com>

Since the introduction of the subflow ULP diag interface, the
dump callback accessed all the subflow data with lockless.

We need either to annotate all the read and write operation accordingly,
or acquire the subflow socket lock. Let's do latter, even if slower, to
avoid a diffstat havoc.

Fixes: 5147dfb50832 ("mptcp: allow dumping subflow context to userspace")
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
---
Notes:
  - This patch modifies the existing ULP API. No better solutions have
    been found for -net, and there is some similar prior art, see
    commit 0df48c26d841 ("tcp: add tcpi_bytes_acked to tcp_info").

    Please also note that TLS ULP Diag has likely the same issue.
To: Boris Pismenny <borisp@nvidia.com>
To: John Fastabend <john.fastabend@gmail.com>
---
 include/net/tcp.h  | 2 +-
 net/mptcp/diag.c   | 6 +++++-
 net/tls/tls_main.c | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)
  

Comments

Eric Dumazet Feb. 19, 2024, 5:21 p.m. UTC | #1
On Thu, Feb 15, 2024 at 7:25 PM Matthieu Baerts (NGI0)
<matttbe@kernel.org> wrote:
>
> From: Paolo Abeni <pabeni@redhat.com>
>
> Since the introduction of the subflow ULP diag interface, the
> dump callback accessed all the subflow data with lockless.
>
> We need either to annotate all the read and write operation accordingly,
> or acquire the subflow socket lock. Let's do latter, even if slower, to
> avoid a diffstat havoc.
>
> Fixes: 5147dfb50832 ("mptcp: allow dumping subflow context to userspace")
> Cc: stable@vger.kernel.org
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> Reviewed-by: Mat Martineau <martineau@kernel.org>
> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
> ---
> Notes:
>   - This patch modifies the existing ULP API. No better solutions have
>     been found for -net, and there is some similar prior art, see
>     commit 0df48c26d841 ("tcp: add tcpi_bytes_acked to tcp_info").
>
>     Please also note that TLS ULP Diag has likely the same issue.
> To: Boris Pismenny <borisp@nvidia.com>
> To: John Fastabend <john.fastabend@gmail.com>
> ---
>  include/net/tcp.h  | 2 +-
>  net/mptcp/diag.c   | 6 +++++-
>  net/tls/tls_main.c | 2 +-
>  3 files changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index dd78a1181031..f6eba9652d01 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -2506,7 +2506,7 @@ struct tcp_ulp_ops {
>         /* cleanup ulp */
>         void (*release)(struct sock *sk);
>         /* diagnostic */
> -       int (*get_info)(const struct sock *sk, struct sk_buff *skb);
> +       int (*get_info)(struct sock *sk, struct sk_buff *skb);
>         size_t (*get_info_size)(const struct sock *sk);
>         /* clone ulp */
>         void (*clone)(const struct request_sock *req, struct sock *newsk,
> diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
> index a536586742f2..e57c5f47f035 100644
> --- a/net/mptcp/diag.c
> +++ b/net/mptcp/diag.c
> @@ -13,17 +13,19 @@
>  #include <uapi/linux/mptcp.h>
>  #include "protocol.h"
>
> -static int subflow_get_info(const struct sock *sk, struct sk_buff *skb)
> +static int subflow_get_info(struct sock *sk, struct sk_buff *skb)
>  {
>         struct mptcp_subflow_context *sf;
>         struct nlattr *start;
>         u32 flags = 0;
> +       bool slow;
>         int err;
>
>         start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP);
>         if (!start)
>                 return -EMSGSIZE;
>
> +       slow = lock_sock_fast(sk);
>         rcu_read_lock();

I am afraid lockdep is not happy with this change.

Paolo, we probably need the READ_ONCE() annotations after all.
  
Eric Dumazet Feb. 19, 2024, 5:35 p.m. UTC | #2
On Mon, Feb 19, 2024 at 6:21 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Thu, Feb 15, 2024 at 7:25 PM Matthieu Baerts (NGI0)
> <matttbe@kernel.org> wrote:
> >
> > From: Paolo Abeni <pabeni@redhat.com>
> >
> > Since the introduction of the subflow ULP diag interface, the
> > dump callback accessed all the subflow data with lockless.
> >
> > We need either to annotate all the read and write operation accordingly,
> > or acquire the subflow socket lock. Let's do latter, even if slower, to
> > avoid a diffstat havoc.
> >
> > Fixes: 5147dfb50832 ("mptcp: allow dumping subflow context to userspace")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > Reviewed-by: Mat Martineau <martineau@kernel.org>
> > Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
> > ---
> > Notes:
> >   - This patch modifies the existing ULP API. No better solutions have
> >     been found for -net, and there is some similar prior art, see
> >     commit 0df48c26d841 ("tcp: add tcpi_bytes_acked to tcp_info").
> >
> >     Please also note that TLS ULP Diag has likely the same issue.
> > To: Boris Pismenny <borisp@nvidia.com>
> > To: John Fastabend <john.fastabend@gmail.com>
> > ---
> >  include/net/tcp.h  | 2 +-
> >  net/mptcp/diag.c   | 6 +++++-
> >  net/tls/tls_main.c | 2 +-
> >  3 files changed, 7 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index dd78a1181031..f6eba9652d01 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -2506,7 +2506,7 @@ struct tcp_ulp_ops {
> >         /* cleanup ulp */
> >         void (*release)(struct sock *sk);
> >         /* diagnostic */
> > -       int (*get_info)(const struct sock *sk, struct sk_buff *skb);
> > +       int (*get_info)(struct sock *sk, struct sk_buff *skb);
> >         size_t (*get_info_size)(const struct sock *sk);
> >         /* clone ulp */
> >         void (*clone)(const struct request_sock *req, struct sock *newsk,
> > diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
> > index a536586742f2..e57c5f47f035 100644
> > --- a/net/mptcp/diag.c
> > +++ b/net/mptcp/diag.c
> > @@ -13,17 +13,19 @@
> >  #include <uapi/linux/mptcp.h>
> >  #include "protocol.h"
> >
> > -static int subflow_get_info(const struct sock *sk, struct sk_buff *skb)
> > +static int subflow_get_info(struct sock *sk, struct sk_buff *skb)
> >  {
> >         struct mptcp_subflow_context *sf;
> >         struct nlattr *start;
> >         u32 flags = 0;
> > +       bool slow;
> >         int err;
> >
> >         start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP);
> >         if (!start)
> >                 return -EMSGSIZE;
> >
> > +       slow = lock_sock_fast(sk);
> >         rcu_read_lock();
>
> I am afraid lockdep is not happy with this change.
>
> Paolo, we probably need the READ_ONCE() annotations after all.

Or perhaps something like the following would be enough.

diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
index 6ff6f14674aa2941bc04c680bacd9f79fc65060d..7017dd60659dc7133318c1c82e3f429bea3a5d57
100644
--- a/net/mptcp/diag.c
+++ b/net/mptcp/diag.c
@@ -21,6 +21,9 @@ static int subflow_get_info(struct sock *sk, struct
sk_buff *skb)
        bool slow;
        int err;

+       if (inet_sk_state_load(sk) == TCP_LISTEN)
+               return 0;
+
        start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP);
        if (!start)
                return -EMSGSIZE;
  
Paolo Abeni Feb. 19, 2024, 6:04 p.m. UTC | #3
On Mon, 2024-02-19 at 18:35 +0100, Eric Dumazet wrote:
> On Mon, Feb 19, 2024 at 6:21 PM Eric Dumazet <edumazet@google.com> wrote:
> > 
> > On Thu, Feb 15, 2024 at 7:25 PM Matthieu Baerts (NGI0)
> > <matttbe@kernel.org> wrote:
> > > 
> > > From: Paolo Abeni <pabeni@redhat.com>
> > > 
> > > Since the introduction of the subflow ULP diag interface, the
> > > dump callback accessed all the subflow data with lockless.
> > > 
> > > We need either to annotate all the read and write operation accordingly,
> > > or acquire the subflow socket lock. Let's do latter, even if slower, to
> > > avoid a diffstat havoc.
> > > 
> > > Fixes: 5147dfb50832 ("mptcp: allow dumping subflow context to userspace")
> > > Cc: stable@vger.kernel.org
> > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > > Reviewed-by: Mat Martineau <martineau@kernel.org>
> > > Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
> > > ---
> > > Notes:
> > >   - This patch modifies the existing ULP API. No better solutions have
> > >     been found for -net, and there is some similar prior art, see
> > >     commit 0df48c26d841 ("tcp: add tcpi_bytes_acked to tcp_info").
> > > 
> > >     Please also note that TLS ULP Diag has likely the same issue.
> > > To: Boris Pismenny <borisp@nvidia.com>
> > > To: John Fastabend <john.fastabend@gmail.com>
> > > ---
> > >  include/net/tcp.h  | 2 +-
> > >  net/mptcp/diag.c   | 6 +++++-
> > >  net/tls/tls_main.c | 2 +-
> > >  3 files changed, 7 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > > index dd78a1181031..f6eba9652d01 100644
> > > --- a/include/net/tcp.h
> > > +++ b/include/net/tcp.h
> > > @@ -2506,7 +2506,7 @@ struct tcp_ulp_ops {
> > >         /* cleanup ulp */
> > >         void (*release)(struct sock *sk);
> > >         /* diagnostic */
> > > -       int (*get_info)(const struct sock *sk, struct sk_buff *skb);
> > > +       int (*get_info)(struct sock *sk, struct sk_buff *skb);
> > >         size_t (*get_info_size)(const struct sock *sk);
> > >         /* clone ulp */
> > >         void (*clone)(const struct request_sock *req, struct sock *newsk,
> > > diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
> > > index a536586742f2..e57c5f47f035 100644
> > > --- a/net/mptcp/diag.c
> > > +++ b/net/mptcp/diag.c
> > > @@ -13,17 +13,19 @@
> > >  #include <uapi/linux/mptcp.h>
> > >  #include "protocol.h"
> > > 
> > > -static int subflow_get_info(const struct sock *sk, struct sk_buff *skb)
> > > +static int subflow_get_info(struct sock *sk, struct sk_buff *skb)
> > >  {
> > >         struct mptcp_subflow_context *sf;
> > >         struct nlattr *start;
> > >         u32 flags = 0;
> > > +       bool slow;
> > >         int err;
> > > 
> > >         start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP);
> > >         if (!start)
> > >                 return -EMSGSIZE;
> > > 
> > > +       slow = lock_sock_fast(sk);
> > >         rcu_read_lock();
> > 
> > I am afraid lockdep is not happy with this change.
> > 
> > Paolo, we probably need the READ_ONCE() annotations after all.
> 
> Or perhaps something like the following would be enough.
> 
> diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
> index 6ff6f14674aa2941bc04c680bacd9f79fc65060d..7017dd60659dc7133318c1c82e3f429bea3a5d57
> 100644
> --- a/net/mptcp/diag.c
> +++ b/net/mptcp/diag.c
> @@ -21,6 +21,9 @@ static int subflow_get_info(struct sock *sk, struct
> sk_buff *skb)
>         bool slow;
>         int err;
> 
> +       if (inet_sk_state_load(sk) == TCP_LISTEN)
> +               return 0;
> +
>         start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP);
>         if (!start)
>                 return -EMSGSIZE;

Thanks for the head-up. This later option looks preferable, to avoid
quit a bit of noise with _ONCE annotation. Is there a syzkaller splat I
could look at? if it landed on the ML, I missed it.

Thanks!

Paolo
  
Eric Dumazet Feb. 19, 2024, 6:33 p.m. UTC | #4
On Mon, Feb 19, 2024 at 7:04 PM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Mon, 2024-02-19 at 18:35 +0100, Eric Dumazet wrote:
> > On Mon, Feb 19, 2024 at 6:21 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > On Thu, Feb 15, 2024 at 7:25 PM Matthieu Baerts (NGI0)
> > > <matttbe@kernel.org> wrote:
> > > >
> > > > From: Paolo Abeni <pabeni@redhat.com>
> > > >
> > > > Since the introduction of the subflow ULP diag interface, the
> > > > dump callback accessed all the subflow data with lockless.
> > > >
> > > > We need either to annotate all the read and write operation accordingly,
> > > > or acquire the subflow socket lock. Let's do latter, even if slower, to
> > > > avoid a diffstat havoc.
> > > >
> > > > Fixes: 5147dfb50832 ("mptcp: allow dumping subflow context to userspace")
> > > > Cc: stable@vger.kernel.org
> > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > > > Reviewed-by: Mat Martineau <martineau@kernel.org>
> > > > Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
> > > > ---
> > > > Notes:
> > > >   - This patch modifies the existing ULP API. No better solutions have
> > > >     been found for -net, and there is some similar prior art, see
> > > >     commit 0df48c26d841 ("tcp: add tcpi_bytes_acked to tcp_info").
> > > >
> > > >     Please also note that TLS ULP Diag has likely the same issue.
> > > > To: Boris Pismenny <borisp@nvidia.com>
> > > > To: John Fastabend <john.fastabend@gmail.com>
> > > > ---
> > > >  include/net/tcp.h  | 2 +-
> > > >  net/mptcp/diag.c   | 6 +++++-
> > > >  net/tls/tls_main.c | 2 +-
> > > >  3 files changed, 7 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > > > index dd78a1181031..f6eba9652d01 100644
> > > > --- a/include/net/tcp.h
> > > > +++ b/include/net/tcp.h
> > > > @@ -2506,7 +2506,7 @@ struct tcp_ulp_ops {
> > > >         /* cleanup ulp */
> > > >         void (*release)(struct sock *sk);
> > > >         /* diagnostic */
> > > > -       int (*get_info)(const struct sock *sk, struct sk_buff *skb);
> > > > +       int (*get_info)(struct sock *sk, struct sk_buff *skb);
> > > >         size_t (*get_info_size)(const struct sock *sk);
> > > >         /* clone ulp */
> > > >         void (*clone)(const struct request_sock *req, struct sock *newsk,
> > > > diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
> > > > index a536586742f2..e57c5f47f035 100644
> > > > --- a/net/mptcp/diag.c
> > > > +++ b/net/mptcp/diag.c
> > > > @@ -13,17 +13,19 @@
> > > >  #include <uapi/linux/mptcp.h>
> > > >  #include "protocol.h"
> > > >
> > > > -static int subflow_get_info(const struct sock *sk, struct sk_buff *skb)
> > > > +static int subflow_get_info(struct sock *sk, struct sk_buff *skb)
> > > >  {
> > > >         struct mptcp_subflow_context *sf;
> > > >         struct nlattr *start;
> > > >         u32 flags = 0;
> > > > +       bool slow;
> > > >         int err;
> > > >
> > > >         start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP);
> > > >         if (!start)
> > > >                 return -EMSGSIZE;
> > > >
> > > > +       slow = lock_sock_fast(sk);
> > > >         rcu_read_lock();
> > >
> > > I am afraid lockdep is not happy with this change.
> > >
> > > Paolo, we probably need the READ_ONCE() annotations after all.
> >
> > Or perhaps something like the following would be enough.
> >
> > diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
> > index 6ff6f14674aa2941bc04c680bacd9f79fc65060d..7017dd60659dc7133318c1c82e3f429bea3a5d57
> > 100644
> > --- a/net/mptcp/diag.c
> > +++ b/net/mptcp/diag.c
> > @@ -21,6 +21,9 @@ static int subflow_get_info(struct sock *sk, struct
> > sk_buff *skb)
> >         bool slow;
> >         int err;
> >
> > +       if (inet_sk_state_load(sk) == TCP_LISTEN)
> > +               return 0;
> > +
> >         start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP);
> >         if (!start)
> >                 return -EMSGSIZE;
>
> Thanks for the head-up. This later option looks preferable, to avoid
> quit a bit of noise with _ONCE annotation. Is there a syzkaller splat I
> could look at? if it landed on the ML, I missed it.
>

Not landed yet, here is the splat :

======================================================
WARNING: possible circular locking dependency detected
6.8.0-rc4-syzkaller-00212-g40b9385dd8e6 #0 Not tainted
------------------------------------------------------
syz-executor.2/24141 is trying to acquire lock:
ffff888045870130 (k-sk_lock-AF_INET6){+.+.}-{0:0}, at:
tcp_diag_put_ulp net/ipv4/tcp_diag.c:100 [inline]
ffff888045870130 (k-sk_lock-AF_INET6){+.+.}-{0:0}, at:
tcp_diag_get_aux+0x738/0x830 net/ipv4/tcp_diag.c:137

but task is already holding lock:
ffffc9000135e488 (&h->lhash2[i].lock){+.+.}-{2:2}, at: spin_lock
include/linux/spinlock.h:351 [inline]
ffffc9000135e488 (&h->lhash2[i].lock){+.+.}-{2:2}, at:
inet_diag_dump_icsk+0x39f/0x1f80 net/ipv4/inet_diag.c:1038

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (&h->lhash2[i].lock){+.+.}-{2:2}:
lock_acquire+0x1e3/0x530 kernel/locking/lockdep.c:5754
__raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline]
_raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154
spin_lock include/linux/spinlock.h:351 [inline]
__inet_hash+0x335/0xbe0 net/ipv4/inet_hashtables.c:743
inet_csk_listen_start+0x23a/0x320 net/ipv4/inet_connection_sock.c:1261
__inet_listen_sk+0x2a2/0x770 net/ipv4/af_inet.c:217
inet_listen+0xa3/0x110 net/ipv4/af_inet.c:239
rds_tcp_listen_init+0x3fd/0x5a0 net/rds/tcp_listen.c:316
rds_tcp_init_net+0x141/0x320 net/rds/tcp.c:577
ops_init+0x352/0x610 net/core/net_namespace.c:136
__register_pernet_operations net/core/net_namespace.c:1214 [inline]
register_pernet_operations+0x2cb/0x660 net/core/net_namespace.c:1283
register_pernet_device+0x33/0x80 net/core/net_namespace.c:1370
rds_tcp_init+0x62/0xd0 net/rds/tcp.c:735
do_one_initcall+0x238/0x830 init/main.c:1236
do_initcall_level+0x157/0x210 init/main.c:1298
do_initcalls+0x3f/0x80 init/main.c:1314
kernel_init_freeable+0x42f/0x5d0 init/main.c:1551
kernel_init+0x1d/0x2a0 init/main.c:1441
ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:242

-> #0 (k-sk_lock-AF_INET6){+.+.}-{0:0}:
check_prev_add kernel/locking/lockdep.c:3134 [inline]
check_prevs_add kernel/locking/lockdep.c:3253 [inline]
validate_chain+0x18ca/0x58e0 kernel/locking/lockdep.c:3869
__lock_acquire+0x1345/0x1fd0 kernel/locking/lockdep.c:5137
lock_acquire+0x1e3/0x530 kernel/locking/lockdep.c:5754
lock_sock_fast include/net/sock.h:1723 [inline]
subflow_get_info+0x166/0xd20 net/mptcp/diag.c:28
tcp_diag_put_ulp net/ipv4/tcp_diag.c:100 [inline]
tcp_diag_get_aux+0x738/0x830 net/ipv4/tcp_diag.c:137
inet_sk_diag_fill+0x10ed/0x1e00 net/ipv4/inet_diag.c:345
inet_diag_dump_icsk+0x55b/0x1f80 net/ipv4/inet_diag.c:1061
__inet_diag_dump+0x211/0x3a0 net/ipv4/inet_diag.c:1263
inet_diag_dump_compat+0x1c1/0x2d0 net/ipv4/inet_diag.c:1371
netlink_dump+0x59b/0xc80 net/netlink/af_netlink.c:2264
__netlink_dump_start+0x5df/0x790 net/netlink/af_netlink.c:2370
netlink_dump_start include/linux/netlink.h:338 [inline]
inet_diag_rcv_msg_compat+0x209/0x4c0 net/ipv4/inet_diag.c:1405
sock_diag_rcv_msg+0xe7/0x410
netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2543
sock_diag_rcv+0x2a/0x40 net/core/sock_diag.c:280
netlink_unicast_kernel net/netlink/af_netlink.c:1341 [inline]
netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1367
netlink_sendmsg+0xa3b/0xd70 net/netlink/af_netlink.c:1908
sock_sendmsg_nosec net/socket.c:730 [inline]
__sock_sendmsg+0x221/0x270 net/socket.c:745
____sys_sendmsg+0x525/0x7d0 net/socket.c:2584
___sys_sendmsg net/socket.c:2638 [inline]
__sys_sendmsg+0x2b0/0x3a0 net/socket.c:2667
do_syscall_64+0xf9/0x240
entry_SYSCALL_64_after_hwframe+0x6f/0x77

other info that might help us debug this:

Possible unsafe locking scenario:

CPU0 CPU1
---- ----
lock(&h->lhash2[i].lock);
lock(k-sk_lock-AF_INET6);
lock(&h->lhash2[i].lock);
lock(k-sk_lock-AF_INET6);

*** DEADLOCK ***

5 locks held by syz-executor.2/24141:
#0: ffffffff8f380bc8 (sock_diag_mutex){+.+.}-{3:3}, at:
sock_diag_rcv+0x1b/0x40 net/core/sock_diag.c:279
#1: ffffffff8f380a28 (sock_diag_table_mutex){+.+.}-{3:3}, at:
sock_diag_rcv_msg+0xc6/0x410 net/core/sock_diag.c:259
#2: ffff8880586f5680 (nlk_cb_mutex-SOCK_DIAG){+.+.}-{3:3}, at:
netlink_dump+0xde/0xc80 net/netlink/af_netlink.c:2211
#3: ffffffff8f464568 (inet_diag_table_mutex){+.+.}-{3:3}, at:
inet_diag_lock_handler net/ipv4/inet_diag.c:63 [inline]
#3: ffffffff8f464568 (inet_diag_table_mutex){+.+.}-{3:3}, at:
__inet_diag_dump+0x191/0x3a0 net/ipv4/inet_diag.c:1261
#4: ffffc9000135e488 (&h->lhash2[i].lock){+.+.}-{2:2}, at: spin_lock
include/linux/spinlock.h:351 [inline]
#4: ffffc9000135e488 (&h->lhash2[i].lock){+.+.}-{2:2}, at:
inet_diag_dump_icsk+0x39f/0x1f80 net/ipv4/inet_diag.c:1038

stack backtrace:
CPU: 0 PID: 24141 Comm: syz-executor.2 Not tainted
6.8.0-rc4-syzkaller-00212-g40b9385dd8e6 #0
Hardware name: Google Google Compute Engine/Google Compute Engine,
BIOS Google 01/25/2024
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x1e7/0x2e0 lib/dump_stack.c:106
check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2187
check_prev_add kernel/locking/lockdep.c:3134 [inline]
check_prevs_add kernel/locking/lockdep.c:3253 [inline]
validate_chain+0x18ca/0x58e0 kernel/locking/lockdep.c:3869
__lock_acquire+0x1345/0x1fd0 kernel/locking/lockdep.c:5137
lock_acquire+0x1e3/0x530 kernel/locking/lockdep.c:5754
lock_sock_fast include/net/sock.h:1723 [inline]
subflow_get_info+0x166/0xd20 net/mptcp/diag.c:28
tcp_diag_put_ulp net/ipv4/tcp_diag.c:100 [inline]
tcp_diag_get_aux+0x738/0x830 net/ipv4/tcp_diag.c:137
inet_sk_diag_fill+0x10ed/0x1e00 net/ipv4/inet_diag.c:345
inet_diag_dump_icsk+0x55b/0x1f80 net/ipv4/inet_diag.c:1061
__inet_diag_dump+0x211/0x3a0 net/ipv4/inet_diag.c:1263
inet_diag_dump_compat+0x1c1/0x2d0 net/ipv4/inet_diag.c:1371
netlink_dump+0x59b/0xc80 net/netlink/af_netlink.c:2264
__netlink_dump_start+0x5df/0x790 net/netlink/af_netlink.c:2370
netlink_dump_start include/linux/netlink.h:338 [inline]
inet_diag_rcv_msg_compat+0x209/0x4c0 net/ipv4/inet_diag.c:1405
sock_diag_rcv_msg+0xe7/0x410
netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2543
sock_diag_rcv+0x2a/0x40 net/core/sock_diag.c:280
netlink_unicast_kernel net/netlink/af_netlink.c:1341 [inline]
netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1367
netlink_sendmsg+0xa3b/0xd70 net/netlink/af_netlink.c:1908
sock_sendmsg_nosec net/socket.c:730 [inline]
__sock_sendmsg+0x221/0x270 net/socket.c:745
____sys_sendmsg+0x525/0x7d0 net/socket.c:2584
___sys_sendmsg net/socket.c:2638 [inline]
__sys_sendmsg+0x2b0/0x3a0 net/socket.c:2667
do_syscall_64+0xf9/0x240
entry_SYSCALL_64_after_hwframe+0x6f/0x77
RIP: 0033:0x7fbc4c07dda9
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48
89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d
01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007fbc4ce750c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00007fbc4c1abf80 RCX: 00007fbc4c07dda9
RDX: 0000000000000000 RSI: 0000000020000000 RDI: 0000000000000004
RBP: 00007fbc4c0ca47a R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 000000000000000b R14: 00007fbc4c1abf80 R15: 00007ffcc3d92258
</TASK>
BUG: sleeping function called from invalid context at net/core/sock.c:3554
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 24141, name:
syz-executor.2
preempt_count: 1, expected: 0
RCU nest depth: 0, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 0 PID: 24141 Comm: syz-executor.2 Not tainted
6.8.0-rc4-syzkaller-00212-g40b9385dd8e6 #0
Hardware name: Google Google Compute Engine/Google Compute Engine,
BIOS Google 01/25/2024
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x1e7/0x2e0 lib/dump_stack.c:106
__might_resched+0x5d3/0x780 kernel/sched/core.c:10176
__lock_sock_fast+0x31/0xe0 net/core/sock.c:3554
lock_sock_fast include/net/sock.h:1725 [inline]
subflow_get_info+0x172/0xd20 net/mptcp/diag.c:28
tcp_diag_put_ulp net/ipv4/tcp_diag.c:100 [inline]
tcp_diag_get_aux+0x738/0x830 net/ipv4/tcp_diag.c:137
inet_sk_diag_fill+0x10ed/0x1e00 net/ipv4/inet_diag.c:345
inet_diag_dump_icsk+0x55b/0x1f80 net/ipv4/inet_diag.c:1061
__inet_diag_dump+0x211/0x3a0 net/ipv4/inet_diag.c:1263
inet_diag_dump_compat+0x1c1/0x2d0 net/ipv4/inet_diag.c:1371
netlink_dump+0x59b/0xc80 net/netlink/af_netlink.c:2264
__netlink_dump_start+0x5df/0x790 net/netlink/af_netlink.c:2370
netlink_dump_start include/linux/netlink.h:338 [inline]
inet_diag_rcv_msg_compat+0x209/0x4c0 net/ipv4/inet_diag.c:1405
sock_diag_rcv_msg+0xe7/0x410
netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2543
sock_diag_rcv+0x2a/0x40 net/core/sock_diag.c:280
netlink_unicast_kernel net/netlink/af_netlink.c:1341 [inline]
netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1367
netlink_sendmsg+0xa3b/0xd70 net/netlink/af_netlink.c:1908
sock_sendmsg_nosec net/socket.c:730 [inline]
__sock_sendmsg+0x221/0x270 net/socket.c:745
____sys_sendmsg+0x525/0x7d0 net/socket.c:2584
___sys_sendmsg net/socket.c:2638 [inline]
__sys_sendmsg+0x2b0/0x3a0 net/socket.c:2667
do_syscall_64+0xf9/0x240
entry_SYSCALL_64_after_hwframe+0x6f/0x77
RIP: 0033:0x7fbc4c07dda9
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48
89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d
01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007fbc4ce750c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00007fbc4c1abf80 RCX: 00007fbc4c07dda9
RDX: 0000000000000000 RSI: 0000000020000000 RDI: 0000000000000004
RBP: 00007fbc4c0ca47a R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 000000000000000b R14: 00007fbc4c1abf80 R15: 00007ffcc3d92258
  
Paolo Abeni Feb. 20, 2024, 5:33 p.m. UTC | #5
On Mon, 2024-02-19 at 19:33 +0100, Eric Dumazet wrote:
> On Mon, Feb 19, 2024 at 7:04 PM Paolo Abeni <pabeni@redhat.com> wrote:
> > Thanks for the head-up. This later option looks preferable, to avoid
> > quit a bit of noise with _ONCE annotation. Is there a syzkaller splat I
> > could look at? if it landed on the ML, I missed it.
> > 
> 
> Not landed yet, here is the splat :
> 
> ======================================================
> WARNING: possible circular locking dependency detected
> 6.8.0-rc4-syzkaller-00212-g40b9385dd8e6 #0 Not tainted
> ------------------------------------------------------
> syz-executor.2/24141 is trying to acquire lock:
> ffff888045870130 (k-sk_lock-AF_INET6){+.+.}-{0:0}, at:
> tcp_diag_put_ulp net/ipv4/tcp_diag.c:100 [inline]
> ffff888045870130 (k-sk_lock-AF_INET6){+.+.}-{0:0}, at:
> tcp_diag_get_aux+0x738/0x830 net/ipv4/tcp_diag.c:137
> 
> but task is already holding lock:
> ffffc9000135e488 (&h->lhash2[i].lock){+.+.}-{2:2}, at: spin_lock
> include/linux/spinlock.h:351 [inline]
> ffffc9000135e488 (&h->lhash2[i].lock){+.+.}-{2:2}, at:
> inet_diag_dump_icsk+0x39f/0x1f80 net/ipv4/inet_diag.c:1038

[Sorry for the latency]. Yes it looks like that checking the listener
status will work. I can test and send the formal patch - with the due
credits! - or do you prefer otherwise?

Thanks!

Paolo
  
Eric Dumazet Feb. 20, 2024, 6:03 p.m. UTC | #6
On Tue, Feb 20, 2024 at 6:33 PM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Mon, 2024-02-19 at 19:33 +0100, Eric Dumazet wrote:
> > On Mon, Feb 19, 2024 at 7:04 PM Paolo Abeni <pabeni@redhat.com> wrote:
> > > Thanks for the head-up. This later option looks preferable, to avoid
> > > quit a bit of noise with _ONCE annotation. Is there a syzkaller splat I
> > > could look at? if it landed on the ML, I missed it.
> > >
> >
> > Not landed yet, here is the splat :
> >
> > ======================================================
> > WARNING: possible circular locking dependency detected
> > 6.8.0-rc4-syzkaller-00212-g40b9385dd8e6 #0 Not tainted
> > ------------------------------------------------------
> > syz-executor.2/24141 is trying to acquire lock:
> > ffff888045870130 (k-sk_lock-AF_INET6){+.+.}-{0:0}, at:
> > tcp_diag_put_ulp net/ipv4/tcp_diag.c:100 [inline]
> > ffff888045870130 (k-sk_lock-AF_INET6){+.+.}-{0:0}, at:
> > tcp_diag_get_aux+0x738/0x830 net/ipv4/tcp_diag.c:137
> >
> > but task is already holding lock:
> > ffffc9000135e488 (&h->lhash2[i].lock){+.+.}-{2:2}, at: spin_lock
> > include/linux/spinlock.h:351 [inline]
> > ffffc9000135e488 (&h->lhash2[i].lock){+.+.}-{2:2}, at:
> > inet_diag_dump_icsk+0x39f/0x1f80 net/ipv4/inet_diag.c:1038
>
> [Sorry for the latency]. Yes it looks like that checking the listener
> status will work. I can test and send the formal patch - with the due
> credits! - or do you prefer otherwise?

Sure, please send the formal patch, thank you.
  

Patch

diff --git a/include/net/tcp.h b/include/net/tcp.h
index dd78a1181031..f6eba9652d01 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2506,7 +2506,7 @@  struct tcp_ulp_ops {
 	/* cleanup ulp */
 	void (*release)(struct sock *sk);
 	/* diagnostic */
-	int (*get_info)(const struct sock *sk, struct sk_buff *skb);
+	int (*get_info)(struct sock *sk, struct sk_buff *skb);
 	size_t (*get_info_size)(const struct sock *sk);
 	/* clone ulp */
 	void (*clone)(const struct request_sock *req, struct sock *newsk,
diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
index a536586742f2..e57c5f47f035 100644
--- a/net/mptcp/diag.c
+++ b/net/mptcp/diag.c
@@ -13,17 +13,19 @@ 
 #include <uapi/linux/mptcp.h>
 #include "protocol.h"
 
-static int subflow_get_info(const struct sock *sk, struct sk_buff *skb)
+static int subflow_get_info(struct sock *sk, struct sk_buff *skb)
 {
 	struct mptcp_subflow_context *sf;
 	struct nlattr *start;
 	u32 flags = 0;
+	bool slow;
 	int err;
 
 	start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP);
 	if (!start)
 		return -EMSGSIZE;
 
+	slow = lock_sock_fast(sk);
 	rcu_read_lock();
 	sf = rcu_dereference(inet_csk(sk)->icsk_ulp_data);
 	if (!sf) {
@@ -69,11 +71,13 @@  static int subflow_get_info(const struct sock *sk, struct sk_buff *skb)
 	}
 
 	rcu_read_unlock();
+	unlock_sock_fast(sk, slow);
 	nla_nest_end(skb, start);
 	return 0;
 
 nla_failure:
 	rcu_read_unlock();
+	unlock_sock_fast(sk, slow);
 	nla_nest_cancel(skb, start);
 	return err;
 }
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 1c2c6800949d..b4674f03d71a 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -1003,7 +1003,7 @@  static u16 tls_user_config(struct tls_context *ctx, bool tx)
 	return 0;
 }
 
-static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
+static int tls_get_info(struct sock *sk, struct sk_buff *skb)
 {
 	u16 version, cipher_type;
 	struct tls_context *ctx;