[RFC,nf-next,v3,1/2] netfilter: bpf: support prog update

Message ID 1703081351-85579-2-git-send-email-alibuda@linux.alibaba.com
State New
Headers
Series netfilter: bpf: support prog update |

Commit Message

D. Wythe Dec. 20, 2023, 2:09 p.m. UTC
  From: "D. Wythe" <alibuda@linux.alibaba.com>

To support the prog update, we need to ensure that the prog seen
within the hook is always valid. Considering that hooks are always
protected by rcu_read_lock(), which provide us the ability to
access the prog under rcu.

Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
---
 net/netfilter/nf_bpf_link.c | 63 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 15 deletions(-)
  

Comments

Alexei Starovoitov Dec. 20, 2023, 9:11 p.m. UTC | #1
On Wed, Dec 20, 2023 at 6:09 AM D. Wythe <alibuda@linux.alibaba.com> wrote:
>
> From: "D. Wythe" <alibuda@linux.alibaba.com>
>
> To support the prog update, we need to ensure that the prog seen
> within the hook is always valid. Considering that hooks are always
> protected by rcu_read_lock(), which provide us the ability to
> access the prog under rcu.
>
> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
> ---
>  net/netfilter/nf_bpf_link.c | 63 ++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 48 insertions(+), 15 deletions(-)
>
> diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
> index e502ec0..9bc91d1 100644
> --- a/net/netfilter/nf_bpf_link.c
> +++ b/net/netfilter/nf_bpf_link.c
> @@ -8,17 +8,8 @@
>  #include <net/netfilter/nf_bpf_link.h>
>  #include <uapi/linux/netfilter_ipv4.h>
>
> -static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
> -                                   const struct nf_hook_state *s)
> -{
> -       const struct bpf_prog *prog = bpf_prog;
> -       struct bpf_nf_ctx ctx = {
> -               .state = s,
> -               .skb = skb,
> -       };
> -
> -       return bpf_prog_run(prog, &ctx);
> -}
> +/* protect link update in parallel */
> +static DEFINE_MUTEX(bpf_nf_mutex);
>
>  struct bpf_nf_link {
>         struct bpf_link link;
> @@ -26,8 +17,20 @@ struct bpf_nf_link {
>         struct net *net;
>         u32 dead;
>         const struct nf_defrag_hook *defrag_hook;
> +       struct rcu_head head;

I have to point out the same issues as before, but
will ask them differently...

Why do you think above rcu_head is necessary?

>  };
>
> +static unsigned int nf_hook_run_bpf(void *bpf_link, struct sk_buff *skb,
> +                                   const struct nf_hook_state *s)
> +{
> +       const struct bpf_nf_link *nf_link = bpf_link;
> +       struct bpf_nf_ctx ctx = {
> +               .state = s,
> +               .skb = skb,
> +       };
> +       return bpf_prog_run(rcu_dereference_raw(nf_link->link.prog), &ctx);
> +}
> +
>  #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
>  static const struct nf_defrag_hook *
>  get_proto_defrag_hook(struct bpf_nf_link *link,
> @@ -126,8 +129,7 @@ static void bpf_nf_link_release(struct bpf_link *link)
>  static void bpf_nf_link_dealloc(struct bpf_link *link)
>  {
>         struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
> -
> -       kfree(nf_link);
> +       kfree_rcu(nf_link, head);

Why is this needed ?
Have you looked at tcx_link_lops ?

>  }
>
>  static int bpf_nf_link_detach(struct bpf_link *link)
> @@ -162,7 +164,34 @@ static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
>  static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
>                               struct bpf_prog *old_prog)
>  {
> -       return -EOPNOTSUPP;
> +       struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
> +       int err = 0;
> +
> +       mutex_lock(&bpf_nf_mutex);

Why do you need this mutex?
What race does it solve?

> +
> +       if (nf_link->dead) {
> +               err = -EPERM;
> +               goto out;
> +       }
> +
> +       /* target old_prog mismatch */
> +       if (old_prog && link->prog != old_prog) {
> +               err = -EPERM;
> +               goto out;
> +       }
> +
> +       old_prog = link->prog;
> +       if (old_prog == new_prog) {
> +               /* don't need update */
> +               bpf_prog_put(new_prog);
> +               goto out;
> +       }
> +
> +       old_prog = xchg(&link->prog, new_prog);
> +       bpf_prog_put(old_prog);
> +out:
> +       mutex_unlock(&bpf_nf_mutex);
> +       return err;
>  }
>
>  static const struct bpf_link_ops bpf_nf_link_lops = {
> @@ -226,7 +255,11 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
>
>         link->hook_ops.hook = nf_hook_run_bpf;
>         link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
> -       link->hook_ops.priv = prog;
> +
> +       /* bpf_nf_link_release & bpf_nf_link_dealloc() can ensures that link remains
> +        * valid at all times within nf_hook_run_bpf().
> +        */
> +       link->hook_ops.priv = link;
>
>         link->hook_ops.pf = attr->link_create.netfilter.pf;
>         link->hook_ops.priority = attr->link_create.netfilter.priority;
> --
> 1.8.3.1
>
  
D. Wythe Dec. 22, 2023, 7:06 a.m. UTC | #2
On 12/21/23 5:11 AM, Alexei Starovoitov wrote:
> On Wed, Dec 20, 2023 at 6:09 AM D. Wythe <alibuda@linux.alibaba.com> wrote:
>> From: "D. Wythe" <alibuda@linux.alibaba.com>
>>
>> To support the prog update, we need to ensure that the prog seen
>> within the hook is always valid. Considering that hooks are always
>> protected by rcu_read_lock(), which provide us the ability to
>> access the prog under rcu.
>>
>> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
>> ---
>>   net/netfilter/nf_bpf_link.c | 63 ++++++++++++++++++++++++++++++++++-----------
>>   1 file changed, 48 insertions(+), 15 deletions(-)
>>
>> diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
>> index e502ec0..9bc91d1 100644
>> --- a/net/netfilter/nf_bpf_link.c
>> +++ b/net/netfilter/nf_bpf_link.c
>> @@ -8,17 +8,8 @@
>>   #include <net/netfilter/nf_bpf_link.h>
>>   #include <uapi/linux/netfilter_ipv4.h>
>>
>> -static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
>> -                                   const struct nf_hook_state *s)
>> -{
>> -       const struct bpf_prog *prog = bpf_prog;
>> -       struct bpf_nf_ctx ctx = {
>> -               .state = s,
>> -               .skb = skb,
>> -       };
>> -
>> -       return bpf_prog_run(prog, &ctx);
>> -}
>> +/* protect link update in parallel */
>> +static DEFINE_MUTEX(bpf_nf_mutex);
>>
>>   struct bpf_nf_link {
>>          struct bpf_link link;
>> @@ -26,8 +17,20 @@ struct bpf_nf_link {
>>          struct net *net;
>>          u32 dead;
>>          const struct nf_defrag_hook *defrag_hook;
>> +       struct rcu_head head;
> I have to point out the same issues as before, but
> will ask them differently...
>
> Why do you think above rcu_head is necessary?
>
>>   };
>>
>> +static unsigned int nf_hook_run_bpf(void *bpf_link, struct sk_buff *skb,
>> +                                   const struct nf_hook_state *s)
>> +{
>> +       const struct bpf_nf_link *nf_link = bpf_link;
>> +       struct bpf_nf_ctx ctx = {
>> +               .state = s,
>> +               .skb = skb,
>> +       };
>> +       return bpf_prog_run(rcu_dereference_raw(nf_link->link.prog), &ctx);
>> +}
>> +
>>   #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
>>   static const struct nf_defrag_hook *
>>   get_proto_defrag_hook(struct bpf_nf_link *link,
>> @@ -126,8 +129,7 @@ static void bpf_nf_link_release(struct bpf_link *link)
>>   static void bpf_nf_link_dealloc(struct bpf_link *link)
>>   {
>>          struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
>> -
>> -       kfree(nf_link);
>> +       kfree_rcu(nf_link, head);
> Why is this needed ?
> Have you looked at tcx_link_lops ?

Introducing rcu_head/kfree_rcu is to address the situation where the 
netfilter hooks might
still access the link after bpf_nf_link_dealloc.

                                                      nf_hook_run_bpf
                                                      const struct 
bpf_nf_link *nf_link = bpf_link;

bpf_nf_link_release
     nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);

bpf_nf_link_dealloc
     free(link)
bpf_prog_run(link->prog);


I had checked the tcx_link_lops ,it's seems it use the synchronize_rcu() 
to solve the
same problem, which is also the way we used in the first version.

https://lore.kernel.org/bpf/1702467945-38866-1-git-send-email-alibuda@linux.alibaba.com/

However, we have received some opposing views, believing that this is a 
bit overkill,
so we decided to use kfree_rcu.

https://lore.kernel.org/bpf/20231213222415.GA13818@breakpoint.cc/

>>   }
>>
>>   static int bpf_nf_link_detach(struct bpf_link *link)
>> @@ -162,7 +164,34 @@ static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
>>   static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
>>                                struct bpf_prog *old_prog)
>>   {
>> -       return -EOPNOTSUPP;
>> +       struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
>> +       int err = 0;
>> +
>> +       mutex_lock(&bpf_nf_mutex);
> Why do you need this mutex?
> What race does it solve?

To avoid user update a link with differ prog at the same time. I noticed 
that sys_bpf()
doesn't seem to prevent being invoked by user at the same time. Have I 
missed something?

Best wishes,
D. Wythe
>> +
>> +       if (nf_link->dead) {
>> +               err = -EPERM;
>> +               goto out;
>> +       }
>> +
>> +       /* target old_prog mismatch */
>> +       if (old_prog && link->prog != old_prog) {
>> +               err = -EPERM;
>> +               goto out;
>> +       }
>> +
>> +       old_prog = link->prog;
>> +       if (old_prog == new_prog) {
>> +               /* don't need update */
>> +               bpf_prog_put(new_prog);
>> +               goto out;
>> +       }
>> +
>> +       old_prog = xchg(&link->prog, new_prog);
>> +       bpf_prog_put(old_prog);
>> +out:
>> +       mutex_unlock(&bpf_nf_mutex);
>> +       return err;
>>   }
>>
>>   static const struct bpf_link_ops bpf_nf_link_lops = {
>> @@ -226,7 +255,11 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
>>
>>          link->hook_ops.hook = nf_hook_run_bpf;
>>          link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
>> -       link->hook_ops.priv = prog;
>> +
>> +       /* bpf_nf_link_release & bpf_nf_link_dealloc() can ensures that link remains
>> +        * valid at all times within nf_hook_run_bpf().
>> +        */
>> +       link->hook_ops.priv = link;
>>
>>          link->hook_ops.pf = attr->link_create.netfilter.pf;
>>          link->hook_ops.priority = attr->link_create.netfilter.priority;
>> --
>> 1.8.3.1
>>
  
Alexei Starovoitov Dec. 22, 2023, 10:23 p.m. UTC | #3
On Thu, Dec 21, 2023 at 11:06 PM D. Wythe <alibuda@linux.alibaba.com> wrote:
>
>
>
> On 12/21/23 5:11 AM, Alexei Starovoitov wrote:
> > On Wed, Dec 20, 2023 at 6:09 AM D. Wythe <alibuda@linux.alibaba.com> wrote:
> >> From: "D. Wythe" <alibuda@linux.alibaba.com>
> >>
> >> To support the prog update, we need to ensure that the prog seen
> >> within the hook is always valid. Considering that hooks are always
> >> protected by rcu_read_lock(), which provide us the ability to
> >> access the prog under rcu.
> >>
> >> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
> >> ---
> >>   net/netfilter/nf_bpf_link.c | 63 ++++++++++++++++++++++++++++++++++-----------
> >>   1 file changed, 48 insertions(+), 15 deletions(-)
> >>
> >> diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
> >> index e502ec0..9bc91d1 100644
> >> --- a/net/netfilter/nf_bpf_link.c
> >> +++ b/net/netfilter/nf_bpf_link.c
> >> @@ -8,17 +8,8 @@
> >>   #include <net/netfilter/nf_bpf_link.h>
> >>   #include <uapi/linux/netfilter_ipv4.h>
> >>
> >> -static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
> >> -                                   const struct nf_hook_state *s)
> >> -{
> >> -       const struct bpf_prog *prog = bpf_prog;
> >> -       struct bpf_nf_ctx ctx = {
> >> -               .state = s,
> >> -               .skb = skb,
> >> -       };
> >> -
> >> -       return bpf_prog_run(prog, &ctx);
> >> -}
> >> +/* protect link update in parallel */
> >> +static DEFINE_MUTEX(bpf_nf_mutex);
> >>
> >>   struct bpf_nf_link {
> >>          struct bpf_link link;
> >> @@ -26,8 +17,20 @@ struct bpf_nf_link {
> >>          struct net *net;
> >>          u32 dead;
> >>          const struct nf_defrag_hook *defrag_hook;
> >> +       struct rcu_head head;
> > I have to point out the same issues as before, but
> > will ask them differently...
> >
> > Why do you think above rcu_head is necessary?
> >
> >>   };
> >>
> >> +static unsigned int nf_hook_run_bpf(void *bpf_link, struct sk_buff *skb,
> >> +                                   const struct nf_hook_state *s)
> >> +{
> >> +       const struct bpf_nf_link *nf_link = bpf_link;
> >> +       struct bpf_nf_ctx ctx = {
> >> +               .state = s,
> >> +               .skb = skb,
> >> +       };
> >> +       return bpf_prog_run(rcu_dereference_raw(nf_link->link.prog), &ctx);
> >> +}
> >> +
> >>   #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
> >>   static const struct nf_defrag_hook *
> >>   get_proto_defrag_hook(struct bpf_nf_link *link,
> >> @@ -126,8 +129,7 @@ static void bpf_nf_link_release(struct bpf_link *link)
> >>   static void bpf_nf_link_dealloc(struct bpf_link *link)
> >>   {
> >>          struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
> >> -
> >> -       kfree(nf_link);
> >> +       kfree_rcu(nf_link, head);
> > Why is this needed ?
> > Have you looked at tcx_link_lops ?
>
> Introducing rcu_head/kfree_rcu is to address the situation where the
> netfilter hooks might
> still access the link after bpf_nf_link_dealloc.

Why do you think so?

>
>                                                       nf_hook_run_bpf
>                                                       const struct
> bpf_nf_link *nf_link = bpf_link;
>
> bpf_nf_link_release
>      nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
>
> bpf_nf_link_dealloc
>      free(link)
> bpf_prog_run(link->prog);
>
>
> I had checked the tcx_link_lops ,it's seems it use the synchronize_rcu()
> to solve the

Where do you see such code in tcx_link_lops ?

> same problem, which is also the way we used in the first version.
>
> https://lore.kernel.org/bpf/1702467945-38866-1-git-send-email-alibuda@linux.alibaba.com/
>
> However, we have received some opposing views, believing that this is a
> bit overkill,
> so we decided to use kfree_rcu.
>
> https://lore.kernel.org/bpf/20231213222415.GA13818@breakpoint.cc/
>
> >>   }
> >>
> >>   static int bpf_nf_link_detach(struct bpf_link *link)
> >> @@ -162,7 +164,34 @@ static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
> >>   static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
> >>                                struct bpf_prog *old_prog)
> >>   {
> >> -       return -EOPNOTSUPP;
> >> +       struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
> >> +       int err = 0;
> >> +
> >> +       mutex_lock(&bpf_nf_mutex);
> > Why do you need this mutex?
> > What race does it solve?
>
> To avoid user update a link with differ prog at the same time. I noticed
> that sys_bpf()
> doesn't seem to prevent being invoked by user at the same time. Have I
> missed something?

You're correct that sys_bpf() doesn't lock anything.
But what are you serializing in this bpf_nf_link_update() ?
What will happen if multiple bpf_nf_link_update()
without mutex run on different CPUs in parallel ?
  
D. Wythe Dec. 27, 2023, 8:20 a.m. UTC | #4
On 12/23/23 6:23 AM, Alexei Starovoitov wrote:
> On Thu, Dec 21, 2023 at 11:06 PM D. Wythe <alibuda@linux.alibaba.com> wrote:
>>
>>
>> On 12/21/23 5:11 AM, Alexei Starovoitov wrote:
>>> On Wed, Dec 20, 2023 at 6:09 AM D. Wythe <alibuda@linux.alibaba.com> wrote:
>>>> From: "D. Wythe" <alibuda@linux.alibaba.com>
>>>>
>>>> To support the prog update, we need to ensure that the prog seen
>>>> within the hook is always valid. Considering that hooks are always
>>>> protected by rcu_read_lock(), which provide us the ability to
>>>> access the prog under rcu.
>>>>
>>>> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
>>>> ---
>>>>    net/netfilter/nf_bpf_link.c | 63 ++++++++++++++++++++++++++++++++++-----------
>>>>    1 file changed, 48 insertions(+), 15 deletions(-)
>>>>
>>>> diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
>>>> index e502ec0..9bc91d1 100644
>>>> --- a/net/netfilter/nf_bpf_link.c
>>>> +++ b/net/netfilter/nf_bpf_link.c
>>>> @@ -8,17 +8,8 @@
>>>>    #include <net/netfilter/nf_bpf_link.h>
>>>>    #include <uapi/linux/netfilter_ipv4.h>
>>>>
>>>> -static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
>>>> -                                   const struct nf_hook_state *s)
>>>> -{
>>>> -       const struct bpf_prog *prog = bpf_prog;
>>>> -       struct bpf_nf_ctx ctx = {
>>>> -               .state = s,
>>>> -               .skb = skb,
>>>> -       };
>>>> -
>>>> -       return bpf_prog_run(prog, &ctx);
>>>> -}
>>>> +/* protect link update in parallel */
>>>> +static DEFINE_MUTEX(bpf_nf_mutex);
>>>>
>>>>    struct bpf_nf_link {
>>>>           struct bpf_link link;
>>>> @@ -26,8 +17,20 @@ struct bpf_nf_link {
>>>>           struct net *net;
>>>>           u32 dead;
>>>>           const struct nf_defrag_hook *defrag_hook;
>>>> +       struct rcu_head head;
>>> I have to point out the same issues as before, but
>>> will ask them differently...
>>>
>>> Why do you think above rcu_head is necessary?
>>>
>>>>    };
>>>>
>>>> +static unsigned int nf_hook_run_bpf(void *bpf_link, struct sk_buff *skb,
>>>> +                                   const struct nf_hook_state *s)
>>>> +{
>>>> +       const struct bpf_nf_link *nf_link = bpf_link;
>>>> +       struct bpf_nf_ctx ctx = {
>>>> +               .state = s,
>>>> +               .skb = skb,
>>>> +       };
>>>> +       return bpf_prog_run(rcu_dereference_raw(nf_link->link.prog), &ctx);
>>>> +}
>>>> +
>>>>    #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
>>>>    static const struct nf_defrag_hook *
>>>>    get_proto_defrag_hook(struct bpf_nf_link *link,
>>>> @@ -126,8 +129,7 @@ static void bpf_nf_link_release(struct bpf_link *link)
>>>>    static void bpf_nf_link_dealloc(struct bpf_link *link)
>>>>    {
>>>>           struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
>>>> -
>>>> -       kfree(nf_link);
>>>> +       kfree_rcu(nf_link, head);
>>> Why is this needed ?
>>> Have you looked at tcx_link_lops ?
>> Introducing rcu_head/kfree_rcu is to address the situation where the
>> netfilter hooks might
>> still access the link after bpf_nf_link_dealloc.
> Why do you think so?
>

Hi Alexei,


IMMO, nf_unregister_net_hook does not wait for the completion of the 
execution of the hook that is being removed,
instead, it allocates a new array without the very hook to replace the 
old arrayvia rcu_assign_pointer() (in __nf_hook_entries_try_shrink),
then it use call_rcu() to release the old one.

You can find more details in commit 
8c873e2199700c2de7dbd5eedb9d90d5f109462b.

In other words, when nf_unregister_net_hook returns, there may still be 
contexts executing hooks on the
old array, which means that the `link` may still be accessed after 
nf_unregister_net_hook returns.

And that's the reason why we use kfree_rcu() to release the `link`.
>>                                                        nf_hook_run_bpf
>>                                                        const struct
>> bpf_nf_link *nf_link = bpf_link;
>>
>> bpf_nf_link_release
>>       nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
>>
>> bpf_nf_link_dealloc
>>       free(link)
>> bpf_prog_run(link->prog);
>>
>>
>> I had checked the tcx_link_lops ,it's seems it use the synchronize_rcu()
>> to solve the
> Where do you see such code in tcx_link_lops ?

I'm not certain if the reason that it choose to use synchronize_rcu()is 
the same as mine,
but I did see it here:


tcx_link_release() -> tcx_entry_sync()


static inline void tcx_entry_sync(void)
{
     /* bpf_mprog_entry got a/b swapped, therefore ensure that
      * there are no inflight users on the old one anymore.
      */
     synchronize_rcu();
}

>> same problem, which is also the way we used in the first version.
>>
>> https://lore.kernel.org/bpf/1702467945-38866-1-git-send-email-alibuda@linux.alibaba.com/
>>
>> However, we have received some opposing views, believing that this is a
>> bit overkill,
>> so we decided to use kfree_rcu.
>>
>> https://lore.kernel.org/bpf/20231213222415.GA13818@breakpoint.cc/
>>
>>>>    }
>>>>
>>>>    static int bpf_nf_link_detach(struct bpf_link *link)
>>>> @@ -162,7 +164,34 @@ static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
>>>>    static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
>>>>                                 struct bpf_prog *old_prog)
>>>>    {
>>>> -       return -EOPNOTSUPP;
>>>> +       struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
>>>> +       int err = 0;
>>>> +
>>>> +       mutex_lock(&bpf_nf_mutex);
>>> Why do you need this mutex?
>>> What race does it solve?
>> To avoid user update a link with differ prog at the same time. I noticed
>> that sys_bpf()
>> doesn't seem to prevent being invoked by user at the same time. Have I
>> missed something?
> You're correct that sys_bpf() doesn't lock anything.
> But what are you serializing in this bpf_nf_link_update() ?
> What will happen if multiple bpf_nf_link_update()
> without mutex run on different CPUs in parallel ?

I must admit that it is indeed feasible if we eliminate the mutex and 
use cmpxchg to swap the prog (we need to ensure that there is only one 
bpf_prog_put() on the old prog).
However, when cmpxchg fails, it means that this context has not 
outcompeted the other one, and we have to return a failure. Maybe 
something like this:

if (!cmpxchg(&link->prog, old_prog, new_prog)) {
     /* already replaced by another link_update */
     return -xxx;
}

As a comparison, The version with the mutex wouldn't encounter this 
error, every update would succeed. I think that it's too harsh for the 
user to receive a failure
in that case since they haven't done anything wrong.

Best wishes,
D. Wythe
  
Alexei Starovoitov Dec. 27, 2023, 7 p.m. UTC | #5
On Wed, Dec 27, 2023 at 12:20 AM D. Wythe <alibuda@linux.alibaba.com> wrote:
>
>
> Hi Alexei,
>
>
> IMMO, nf_unregister_net_hook does not wait for the completion of the
> execution of the hook that is being removed,
> instead, it allocates a new array without the very hook to replace the
> old arrayvia rcu_assign_pointer() (in __nf_hook_entries_try_shrink),
> then it use call_rcu() to release the old one.
>
> You can find more details in commit
> 8c873e2199700c2de7dbd5eedb9d90d5f109462b.
>
> In other words, when nf_unregister_net_hook returns, there may still be
> contexts executing hooks on the
> old array, which means that the `link` may still be accessed after
> nf_unregister_net_hook returns.
>
> And that's the reason why we use kfree_rcu() to release the `link`.
> >>                                                        nf_hook_run_bpf
> >>                                                        const struct
> >> bpf_nf_link *nf_link = bpf_link;
> >>
> >> bpf_nf_link_release
> >>       nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
> >>
> >> bpf_nf_link_dealloc
> >>       free(link)
> >> bpf_prog_run(link->prog);

Got it.
Sounds like it's an existing bug. If so it should be an independent
patch with Fixes tag.

Also please craft a test case to demonstrate UAF.

>
> I must admit that it is indeed feasible if we eliminate the mutex and
> use cmpxchg to swap the prog (we need to ensure that there is only one
> bpf_prog_put() on the old prog).
> However, when cmpxchg fails, it means that this context has not
> outcompeted the other one, and we have to return a failure. Maybe
> something like this:
>
> if (!cmpxchg(&link->prog, old_prog, new_prog)) {
>      /* already replaced by another link_update */
>      return -xxx;
> }
>
> As a comparison, The version with the mutex wouldn't encounter this
> error, every update would succeed. I think that it's too harsh for the
> user to receive a failure
> in that case since they haven't done anything wrong.

Disagree. The mutex doesn't prevent this issue.
There is always a race.
It happens when link_update.old_prog_fd and BPF_F_REPLACE
were specified.
One user space passes an FD of the old prog and
another user space doing the same. They both race and one of them
gets
if (old_prog && link->prog != old_prog) {
               err = -EPERM;

it's no different with dropping the mutex and doing:
if (old_prog) {
    if (!cmpxchg(&link->prog, old_prog, new_prog))
      -EPERM
} else {
   old_prog = xchg(&link->prog, new_prog);
}
  
D. Wythe Dec. 28, 2023, 11:06 a.m. UTC | #6
On 12/28/23 3:00 AM, Alexei Starovoitov wrote:
> On Wed, Dec 27, 2023 at 12:20 AM D. Wythe <alibuda@linux.alibaba.com> wrote:
>>
>> Hi Alexei,
>>
>>
>> IMMO, nf_unregister_net_hook does not wait for the completion of the
>> execution of the hook that is being removed,
>> instead, it allocates a new array without the very hook to replace the
>> old arrayvia rcu_assign_pointer() (in __nf_hook_entries_try_shrink),
>> then it use call_rcu() to release the old one.
>>
>> You can find more details in commit
>> 8c873e2199700c2de7dbd5eedb9d90d5f109462b.
>>
>> In other words, when nf_unregister_net_hook returns, there may still be
>> contexts executing hooks on the
>> old array, which means that the `link` may still be accessed after
>> nf_unregister_net_hook returns.
>>
>> And that's the reason why we use kfree_rcu() to release the `link`.
>>>>                                                         nf_hook_run_bpf
>>>>                                                         const struct
>>>> bpf_nf_link *nf_link = bpf_link;
>>>>
>>>> bpf_nf_link_release
>>>>        nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
>>>>
>>>> bpf_nf_link_dealloc
>>>>        free(link)
>>>> bpf_prog_run(link->prog);
> Got it.
> Sounds like it's an existing bug. If so it should be an independent
> patch with Fixes tag.
>
> Also please craft a test case to demonstrate UAF.
>

It is not an existing bug... Accessing the link within the hook was 
something I introduced here
to support updates😉, as previously there was no access to the link 
within the hook.
>> I must admit that it is indeed feasible if we eliminate the mutex and
>> use cmpxchg to swap the prog (we need to ensure that there is only one
>> bpf_prog_put() on the old prog).
>> However, when cmpxchg fails, it means that this context has not
>> outcompeted the other one, and we have to return a failure. Maybe
>> something like this:
>>
>> if (!cmpxchg(&link->prog, old_prog, new_prog)) {
>>       /* already replaced by another link_update */
>>       return -xxx;
>> }
>>
>> As a comparison, The version with the mutex wouldn't encounter this
>> error, every update would succeed. I think that it's too harsh for the
>> user to receive a failure
>> in that case since they haven't done anything wrong.
> Disagree. The mutex doesn't prevent this issue.
> There is always a race.
> It happens when link_update.old_prog_fd and BPF_F_REPLACE
> were specified.
> One user space passes an FD of the old prog and
> another user space doing the same. They both race and one of them
> gets
> if (old_prog && link->prog != old_prog) {
>                 err = -EPERM;
>
> it's no different with dropping the mutex and doing:
> if (old_prog) {
>      if (!cmpxchg(&link->prog, old_prog, new_prog))
>        -EPERM
> } else {
>     old_prog = xchg(&link->prog, new_prog);
> }

Got it!  It's very helpful, Thanks very much! I will modify my patch 
accordingly.


Best wishes,
D. Wythe
  

Patch

diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
index e502ec0..9bc91d1 100644
--- a/net/netfilter/nf_bpf_link.c
+++ b/net/netfilter/nf_bpf_link.c
@@ -8,17 +8,8 @@ 
 #include <net/netfilter/nf_bpf_link.h>
 #include <uapi/linux/netfilter_ipv4.h>
 
-static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
-				    const struct nf_hook_state *s)
-{
-	const struct bpf_prog *prog = bpf_prog;
-	struct bpf_nf_ctx ctx = {
-		.state = s,
-		.skb = skb,
-	};
-
-	return bpf_prog_run(prog, &ctx);
-}
+/* protect link update in parallel */
+static DEFINE_MUTEX(bpf_nf_mutex);
 
 struct bpf_nf_link {
 	struct bpf_link link;
@@ -26,8 +17,20 @@  struct bpf_nf_link {
 	struct net *net;
 	u32 dead;
 	const struct nf_defrag_hook *defrag_hook;
+	struct rcu_head head;
 };
 
+static unsigned int nf_hook_run_bpf(void *bpf_link, struct sk_buff *skb,
+				    const struct nf_hook_state *s)
+{
+	const struct bpf_nf_link *nf_link = bpf_link;
+	struct bpf_nf_ctx ctx = {
+		.state = s,
+		.skb = skb,
+	};
+	return bpf_prog_run(rcu_dereference_raw(nf_link->link.prog), &ctx);
+}
+
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 static const struct nf_defrag_hook *
 get_proto_defrag_hook(struct bpf_nf_link *link,
@@ -126,8 +129,7 @@  static void bpf_nf_link_release(struct bpf_link *link)
 static void bpf_nf_link_dealloc(struct bpf_link *link)
 {
 	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
-
-	kfree(nf_link);
+	kfree_rcu(nf_link, head);
 }
 
 static int bpf_nf_link_detach(struct bpf_link *link)
@@ -162,7 +164,34 @@  static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
 static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 			      struct bpf_prog *old_prog)
 {
-	return -EOPNOTSUPP;
+	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+	int err = 0;
+
+	mutex_lock(&bpf_nf_mutex);
+
+	if (nf_link->dead) {
+		err = -EPERM;
+		goto out;
+	}
+
+	/* target old_prog mismatch */
+	if (old_prog && link->prog != old_prog) {
+		err = -EPERM;
+		goto out;
+	}
+
+	old_prog = link->prog;
+	if (old_prog == new_prog) {
+		/* don't need update */
+		bpf_prog_put(new_prog);
+		goto out;
+	}
+
+	old_prog = xchg(&link->prog, new_prog);
+	bpf_prog_put(old_prog);
+out:
+	mutex_unlock(&bpf_nf_mutex);
+	return err;
 }
 
 static const struct bpf_link_ops bpf_nf_link_lops = {
@@ -226,7 +255,11 @@  int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 
 	link->hook_ops.hook = nf_hook_run_bpf;
 	link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
-	link->hook_ops.priv = prog;
+
+	/* bpf_nf_link_release & bpf_nf_link_dealloc() can ensures that link remains
+	 * valid at all times within nf_hook_run_bpf().
+	 */
+	link->hook_ops.priv = link;
 
 	link->hook_ops.pf = attr->link_create.netfilter.pf;
 	link->hook_ops.priority = attr->link_create.netfilter.priority;