[bpf-next,v2,5/6] bpf: teach the verifier to enforce css_iter and process_iter in RCU CS

Message ID 20230912070149.969939-6-zhouchuyi@bytedance.com
State New
Headers
Series Add Open-coded process and css iters |

Commit Message

Chuyi Zhou Sept. 12, 2023, 7:01 a.m. UTC
  css_iter and process_iter should be used in rcu section. Specifically, in
sleepable progs explicit bpf_rcu_read_lock() is needed before use these
iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
use them directly.

This patch checks whether we are in rcu cs before we want to invoke
bpf_iter_process_new and bpf_iter_css_{pre, post}_new in
mark_stack_slots_iter(). If the rcu protection is guaranteed, we would
let st->type = PTR_TO_STACK | MEM_RCU. is_iter_reg_valid_init() will
reject if reg->type is UNTRUSTED.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
 kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)
  

Comments

Chuyi Zhou Sept. 13, 2023, 1:53 p.m. UTC | #1
Hello.

在 2023/9/12 15:01, Chuyi Zhou 写道:
> css_iter and process_iter should be used in rcu section. Specifically, in
> sleepable progs explicit bpf_rcu_read_lock() is needed before use these
> iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
> use them directly.
> 
> This patch checks whether we are in rcu cs before we want to invoke
> bpf_iter_process_new and bpf_iter_css_{pre, post}_new in
> mark_stack_slots_iter(). If the rcu protection is guaranteed, we would
> let st->type = PTR_TO_STACK | MEM_RCU. is_iter_reg_valid_init() will
> reject if reg->type is UNTRUSTED.

I use the following BPF Prog to test this patch:

SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
int iter_task_for_each_sleep(void *ctx)
{
	struct task_struct *task;
	struct task_struct *cur_task = bpf_get_current_task_btf();

	if (cur_task->pid != target_pid)
		return 0;
	bpf_rcu_read_lock();
	bpf_for_each(process, task) {
		bpf_rcu_read_unlock();
		if (task->pid == target_pid)
			process_cnt += 1;
		bpf_rcu_read_lock();
	}
	bpf_rcu_read_unlock();
	return 0;
}

Unfortunately, we can pass the verifier.

Then I add some printk-messages before setting/clearing state to help debug:

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d151e6b43a5f..35f3fa9471a9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1200,7 +1200,7 @@ static int mark_stack_slots_iter(struct 
bpf_verifier_env *env,
                 __mark_reg_known_zero(st);
                 st->type = PTR_TO_STACK; /* we don't have dedicated reg 
type */
                 if (is_iter_need_rcu(meta)) {
+                       printk("mark reg_addr : %px", st);
                         if (in_rcu_cs(env))
                                 st->type |= MEM_RCU;
                         else
@@ -11472,8 +11472,8 @@ static int check_kfunc_call(struct 
bpf_verifier_env *env, struct bpf_insn *insn,
                         return -EINVAL;
                 } else if (rcu_unlock) {
                         bpf_for_each_reg_in_vstate(env->cur_state, 
state, reg, ({
+                               printk("clear reg_addr : %px MEM_RCU : 
%d PTR_UNTRUSTED : %d\n ", reg, reg->type & MEM_RCU, reg->type & 
PTR_UNTRUSTED);
                                 if (reg->type & MEM_RCU) {
-                                       printk("clear reg addr : %lld", 
reg);
                                         reg->type &= ~(MEM_RCU | 
PTR_MAYBE_NULL);
                                         reg->type |= PTR_UNTRUSTED;
                                 }


The demsg log:

[  393.705324] mark reg_addr : ffff88814e40e200

[  393.706883] clear reg_addr : ffff88814d5f8000 MEM_RCU : 0 
PTR_UNTRUSTED : 0

[  393.707353] clear reg_addr : ffff88814d5f8078 MEM_RCU : 0 
PTR_UNTRUSTED : 0

[  393.708099] clear reg_addr : ffff88814d5f80f0 MEM_RCU : 0 
PTR_UNTRUSTED : 0
....
....

I didn't see ffff88814e40e200 is cleared as expected because 
bpf_for_each_reg_in_vstate didn't find it.

It seems when we are doing bpf_read_unlock() in the middle of iteration 
and want to clearing state through bpf_for_each_reg_in_vstate, we can 
not find the previous reg which we marked MEM_RCU/PTR_UNTRUSTED in 
mark_stack_slots_iter().

I thought maybe the correct answer here is operating the *iter_reg* 
parameter in mark_stack_slots_iter() direcly so we can find it in 
bpf_for_each_reg_in_vstate.

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6a6827ba7a18..53330ddf2b3c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1218,6 +1218,12 @@ static int mark_stack_slots_iter(struct 
bpf_verifier_env *env,
                 mark_stack_slot_scratched(env, spi - i);
         }

+       if (is_iter_need_rcu(meta)) {
+               if (in_rcu_cs(env))
+                       reg->type |= MEM_RCU;
+               else
+                       reg->type |= PTR_UNTRUSTED;
+       }
         return 0;
  }

@@ -1307,7 +1315,8 @@ static bool is_iter_reg_valid_init(struct 
bpf_verifier_env *env, struct bpf_reg_
                         if (slot->slot_type[j] != STACK_ITER)
                                 Kumarreturn false;
         }
-
+       if (reg->type & PTR_UNTRUSTED)
+               return false;
         return true;
  }

However, it did not work either. The reason it didn't work is the state 
of iter_reg will be cleared implicitly before the 
is_iter_reg_valid_init() even we don't call bpf_rcu_unlock.

It would be appreciate if you could give some suggestion. Maby it worthy 
to try the solution proposed by Kumar?[1]

[1] 
https://lore.kernel.org/lkml/CAP01T77cWxWNwq5HLr+Woiu7k4-P3QQfJWX1OeQJUkxW3=P4bA@mail.gmail.com/#t
  
Chuyi Zhou Sept. 14, 2023, 8:56 a.m. UTC | #2
在 2023/9/13 21:53, Chuyi Zhou 写道:
> Hello.
> 
> 在 2023/9/12 15:01, Chuyi Zhou 写道:
>> css_iter and process_iter should be used in rcu section. Specifically, in
>> sleepable progs explicit bpf_rcu_read_lock() is needed before use these
>> iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
>> use them directly.
>>
>> This patch checks whether we are in rcu cs before we want to invoke
>> bpf_iter_process_new and bpf_iter_css_{pre, post}_new in
>> mark_stack_slots_iter(). If the rcu protection is guaranteed, we would
>> let st->type = PTR_TO_STACK | MEM_RCU. is_iter_reg_valid_init() will
>> reject if reg->type is UNTRUSTED.
> 
> I use the following BPF Prog to test this patch:
> 
> SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
> int iter_task_for_each_sleep(void *ctx)
> {
>      struct task_struct *task;
>      struct task_struct *cur_task = bpf_get_current_task_btf();
> 
>      if (cur_task->pid != target_pid)
>          return 0;
>      bpf_rcu_read_lock();
>      bpf_for_each(process, task) {
>          bpf_rcu_read_unlock();
>          if (task->pid == target_pid)
>              process_cnt += 1;
>          bpf_rcu_read_lock();
>      }
>      bpf_rcu_read_unlock();
>      return 0;
> }
> 
> Unfortunately, we can pass the verifier.
> 
> Then I add some printk-messages before setting/clearing state to help 
> debug:
> 
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index d151e6b43a5f..35f3fa9471a9 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1200,7 +1200,7 @@ static int mark_stack_slots_iter(struct 
> bpf_verifier_env *env,
>                  __mark_reg_known_zero(st);
>                  st->type = PTR_TO_STACK; /* we don't have dedicated reg 
> type */
>                  if (is_iter_need_rcu(meta)) {
> +                       printk("mark reg_addr : %px", st);
>                          if (in_rcu_cs(env))
>                                  st->type |= MEM_RCU;
>                          else
> @@ -11472,8 +11472,8 @@ static int check_kfunc_call(struct 
> bpf_verifier_env *env, struct bpf_insn *insn,
>                          return -EINVAL;
>                  } else if (rcu_unlock) {
>                          bpf_for_each_reg_in_vstate(env->cur_state, 
> state, reg, ({
> +                               printk("clear reg_addr : %px MEM_RCU : 
> %d PTR_UNTRUSTED : %d\n ", reg, reg->type & MEM_RCU, reg->type & 
> PTR_UNTRUSTED);
>                                  if (reg->type & MEM_RCU) {
> -                                       printk("clear reg addr : %lld", 
> reg);
>                                          reg->type &= ~(MEM_RCU | 
> PTR_MAYBE_NULL);
>                                          reg->type |= PTR_UNTRUSTED;
>                                  }
> 
> 
> The demsg log:
> 
> [  393.705324] mark reg_addr : ffff88814e40e200
> 
> [  393.706883] clear reg_addr : ffff88814d5f8000 MEM_RCU : 0 
> PTR_UNTRUSTED : 0
> 
> [  393.707353] clear reg_addr : ffff88814d5f8078 MEM_RCU : 0 
> PTR_UNTRUSTED : 0
> 
> [  393.708099] clear reg_addr : ffff88814d5f80f0 MEM_RCU : 0 
> PTR_UNTRUSTED : 0
> ....
> ....
> 
> I didn't see ffff88814e40e200 is cleared as expected because 
> bpf_for_each_reg_in_vstate didn't find it.
> 
> It seems when we are doing bpf_read_unlock() in the middle of iteration 
> and want to clearing state through bpf_for_each_reg_in_vstate, we can 
> not find the previous reg which we marked MEM_RCU/PTR_UNTRUSTED in 
> mark_stack_slots_iter().
> 

bpf_get_spilled_reg will skip slots if they are not STACK_SPILL, but in 
mark_stack_slots_iter() we has marked the slots *STACK_ITER*

With the following change, everything seems work OK.

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index a3236651ec64..83c5ecccadb4 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -387,7 +387,7 @@ struct bpf_verifier_state {

  #define bpf_get_spilled_reg(slot, frame)                               \
         (((slot < frame->allocated_stack / BPF_REG_SIZE) &&             \
-         (frame->stack[slot].slot_type[0] == STACK_SPILL))             \
+         (frame->stack[slot].slot_type[0] == STACK_SPILL || 
frame->stack[slot].slot_type[0] == STACK_ITER))            \
          ? &frame->stack[slot].spilled_ptr : NULL)

I am not sure whether this would harm some logic implicitly when using 
bpf_get_spilled_reg/bpf_for_each_spilled_reg in other place. If so, 
maybe we should add a extra parameter to control the picking behaviour.

#define bpf_get_spilled_reg(slot, frame, stack_type)
			\
	(((slot < frame->allocated_stack / BPF_REG_SIZE) &&		\
	  (frame->stack[slot].slot_type[0] == stack_type))		\
	 ? &frame->stack[slot].spilled_ptr : NULL)

Thanks.
  
Andrii Nakryiko Sept. 14, 2023, 11:26 p.m. UTC | #3
On Tue, Sep 12, 2023 at 12:02 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> css_iter and process_iter should be used in rcu section. Specifically, in
> sleepable progs explicit bpf_rcu_read_lock() is needed before use these
> iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
> use them directly.
>
> This patch checks whether we are in rcu cs before we want to invoke
> bpf_iter_process_new and bpf_iter_css_{pre, post}_new in
> mark_stack_slots_iter(). If the rcu protection is guaranteed, we would
> let st->type = PTR_TO_STACK | MEM_RCU. is_iter_reg_valid_init() will
> reject if reg->type is UNTRUSTED.

it would be nice to mention where this MEM_RCU is turned into
UNTRUSTED when we do rcu_read_unlock(). For someone unfamiliar with
these parts of verifier (like me) this is completely unobvious.

>
> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> ---
>  kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++++--
>  1 file changed, 28 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 2367483bf4c2..6a6827ba7a18 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1172,7 +1172,13 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
>
>  static void __mark_reg_known_zero(struct bpf_reg_state *reg);
>
> +static bool in_rcu_cs(struct bpf_verifier_env *env);
> +
> +/* check whether we are using bpf_iter_process_*() or bpf_iter_css_*() */
> +static bool is_iter_need_rcu(struct bpf_kfunc_call_arg_meta *meta);
> +
>  static int mark_stack_slots_iter(struct bpf_verifier_env *env,
> +                                struct bpf_kfunc_call_arg_meta *meta,
>                                  struct bpf_reg_state *reg, int insn_idx,
>                                  struct btf *btf, u32 btf_id, int nr_slots)
>  {
> @@ -1193,6 +1199,12 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
>
>                 __mark_reg_known_zero(st);
>                 st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
> +               if (is_iter_need_rcu(meta)) {
> +                       if (in_rcu_cs(env))
> +                               st->type |= MEM_RCU;
> +                       else
> +                               st->type |= PTR_UNTRUSTED;
> +               }
>                 st->live |= REG_LIVE_WRITTEN;
>                 st->ref_obj_id = i == 0 ? id : 0;
>                 st->iter.btf = btf;
> @@ -1281,6 +1293,8 @@ static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_
>                 struct bpf_stack_state *slot = &state->stack[spi - i];
>                 struct bpf_reg_state *st = &slot->spilled_ptr;
>
> +               if (st->type & PTR_UNTRUSTED)
> +                       return false;
>                 /* only main (first) slot has ref_obj_id set */
>                 if (i == 0 && !st->ref_obj_id)
>                         return false;
> @@ -7503,13 +7517,13 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
>                                 return err;
>                 }
>
> -               err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
> +               err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
>                 if (err)
>                         return err;
>         } else {
>                 /* iter_next() or iter_destroy() expect initialized iter state*/
>                 if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
> -                       verbose(env, "expected an initialized iter_%s as arg #%d\n",
> +                       verbose(env, "expected an initialized iter_%s as arg #%d or without bpf_rcu_read_lock()\n",
>                                 iter_type_str(meta->btf, btf_id), regno);

this message makes no sense, but even if reworded it would be
confusing for users. So maybe do the RCU check separately and report a
clear message that this iterator is expected to be within a single
continuous rcu_read_{lock+unlock} region.

I do think tracking RCU regions explicitly would make for much easier
to follow code, better messages, etc. Probably would be beneficial for
some other RCU-protected features. But that's a separate topic.

>                         return -EINVAL;
>                 }
> @@ -10382,6 +10396,18 @@ BTF_ID(func, bpf_percpu_obj_new_impl)
>  BTF_ID(func, bpf_percpu_obj_drop_impl)
>  BTF_ID(func, bpf_iter_css_task_new)
>
> +BTF_SET_START(rcu_protect_kfuns_set)
> +BTF_ID(func, bpf_iter_process_new)
> +BTF_ID(func, bpf_iter_css_pre_new)
> +BTF_ID(func, bpf_iter_css_post_new)
> +BTF_SET_END(rcu_protect_kfuns_set)
> +

instead of maintaining these extra special sets, why not add a KF
flag, like KF_RCU_PROTECTED?


> +static inline bool is_iter_need_rcu(struct bpf_kfunc_call_arg_meta *meta)
> +{
> +       return btf_id_set_contains(&rcu_protect_kfuns_set, meta->func_id);
> +}
> +
> +
>  static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
>  {
>         if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
> --
> 2.20.1
>
  
Andrii Nakryiko Sept. 14, 2023, 11:26 p.m. UTC | #4
On Thu, Sep 14, 2023 at 1:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
>
>
> 在 2023/9/13 21:53, Chuyi Zhou 写道:
> > Hello.
> >
> > 在 2023/9/12 15:01, Chuyi Zhou 写道:
> >> css_iter and process_iter should be used in rcu section. Specifically, in
> >> sleepable progs explicit bpf_rcu_read_lock() is needed before use these
> >> iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
> >> use them directly.
> >>
> >> This patch checks whether we are in rcu cs before we want to invoke
> >> bpf_iter_process_new and bpf_iter_css_{pre, post}_new in
> >> mark_stack_slots_iter(). If the rcu protection is guaranteed, we would
> >> let st->type = PTR_TO_STACK | MEM_RCU. is_iter_reg_valid_init() will
> >> reject if reg->type is UNTRUSTED.
> >
> > I use the following BPF Prog to test this patch:
> >
> > SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
> > int iter_task_for_each_sleep(void *ctx)
> > {
> >      struct task_struct *task;
> >      struct task_struct *cur_task = bpf_get_current_task_btf();
> >
> >      if (cur_task->pid != target_pid)
> >          return 0;
> >      bpf_rcu_read_lock();
> >      bpf_for_each(process, task) {
> >          bpf_rcu_read_unlock();
> >          if (task->pid == target_pid)
> >              process_cnt += 1;
> >          bpf_rcu_read_lock();
> >      }
> >      bpf_rcu_read_unlock();
> >      return 0;
> > }
> >
> > Unfortunately, we can pass the verifier.
> >
> > Then I add some printk-messages before setting/clearing state to help
> > debug:
> >
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index d151e6b43a5f..35f3fa9471a9 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -1200,7 +1200,7 @@ static int mark_stack_slots_iter(struct
> > bpf_verifier_env *env,
> >                  __mark_reg_known_zero(st);
> >                  st->type = PTR_TO_STACK; /* we don't have dedicated reg
> > type */
> >                  if (is_iter_need_rcu(meta)) {
> > +                       printk("mark reg_addr : %px", st);
> >                          if (in_rcu_cs(env))
> >                                  st->type |= MEM_RCU;
> >                          else
> > @@ -11472,8 +11472,8 @@ static int check_kfunc_call(struct
> > bpf_verifier_env *env, struct bpf_insn *insn,
> >                          return -EINVAL;
> >                  } else if (rcu_unlock) {
> >                          bpf_for_each_reg_in_vstate(env->cur_state,
> > state, reg, ({
> > +                               printk("clear reg_addr : %px MEM_RCU :
> > %d PTR_UNTRUSTED : %d\n ", reg, reg->type & MEM_RCU, reg->type &
> > PTR_UNTRUSTED);
> >                                  if (reg->type & MEM_RCU) {
> > -                                       printk("clear reg addr : %lld",
> > reg);
> >                                          reg->type &= ~(MEM_RCU |
> > PTR_MAYBE_NULL);
> >                                          reg->type |= PTR_UNTRUSTED;
> >                                  }
> >
> >
> > The demsg log:
> >
> > [  393.705324] mark reg_addr : ffff88814e40e200
> >
> > [  393.706883] clear reg_addr : ffff88814d5f8000 MEM_RCU : 0
> > PTR_UNTRUSTED : 0
> >
> > [  393.707353] clear reg_addr : ffff88814d5f8078 MEM_RCU : 0
> > PTR_UNTRUSTED : 0
> >
> > [  393.708099] clear reg_addr : ffff88814d5f80f0 MEM_RCU : 0
> > PTR_UNTRUSTED : 0
> > ....
> > ....
> >
> > I didn't see ffff88814e40e200 is cleared as expected because
> > bpf_for_each_reg_in_vstate didn't find it.
> >
> > It seems when we are doing bpf_read_unlock() in the middle of iteration
> > and want to clearing state through bpf_for_each_reg_in_vstate, we can
> > not find the previous reg which we marked MEM_RCU/PTR_UNTRUSTED in
> > mark_stack_slots_iter().
> >
>
> bpf_get_spilled_reg will skip slots if they are not STACK_SPILL, but in
> mark_stack_slots_iter() we has marked the slots *STACK_ITER*
>
> With the following change, everything seems work OK.
>
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index a3236651ec64..83c5ecccadb4 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -387,7 +387,7 @@ struct bpf_verifier_state {
>
>   #define bpf_get_spilled_reg(slot, frame)                               \
>          (((slot < frame->allocated_stack / BPF_REG_SIZE) &&             \
> -         (frame->stack[slot].slot_type[0] == STACK_SPILL))             \
> +         (frame->stack[slot].slot_type[0] == STACK_SPILL ||
> frame->stack[slot].slot_type[0] == STACK_ITER))            \
>           ? &frame->stack[slot].spilled_ptr : NULL)
>
> I am not sure whether this would harm some logic implicitly when using
> bpf_get_spilled_reg/bpf_for_each_spilled_reg in other place. If so,
> maybe we should add a extra parameter to control the picking behaviour.
>
> #define bpf_get_spilled_reg(slot, frame, stack_type)
>                         \
>         (((slot < frame->allocated_stack / BPF_REG_SIZE) &&             \
>           (frame->stack[slot].slot_type[0] == stack_type))              \
>          ? &frame->stack[slot].spilled_ptr : NULL)
>
> Thanks.

I don't think it's safe to just make bpf_get_spilled_reg, and
subsequently bpf_for_each_reg_in_vstate and bpf_for_each_spilled_reg
just suddenly start iterating iterator states and/or dynptrs. At least
some of existing uses of those assume they are really working just
with registers.
  
Chuyi Zhou Sept. 15, 2023, 5:46 a.m. UTC | #5
Hello.

在 2023/9/15 07:26, Andrii Nakryiko 写道:
> On Thu, Sep 14, 2023 at 1:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>
>>
>>
>> 在 2023/9/13 21:53, Chuyi Zhou 写道:
>>> Hello.
>>>
>>> 在 2023/9/12 15:01, Chuyi Zhou 写道:
>>>> css_iter and process_iter should be used in rcu section. Specifically, in
>>>> sleepable progs explicit bpf_rcu_read_lock() is needed before use these
>>>> iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
>>>> use them directly.
>>>>
>>>> This patch checks whether we are in rcu cs before we want to invoke
>>>> bpf_iter_process_new and bpf_iter_css_{pre, post}_new in
>>>> mark_stack_slots_iter(). If the rcu protection is guaranteed, we would
>>>> let st->type = PTR_TO_STACK | MEM_RCU. is_iter_reg_valid_init() will
>>>> reject if reg->type is UNTRUSTED.
>>>
>>> I use the following BPF Prog to test this patch:
>>>
>>> SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
>>> int iter_task_for_each_sleep(void *ctx)
>>> {
>>>       struct task_struct *task;
>>>       struct task_struct *cur_task = bpf_get_current_task_btf();
>>>
>>>       if (cur_task->pid != target_pid)
>>>           return 0;
>>>       bpf_rcu_read_lock();
>>>       bpf_for_each(process, task) {
>>>           bpf_rcu_read_unlock();
>>>           if (task->pid == target_pid)
>>>               process_cnt += 1;
>>>           bpf_rcu_read_lock();
>>>       }
>>>       bpf_rcu_read_unlock();
>>>       return 0;
>>> }
>>>
>>> Unfortunately, we can pass the verifier.
>>>
>>> Then I add some printk-messages before setting/clearing state to help
>>> debug:
>>>
>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>> index d151e6b43a5f..35f3fa9471a9 100644
>>> --- a/kernel/bpf/verifier.c
>>> +++ b/kernel/bpf/verifier.c
>>> @@ -1200,7 +1200,7 @@ static int mark_stack_slots_iter(struct
>>> bpf_verifier_env *env,
>>>                   __mark_reg_known_zero(st);
>>>                   st->type = PTR_TO_STACK; /* we don't have dedicated reg
>>> type */
>>>                   if (is_iter_need_rcu(meta)) {
>>> +                       printk("mark reg_addr : %px", st);
>>>                           if (in_rcu_cs(env))
>>>                                   st->type |= MEM_RCU;
>>>                           else
>>> @@ -11472,8 +11472,8 @@ static int check_kfunc_call(struct
>>> bpf_verifier_env *env, struct bpf_insn *insn,
>>>                           return -EINVAL;
>>>                   } else if (rcu_unlock) {
>>>                           bpf_for_each_reg_in_vstate(env->cur_state,
>>> state, reg, ({
>>> +                               printk("clear reg_addr : %px MEM_RCU :
>>> %d PTR_UNTRUSTED : %d\n ", reg, reg->type & MEM_RCU, reg->type &
>>> PTR_UNTRUSTED);
>>>                                   if (reg->type & MEM_RCU) {
>>> -                                       printk("clear reg addr : %lld",
>>> reg);
>>>                                           reg->type &= ~(MEM_RCU |
>>> PTR_MAYBE_NULL);
>>>                                           reg->type |= PTR_UNTRUSTED;
>>>                                   }
>>>
>>>
>>> The demsg log:
>>>
>>> [  393.705324] mark reg_addr : ffff88814e40e200
>>>
>>> [  393.706883] clear reg_addr : ffff88814d5f8000 MEM_RCU : 0
>>> PTR_UNTRUSTED : 0
>>>
>>> [  393.707353] clear reg_addr : ffff88814d5f8078 MEM_RCU : 0
>>> PTR_UNTRUSTED : 0
>>>
>>> [  393.708099] clear reg_addr : ffff88814d5f80f0 MEM_RCU : 0
>>> PTR_UNTRUSTED : 0
>>> ....
>>> ....
>>>
>>> I didn't see ffff88814e40e200 is cleared as expected because
>>> bpf_for_each_reg_in_vstate didn't find it.
>>>
>>> It seems when we are doing bpf_read_unlock() in the middle of iteration
>>> and want to clearing state through bpf_for_each_reg_in_vstate, we can
>>> not find the previous reg which we marked MEM_RCU/PTR_UNTRUSTED in
>>> mark_stack_slots_iter().
>>>
>>
>> bpf_get_spilled_reg will skip slots if they are not STACK_SPILL, but in
>> mark_stack_slots_iter() we has marked the slots *STACK_ITER*
>>
>> With the following change, everything seems work OK.
>>
>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>> index a3236651ec64..83c5ecccadb4 100644
>> --- a/include/linux/bpf_verifier.h
>> +++ b/include/linux/bpf_verifier.h
>> @@ -387,7 +387,7 @@ struct bpf_verifier_state {
>>
>>    #define bpf_get_spilled_reg(slot, frame)                               \
>>           (((slot < frame->allocated_stack / BPF_REG_SIZE) &&             \
>> -         (frame->stack[slot].slot_type[0] == STACK_SPILL))             \
>> +         (frame->stack[slot].slot_type[0] == STACK_SPILL ||
>> frame->stack[slot].slot_type[0] == STACK_ITER))            \
>>            ? &frame->stack[slot].spilled_ptr : NULL)
>>
>> I am not sure whether this would harm some logic implicitly when using
>> bpf_get_spilled_reg/bpf_for_each_spilled_reg in other place. If so,
>> maybe we should add a extra parameter to control the picking behaviour.
>>
>> #define bpf_get_spilled_reg(slot, frame, stack_type)
>>                          \
>>          (((slot < frame->allocated_stack / BPF_REG_SIZE) &&             \
>>            (frame->stack[slot].slot_type[0] == stack_type))              \
>>           ? &frame->stack[slot].spilled_ptr : NULL)
>>
>> Thanks.
> 
> I don't think it's safe to just make bpf_get_spilled_reg, and
> subsequently bpf_for_each_reg_in_vstate and bpf_for_each_spilled_reg
> just suddenly start iterating iterator states and/or dynptrs. At least
> some of existing uses of those assume they are really working just
> with registers.

IIUC, when we are doing bpf_rcu_unlock, we do need to clear the state of 
reg including STACK_ITER.

Maybe here we only need change the logic when using 
bpf_for_each_reg_in_vstate to clear state in bpf_rcu_unlock and keep 
everything else unchanged ?

Thanks.
  
Andrii Nakryiko Sept. 15, 2023, 8:23 p.m. UTC | #6
On Thu, Sep 14, 2023 at 10:46 PM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> Hello.
>
> 在 2023/9/15 07:26, Andrii Nakryiko 写道:
> > On Thu, Sep 14, 2023 at 1:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> >>
> >>
> >>
> >> 在 2023/9/13 21:53, Chuyi Zhou 写道:
> >>> Hello.
> >>>
> >>> 在 2023/9/12 15:01, Chuyi Zhou 写道:
> >>>> css_iter and process_iter should be used in rcu section. Specifically, in
> >>>> sleepable progs explicit bpf_rcu_read_lock() is needed before use these
> >>>> iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
> >>>> use them directly.
> >>>>
> >>>> This patch checks whether we are in rcu cs before we want to invoke
> >>>> bpf_iter_process_new and bpf_iter_css_{pre, post}_new in
> >>>> mark_stack_slots_iter(). If the rcu protection is guaranteed, we would
> >>>> let st->type = PTR_TO_STACK | MEM_RCU. is_iter_reg_valid_init() will
> >>>> reject if reg->type is UNTRUSTED.
> >>>
> >>> I use the following BPF Prog to test this patch:
> >>>
> >>> SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
> >>> int iter_task_for_each_sleep(void *ctx)
> >>> {
> >>>       struct task_struct *task;
> >>>       struct task_struct *cur_task = bpf_get_current_task_btf();
> >>>
> >>>       if (cur_task->pid != target_pid)
> >>>           return 0;
> >>>       bpf_rcu_read_lock();
> >>>       bpf_for_each(process, task) {
> >>>           bpf_rcu_read_unlock();
> >>>           if (task->pid == target_pid)
> >>>               process_cnt += 1;
> >>>           bpf_rcu_read_lock();
> >>>       }
> >>>       bpf_rcu_read_unlock();
> >>>       return 0;
> >>> }
> >>>
> >>> Unfortunately, we can pass the verifier.
> >>>
> >>> Then I add some printk-messages before setting/clearing state to help
> >>> debug:
> >>>
> >>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> >>> index d151e6b43a5f..35f3fa9471a9 100644
> >>> --- a/kernel/bpf/verifier.c
> >>> +++ b/kernel/bpf/verifier.c
> >>> @@ -1200,7 +1200,7 @@ static int mark_stack_slots_iter(struct
> >>> bpf_verifier_env *env,
> >>>                   __mark_reg_known_zero(st);
> >>>                   st->type = PTR_TO_STACK; /* we don't have dedicated reg
> >>> type */
> >>>                   if (is_iter_need_rcu(meta)) {
> >>> +                       printk("mark reg_addr : %px", st);
> >>>                           if (in_rcu_cs(env))
> >>>                                   st->type |= MEM_RCU;
> >>>                           else
> >>> @@ -11472,8 +11472,8 @@ static int check_kfunc_call(struct
> >>> bpf_verifier_env *env, struct bpf_insn *insn,
> >>>                           return -EINVAL;
> >>>                   } else if (rcu_unlock) {
> >>>                           bpf_for_each_reg_in_vstate(env->cur_state,
> >>> state, reg, ({
> >>> +                               printk("clear reg_addr : %px MEM_RCU :
> >>> %d PTR_UNTRUSTED : %d\n ", reg, reg->type & MEM_RCU, reg->type &
> >>> PTR_UNTRUSTED);
> >>>                                   if (reg->type & MEM_RCU) {
> >>> -                                       printk("clear reg addr : %lld",
> >>> reg);
> >>>                                           reg->type &= ~(MEM_RCU |
> >>> PTR_MAYBE_NULL);
> >>>                                           reg->type |= PTR_UNTRUSTED;
> >>>                                   }
> >>>
> >>>
> >>> The demsg log:
> >>>
> >>> [  393.705324] mark reg_addr : ffff88814e40e200
> >>>
> >>> [  393.706883] clear reg_addr : ffff88814d5f8000 MEM_RCU : 0
> >>> PTR_UNTRUSTED : 0
> >>>
> >>> [  393.707353] clear reg_addr : ffff88814d5f8078 MEM_RCU : 0
> >>> PTR_UNTRUSTED : 0
> >>>
> >>> [  393.708099] clear reg_addr : ffff88814d5f80f0 MEM_RCU : 0
> >>> PTR_UNTRUSTED : 0
> >>> ....
> >>> ....
> >>>
> >>> I didn't see ffff88814e40e200 is cleared as expected because
> >>> bpf_for_each_reg_in_vstate didn't find it.
> >>>
> >>> It seems when we are doing bpf_read_unlock() in the middle of iteration
> >>> and want to clearing state through bpf_for_each_reg_in_vstate, we can
> >>> not find the previous reg which we marked MEM_RCU/PTR_UNTRUSTED in
> >>> mark_stack_slots_iter().
> >>>
> >>
> >> bpf_get_spilled_reg will skip slots if they are not STACK_SPILL, but in
> >> mark_stack_slots_iter() we has marked the slots *STACK_ITER*
> >>
> >> With the following change, everything seems work OK.
> >>
> >> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> >> index a3236651ec64..83c5ecccadb4 100644
> >> --- a/include/linux/bpf_verifier.h
> >> +++ b/include/linux/bpf_verifier.h
> >> @@ -387,7 +387,7 @@ struct bpf_verifier_state {
> >>
> >>    #define bpf_get_spilled_reg(slot, frame)                               \
> >>           (((slot < frame->allocated_stack / BPF_REG_SIZE) &&             \
> >> -         (frame->stack[slot].slot_type[0] == STACK_SPILL))             \
> >> +         (frame->stack[slot].slot_type[0] == STACK_SPILL ||
> >> frame->stack[slot].slot_type[0] == STACK_ITER))            \
> >>            ? &frame->stack[slot].spilled_ptr : NULL)
> >>
> >> I am not sure whether this would harm some logic implicitly when using
> >> bpf_get_spilled_reg/bpf_for_each_spilled_reg in other place. If so,
> >> maybe we should add a extra parameter to control the picking behaviour.
> >>
> >> #define bpf_get_spilled_reg(slot, frame, stack_type)
> >>                          \
> >>          (((slot < frame->allocated_stack / BPF_REG_SIZE) &&             \
> >>            (frame->stack[slot].slot_type[0] == stack_type))              \
> >>           ? &frame->stack[slot].spilled_ptr : NULL)
> >>
> >> Thanks.
> >
> > I don't think it's safe to just make bpf_get_spilled_reg, and
> > subsequently bpf_for_each_reg_in_vstate and bpf_for_each_spilled_reg
> > just suddenly start iterating iterator states and/or dynptrs. At least
> > some of existing uses of those assume they are really working just
> > with registers.
>
> IIUC, when we are doing bpf_rcu_unlock, we do need to clear the state of
> reg including STACK_ITER.
>
> Maybe here we only need change the logic when using
> bpf_for_each_reg_in_vstate to clear state in bpf_rcu_unlock and keep
> everything else unchanged ?

Right, maybe. I see 10 uses of bpf_for_each_reg_in_vstate() in
kernel/bpf/verifier.c. Before we change the definition of
bpf_for_each_reg_in_vstate() we should validate that iterating dynptr
and iter states doesn't break any of them, that's all.

>
> Thanks.
  

Patch

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2367483bf4c2..6a6827ba7a18 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1172,7 +1172,13 @@  static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
 
 static void __mark_reg_known_zero(struct bpf_reg_state *reg);
 
+static bool in_rcu_cs(struct bpf_verifier_env *env);
+
+/* check whether we are using bpf_iter_process_*() or bpf_iter_css_*() */
+static bool is_iter_need_rcu(struct bpf_kfunc_call_arg_meta *meta);
+
 static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+				 struct bpf_kfunc_call_arg_meta *meta,
 				 struct bpf_reg_state *reg, int insn_idx,
 				 struct btf *btf, u32 btf_id, int nr_slots)
 {
@@ -1193,6 +1199,12 @@  static int mark_stack_slots_iter(struct bpf_verifier_env *env,
 
 		__mark_reg_known_zero(st);
 		st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+		if (is_iter_need_rcu(meta)) {
+			if (in_rcu_cs(env))
+				st->type |= MEM_RCU;
+			else
+				st->type |= PTR_UNTRUSTED;
+		}
 		st->live |= REG_LIVE_WRITTEN;
 		st->ref_obj_id = i == 0 ? id : 0;
 		st->iter.btf = btf;
@@ -1281,6 +1293,8 @@  static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_
 		struct bpf_stack_state *slot = &state->stack[spi - i];
 		struct bpf_reg_state *st = &slot->spilled_ptr;
 
+		if (st->type & PTR_UNTRUSTED)
+			return false;
 		/* only main (first) slot has ref_obj_id set */
 		if (i == 0 && !st->ref_obj_id)
 			return false;
@@ -7503,13 +7517,13 @@  static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
 				return err;
 		}
 
-		err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+		err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
 		if (err)
 			return err;
 	} else {
 		/* iter_next() or iter_destroy() expect initialized iter state*/
 		if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
-			verbose(env, "expected an initialized iter_%s as arg #%d\n",
+			verbose(env, "expected an initialized iter_%s as arg #%d or without bpf_rcu_read_lock()\n",
 				iter_type_str(meta->btf, btf_id), regno);
 			return -EINVAL;
 		}
@@ -10382,6 +10396,18 @@  BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
 BTF_ID(func, bpf_iter_css_task_new)
 
+BTF_SET_START(rcu_protect_kfuns_set)
+BTF_ID(func, bpf_iter_process_new)
+BTF_ID(func, bpf_iter_css_pre_new)
+BTF_ID(func, bpf_iter_css_post_new)
+BTF_SET_END(rcu_protect_kfuns_set)
+
+static inline bool is_iter_need_rcu(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return btf_id_set_contains(&rcu_protect_kfuns_set, meta->func_id);
+}
+
+
 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
 {
 	if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&