[bpf-next,v3,3/7] bpf: Introduce task open coded iterator kfuncs
Commit Message
This patch adds kfuncs bpf_iter_task_{new,next,destroy} which allow
creation and manipulation of struct bpf_iter_task in open-coded iterator
style. BPF programs can use these kfuncs or through bpf_for_each macro to
iterate all processes in the system.
The API design keep consistent with SEC("iter/task"). bpf_iter_task_new()
accepts a specific task and iterating type which allows:
1. iterating all process in the system
2. iterating all threads in the system
3. iterating all threads of a specific task
Here we also resuse enum bpf_iter_task_type and rename BPF_TASK_ITER_TID
to BPF_TASK_ITER_THREAD, rename BPF_TASK_ITER_TGID to BPF_TASK_ITER_PROC.
The newly-added struct bpf_iter_task has a name collision with a selftest
for the seq_file task iter's bpf skel, so the selftests/bpf/progs file is
renamed in order to avoid the collision.
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
include/linux/bpf.h | 8 +-
kernel/bpf/helpers.c | 3 +
kernel/bpf/task_iter.c | 96 ++++++++++++++++---
.../testing/selftests/bpf/bpf_experimental.h | 5 +
.../selftests/bpf/prog_tests/bpf_iter.c | 18 ++--
.../{bpf_iter_task.c => bpf_iter_tasks.c} | 0
6 files changed, 106 insertions(+), 24 deletions(-)
rename tools/testing/selftests/bpf/progs/{bpf_iter_task.c => bpf_iter_tasks.c} (100%)
Comments
On Mon, Sep 25, 2023 at 3:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> This patch adds kfuncs bpf_iter_task_{new,next,destroy} which allow
> creation and manipulation of struct bpf_iter_task in open-coded iterator
> style. BPF programs can use these kfuncs or through bpf_for_each macro to
> iterate all processes in the system.
>
> The API design keep consistent with SEC("iter/task"). bpf_iter_task_new()
> accepts a specific task and iterating type which allows:
> 1. iterating all process in the system
>
> 2. iterating all threads in the system
>
> 3. iterating all threads of a specific task
> Here we also resuse enum bpf_iter_task_type and rename BPF_TASK_ITER_TID
> to BPF_TASK_ITER_THREAD, rename BPF_TASK_ITER_TGID to BPF_TASK_ITER_PROC.
>
> The newly-added struct bpf_iter_task has a name collision with a selftest
> for the seq_file task iter's bpf skel, so the selftests/bpf/progs file is
> renamed in order to avoid the collision.
>
> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> ---
> include/linux/bpf.h | 8 +-
> kernel/bpf/helpers.c | 3 +
> kernel/bpf/task_iter.c | 96 ++++++++++++++++---
> .../testing/selftests/bpf/bpf_experimental.h | 5 +
> .../selftests/bpf/prog_tests/bpf_iter.c | 18 ++--
> .../{bpf_iter_task.c => bpf_iter_tasks.c} | 0
> 6 files changed, 106 insertions(+), 24 deletions(-)
> rename tools/testing/selftests/bpf/progs/{bpf_iter_task.c => bpf_iter_tasks.c} (100%)
>
[...]
> @@ -692,9 +692,9 @@ static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct b
> static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
> {
> seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
> - if (aux->task.type == BPF_TASK_ITER_TID)
> + if (aux->task.type == BPF_TASK_ITER_THREAD)
> seq_printf(seq, "tid:\t%u\n", aux->task.pid);
> - else if (aux->task.type == BPF_TASK_ITER_TGID)
> + else if (aux->task.type == BPF_TASK_ITER_PROC)
> seq_printf(seq, "pid:\t%u\n", aux->task.pid);
> }
>
> @@ -856,6 +856,80 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
> bpf_mem_free(&bpf_global_ma, kit->css_it);
> }
>
> +struct bpf_iter_task {
> + __u64 __opaque[2];
> + __u32 __opaque_int[1];
this should be __u64 __opaque[3], because struct takes full 24 bytes
> +} __attribute__((aligned(8)));
> +
> +struct bpf_iter_task_kern {
> + struct task_struct *task;
> + struct task_struct *pos;
> + unsigned int type;
> +} __attribute__((aligned(8)));
> +
> +__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task, unsigned int type)
nit: type -> flags, so we can add a bit more stuff, if necessary
> +{
> + struct bpf_iter_task_kern *kit = (void *)it;
empty line after variable declarations
> + BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) != sizeof(struct bpf_iter_task));
> + BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
> + __alignof__(struct bpf_iter_task));
and I'd add empty line here to keep BUILD_BUG_ON block separate
> + kit->task = kit->pos = NULL;
> + switch (type) {
> + case BPF_TASK_ITER_ALL:
> + case BPF_TASK_ITER_PROC:
> + case BPF_TASK_ITER_THREAD:
> + break;
> + default:
> + return -EINVAL;
> + }
> +
> + if (type == BPF_TASK_ITER_THREAD)
> + kit->task = task;
> + else
> + kit->task = &init_task;
> + kit->pos = kit->task;
> + kit->type = type;
> + return 0;
> +}
> +
> +__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it)
> +{
> + struct bpf_iter_task_kern *kit = (void *)it;
> + struct task_struct *pos;
> + unsigned int type;
> +
> + type = kit->type;
> + pos = kit->pos;
> +
> + if (!pos)
> + goto out;
> +
> + if (type == BPF_TASK_ITER_PROC)
> + goto get_next_task;
> +
> + kit->pos = next_thread(kit->pos);
> + if (kit->pos == kit->task) {
> + if (type == BPF_TASK_ITER_THREAD) {
> + kit->pos = NULL;
> + goto out;
> + }
> + } else
> + goto out;
> +
> +get_next_task:
> + kit->pos = next_task(kit->pos);
> + kit->task = kit->pos;
> + if (kit->pos == &init_task)
> + kit->pos = NULL;
I can't say I completely follow the logic (e.g., for
BPF_TASK_ITER_PROC, why do we do next_task() on first next() call)?
Can you elabore the expected behavior for various combinations of
types and starting task argument?
> +
> +out:
> + return pos;
> +}
> +
> +__bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
> +{
> +}
> +
> DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
>
> static void do_mmap_read_unlock(struct irq_work *entry)
> diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
> index d3ea90f0e142..d989775dbdb5 100644
> --- a/tools/testing/selftests/bpf/bpf_experimental.h
> +++ b/tools/testing/selftests/bpf/bpf_experimental.h
> @@ -169,4 +169,9 @@ extern int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
> extern struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) __weak __ksym;
> extern void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) __weak __ksym;
>
> +struct bpf_iter_task;
> +extern int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task, unsigned int type) __weak __ksym;
> +extern struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) __weak __ksym;
> +extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym;
> +
> #endif
> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
please split out selftests changes from kernel-side changes. We only
combine them if kernel changes break selftests, preventing bisection.
[...]
Hello,
在 2023/9/28 07:20, Andrii Nakryiko 写道:
> On Mon, Sep 25, 2023 at 3:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>
>> This patch adds kfuncs bpf_iter_task_{new,next,destroy} which allow
>> creation and manipulation of struct bpf_iter_task in open-coded iterator
>> style. BPF programs can use these kfuncs or through bpf_for_each macro to
>> iterate all processes in the system.
>>
>> The API design keep consistent with SEC("iter/task"). bpf_iter_task_new()
>> accepts a specific task and iterating type which allows:
>> 1. iterating all process in the system
>>
>> 2. iterating all threads in the system
>>
>> 3. iterating all threads of a specific task
>> Here we also resuse enum bpf_iter_task_type and rename BPF_TASK_ITER_TID
>> to BPF_TASK_ITER_THREAD, rename BPF_TASK_ITER_TGID to BPF_TASK_ITER_PROC.
>>
>> The newly-added struct bpf_iter_task has a name collision with a selftest
>> for the seq_file task iter's bpf skel, so the selftests/bpf/progs file is
>> renamed in order to avoid the collision.
>>
>> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
>> ---
>> include/linux/bpf.h | 8 +-
>> kernel/bpf/helpers.c | 3 +
>> kernel/bpf/task_iter.c | 96 ++++++++++++++++---
>> .../testing/selftests/bpf/bpf_experimental.h | 5 +
>> .../selftests/bpf/prog_tests/bpf_iter.c | 18 ++--
>> .../{bpf_iter_task.c => bpf_iter_tasks.c} | 0
>> 6 files changed, 106 insertions(+), 24 deletions(-)
>> rename tools/testing/selftests/bpf/progs/{bpf_iter_task.c => bpf_iter_tasks.c} (100%)
>>
>
> [...]
>
>> @@ -692,9 +692,9 @@ static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct b
>> static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
>> {
>> seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
>> - if (aux->task.type == BPF_TASK_ITER_TID)
>> + if (aux->task.type == BPF_TASK_ITER_THREAD)
>> seq_printf(seq, "tid:\t%u\n", aux->task.pid);
>> - else if (aux->task.type == BPF_TASK_ITER_TGID)
>> + else if (aux->task.type == BPF_TASK_ITER_PROC)
>> seq_printf(seq, "pid:\t%u\n", aux->task.pid);
>> }
>>
>> @@ -856,6 +856,80 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
>> bpf_mem_free(&bpf_global_ma, kit->css_it);
>> }
>>
>> +struct bpf_iter_task {
>> + __u64 __opaque[2];
>> + __u32 __opaque_int[1];
>
> this should be __u64 __opaque[3], because struct takes full 24 bytes
>
>> +} __attribute__((aligned(8)));
>> +
>> +struct bpf_iter_task_kern {
>> + struct task_struct *task;
>> + struct task_struct *pos;
>> + unsigned int type;
>> +} __attribute__((aligned(8)));
>> +
>> +__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task, unsigned int type)
>
> nit: type -> flags, so we can add a bit more stuff, if necessary
>
>> +{
>> + struct bpf_iter_task_kern *kit = (void *)it;
>
> empty line after variable declarations
>
>> + BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) != sizeof(struct bpf_iter_task));
>> + BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
>> + __alignof__(struct bpf_iter_task));
>
> and I'd add empty line here to keep BUILD_BUG_ON block separate
>
>> + kit->task = kit->pos = NULL;
>> + switch (type) {
>> + case BPF_TASK_ITER_ALL:
>> + case BPF_TASK_ITER_PROC:
>> + case BPF_TASK_ITER_THREAD:
>> + break;
>> + default:
>> + return -EINVAL;
>> + }
>> +
>> + if (type == BPF_TASK_ITER_THREAD)
>> + kit->task = task;
>> + else
>> + kit->task = &init_task;
>> + kit->pos = kit->task;
>> + kit->type = type;
>> + return 0;
>> +}
>> +
>> +__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it)
>> +{
>> + struct bpf_iter_task_kern *kit = (void *)it;
>> + struct task_struct *pos;
>> + unsigned int type;
>> +
>> + type = kit->type;
>> + pos = kit->pos;
>> +
>> + if (!pos)
>> + goto out;
>> +
>> + if (type == BPF_TASK_ITER_PROC)
>> + goto get_next_task;
>> +
>> + kit->pos = next_thread(kit->pos);
>> + if (kit->pos == kit->task) {
>> + if (type == BPF_TASK_ITER_THREAD) {
>> + kit->pos = NULL;
>> + goto out;
>> + }
>> + } else
>> + goto out;
>> +
>> +get_next_task:
>> + kit->pos = next_task(kit->pos);
>> + kit->task = kit->pos;
>> + if (kit->pos == &init_task)
>> + kit->pos = NULL;
>
> I can't say I completely follow the logic (e.g., for
> BPF_TASK_ITER_PROC, why do we do next_task() on first next() call)?
> Can you elabore the expected behavior for various combinations of
> types and starting task argument?
>
Thanks for the review.
The expected behavior of current implementation is:
BPF_TASK_ITER_PROC:
init_task->first_process->second_process->...->last_process->init_task
We would exit before visiting init_task again.
BPF_TASK_ITER_THREAD:
group_task->first_thread->second_thread->...->last_thread->group_task
We would exit before visiting group_task again.
BPF_TASK_ITER_ALL:
init_task -> first_process -> second_process -> ...
| |
-> first_thread.. |
-> first_thread
Actually, every next() call, we would return the "pos" which was
prepared by previous next() call, and use next_task()/next_thread() to
update kit->pos. Once we meet the exit condition (next_task() return
init_task or next_thread() return group_task), we would update kit->pos
to NULL. In this way, when next() is called again, we will terminate the
iteration.
Here "kit->pos = NULL;" means we would return the last valid "pos" and
will return NULL in next call to exit from the iteration.
Am I miss something important?
Thanks.
On Wed, Sep 27, 2023 at 8:29 PM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> Hello,
>
> 在 2023/9/28 07:20, Andrii Nakryiko 写道:
> > On Mon, Sep 25, 2023 at 3:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> >>
> >> This patch adds kfuncs bpf_iter_task_{new,next,destroy} which allow
> >> creation and manipulation of struct bpf_iter_task in open-coded iterator
> >> style. BPF programs can use these kfuncs or through bpf_for_each macro to
> >> iterate all processes in the system.
> >>
> >> The API design keep consistent with SEC("iter/task"). bpf_iter_task_new()
> >> accepts a specific task and iterating type which allows:
> >> 1. iterating all process in the system
> >>
> >> 2. iterating all threads in the system
> >>
> >> 3. iterating all threads of a specific task
> >> Here we also resuse enum bpf_iter_task_type and rename BPF_TASK_ITER_TID
> >> to BPF_TASK_ITER_THREAD, rename BPF_TASK_ITER_TGID to BPF_TASK_ITER_PROC.
> >>
> >> The newly-added struct bpf_iter_task has a name collision with a selftest
> >> for the seq_file task iter's bpf skel, so the selftests/bpf/progs file is
> >> renamed in order to avoid the collision.
> >>
> >> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> >> ---
> >> include/linux/bpf.h | 8 +-
> >> kernel/bpf/helpers.c | 3 +
> >> kernel/bpf/task_iter.c | 96 ++++++++++++++++---
> >> .../testing/selftests/bpf/bpf_experimental.h | 5 +
> >> .../selftests/bpf/prog_tests/bpf_iter.c | 18 ++--
> >> .../{bpf_iter_task.c => bpf_iter_tasks.c} | 0
> >> 6 files changed, 106 insertions(+), 24 deletions(-)
> >> rename tools/testing/selftests/bpf/progs/{bpf_iter_task.c => bpf_iter_tasks.c} (100%)
> >>
> >
> > [...]
> >
> >> @@ -692,9 +692,9 @@ static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct b
> >> static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
> >> {
> >> seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
> >> - if (aux->task.type == BPF_TASK_ITER_TID)
> >> + if (aux->task.type == BPF_TASK_ITER_THREAD)
> >> seq_printf(seq, "tid:\t%u\n", aux->task.pid);
> >> - else if (aux->task.type == BPF_TASK_ITER_TGID)
> >> + else if (aux->task.type == BPF_TASK_ITER_PROC)
> >> seq_printf(seq, "pid:\t%u\n", aux->task.pid);
> >> }
> >>
> >> @@ -856,6 +856,80 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
> >> bpf_mem_free(&bpf_global_ma, kit->css_it);
> >> }
> >>
> >> +struct bpf_iter_task {
> >> + __u64 __opaque[2];
> >> + __u32 __opaque_int[1];
> >
> > this should be __u64 __opaque[3], because struct takes full 24 bytes
> >
> >> +} __attribute__((aligned(8)));
> >> +
> >> +struct bpf_iter_task_kern {
> >> + struct task_struct *task;
> >> + struct task_struct *pos;
> >> + unsigned int type;
> >> +} __attribute__((aligned(8)));
> >> +
> >> +__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task, unsigned int type)
> >
> > nit: type -> flags, so we can add a bit more stuff, if necessary
> >
> >> +{
> >> + struct bpf_iter_task_kern *kit = (void *)it;
> >
> > empty line after variable declarations
> >
> >> + BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) != sizeof(struct bpf_iter_task));
> >> + BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
> >> + __alignof__(struct bpf_iter_task));
> >
> > and I'd add empty line here to keep BUILD_BUG_ON block separate
> >
> >> + kit->task = kit->pos = NULL;
> >> + switch (type) {
> >> + case BPF_TASK_ITER_ALL:
> >> + case BPF_TASK_ITER_PROC:
> >> + case BPF_TASK_ITER_THREAD:
> >> + break;
> >> + default:
> >> + return -EINVAL;
> >> + }
> >> +
> >> + if (type == BPF_TASK_ITER_THREAD)
> >> + kit->task = task;
> >> + else
> >> + kit->task = &init_task;
> >> + kit->pos = kit->task;
> >> + kit->type = type;
> >> + return 0;
> >> +}
> >> +
> >> +__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it)
> >> +{
> >> + struct bpf_iter_task_kern *kit = (void *)it;
> >> + struct task_struct *pos;
> >> + unsigned int type;
> >> +
> >> + type = kit->type;
> >> + pos = kit->pos;
> >> +
> >> + if (!pos)
> >> + goto out;
> >> +
> >> + if (type == BPF_TASK_ITER_PROC)
> >> + goto get_next_task;
> >> +
> >> + kit->pos = next_thread(kit->pos);
> >> + if (kit->pos == kit->task) {
> >> + if (type == BPF_TASK_ITER_THREAD) {
> >> + kit->pos = NULL;
> >> + goto out;
> >> + }
> >> + } else
> >> + goto out;
> >> +
> >> +get_next_task:
> >> + kit->pos = next_task(kit->pos);
> >> + kit->task = kit->pos;
> >> + if (kit->pos == &init_task)
> >> + kit->pos = NULL;
> >
> > I can't say I completely follow the logic (e.g., for
> > BPF_TASK_ITER_PROC, why do we do next_task() on first next() call)?
> > Can you elabore the expected behavior for various combinations of
> > types and starting task argument?
> >
>
> Thanks for the review.
>
> The expected behavior of current implementation is:
>
> BPF_TASK_ITER_PROC:
>
> init_task->first_process->second_process->...->last_process->init_task
>
> We would exit before visiting init_task again.
ah, ok, so in this case it's more like BPF_TASK_ITER_ALL_PROCS, i.e.,
we iterate all processes in the system. Input `task` that we provide
is ignored/meaningless, right? Maybe we should express it as
ALL_PROCS?
>
> BPF_TASK_ITER_THREAD:
>
> group_task->first_thread->second_thread->...->last_thread->group_task
>
> We would exit before visiting group_task again.
>
And this one is iterating threads of a process specified by given
`task`, right? This is where my confusion comes from. ITER_PROC and
ITER_THREAD, by their name, seems to be very similar, but in reality
ITER_PROC is more like ITER_ALL (except process vs thread iteration),
while ITER_THREAD is parameterized by input `task`.
I'm not sure what's the least confusing way to name and organize
everything, but I think it's quite confusing right now, unfortunately.
I wonder if you or someone else have a better suggestion on making
this more straightforward?
> BPF_TASK_ITER_ALL:
>
> init_task -> first_process -> second_process -> ...
> | |
> -> first_thread.. |
> -> first_thread
>
Ok, and this one is "all threads in the system". So
BPF_TASK_ITER_ALL_THREADS would describe it more precisely, right?
> Actually, every next() call, we would return the "pos" which was
> prepared by previous next() call, and use next_task()/next_thread() to
> update kit->pos. Once we meet the exit condition (next_task() return
> init_task or next_thread() return group_task), we would update kit->pos
> to NULL. In this way, when next() is called again, we will terminate the
> iteration.
>
> Here "kit->pos = NULL;" means we would return the last valid "pos" and
> will return NULL in next call to exit from the iteration.
>
> Am I miss something important?
No, it's my bad. I did check, but somehow concluded that you are
returning kit->pos, but you are returning locally cached previous
value of kit->pos. All good here, I think.
>
> Thanks.
>
>
>
Hello, Andrii
在 2023/9/30 05:27, Andrii Nakryiko 写道:
> On Wed, Sep 27, 2023 at 8:29 PM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>
>> Hello,
>>
>> 在 2023/9/28 07:20, Andrii Nakryiko 写道:
>>> On Mon, Sep 25, 2023 at 3:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>>>
>>>> This patch adds kfuncs bpf_iter_task_{new,next,destroy} which allow
>>>> creation and manipulation of struct bpf_iter_task in open-coded iterator
>>>> style. BPF programs can use these kfuncs or through bpf_for_each macro to
>>>> iterate all processes in the system.
>>>>
>>>> The API design keep consistent with SEC("iter/task"). bpf_iter_task_new()
>>>> accepts a specific task and iterating type which allows:
>>>> 1. iterating all process in the system
>>>>
>>>> 2. iterating all threads in the system
>>>>
>>>> 3. iterating all threads of a specific task
>>>> Here we also resuse enum bpf_iter_task_type and rename BPF_TASK_ITER_TID
>>>> to BPF_TASK_ITER_THREAD, rename BPF_TASK_ITER_TGID to BPF_TASK_ITER_PROC.
>>>>
>>>> The newly-added struct bpf_iter_task has a name collision with a selftest
>>>> for the seq_file task iter's bpf skel, so the selftests/bpf/progs file is
>>>> renamed in order to avoid the collision.
>>>>
>>>> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
>>>> ---
>>>> include/linux/bpf.h | 8 +-
>>>> kernel/bpf/helpers.c | 3 +
>>>> kernel/bpf/task_iter.c | 96 ++++++++++++++++---
>>>> .../testing/selftests/bpf/bpf_experimental.h | 5 +
>>>> .../selftests/bpf/prog_tests/bpf_iter.c | 18 ++--
>>>> .../{bpf_iter_task.c => bpf_iter_tasks.c} | 0
>>>> 6 files changed, 106 insertions(+), 24 deletions(-)
>>>> rename tools/testing/selftests/bpf/progs/{bpf_iter_task.c => bpf_iter_tasks.c} (100%)
>>>>
>>>
[...]
>>>> +get_next_task:
>>>> + kit->pos = next_task(kit->pos);
>>>> + kit->task = kit->pos;
>>>> + if (kit->pos == &init_task)
>>>> + kit->pos = NULL;
>>>
>>> I can't say I completely follow the logic (e.g., for
>>> BPF_TASK_ITER_PROC, why do we do next_task() on first next() call)?
>>> Can you elabore the expected behavior for various combinations of
>>> types and starting task argument?
>>>
>>
>> Thanks for the review.
>>
>> The expected behavior of current implementation is:
>>
>> BPF_TASK_ITER_PROC:
>>
>> init_task->first_process->second_process->...->last_process->init_task
>>
>> We would exit before visiting init_task again.
>
> ah, ok, so in this case it's more like BPF_TASK_ITER_ALL_PROCS, i.e.,
> we iterate all processes in the system. Input `task` that we provide
> is ignored/meaningless, right? Maybe we should express it as
> ALL_PROCS?
>
>>
>> BPF_TASK_ITER_THREAD:
>>
>> group_task->first_thread->second_thread->...->last_thread->group_task
>>
>> We would exit before visiting group_task again.
>>
>
> And this one is iterating threads of a process specified by given
> `task`, right? This is where my confusion comes from. ITER_PROC and
> ITER_THREAD, by their name, seems to be very similar, but in reality
> ITER_PROC is more like ITER_ALL (except process vs thread iteration),
> while ITER_THREAD is parameterized by input `task`.
>
> I'm not sure what's the least confusing way to name and organize
> everything, but I think it's quite confusing right now, unfortunately.
> I wonder if you or someone else have a better suggestion on making
> this more straightforward?
>
Maybe here we can introduce new enums and not reuse or rename
BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID?
{
BPF_TASK_ITER_ALL_PROC,
BPF_TASK_ITER_ALL_THREAD,
BPF_TASK_ITER_THREAD
}
BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID are inner flags. Looking at the
example usage of SEC("iter/task"), unlike
BPF_CGROUP_ITER_DESCENDANTS_PRE/BPF_CGROUP_ITER_DESCENDANTS_POST, we
actually don't use BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID directly. When
using SEC("iter/task"), we just set pid/tid for struct
bpf_iter_link_info. Exposing new enums to users for open coded
task_iters will not confuse users.
Thanks.
On Sun, Oct 1, 2023 at 1:21 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> Hello, Andrii
>
> 在 2023/9/30 05:27, Andrii Nakryiko 写道:
> > On Wed, Sep 27, 2023 at 8:29 PM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> >>
> >> Hello,
> >>
> >> 在 2023/9/28 07:20, Andrii Nakryiko 写道:
> >>> On Mon, Sep 25, 2023 at 3:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> >>>>
> >>>> This patch adds kfuncs bpf_iter_task_{new,next,destroy} which allow
> >>>> creation and manipulation of struct bpf_iter_task in open-coded iterator
> >>>> style. BPF programs can use these kfuncs or through bpf_for_each macro to
> >>>> iterate all processes in the system.
> >>>>
> >>>> The API design keep consistent with SEC("iter/task"). bpf_iter_task_new()
> >>>> accepts a specific task and iterating type which allows:
> >>>> 1. iterating all process in the system
> >>>>
> >>>> 2. iterating all threads in the system
> >>>>
> >>>> 3. iterating all threads of a specific task
> >>>> Here we also resuse enum bpf_iter_task_type and rename BPF_TASK_ITER_TID
> >>>> to BPF_TASK_ITER_THREAD, rename BPF_TASK_ITER_TGID to BPF_TASK_ITER_PROC.
> >>>>
> >>>> The newly-added struct bpf_iter_task has a name collision with a selftest
> >>>> for the seq_file task iter's bpf skel, so the selftests/bpf/progs file is
> >>>> renamed in order to avoid the collision.
> >>>>
> >>>> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> >>>> ---
> >>>> include/linux/bpf.h | 8 +-
> >>>> kernel/bpf/helpers.c | 3 +
> >>>> kernel/bpf/task_iter.c | 96 ++++++++++++++++---
> >>>> .../testing/selftests/bpf/bpf_experimental.h | 5 +
> >>>> .../selftests/bpf/prog_tests/bpf_iter.c | 18 ++--
> >>>> .../{bpf_iter_task.c => bpf_iter_tasks.c} | 0
> >>>> 6 files changed, 106 insertions(+), 24 deletions(-)
> >>>> rename tools/testing/selftests/bpf/progs/{bpf_iter_task.c => bpf_iter_tasks.c} (100%)
> >>>>
> >>>
>
>
> [...]
>
> >>>> +get_next_task:
> >>>> + kit->pos = next_task(kit->pos);
> >>>> + kit->task = kit->pos;
> >>>> + if (kit->pos == &init_task)
> >>>> + kit->pos = NULL;
> >>>
> >>> I can't say I completely follow the logic (e.g., for
> >>> BPF_TASK_ITER_PROC, why do we do next_task() on first next() call)?
> >>> Can you elabore the expected behavior for various combinations of
> >>> types and starting task argument?
> >>>
> >>
> >> Thanks for the review.
> >>
> >> The expected behavior of current implementation is:
> >>
> >> BPF_TASK_ITER_PROC:
> >>
> >> init_task->first_process->second_process->...->last_process->init_task
> >>
> >> We would exit before visiting init_task again.
> >
> > ah, ok, so in this case it's more like BPF_TASK_ITER_ALL_PROCS, i.e.,
> > we iterate all processes in the system. Input `task` that we provide
> > is ignored/meaningless, right? Maybe we should express it as
> > ALL_PROCS?
> >
> >>
> >> BPF_TASK_ITER_THREAD:
> >>
> >> group_task->first_thread->second_thread->...->last_thread->group_task
> >>
> >> We would exit before visiting group_task again.
> >>
> >
> > And this one is iterating threads of a process specified by given
> > `task`, right? This is where my confusion comes from. ITER_PROC and
> > ITER_THREAD, by their name, seems to be very similar, but in reality
> > ITER_PROC is more like ITER_ALL (except process vs thread iteration),
> > while ITER_THREAD is parameterized by input `task`.
> >
> > I'm not sure what's the least confusing way to name and organize
> > everything, but I think it's quite confusing right now, unfortunately.
> > I wonder if you or someone else have a better suggestion on making
> > this more straightforward?
> >
>
> Maybe here we can introduce new enums and not reuse or rename
> BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID?
Yep, probably it's cleaner
>
> {
> BPF_TASK_ITER_ALL_PROC,
BPF_TASK_ITER_ALL_PROCS
> BPF_TASK_ITER_ALL_THREAD,
BPF_TASK_ITER_ALL_THREADS
> BPF_TASK_ITER_THREAD
BPF_TASK_ITER_PROC_THREADS ?
> }
>
> BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID are inner flags. Looking at the
> example usage of SEC("iter/task"), unlike
> BPF_CGROUP_ITER_DESCENDANTS_PRE/BPF_CGROUP_ITER_DESCENDANTS_POST, we
> actually don't use BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID directly. When
> using SEC("iter/task"), we just set pid/tid for struct
> bpf_iter_link_info. Exposing new enums to users for open coded
> task_iters will not confuse users.
>
> Thanks.
>
@@ -2194,16 +2194,16 @@ int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
* BPF_TASK_ITER_ALL (default)
* Iterate over resources of every task.
*
- * BPF_TASK_ITER_TID
+ * BPF_TASK_ITER_THREAD
* Iterate over resources of a task/tid.
*
- * BPF_TASK_ITER_TGID
+ * BPF_TASK_ITER_PROC
* Iterate over resources of every task of a process / task group.
*/
enum bpf_iter_task_type {
BPF_TASK_ITER_ALL = 0,
- BPF_TASK_ITER_TID,
- BPF_TASK_ITER_TGID,
+ BPF_TASK_ITER_THREAD,
+ BPF_TASK_ITER_PROC,
};
struct bpf_iter_aux_info {
@@ -2507,6 +2507,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, bpf_dynptr_adjust)
BTF_ID_FLAGS(func, bpf_dynptr_is_null)
BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
@@ -94,7 +94,7 @@ static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *co
struct task_struct *task = NULL;
struct pid *pid;
- if (common->type == BPF_TASK_ITER_TID) {
+ if (common->type == BPF_TASK_ITER_THREAD) {
if (*tid && *tid != common->pid)
return NULL;
rcu_read_lock();
@@ -108,7 +108,7 @@ static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *co
return task;
}
- if (common->type == BPF_TASK_ITER_TGID) {
+ if (common->type == BPF_TASK_ITER_PROC) {
rcu_read_lock();
task = task_group_seq_get_next(common, tid, skip_if_dup_files);
rcu_read_unlock();
@@ -217,15 +217,15 @@ static int bpf_iter_attach_task(struct bpf_prog *prog,
aux->task.type = BPF_TASK_ITER_ALL;
if (linfo->task.tid != 0) {
- aux->task.type = BPF_TASK_ITER_TID;
+ aux->task.type = BPF_TASK_ITER_THREAD;
aux->task.pid = linfo->task.tid;
}
if (linfo->task.pid != 0) {
- aux->task.type = BPF_TASK_ITER_TGID;
+ aux->task.type = BPF_TASK_ITER_PROC;
aux->task.pid = linfo->task.pid;
}
if (linfo->task.pid_fd != 0) {
- aux->task.type = BPF_TASK_ITER_TGID;
+ aux->task.type = BPF_TASK_ITER_PROC;
pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
if (IS_ERR(pid))
@@ -305,7 +305,7 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
rcu_read_unlock();
put_task_struct(curr_task);
- if (info->common.type == BPF_TASK_ITER_TID) {
+ if (info->common.type == BPF_TASK_ITER_THREAD) {
info->task = NULL;
return NULL;
}
@@ -566,7 +566,7 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
return curr_vma;
next_task:
- if (info->common.type == BPF_TASK_ITER_TID)
+ if (info->common.type == BPF_TASK_ITER_THREAD)
goto finish;
put_task_struct(curr_task);
@@ -677,10 +677,10 @@ static const struct bpf_iter_seq_info task_seq_info = {
static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
{
switch (aux->task.type) {
- case BPF_TASK_ITER_TID:
+ case BPF_TASK_ITER_THREAD:
info->iter.task.tid = aux->task.pid;
break;
- case BPF_TASK_ITER_TGID:
+ case BPF_TASK_ITER_PROC:
info->iter.task.pid = aux->task.pid;
break;
default:
@@ -692,9 +692,9 @@ static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct b
static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
{
seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
- if (aux->task.type == BPF_TASK_ITER_TID)
+ if (aux->task.type == BPF_TASK_ITER_THREAD)
seq_printf(seq, "tid:\t%u\n", aux->task.pid);
- else if (aux->task.type == BPF_TASK_ITER_TGID)
+ else if (aux->task.type == BPF_TASK_ITER_PROC)
seq_printf(seq, "pid:\t%u\n", aux->task.pid);
}
@@ -856,6 +856,80 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
bpf_mem_free(&bpf_global_ma, kit->css_it);
}
+struct bpf_iter_task {
+ __u64 __opaque[2];
+ __u32 __opaque_int[1];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_task_kern {
+ struct task_struct *task;
+ struct task_struct *pos;
+ unsigned int type;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task, unsigned int type)
+{
+ struct bpf_iter_task_kern *kit = (void *)it;
+ BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) != sizeof(struct bpf_iter_task));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
+ __alignof__(struct bpf_iter_task));
+ kit->task = kit->pos = NULL;
+ switch (type) {
+ case BPF_TASK_ITER_ALL:
+ case BPF_TASK_ITER_PROC:
+ case BPF_TASK_ITER_THREAD:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (type == BPF_TASK_ITER_THREAD)
+ kit->task = task;
+ else
+ kit->task = &init_task;
+ kit->pos = kit->task;
+ kit->type = type;
+ return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it)
+{
+ struct bpf_iter_task_kern *kit = (void *)it;
+ struct task_struct *pos;
+ unsigned int type;
+
+ type = kit->type;
+ pos = kit->pos;
+
+ if (!pos)
+ goto out;
+
+ if (type == BPF_TASK_ITER_PROC)
+ goto get_next_task;
+
+ kit->pos = next_thread(kit->pos);
+ if (kit->pos == kit->task) {
+ if (type == BPF_TASK_ITER_THREAD) {
+ kit->pos = NULL;
+ goto out;
+ }
+ } else
+ goto out;
+
+get_next_task:
+ kit->pos = next_task(kit->pos);
+ kit->task = kit->pos;
+ if (kit->pos == &init_task)
+ kit->pos = NULL;
+
+out:
+ return pos;
+}
+
+__bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
+{
+}
+
DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
static void do_mmap_read_unlock(struct irq_work *entry)
@@ -169,4 +169,9 @@ extern int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
extern struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) __weak __ksym;
extern void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) __weak __ksym;
+struct bpf_iter_task;
+extern int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task, unsigned int type) __weak __ksym;
+extern struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) __weak __ksym;
+extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym;
+
#endif
@@ -7,7 +7,7 @@
#include "bpf_iter_ipv6_route.skel.h"
#include "bpf_iter_netlink.skel.h"
#include "bpf_iter_bpf_map.skel.h"
-#include "bpf_iter_task.skel.h"
+#include "bpf_iter_tasks.skel.h"
#include "bpf_iter_task_stack.skel.h"
#include "bpf_iter_task_file.skel.h"
#include "bpf_iter_task_vma.skel.h"
@@ -215,12 +215,12 @@ static void *do_nothing_wait(void *arg)
static void test_task_common_nocheck(struct bpf_iter_attach_opts *opts,
int *num_unknown, int *num_known)
{
- struct bpf_iter_task *skel;
+ struct bpf_iter_tasks *skel;
pthread_t thread_id;
void *ret;
- skel = bpf_iter_task__open_and_load();
- if (!ASSERT_OK_PTR(skel, "bpf_iter_task__open_and_load"))
+ skel = bpf_iter_tasks__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "bpf_iter_tasks__open_and_load"))
return;
ASSERT_OK(pthread_mutex_lock(&do_nothing_mutex), "pthread_mutex_lock");
@@ -239,7 +239,7 @@ static void test_task_common_nocheck(struct bpf_iter_attach_opts *opts,
ASSERT_FALSE(pthread_join(thread_id, &ret) || ret != NULL,
"pthread_join");
- bpf_iter_task__destroy(skel);
+ bpf_iter_tasks__destroy(skel);
}
static void test_task_common(struct bpf_iter_attach_opts *opts, int num_unknown, int num_known)
@@ -307,10 +307,10 @@ static void test_task_pidfd(void)
static void test_task_sleepable(void)
{
- struct bpf_iter_task *skel;
+ struct bpf_iter_tasks *skel;
- skel = bpf_iter_task__open_and_load();
- if (!ASSERT_OK_PTR(skel, "bpf_iter_task__open_and_load"))
+ skel = bpf_iter_tasks__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "bpf_iter_tasks__open_and_load"))
return;
do_dummy_read(skel->progs.dump_task_sleepable);
@@ -320,7 +320,7 @@ static void test_task_sleepable(void)
ASSERT_GT(skel->bss->num_success_copy_from_user_task, 0,
"num_success_copy_from_user_task");
- bpf_iter_task__destroy(skel);
+ bpf_iter_tasks__destroy(skel);
}
static void test_task_stack(void)
similarity index 100%
rename from tools/testing/selftests/bpf/progs/bpf_iter_task.c
rename to tools/testing/selftests/bpf/progs/bpf_iter_tasks.c