[RFC,v2,1/5] mm, oom: Introduce bpf_oom_evaluate_task
Commit Message
This patch adds a new hook bpf_oom_evaluate_task in oom_evaluate_task. It
takes oc and current iterating task as parameters and returns a result
indicating which one should be selected. We can use it to bypass the
current logic of oom_evaluate_task and implement customized OOM policies
in the attached BPF progams.
Suggested-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
mm/oom_kill.c | 59 +++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 50 insertions(+), 9 deletions(-)
Comments
在 2023/8/10 16:13, Chuyi Zhou 写道:
> +#include <linux/bpf.h> #include <linux/oom.h> #include <linux/mm.h>
> #include <linux/err.h> @@ -305,6 +306,27 @@ static enum oom_constraint
> constrained_alloc(struct oom_control *oc) return CONSTRAINT_NONE; }
> +enum { + NO_BPF_POLICY, + BPF_EVAL_ABORT, + BPF_EVAL_NEXT, +
> BPF_EVAL_SELECT, +}; +
I saw that tools/testing/selftests/bpf/progs/oom_policy.c is also using
NO_BPF_POLICY etc. I think
+enum {
+ NO_BPF_POLICY,
+ BPF_EVAL_ABORT,
+ BPF_EVAL_NEXT,
+ BPF_EVAL_SELECT,
+};
+
definitions can be placed in include/linux/oom.h
Thanks
Bixuan Cui
Hello, Bixuan.
在 2023/9/13 09:18, Bixuan Cui 写道:
>
>
> 在 2023/8/10 16:13, Chuyi Zhou 写道:
>> +#include <linux/bpf.h> #include <linux/oom.h> #include <linux/mm.h>
>> #include <linux/err.h> @@ -305,6 +306,27 @@ static enum oom_constraint
>> constrained_alloc(struct oom_control *oc) return CONSTRAINT_NONE; }
>> +enum { + NO_BPF_POLICY, + BPF_EVAL_ABORT, + BPF_EVAL_NEXT, +
>> BPF_EVAL_SELECT, +}; +
>
> I saw that tools/testing/selftests/bpf/progs/oom_policy.c is also using
> NO_BPF_POLICY etc. I think
> +enum {
> + NO_BPF_POLICY,
> + BPF_EVAL_ABORT,
> + BPF_EVAL_NEXT,
> + BPF_EVAL_SELECT,
> +};
> +
> definitions can be placed in include/linux/oom.h
>
Thanks for your feedback!
Yes, Maybe we should move these enums to a more proper place so that
they can be generated by BTF and we can take them from vmlinux.h.
> Thanks
> Bixuan Cui
在 2023/8/10 16:13, Chuyi Zhou 写道:
> + +__weak noinline int bpf_oom_evaluate_task(struct task_struct *task,
> struct oom_control *oc) +{ + return NO_BPF_POLICY; +} +
> +BTF_SET8_START(oom_bpf_fmodret_ids) +BTF_ID_FLAGS(func,
> bpf_oom_evaluate_task) +BTF_SET8_END(oom_bpf_fmodret_ids)
I have a question here, why use __weak? Is there other modules that can
redefine bpf_oom_evaluate_task? why not use __bpf_kfunc
(Documentation/bpf/kfuncs.rst) ?
Thanks
Bixuan Cui
@@ -18,6 +18,7 @@
* kernel subsystems and hints as to where to find out what things do.
*/
+#include <linux/bpf.h>
#include <linux/oom.h>
#include <linux/mm.h>
#include <linux/err.h>
@@ -305,6 +306,27 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
return CONSTRAINT_NONE;
}
+enum {
+ NO_BPF_POLICY,
+ BPF_EVAL_ABORT,
+ BPF_EVAL_NEXT,
+ BPF_EVAL_SELECT,
+};
+
+__weak noinline int bpf_oom_evaluate_task(struct task_struct *task, struct oom_control *oc)
+{
+ return NO_BPF_POLICY;
+}
+
+BTF_SET8_START(oom_bpf_fmodret_ids)
+BTF_ID_FLAGS(func, bpf_oom_evaluate_task)
+BTF_SET8_END(oom_bpf_fmodret_ids)
+
+static const struct btf_kfunc_id_set oom_bpf_fmodret_set = {
+ .owner = THIS_MODULE,
+ .set = &oom_bpf_fmodret_ids,
+};
+
static int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
@@ -317,6 +339,26 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
goto next;
+ /*
+ * If task is allocating a lot of memory and has been marked to be
+ * killed first if it triggers an oom, then select it.
+ */
+ if (oom_task_origin(task)) {
+ points = LONG_MAX;
+ goto select;
+ }
+
+ switch (bpf_oom_evaluate_task(task, oc)) {
+ case BPF_EVAL_ABORT:
+ goto abort; /* abort search process */
+ case BPF_EVAL_NEXT:
+ goto next; /* ignore the task */
+ case BPF_EVAL_SELECT:
+ goto select; /* select the task */
+ default:
+ break; /* No BPF policy */
+ }
+
/*
* This task already has access to memory reserves and is being killed.
* Don't allow any other task to have access to the reserves unless
@@ -329,15 +371,6 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
goto abort;
}
- /*
- * If task is allocating a lot of memory and has been marked to be
- * killed first if it triggers an oom, then select it.
- */
- if (oom_task_origin(task)) {
- points = LONG_MAX;
- goto select;
- }
-
points = oom_badness(task, oc->totalpages);
if (points == LONG_MIN || points < oc->chosen_points)
goto next;
@@ -732,10 +765,18 @@ static struct ctl_table vm_oom_kill_table[] = {
static int __init oom_init(void)
{
+ int err;
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
#ifdef CONFIG_SYSCTL
register_sysctl_init("vm", vm_oom_kill_table);
#endif
+
+#ifdef CONFIG_BPF_SYSCALL
+ err = register_btf_fmodret_id_set(&oom_bpf_fmodret_set);
+ if (err)
+ pr_warn("error while registering oom fmodret entrypoints: %d", err);
+#endif
+
return 0;
}
subsys_initcall(oom_init)