[RFC,v2,2/5] mm: Add policy_name to identify OOM policies

Message ID 20230810081319.65668-3-zhouchuyi@bytedance.com
State New
Headers
Series mm: Select victim using bpf_oom_evaluate_task |

Commit Message

Chuyi Zhou Aug. 10, 2023, 8:13 a.m. UTC
  This patch adds a new metadata policy_name in oom_control and report it
in dump_header(), so we can know what has been the selection policy. In
BPF program, we can call kfunc set_oom_policy_name to set the current
user-defined policy name. The in-kernel policy_name is "default".

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
 include/linux/oom.h |  7 +++++++
 mm/oom_kill.c       | 42 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 46 insertions(+), 3 deletions(-)
  

Comments

Bixuan Cui Sept. 14, 2023, 12:02 p.m. UTC | #1
在 2023/8/15 4:51, Jonathan Corbet 写道:
>>   /**
>>    * dump_tasks - dump current memory state of all system tasks
>>    * @oc: pointer to struct oom_control
>> @@ -484,8 +513,8 @@ static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
>>   
>>   static void dump_header(struct oom_control *oc, struct task_struct *p)
>>   {
>> -	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
>> -		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
>> +	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, policy_name=%s, oom_score_adj=%hd\n",
>> +		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, oc->policy_name,
> ...and if the policy name is unterminated, this print will run off the
> end of the structure.
> 
> Am I missing something here?
Perhaps it is inaccurate to use policy name in this way. For example, 
some one use BPF_PROG(bpf_oom_evaluate_task, ...) but do not set the 
policy name through bpf_set_policy_name. In this way, the result is 
still policy name=default, which ultimately leads to error print in the 
dump_header.
I think a better way:

+static const char *const policy_select[] = {
+    "OOM_DEFAULT";
+    "BPF_ABORT",
+    "BPF_NEXT",
+    "BPF_SELECT",
+};

struct oom_control {

  	/* Used to print the constraint info. */
  	enum oom_constraint constraint;
+
+	/* Used to report the policy select. */
+	int policy_select;
  };

static int oom_evaluate_task(struct task_struct *task, void *arg)
{
...

+	switch (bpf_oom_evaluate_task(task, oc)) {
+	case BPF_EVAL_ABORT:
+              oc->policy_select = BPF_EVAL_ABORT;
+		goto abort; /* abort search process */
+	case BPF_EVAL_NEXT:
+              oc->policy_select = BPF_EVAL_NEXT;
+		goto next; /* ignore the task */
+	case BPF_EVAL_SELECT:
+             oc->policy_select = BPF_EVAL_SELECT;
+		goto select; /* select the task */
+	default:
+		break; /* No BPF policy */
+	}

  static void dump_header(struct oom_control *oc, struct task_struct *p)
  {
-	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
oom_score_adj=%hd\n",
-		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
+	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
policy_name=%s, oom_score_adj=%hd\n",
+		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, 
policy_select[oc->policy_select],
  			current->signal->oom_score_adj);


And all definitions of oc should be added
struct oom_control oc = {
      .select = NO_BPF_POLICY,
}

Delete set_oom_policy_name, it makes no sense for users to set policy 
names. :-)

Thanks
Bixuan Cui
  
Bixuan Cui Sept. 14, 2023, 12:04 p.m. UTC | #2
在 2023/8/15 4:51, Jonathan Corbet 写道:
>>   /**
>>    * dump_tasks - dump current memory state of all system tasks
>>    * @oc: pointer to struct oom_control
>> @@ -484,8 +513,8 @@ static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
>>   
>>   static void dump_header(struct oom_control *oc, struct task_struct *p)
>>   {
>> -	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
>> -		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
>> +	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, policy_name=%s, oom_score_adj=%hd\n",
>> +		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, oc->policy_name,
> ...and if the policy name is unterminated, this print will run off the
> end of the structure.
> 
> Am I missing something here?
Perhaps it is inaccurate to use policy name in this way. For example, 
some one use BPF_PROG(bpf_oom_evaluate_task, ...) but do not set the 
policy name through bpf_set_policy_name. In this way, the result is 
still policy name=default, which ultimately leads to error print in the 
dump_header.
I think a better way:

+static const char *const policy_select[] = {
+    "OOM_DEFAULT";
+    "BPF_ABORT",
+    "BPF_NEXT",
+    "BPF_SELECT",
+};

struct oom_control {

  	/* Used to print the constraint info. */
  	enum oom_constraint constraint;
+
+	/* Used to report the policy select. */
+	int policy_select;
  };

static int oom_evaluate_task(struct task_struct *task, void *arg)
{
...

+	switch (bpf_oom_evaluate_task(task, oc)) {
+	case BPF_EVAL_ABORT:
+              oc->policy_select = BPF_EVAL_ABORT;
+		goto abort; /* abort search process */
+	case BPF_EVAL_NEXT:
+              oc->policy_select = BPF_EVAL_NEXT;
+		goto next; /* ignore the task */
+	case BPF_EVAL_SELECT:
+             oc->policy_select = BPF_EVAL_SELECT;
+		goto select; /* select the task */
+	default:
+		break; /* No BPF policy */
+	}

  static void dump_header(struct oom_control *oc, struct task_struct *p)
  {
-	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
oom_score_adj=%hd\n",
-		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
+	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
policy_name=%s, oom_score_adj=%hd\n",
+		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, 
policy_select[oc->policy_select],
  			current->signal->oom_score_adj);


And all definitions of oc should be added
struct oom_control oc = {
      .select = NO_BPF_POLICY,
}

Delete set_oom_policy_name, it makes no sense for users to set policy 
names. :-)

Thanks
Bixuan Cui
  
Chuyi Zhou Sept. 14, 2023, 12:50 p.m. UTC | #3
Hello,

在 2023/9/14 20:02, Bixuan Cui 写道:
> 
> 
> 在 2023/8/15 4:51, Jonathan Corbet 写道:
>>>   /**
>>>    * dump_tasks - dump current memory state of all system tasks
>>>    * @oc: pointer to struct oom_control
>>> @@ -484,8 +513,8 @@ static void dump_oom_summary(struct oom_control 
>>> *oc, struct task_struct *victim)
>>>   static void dump_header(struct oom_control *oc, struct task_struct *p)
>>>   {
>>> -    pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
>>> oom_score_adj=%hd\n",
>>> -        current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
>>> +    pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
>>> policy_name=%s, oom_score_adj=%hd\n",
>>> +        current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, 
>>> oc->policy_name,
>> ...and if the policy name is unterminated, this print will run off the
>> end of the structure.
>>
>> Am I missing something here?
> Perhaps it is inaccurate to use policy name in this way. For example, 
> some one use BPF_PROG(bpf_oom_evaluate_task, ...) but do not set the 
> policy name through bpf_set_policy_name. In this way, the result is 
> still policy name=default, which ultimately leads to error print in the 
> dump_header.
> I think a better way:
> 
> +static const char *const policy_select[] = {
> +    "OOM_DEFAULT";
> +    "BPF_ABORT",
> +    "BPF_NEXT",
> +    "BPF_SELECT",
> +};
> 
> struct oom_control {
> 
>       /* Used to print the constraint info. */
>       enum oom_constraint constraint;
> +
> +    /* Used to report the policy select. */
> +    int policy_select;
>   };
> 
> static int oom_evaluate_task(struct task_struct *task, void *arg)
> {
> ...
> 
> +    switch (bpf_oom_evaluate_task(task, oc)) {
> +    case BPF_EVAL_ABORT:
> +              oc->policy_select = BPF_EVAL_ABORT;
> +        goto abort; /* abort search process */
> +    case BPF_EVAL_NEXT:
> +              oc->policy_select = BPF_EVAL_NEXT;
> +        goto next; /* ignore the task */
> +    case BPF_EVAL_SELECT:
> +             oc->policy_select = BPF_EVAL_SELECT;
> +        goto select; /* select the task */
> +    default:
> +        break; /* No BPF policy */
> +    }
> 
>   static void dump_header(struct oom_control *oc, struct task_struct *p)
>   {
> -    pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
> oom_score_adj=%hd\n",
> -        current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
> +    pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, 
> policy_name=%s, oom_score_adj=%hd\n",
> +        current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, 
> policy_select[oc->policy_select],
>               current->signal->oom_score_adj);
> 
> 

The policy_name may be different from the previous OOM reporting, even 
though they are using the same policy.

> And all definitions of oc should be added
> struct oom_control oc = {
>       .select = NO_BPF_POLICY,
> }
> 
> Delete set_oom_policy_name, it makes no sense for users to set policy 
> names. :-)
> 

There can be multiple OOM policy in the system at the same time.

If we need to apply different OOM policies to different memcgs based on 
different scenarios, we can use this hook(set_oom_policy_name) to set 
name to identify which policy in invoked at that time.

Just some thoughts.

Thanks.

> Thanks
> Bixuan Cui
> 
> 
> 
> 
>
  
Bixuan Cui Sept. 15, 2023, 2:28 a.m. UTC | #4
在 2023/9/14 20:50, Chuyi Zhou 写道:
>>
>> Delete set_oom_policy_name, it makes no sense for users to set policy 
>> names. 🙂
>>
> 
> There can be multiple OOM policy in the system at the same time.
> 
> If we need to apply different OOM policies to different memcgs based on 
> different scenarios, we can use this hook(set_oom_policy_name) to set 
> name to identify which policy in invoked at that time.
> 
> Just some thoughts.
Well, I thought the system would only load one OOM policy(set one policy 
name), which would set the prio of all memcgs.

What you mean is that multiple bpf.c may be loaded at the system, and 
each bpf.c sets the different policy for different memcgs?
  
Chuyi Zhou Sept. 15, 2023, 3:31 a.m. UTC | #5
在 2023/9/15 10:28, Bixuan Cui 写道:
> 
> 
> 在 2023/9/14 20:50, Chuyi Zhou 写道:
>>>
>>> Delete set_oom_policy_name, it makes no sense for users to set policy 
>>> names. 🙂
>>>
>>
>> There can be multiple OOM policy in the system at the same time.
>>
>> If we need to apply different OOM policies to different memcgs based 
>> on different scenarios, we can use this hook(set_oom_policy_name) to 
>> set name to identify which policy in invoked at that time.
>>
>> Just some thoughts.
> Well, I thought the system would only load one OOM policy(set one policy 
> name), which would set the prio of all memcgs.
> 
> What you mean is that multiple bpf.c may be loaded at the system, and 
> each bpf.c sets the different policy for different memcgs?

No. Not multiple bpf_oompolicy.c but multiple OOM Policy in one BPF Program.
  

Patch

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 7d0c9c48a0c5..69d0f2ec6ea6 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -22,6 +22,10 @@  enum oom_constraint {
 	CONSTRAINT_MEMCG,
 };
 
+enum {
+	POLICY_NAME_LEN = 16,
+};
+
 /*
  * Details of the page allocation that triggered the oom killer that are used to
  * determine what should be killed.
@@ -52,6 +56,9 @@  struct oom_control {
 
 	/* Used to print the constraint info. */
 	enum oom_constraint constraint;
+
+	/* Used to report the policy info. */
+	char policy_name[POLICY_NAME_LEN];
 };
 
 extern struct mutex oom_lock;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 255c9ef1d808..3239dcdba4d7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -443,6 +443,35 @@  static int dump_task(struct task_struct *p, void *arg)
 	return 0;
 }
 
+__bpf_kfunc void set_oom_policy_name(struct oom_control *oc, const char *src, size_t sz)
+{
+	memset(oc->policy_name, 0, sizeof(oc->policy_name));
+
+	if (sz > POLICY_NAME_LEN)
+		sz = POLICY_NAME_LEN;
+
+	memcpy(oc->policy_name, src, sz);
+}
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "kfuncs which will be used in BPF programs");
+
+__weak noinline void bpf_set_policy_name(struct oom_control *oc)
+{
+}
+
+__diag_pop();
+
+BTF_SET8_START(bpf_oom_policy_kfunc_ids)
+BTF_ID_FLAGS(func, set_oom_policy_name)
+BTF_SET8_END(bpf_oom_policy_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_oom_policy_kfunc_set = {
+	.owner          = THIS_MODULE,
+	.set            = &bpf_oom_policy_kfunc_ids,
+};
+
 /**
  * dump_tasks - dump current memory state of all system tasks
  * @oc: pointer to struct oom_control
@@ -484,8 +513,8 @@  static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
 
 static void dump_header(struct oom_control *oc, struct task_struct *p)
 {
-	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
-		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
+	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, policy_name=%s, oom_score_adj=%hd\n",
+		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, oc->policy_name,
 			current->signal->oom_score_adj);
 	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
 		pr_warn("COMPACTION is disabled!!!\n");
@@ -775,8 +804,11 @@  static int __init oom_init(void)
 	err = register_btf_fmodret_id_set(&oom_bpf_fmodret_set);
 	if (err)
 		pr_warn("error while registering oom fmodret entrypoints: %d", err);
+	err = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+					&bpf_oom_policy_kfunc_set);
+	if (err)
+		pr_warn("error while registering oom kfunc entrypoints: %d", err);
 #endif
-
 	return 0;
 }
 subsys_initcall(oom_init)
@@ -1196,6 +1228,10 @@  bool out_of_memory(struct oom_control *oc)
 		return true;
 	}
 
+	set_oom_policy_name(oc, "default", sizeof("default"));
+
+	bpf_set_policy_name(oc);
+
 	select_bad_process(oc);
 	/* Found nothing?!?! */
 	if (!oc->chosen) {