[1/1] sched: fix user_mask double free

Message ID b8970a530d420109ee9fe0b268e097fb839211b0.1669020858.git.wangbiao3@xiaomi.com
State New
Headers
Series sched: fix user_mask double free |

Commit Message

David Wang 王标 Nov. 21, 2022, 10:04 a.m. UTC
  From: wangbiao3 <wangbiao3@xiaomi.com>

Clone/Fork a new task,call dup_task_struct->arch_dup_task_struct(tsk,orig)
which copy the data of parent/sibling task inclding p->user_cpus_ptr,so
the user_cpus_ptr of newtask is the same with orig task's.When
dup_task_struct call dup_user_cpus_ptr(tsk, orig, node),it return 0
dircetly if src->user_cpus_ptris free by other task,in this case ,
the newtask's address of user_cpus_ptr is not changed. Finally,
wakup newtask to execute, call task_cpu_possible_mask-->
do_set_cpus_allowed to set new task's user_cpus_ptr(user_mask) which
call kfree user_mask at the end. So cause a slub double free panic.

Use pi_lock to protect content of user_cpus_ptr in dup_user_cpus_ptr and
clear dst->user_cpus_ptr when found src->user_cpus_ptr is null

kernel BUG at mm/slub.c:363!
Call trace:
 __slab_free+0x230/0x28c
 kfree+0x220/0x2cc
 do_set_cpus_allowed+0x74/0xa4
 select_fallback_rq+0x12c/0x200
 wake_up_new_task+0x26c/0x304
 kernel_clone+0x2c0/0x470
 __arm64_sys_clone+0x5c/0x8c
 invoke_syscall+0x60/0x150
 el0_svc_common.llvm.13030543509303927816+0x98/0x114
 do_el0_svc_compat+0x20/0x30
 el0_svc_compat+0x28/0x90
 el0t_32_sync_handler+0x7c/0xbc
 el0t_32_sync+0x1b8/0x1bc

Signed-off-by: wangbiao3 <wangbiao3@xiaomi.com>
---
 kernel/sched/core.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

--
2.38.1

#/******±¾Óʼþ¼°Æ丽¼þº¬ÓÐСÃ×¹«Ë¾µÄ±£ÃÜÐÅÏ¢£¬½öÏÞÓÚ·¢Ë͸øÉÏÃæµØÖ·ÖÐÁгöµÄ¸öÈË»òȺ×é¡£½ûÖ¹ÈκÎÆäËûÈËÒÔÈκÎÐÎʽʹÓ㨰üÀ¨µ«²»ÏÞÓÚÈ«²¿»ò²¿·ÖµØй¶¡¢¸´ÖÆ¡¢»òÉ¢·¢£©±¾ÓʼþÖеÄÐÅÏ¢¡£Èç¹ûÄú´íÊÕÁ˱¾Óʼþ£¬ÇëÄúÁ¢¼´µç»°»òÓʼþ֪ͨ·¢¼þÈ˲¢É¾³ý±¾Óʼþ£¡ This e-mail and its attachments contain confidential information from XIAOMI, which is intended only for the person or entity whose address is listed above. Any use of the information contained herein in any way (including, but not limited to, total or partial disclosure, reproduction, or dissemination) by persons other than the intended recipient(s) is prohibited. If you receive this e-mail in error, please notify the sender by phone or email immediately and delete it!******/#
  

Comments

Quentin Perret Nov. 22, 2022, 1:18 p.m. UTC | #1
+CC the missing people from get_maintainers.pl

On Monday 21 Nov 2022 at 18:04:20 (+0800), wangbiao3@xiaomi.com wrote:
> From: wangbiao3 <wangbiao3@xiaomi.com>
> 
> Clone/Fork a new task,call dup_task_struct->arch_dup_task_struct(tsk,orig)
> which copy the data of parent/sibling task inclding p->user_cpus_ptr,so
> the user_cpus_ptr of newtask is the same with orig task's.When
> dup_task_struct call dup_user_cpus_ptr(tsk, orig, node),it return 0
> dircetly if src->user_cpus_ptris free by other task,in this case ,
> the newtask's address of user_cpus_ptr is not changed. Finally,
> wakup newtask to execute, call task_cpu_possible_mask-->
> do_set_cpus_allowed to set new task's user_cpus_ptr(user_mask) which
> call kfree user_mask at the end. So cause a slub double free panic.
> 
> Use pi_lock to protect content of user_cpus_ptr in dup_user_cpus_ptr and
> clear dst->user_cpus_ptr when found src->user_cpus_ptr is null
> 
> kernel BUG at mm/slub.c:363!
> Call trace:
>  __slab_free+0x230/0x28c
>  kfree+0x220/0x2cc
>  do_set_cpus_allowed+0x74/0xa4
>  select_fallback_rq+0x12c/0x200
>  wake_up_new_task+0x26c/0x304
>  kernel_clone+0x2c0/0x470
>  __arm64_sys_clone+0x5c/0x8c
>  invoke_syscall+0x60/0x150
>  el0_svc_common.llvm.13030543509303927816+0x98/0x114
>  do_el0_svc_compat+0x20/0x30
>  el0_svc_compat+0x28/0x90
>  el0t_32_sync_handler+0x7c/0xbc
>  el0t_32_sync+0x1b8/0x1bc
> 
> Signed-off-by: wangbiao3 <wangbiao3@xiaomi.com>
> ---
>  kernel/sched/core.c | 31 ++++++++++++++++++++-----------
>  1 file changed, 20 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index daff72f00385..b013d8b777b4 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2584,29 +2584,38 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
>         __do_set_cpus_allowed(p, new_mask, 0);
>  }
> 
> +static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
> +{
> +       struct cpumask *user_mask = NULL;
> +
> +       swap(p->user_cpus_ptr, user_mask);
> +
> +       return user_mask;
> +}
> +
>  int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
>                       int node)
>  {
> -       if (!src->user_cpus_ptr)
> -               return 0;

Removing this puts the kmalloc_node() (and kfree()) below on the fast
path for everyone. It wouldn't be surprising if this causes regressions
for some people ...

Can we optimize that?

> +       unsigned long flags;
> +       struct cpumask *user_mask = NULL;
> 
>         dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
>         if (!dst->user_cpus_ptr)
>                 return -ENOMEM;
> 
> +       /* Use pi_lock to protect content of user_cpus_ptr */
> +       raw_spin_lock_irqsave(&src->pi_lock, flags);
> +       if (!src->user_cpus_ptr) {
> +               user_mask = clear_user_cpus_ptr(dst);
> +               raw_spin_unlock_irqrestore(&src->pi_lock, flags);
> +               kfree(user_mask);
> +               return 0;
> +       }
>         cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
> +       raw_spin_unlock_irqrestore(&src->pi_lock, flags);
>         return 0;
>  }
> 
> -static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
> -{
> -       struct cpumask *user_mask = NULL;
> -
> -       swap(p->user_cpus_ptr, user_mask);
> -
> -       return user_mask;
> -}
> -
>  void release_user_cpus_ptr(struct task_struct *p)
>  {
>         kfree(clear_user_cpus_ptr(p));
> --
> 2.38.1
> 
> #/******?????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? This e-mail and its attachments contain confidential information from XIAOMI, which is intended only for the person or entity whose address is listed above. Any use of the information contained herein in any way (including, but not limited to, total or partial disclosure, reproduction, or dissemination) by persons other than the intended recipient(s) is prohibited. If you receive this e-mail in error, please notify the sender by phone or email immediately and delete it!******/#

Note that this legalese must be removed from your email before anybody
can apply this patch, please see
Documentation/process/submitting-patches.rst.

Thanks,
Quentin
  
Peter Zijlstra Nov. 22, 2022, 2:05 p.m. UTC | #2
So you failed:

 - to Cc the original author of this code (Will Deacon)
 - to report what version this is against (apparently Linus' tree)
 - to check if this still applies to the latest tree (it doesn't)
 - to Cc the author of the code it now conflicts with (Waiman)
 - write something coherent in the changelog.
 - to include a Fixes tag.

Still, let me try and make sense of things...

On Mon, Nov 21, 2022 at 06:04:20PM +0800, wangbiao3@xiaomi.com wrote:
> From: wangbiao3 <wangbiao3@xiaomi.com>
> 
> Clone/Fork a new task,call dup_task_struct->arch_dup_task_struct(tsk,orig)
> which copy the data of parent/sibling task inclding p->user_cpus_ptr,so
> the user_cpus_ptr of newtask is the same with orig task's.When
> dup_task_struct call dup_user_cpus_ptr(tsk, orig, node),it return 0
> dircetly if src->user_cpus_ptris free by other task,in this case ,
> the newtask's address of user_cpus_ptr is not changed.

(even just inserting some whitespace would've made it so much easier to
read)

But, the only way that would be possible is if
force_compatible_cpus_allowed_ptr() were to be called on !current, and
that just doesn't happen, the only callsite is:

arch/arm64/kernel/process.c:                    force_compatible_cpus_allowed_ptr(current);

And you can't be in fork() and exec() at the same time.

If it were possible to call restrict_cpus_allowed_ptr() on a non-current
task then yes, absolutely, which is why:

  8f9ea86fdf99 ("sched: Always preserve the user requested cpumask")

also wraps the thing in pi_lock, but looking at it now, perhaps it needs
to do the alloc/copy first and swap under pi_lock instead.

(leaving the rest for the newly Cc'ed)

> Finally,
> wakup newtask to execute, call task_cpu_possible_mask-->
> do_set_cpus_allowed to set new task's user_cpus_ptr(user_mask) which
> call kfree user_mask at the end. So cause a slub double free panic.
> 
> Use pi_lock to protect content of user_cpus_ptr in dup_user_cpus_ptr and
> clear dst->user_cpus_ptr when found src->user_cpus_ptr is null
> 
> kernel BUG at mm/slub.c:363!
> Call trace:
>  __slab_free+0x230/0x28c
>  kfree+0x220/0x2cc
>  do_set_cpus_allowed+0x74/0xa4
>  select_fallback_rq+0x12c/0x200
>  wake_up_new_task+0x26c/0x304
>  kernel_clone+0x2c0/0x470
>  __arm64_sys_clone+0x5c/0x8c
>  invoke_syscall+0x60/0x150
>  el0_svc_common.llvm.13030543509303927816+0x98/0x114
>  do_el0_svc_compat+0x20/0x30
>  el0_svc_compat+0x28/0x90
>  el0t_32_sync_handler+0x7c/0xbc
>  el0t_32_sync+0x1b8/0x1bc
> 
> Signed-off-by: wangbiao3 <wangbiao3@xiaomi.com>
> ---
>  kernel/sched/core.c | 31 ++++++++++++++++++++-----------
>  1 file changed, 20 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index daff72f00385..b013d8b777b4 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2584,29 +2584,38 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
>         __do_set_cpus_allowed(p, new_mask, 0);
>  }
> 
> +static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
> +{
> +       struct cpumask *user_mask = NULL;
> +
> +       swap(p->user_cpus_ptr, user_mask);
> +
> +       return user_mask;
> +}
> +
>  int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
>                       int node)
>  {
> -       if (!src->user_cpus_ptr)
> -               return 0;
> +       unsigned long flags;
> +       struct cpumask *user_mask = NULL;
> 
>         dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
>         if (!dst->user_cpus_ptr)
>                 return -ENOMEM;
> 
> +       /* Use pi_lock to protect content of user_cpus_ptr */
> +       raw_spin_lock_irqsave(&src->pi_lock, flags);
> +       if (!src->user_cpus_ptr) {
> +               user_mask = clear_user_cpus_ptr(dst);
> +               raw_spin_unlock_irqrestore(&src->pi_lock, flags);
> +               kfree(user_mask);
> +               return 0;
> +       }
>         cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
> +       raw_spin_unlock_irqrestore(&src->pi_lock, flags);
>         return 0;
>  }
> 
> -static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
> -{
> -       struct cpumask *user_mask = NULL;
> -
> -       swap(p->user_cpus_ptr, user_mask);
> -
> -       return user_mask;
> -}
> -
>  void release_user_cpus_ptr(struct task_struct *p)
>  {
>         kfree(clear_user_cpus_ptr(p));
> --
> 2.38.1
> 
> #/******?????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? This e-mail and its attachments contain confidential information from XIAOMI, which is intended only for the person or entity whose address is listed above. Any use of the information contained herein in any way (including, but not limited to, total or partial disclosure, reproduction, or dissemination) by persons other than the intended recipient(s) is prohibited. If you receive this e-mail in error, please notify the sender by phone or email immediately and delete it!******/#
  
Peter Zijlstra Nov. 22, 2022, 2:06 p.m. UTC | #3
On Tue, Nov 22, 2022 at 03:05:01PM +0100, Peter Zijlstra wrote:
> 
> So you failed:
> 

> > #/******?????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? This e-mail and its attachments contain confidential information from XIAOMI, which is intended only for the person or entity whose address is listed above. Any use of the information contained herein in any way (including, but not limited to, total or partial disclosure, reproduction, or dissemination) by persons other than the intended recipient(s) is prohibited. If you receive this e-mail in error, please notify the sender by phone or email immediately and delete it!******/#

Also, that ^ is super fail, you're sending this to a public list. Please
tell your (IT) manager it makes your corporation look like an idiot.
  
Waiman Long Nov. 22, 2022, 3:39 p.m. UTC | #4
On 11/22/22 09:05, Peter Zijlstra wrote:
> So you failed:
>
>   - to Cc the original author of this code (Will Deacon)
>   - to report what version this is against (apparently Linus' tree)
>   - to check if this still applies to the latest tree (it doesn't)
>   - to Cc the author of the code it now conflicts with (Waiman)
>   - write something coherent in the changelog.
>   - to include a Fixes tag.
>
> Still, let me try and make sense of things...
>
> On Mon, Nov 21, 2022 at 06:04:20PM +0800, wangbiao3@xiaomi.com wrote:
>> From: wangbiao3 <wangbiao3@xiaomi.com>
>>
>> Clone/Fork a new task,call dup_task_struct->arch_dup_task_struct(tsk,orig)
>> which copy the data of parent/sibling task inclding p->user_cpus_ptr,so
>> the user_cpus_ptr of newtask is the same with orig task's.When
>> dup_task_struct call dup_user_cpus_ptr(tsk, orig, node),it return 0
>> dircetly if src->user_cpus_ptris free by other task,in this case ,
>> the newtask's address of user_cpus_ptr is not changed.
> (even just inserting some whitespace would've made it so much easier to
> read)
>
> But, the only way that would be possible is if
> force_compatible_cpus_allowed_ptr() were to be called on !current, and
> that just doesn't happen, the only callsite is:
>
> arch/arm64/kernel/process.c:                    force_compatible_cpus_allowed_ptr(current);
>
> And you can't be in fork() and exec() at the same time.
>
> If it were possible to call restrict_cpus_allowed_ptr() on a non-current
> task then yes, absolutely, which is why:
>
>    8f9ea86fdf99 ("sched: Always preserve the user requested cpumask")
>
> also wraps the thing in pi_lock, but looking at it now, perhaps it needs
> to do the alloc/copy first and swap under pi_lock instead.

With the latest change, user_cpus_ptr, once set, will not be cleared 
until when the task dies. That is why I don't recheck if user_cpus_ptr 
is NULL under pi_lock. The user_cpus_ptr value can certainly changes 
during its lifetime, but it will be stable under pi_lock. 
clear_user_cpus_ptr() is called by release_user_cpus_ptr() only. As said 
before, it is only call when the task dies at free_task() and so there 
shouldn't be any other racing conditions that can happen at the same time.

David, can you try the latest tip tree to see if the problem is still 
reproducible ?

Thanks,
Longman
  
Waiman Long Nov. 22, 2022, 6:13 p.m. UTC | #5
On 11/22/22 10:39, Waiman Long wrote:
>
> On 11/22/22 09:05, Peter Zijlstra wrote:
>> So you failed:
>>
>>   - to Cc the original author of this code (Will Deacon)
>>   - to report what version this is against (apparently Linus' tree)
>>   - to check if this still applies to the latest tree (it doesn't)
>>   - to Cc the author of the code it now conflicts with (Waiman)
>>   - write something coherent in the changelog.
>>   - to include a Fixes tag.
>>
>> Still, let me try and make sense of things...
>>
>> On Mon, Nov 21, 2022 at 06:04:20PM +0800, wangbiao3@xiaomi.com wrote:
>>> From: wangbiao3 <wangbiao3@xiaomi.com>
>>>
>>> Clone/Fork a new task,call 
>>> dup_task_struct->arch_dup_task_struct(tsk,orig)
>>> which copy the data of parent/sibling task inclding p->user_cpus_ptr,so
>>> the user_cpus_ptr of newtask is the same with orig task's.When
>>> dup_task_struct call dup_user_cpus_ptr(tsk, orig, node),it return 0
>>> dircetly if src->user_cpus_ptris free by other task,in this case ,
>>> the newtask's address of user_cpus_ptr is not changed.
>> (even just inserting some whitespace would've made it so much easier to
>> read)
>>
>> But, the only way that would be possible is if
>> force_compatible_cpus_allowed_ptr() were to be called on !current, and
>> that just doesn't happen, the only callsite is:
>>
>> arch/arm64/kernel/process.c: force_compatible_cpus_allowed_ptr(current);
>>
>> And you can't be in fork() and exec() at the same time.
>>
>> If it were possible to call restrict_cpus_allowed_ptr() on a non-current
>> task then yes, absolutely, which is why:
>>
>>    8f9ea86fdf99 ("sched: Always preserve the user requested cpumask")
>>
>> also wraps the thing in pi_lock, but looking at it now, perhaps it needs
>> to do the alloc/copy first and swap under pi_lock instead.
>
> With the latest change, user_cpus_ptr, once set, will not be cleared 
> until when the task dies. That is why I don't recheck if user_cpus_ptr 
> is NULL under pi_lock. The user_cpus_ptr value can certainly changes 
> during its lifetime, but it will be stable under pi_lock. 
> clear_user_cpus_ptr() is called by release_user_cpus_ptr() only. As 
> said before, it is only call when the task dies at free_task() and so 
> there shouldn't be any other racing conditions that can happen at the 
> same time.

On second thought, do_set_cpus_allowed() can put NULL into 
user_cpus_ptr. So I think we should do null check in dup_user_cpus_ptr() 
inside the pi_lock. Will send a patch to do that.

Cheers,
Longman
  
Waiman Long Nov. 24, 2022, 3:59 a.m. UTC | #6
On 11/23/22 21:37, David Wang 王标 wrote:
>
> Dear Waiman,
>
> Yes, we have read 
> https://lore.kernel.org/lkml/20221123131612.914906-1-longman@redhat.com/ 
>  and checked it  carefully.
>
> We mean dup_user_cpus_ptr should not judge if the src->user_cpus_ptr 
> is null at the entry of the
> Function dup_user_cpus_ptr.
> If do this when user_cpus_ptr is freed by othe thread, but the parent 
> task copy the user_cpus_ptr
> data for new task through dup_task_struct 
> <https://opengrok.qualcomm.com/source/s?refs=dup_task_struct&project=KERNEL.PLATFORM.2.0> 
> -> arch_dup_task_struct 
> <https://opengrok.qualcomm.com/source/xref/KERNEL.PLATFORM.2.0/kernel_platform/common/kernel/fork.c#arch_dup_task_struct>(tsk 
> <https://opengrok.qualcomm.com/source/s?defs=tsk&project=KERNEL.PLATFORM.2.0>, 
> orig 
> <https://opengrok.qualcomm.com/source/s?defs=orig&project=KERNEL.PLATFORM.2.0>) 
> before the user_cpus_ptr
> ,
> is freed, ,next , when dup_task_struct 
> <https://opengrok.qualcomm.com/source/s?refs=dup_task_struct&project=KERNEL.PLATFORM.2.0> 
> call dup_user_cpus_ptr 
> <https://opengrok.qualcomm.com/source/s?defs=dup_user_cpus_ptr&project=KERNEL.PLATFORM.2.0>(tsk 
> <https://opengrok.qualcomm.com/source/s?defs=tsk&project=KERNEL.PLATFORM.2.0>, 
> orig 
> <https://opengrok.qualcomm.com/source/s?defs=orig&project=KERNEL.PLATFORM.2.0>, 
> node 
> <https://opengrok.qualcomm.com/source/s?defs=node&project=KERNEL.PLATFORM.2.0>),it 
> will return directly
> without doing nothing.  When wake up new task , then call 
> select_fallback_rqàdo_set_cpus_allowed,
> it will meet slub double free issue.  Then new path can not fix issue 
> in this scenario.
> void do_set_cpus_allowed(struct task_struct *p, const struct cpumask 
> *new_mask)
> {
>         struct affinity_context ac = {
>                .new_mask  = new_mask,
>                .user_mask = NULL,
>                .flags     = SCA_USER,      /* clear the user requested 
> mask */
>         };
>         __do_set_cpus_allowed(p, &ac);
>         kfree(ac.user_mask);
> }
> Kfree kfree(ac.user_mask) cause double free issue. New patch just 
> cover the user_cpus_ptr is freed
> after  code running into raw_spin_lock_irqsave, if it can not enter 
> into pi_lock critical section,
> what will happen.
>
> Maybe should delte following code at the entry of fuction . Please 
> help check it.
>
> -       if (!src->user_cpus_ptr)     //delete this
>
> -              return 0;            //delete  this
>
> We think maybe path needs a little more modification like following :
>
> kernel/sched/core.c 
> <https://lore.kernel.org/lkml/20221123131612.914906-1-longman@redhat.com/#Z31kernel:sched:core.c> 
> | 23 +++++++++++++++++++++--
>
> 1 file changed 
> <https://lore.kernel.org/lkml/20221123131612.914906-1-longman@redhat.com/#related>, 
> 21 insertions(+), 2 deletions(-)
>
> diff 
> <https://lore.kernel.org/lkml/20221123131612.914906-1-longman@redhat.com/#iZ31kernel:sched:core.c> 
> --git a/kernel/sched/core.c b/kernel/sched/core.c
>
> index 8df51b08bb38..6b259d9e127a 100644
>
> --- a/kernel/sched/core.c
>
> +++ b/kernel/sched/core.c
>
> @@ -2624,8 +2624,14 @@ void do_set_cpus_allowed(struct task_struct *p, 
> const struct cpumask *new_mask)
>
> int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
>
>       int node)
>
> {
>
> + cpumask_t *user_mask = NULL;
>
>        unsigned long flags;
>
> + /*
>
> + * This check is racy and losing the race is a valid situation.
>
> + * It is not worth the extra overhead of taking the pi_lock on
>
> + * every fork/clone.
>
> + */
>
> -       if (!src->user_cpus_ptr) //delete this
>
> -              return 0;            //delete this
>
The clearing of user_cpus_ptr is protected by pi_lock. IOW, racing 
between dup_user_cpus_ptr() and do_set_cpus_allowed is not possible and 
double free like what you have suggested should not happen. Yes, the 
user_cpus_ptr check here is racy. The worst case result is that a 
user_cpus_ptr has just been set in the task to be cloned, but it fail to 
copy over the user mask. With or without the check, the race can happen. 
The check is an optimization. Its effect is just make one outcome more 
likely than the other.

Cheers,
Longman
  
Wenjie Li (Evan) Nov. 24, 2022, 12:04 p.m. UTC | #7
Hi, Waiman.

"The clearing of user_cpus_ptr is protected by pi_lock. IOW, racing between dup_user_cpus_ptr() and do_set_cpus_allowed is not possible and double free like what you have suggested should not happen." We still can understand why it is impossible to happen. Because we indeed met this issue. Following is we got from ftrace.

1. Task  A  pid 27961 run on core6 and is forking/cloning task pid 28051, and task B  pid 28051 will copy task struct data from task A pid 27961. So task A p->user_cpus_ptr = ffffff884fbf9200 is equal to task B p->user_cpus_ptr=ffffff884fbf9200 through arch_dup_task_struct.

2. Then core6 met hotplug which will task migration/6 pid 61 preempt schedule task A 27961
  	Trac log: rso-inner-27961 [006] 56114.972822: sched_switch:         rso-inner:27961 [139] R ==> migration/6:61 [0]

3. Then task migration/6 pid 61  migrate task A 27961 from core6  to core0 and clear task A 27961 p->user_cpus_ptr through this process:  migrate_tasks -> select_fallback_rq -> do_set_cpus_allowed -> free user_cpus_ptr and change task A 27961 user_cpus_ptr = NULL.
              Trace log:  migration/6-61    [006] 56114.972937: bprint:               do_set_cpus_allowed: do_set_cpus_allowed: p->comm:rso-inner pid:27961, maskp:0 ac.user_mask:ffffff884fbf9200

4. Then task A pid 27961 enqueue on core0 and run on core0.
	Ttrace log : binder:27524_5-27775 [000] 56114.973592: sched_switch:         binder:27524_5:27775 [120] S ==> rso-inner:27961 [139]

5. Then task A pid 27961 call dup_user_cpus_ptr which found src->user_cpus_ptr is NULL, So It will directly return and will not allocate task B pid 28051's user_cpus_ptr. So task B pid 28051's user_cpus_ptr still is ffffff884fbf9200.

6.  Then task A pid 27961 wake up task B28051 in this process "kernel_clone-> wake_up_new_task-> select_fallback_rq ->  do_set_cpus_allowed"  which will call do_set_cpus_allowed again will double free ffffff884fbf9200:  
                Trace log:    rso-inner-27961 [000] 56114.973966: bprint:               do_set_cpus_allowed: do_set_cpus_allowed: p->comm:rso-inner pid:28051, maskp:0 ac.user_mask:ffffff884fbf9200


Thanks

-----Original Message-----
From: Waiman Long <longman@redhat.com> 
Sent: Thursday, November 24, 2022 12:00 PM
To: David Wang 王标 <wangbiao3@xiaomi.com>; Peter Zijlstra <peterz@infradead.org>
Cc: mingo@redhat.com; juri.lelli@redhat.com; vincent.guittot@linaro.org; brauner@kernel.org; bsegall@google.com; linux-kernel@vger.kernel.org; Wenjie Li (Evan) <wenjieli@qti.qualcomm.com>; 陈冠有 <chenguanyou@xiaomi.com>; Will Deacon <will@kernel.org>; Ting11 Wang 王婷 <wangting11@xiaomi.com>
Subject: Re: 答复: [External Mail]Re: [PATCH 1/1] sched: fix user_mask double free

WARNING: This email originated from outside of Qualcomm. Please be wary of any links or attachments, and do not enable macros.

On 11/23/22 21:37, David Wang 王标 wrote:
>
> Dear Waiman,
>
> Yes, we have read
> https://lore.kernel.org/lkml/20221123131612.914906-1-longman@redhat.co
> m/
>  and checked it  carefully.
>
> We mean dup_user_cpus_ptr should not judge if the src->user_cpus_ptr 
> is null at the entry of the Function dup_user_cpus_ptr.
> If do this when user_cpus_ptr is freed by othe thread, but the parent 
> task copy the user_cpus_ptr data for new task through dup_task_struct 
> <https://opengrok.qualcomm.com/source/s?refs=dup_task_struct&project=K
> ERNEL.PLATFORM.2.0>
> -> arch_dup_task_struct
> <https://opengrok.qualcomm.com/source/xref/KERNEL.PLATFORM.2.0/kernel_
> platform/common/kernel/fork.c#arch_dup_task_struct>(tsk
> <https://opengrok.qualcomm.com/source/s?defs=tsk&project=KERNEL.PLATFO
> RM.2.0>,
> orig
> <https://opengrok.qualcomm.com/source/s?defs=orig&project=KERNEL.PLATF
> ORM.2.0>)
> before the user_cpus_ptr
> ,
> is freed, ,next , when dup_task_struct 
> <https://opengrok.qualcomm.com/source/s?refs=dup_task_struct&project=K
> ERNEL.PLATFORM.2.0>
> call dup_user_cpus_ptr
> <https://opengrok.qualcomm.com/source/s?defs=dup_user_cpus_ptr&project
> =KERNEL.PLATFORM.2.0>(tsk 
> <https://opengrok.qualcomm.com/source/s?defs=tsk&project=KERNEL.PLATFO
> RM.2.0>,
> orig
> <https://opengrok.qualcomm.com/source/s?defs=orig&project=KERNEL.PLATF
> ORM.2.0>,
> node
> <https://opengrok.qualcomm.com/source/s?defs=node&project=KERNEL.PLATF
> ORM.2.0>),it
> will return directly
> without doing nothing.  When wake up new task , then call 
> select_fallback_rqàdo_set_cpus_allowed,
> it will meet slub double free issue.  Then new path can not fix issue 
> in this scenario.
> void do_set_cpus_allowed(struct task_struct *p, const struct cpumask
> *new_mask)
> {
>         struct affinity_context ac = {
>                .new_mask  = new_mask,
>                .user_mask = NULL,
>                .flags     = SCA_USER,      /* clear the user requested
> mask */
>         };
>         __do_set_cpus_allowed(p, &ac);
>         kfree(ac.user_mask);
> }
> Kfree kfree(ac.user_mask) cause double free issue. New patch just 
> cover the user_cpus_ptr is freed after  code running into 
> raw_spin_lock_irqsave, if it can not enter into pi_lock critical 
> section, what will happen.
>
> Maybe should delte following code at the entry of fuction . Please 
> help check it.
>
> -       if (!src->user_cpus_ptr)     //delete this
>
> -              return 0;            //delete  this
>
> We think maybe path needs a little more modification like following :
>
> kernel/sched/core.c
> <https://lore.kernel.org/lkml/20221123131612.914906-1-longman@redhat.c
> om/#Z31kernel:sched:core.c>
> | 23 +++++++++++++++++++++--
>
> 1 file changed
> <https://lore.kernel.org/lkml/20221123131612.914906-1-longman@redhat.c
> om/#related>,
> 21 insertions(+), 2 deletions(-)
>
> diff
> <https://lore.kernel.org/lkml/20221123131612.914906-1-longman@redhat.c
> om/#iZ31kernel:sched:core.c> --git a/kernel/sched/core.c 
> b/kernel/sched/core.c
>
> index 8df51b08bb38..6b259d9e127a 100644
>
> --- a/kernel/sched/core.c
>
> +++ b/kernel/sched/core.c
>
> @@ -2624,8 +2624,14 @@ void do_set_cpus_allowed(struct task_struct *p, 
> const struct cpumask *new_mask)
>
> int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct 
> *src,
>
>       int node)
>
> {
>
> + cpumask_t *user_mask = NULL;
>
>        unsigned long flags;
>
> + /*
>
> + * This check is racy and losing the race is a valid situation.
>
> + * It is not worth the extra overhead of taking the pi_lock on
>
> + * every fork/clone.
>
> + */
>
> -       if (!src->user_cpus_ptr) //delete this
>
> -              return 0;            //delete this
>
The clearing of user_cpus_ptr is protected by pi_lock. IOW, racing between dup_user_cpus_ptr() and do_set_cpus_allowed is not possible and double free like what you have suggested should not happen. Yes, the user_cpus_ptr check here is racy. The worst case result is that a user_cpus_ptr has just been set in the task to be cloned, but it fail to copy over the user mask. With or without the check, the race can happen.
The check is an optimization. Its effect is just make one outcome more likely than the other.

Cheers,
Longman
  
Waiman Long Nov. 25, 2022, 2:08 a.m. UTC | #8
On 11/24/22 07:04, Wenjie Li (Evan) wrote:
> Hi, Waiman.
>
> "The clearing of user_cpus_ptr is protected by pi_lock. IOW, racing between dup_user_cpus_ptr() and do_set_cpus_allowed is not possible and double free like what you have suggested should not happen." We still can understand why it is impossible to happen. Because we indeed met this issue. Following is we got from ftrace.
>
> 1. Task  A  pid 27961 run on core6 and is forking/cloning task pid 28051, and task B  pid 28051 will copy task struct data from task A pid 27961. So task A p->user_cpus_ptr = ffffff884fbf9200 is equal to task B p->user_cpus_ptr=ffffff884fbf9200 through arch_dup_task_struct.

You are right. I forgot the fact that the value of dst->user_cpus_ptr is 
a copy of src. I have posted a v3 patch to address that. Thanks for the 
spotting that.

Cheers,
Longman
  

Patch

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index daff72f00385..b013d8b777b4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2584,29 +2584,38 @@  void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
        __do_set_cpus_allowed(p, new_mask, 0);
 }

+static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
+{
+       struct cpumask *user_mask = NULL;
+
+       swap(p->user_cpus_ptr, user_mask);
+
+       return user_mask;
+}
+
 int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
                      int node)
 {
-       if (!src->user_cpus_ptr)
-               return 0;
+       unsigned long flags;
+       struct cpumask *user_mask = NULL;

        dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
        if (!dst->user_cpus_ptr)
                return -ENOMEM;

+       /* Use pi_lock to protect content of user_cpus_ptr */
+       raw_spin_lock_irqsave(&src->pi_lock, flags);
+       if (!src->user_cpus_ptr) {
+               user_mask = clear_user_cpus_ptr(dst);
+               raw_spin_unlock_irqrestore(&src->pi_lock, flags);
+               kfree(user_mask);
+               return 0;
+       }
        cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+       raw_spin_unlock_irqrestore(&src->pi_lock, flags);
        return 0;
 }

-static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
-{
-       struct cpumask *user_mask = NULL;
-
-       swap(p->user_cpus_ptr, user_mask);
-
-       return user_mask;
-}
-
 void release_user_cpus_ptr(struct task_struct *p)
 {
        kfree(clear_user_cpus_ptr(p));