workqueue: Make sure that wq_unbound_cpumask is never empty

Message ID ZV0jmGSismObVncD@slm.duckdns.org
State New
Headers
Series workqueue: Make sure that wq_unbound_cpumask is never empty |

Commit Message

Tejun Heo Nov. 21, 2023, 9:39 p.m. UTC
  During boot, depending on how the housekeeping and workqueue.unbound_cpus
masks are set, wq_unbound_cpumask can end up empty. Since 8639ecebc9b1
("workqueue: Implement non-strict affinity scope for unbound workqueues"),
this may end up feeding -1 as a CPU number into scheduler leading to oopses.

  BUG: unable to handle page fault for address: ffffffff8305e9c0
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  ...
  Call Trace:
   <TASK>
   select_idle_sibling+0x79/0xaf0
   select_task_rq_fair+0x1cb/0x7b0
   try_to_wake_up+0x29c/0x5c0
   wake_up_process+0x19/0x20
   kick_pool+0x5e/0xb0
   __queue_work+0x119/0x430
   queue_work_on+0x29/0x30
  ...

An empty wq_unbound_cpumask is a clear misconfiguration and already
disallowed once system is booted up. Let's warn on and ignore
unbound_cpumask restrictions which lead to no unbound cpus. While at it,
also remove now unncessary empty check on wq_unbound_cpumask in
wq_select_unbound_cpu().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Yong He <alexyonghe@tencent.com>
Link: http://lkml.kernel.org/r/20231120121623.119780-1-alexyonghe@tencent.com
Fixes: 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues")
Cc: stable@vger.kernel.org # v6.6+
---
Hello,

Yong He, zhuangel570, can you please verify that this patch makes the oops
go away? Waiman, this touches code that you've recently worked on. AFAICS,
they shouldn't interact or cause conflicts. cc'ing just in case.

Thanks.

 kernel/workqueue.c |   22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)
  

Comments

zhuangel570 Nov. 22, 2023, 3:36 a.m. UTC | #1
On Wed, Nov 22, 2023 at 5:39 AM Tejun Heo <tj@kernel.org> wrote:
>
> During boot, depending on how the housekeeping and workqueue.unbound_cpus
> masks are set, wq_unbound_cpumask can end up empty. Since 8639ecebc9b1
> ("workqueue: Implement non-strict affinity scope for unbound workqueues"),
> this may end up feeding -1 as a CPU number into scheduler leading to oopses.
>
>   BUG: unable to handle page fault for address: ffffffff8305e9c0
>   #PF: supervisor read access in kernel mode
>   #PF: error_code(0x0000) - not-present page
>   ...
>   Call Trace:
>    <TASK>
>    select_idle_sibling+0x79/0xaf0
>    select_task_rq_fair+0x1cb/0x7b0
>    try_to_wake_up+0x29c/0x5c0
>    wake_up_process+0x19/0x20
>    kick_pool+0x5e/0xb0
>    __queue_work+0x119/0x430
>    queue_work_on+0x29/0x30
>   ...
>
> An empty wq_unbound_cpumask is a clear misconfiguration and already
> disallowed once system is booted up. Let's warn on and ignore
> unbound_cpumask restrictions which lead to no unbound cpus. While at it,
> also remove now unncessary empty check on wq_unbound_cpumask in
> wq_select_unbound_cpu().
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-by: Yong He <alexyonghe@tencent.com>
> Link: http://lkml.kernel.org/r/20231120121623.119780-1-alexyonghe@tencent.com
> Fixes: 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues")
> Cc: stable@vger.kernel.org # v6.6+
> ---
> Hello,
>
> Yong He, zhuangel570, can you please verify that this patch makes the oops
> go away? Waiman, this touches code that you've recently worked on. AFAICS,
> they shouldn't interact or cause conflicts. cc'ing just in case.

Sure.
I port this patch to my 6.7 branch, and the kernel could boot successfully on BM
and VM, with the same configurations, also I can see the new added warning, so
this patch solves the oops.

So, one last check, do you think we still need to check return value from
cpumask_any_distribute() to make sure kick_pool() set a correct wake_cpu?

Tested-by: Yong He <alexyonghe@tencent.com>

>
> Thanks.
>
>  kernel/workqueue.c |   22 +++++++++++++++-------
>  1 file changed, 15 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 6e578f576a6f..0295291d54bc 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -1684,9 +1684,6 @@ static int wq_select_unbound_cpu(int cpu)
>                 pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
>         }
>
> -       if (cpumask_empty(wq_unbound_cpumask))
> -               return cpu;
> -
>         new_cpu = __this_cpu_read(wq_rr_cpu_last);
>         new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
>         if (unlikely(new_cpu >= nr_cpu_ids)) {
> @@ -6515,6 +6512,17 @@ static inline void wq_watchdog_init(void) { }
>
>  #endif /* CONFIG_WQ_WATCHDOG */
>
> +static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
> +{
> +       if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
> +               pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
> +                       cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
> +               return;
> +       }
> +
> +       cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
> +}
> +
>  /**
>   * workqueue_init_early - early init for workqueue subsystem
>   *
> @@ -6534,11 +6542,11 @@ void __init workqueue_init_early(void)
>         BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
>
>         BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
> -       cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
> -       cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
> -
> +       cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
> +       restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
> +       restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
>         if (!cpumask_empty(&wq_cmdline_cpumask))
> -               cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask);
> +               restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
>
>         pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
>
  
Tejun Heo Nov. 22, 2023, 4:03 p.m. UTC | #2
Hello,

On Tue, Nov 21, 2023 at 05:08:29PM -0500, Waiman Long wrote:
> On 11/21/23 16:39, Tejun Heo wrote:
> > During boot, depending on how the housekeeping and workqueue.unbound_cpus
> > masks are set, wq_unbound_cpumask can end up empty. Since 8639ecebc9b1
> > ("workqueue: Implement non-strict affinity scope for unbound workqueues"),
> > this may end up feeding -1 as a CPU number into scheduler leading to oopses.
> > 
> >    BUG: unable to handle page fault for address: ffffffff8305e9c0
> >    #PF: supervisor read access in kernel mode
> >    #PF: error_code(0x0000) - not-present page
> >    ...
> >    Call Trace:
> >     <TASK>
> >     select_idle_sibling+0x79/0xaf0
> >     select_task_rq_fair+0x1cb/0x7b0
> >     try_to_wake_up+0x29c/0x5c0
> >     wake_up_process+0x19/0x20
> >     kick_pool+0x5e/0xb0
> >     __queue_work+0x119/0x430
> >     queue_work_on+0x29/0x30
> >    ...
> > 
> > An empty wq_unbound_cpumask is a clear misconfiguration and already
> > disallowed once system is booted up. Let's warn on and ignore
> > unbound_cpumask restrictions which lead to no unbound cpus. While at it,
> > also remove now unncessary empty check on wq_unbound_cpumask in
> > wq_select_unbound_cpu().
> > 
> > Signed-off-by: Tejun Heo<tj@kernel.org>
> > Reported-by: Yong He<alexyonghe@tencent.com>
> > Link:http://lkml.kernel.org/r/20231120121623.119780-1-alexyonghe@tencent.com
> > Fixes: 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues")
> > Cc:stable@vger.kernel.org  # v6.6+
> > ---
> > Hello,
> > 
> > Yong He, zhuangel570, can you please verify that this patch makes the oops
> > go away? Waiman, this touches code that you've recently worked on. AFAICS,
> > they shouldn't interact or cause conflicts. cc'ing just in case.
> 
> It does conflict with commit fe28f631fa94 ("workqueue: Add
> workqueue_unbound_exclude_cpumask() to exclude CPUs from
> wq_unbound_cpumask") as it has the following hunk:
> 
> @@ -6534,11 +6606,14 @@ void __init workqueue_init_early(void)
>         BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long
> long));
> 
>         BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
> + BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
> +       BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
>         cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
>         cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask,
> housekeeping_cpumask(HK_TYPE_DOMAIN));
> 
>         if (!cpumask_empty(&wq_cmdline_cpumask))
>                 cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask,
> &wq_cmdline_cpumask);
> +       cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
> 
>         pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
...
> Is it possible to route this patch to cgroup for 6.8 to avoid conflict?
> Other than that, the patch looks good to me.

It's a workqueue fix patch, so what I'm gonna do is land this in
wq/for-6.6-fixes and just resolve it in cgroup/for-next.

Thanks.
  
Tejun Heo Nov. 22, 2023, 4:03 p.m. UTC | #3
On Tue, Nov 21, 2023 at 11:39:36AM -1000, Tejun Heo wrote:
> During boot, depending on how the housekeeping and workqueue.unbound_cpus
> masks are set, wq_unbound_cpumask can end up empty. Since 8639ecebc9b1
> ("workqueue: Implement non-strict affinity scope for unbound workqueues"),
> this may end up feeding -1 as a CPU number into scheduler leading to oopses.
> 
>   BUG: unable to handle page fault for address: ffffffff8305e9c0
>   #PF: supervisor read access in kernel mode
>   #PF: error_code(0x0000) - not-present page
>   ...
>   Call Trace:
>    <TASK>
>    select_idle_sibling+0x79/0xaf0
>    select_task_rq_fair+0x1cb/0x7b0
>    try_to_wake_up+0x29c/0x5c0
>    wake_up_process+0x19/0x20
>    kick_pool+0x5e/0xb0
>    __queue_work+0x119/0x430
>    queue_work_on+0x29/0x30
>   ...
> 
> An empty wq_unbound_cpumask is a clear misconfiguration and already
> disallowed once system is booted up. Let's warn on and ignore
> unbound_cpumask restrictions which lead to no unbound cpus. While at it,
> also remove now unncessary empty check on wq_unbound_cpumask in
> wq_select_unbound_cpu().
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-by: Yong He <alexyonghe@tencent.com>
> Link: http://lkml.kernel.org/r/20231120121623.119780-1-alexyonghe@tencent.com
> Fixes: 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues")
> Cc: stable@vger.kernel.org # v6.6+

Applied to wq/for-6.7-fixes.

Thanks.
  
Tejun Heo Nov. 22, 2023, 4:23 p.m. UTC | #4
On Wed, Nov 22, 2023 at 06:03:10AM -1000, Tejun Heo wrote:
> It's a workqueue fix patch, so what I'm gonna do is land this in
> wq/for-6.6-fixes and just resolve it in cgroup/for-next.
           ^
	   7

So, I applied the fix to wq/for-6.7-fixes and merged it into cgroup/for-6.8
for conflict resolution.

Thanks.
  
Waiman Long Nov. 22, 2023, 7:10 p.m. UTC | #5
On 11/22/23 11:23, Tejun Heo wrote:
> On Wed, Nov 22, 2023 at 06:03:10AM -1000, Tejun Heo wrote:
>> It's a workqueue fix patch, so what I'm gonna do is land this in
>> wq/for-6.6-fixes and just resolve it in cgroup/for-next.
>             ^
> 	   7
>
> So, I applied the fix to wq/for-6.7-fixes and merged it into cgroup/for-6.8
> for conflict resolution.

That will work too. Thanks for taking care of that.

Cheers,
Longman
  

Patch

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6e578f576a6f..0295291d54bc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1684,9 +1684,6 @@  static int wq_select_unbound_cpu(int cpu)
 		pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
 	}
 
-	if (cpumask_empty(wq_unbound_cpumask))
-		return cpu;
-
 	new_cpu = __this_cpu_read(wq_rr_cpu_last);
 	new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
 	if (unlikely(new_cpu >= nr_cpu_ids)) {
@@ -6515,6 +6512,17 @@  static inline void wq_watchdog_init(void) { }
 
 #endif	/* CONFIG_WQ_WATCHDOG */
 
+static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
+{
+	if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
+		pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
+			cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
+		return;
+	}
+
+	cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
+}
+
 /**
  * workqueue_init_early - early init for workqueue subsystem
  *
@@ -6534,11 +6542,11 @@  void __init workqueue_init_early(void)
 	BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-	cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
-	cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
-
+	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+	restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
+	restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
 	if (!cpumask_empty(&wq_cmdline_cpumask))
-		cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask);
+		restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);