[v5,4/5] workqueue: Convert the idle_timer to a timer + work_struct

Message ID 20221122192937.2386494-5-vschneid@redhat.com
State New
Headers
Series workqueue: destroy_worker() vs isolated CPUs |

Commit Message

Valentin Schneider Nov. 22, 2022, 7:29 p.m. UTC
  A later patch will require a sleepable context in the idle worker timeout
function. Converting worker_pool.idle_timer to a delayed_work gives us just
that, however this would imply turning all idle_timer expiries into
scheduler events (waking up a worker to handle the dwork).

Instead, implement a "custom dwork" where the timer callback does some
extra checks before queuing the associated work.

No change in functionality intended.

The new worker_pool.idle_cull_list is made ____cacheline_aligned to prevent
it from sitting over two cachelines.

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
 kernel/workqueue.c | 68 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 64 insertions(+), 4 deletions(-)
  

Comments

Tejun Heo Nov. 22, 2022, 8:23 p.m. UTC | #1
Hello,

On Tue, Nov 22, 2022 at 07:29:36PM +0000, Valentin Schneider wrote:
> @@ -2039,12 +2060,48 @@ static void idle_worker_timeout(struct timer_list *t)
>  		worker = list_entry(pool->idle_list.prev, struct worker, entry);
>  		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
>  
> +		/* All remaining entries will be younger than this */
>  		if (time_before(jiffies, expires)) {
> -			mod_timer(&pool->idle_timer, expires);
> +			if (!cull_cnt)
> +				mod_timer(&pool->idle_timer, expires);
>  			break;
>  		}
>  
> +		/*
> +		 * Mark the idle worker ripe for culling.
> +		 * If a preempted idle worker gets to run before the idle cull
> +		 * handles it, it will just pop itself out of that list and
> +		 * continue as normal.
> +		 */
> +		list_move(&worker->entry, &pool->idle_cull_list);
> +	}
> +	raw_spin_unlock_irq(&pool->lock);
> +
> +	if (cull_cnt)
> +		queue_work(system_unbound_wq, &pool->idle_cull_work);
> +}

So, you mentioned this explicitly in the cover letter but I think I'd prefer
if the timer were simpler and all logic were in the work item. It just needs
to pick at the first worker and compare the expiration once, right? If that
bothers you, we can make workers keep track of the oldest idle's timestamp
in, say, wq->first_idle_at as the workers go idle and busy and then the
timer can simply look at the value and decide to schedule the work item or
not.

Thanks.
  
Valentin Schneider Nov. 28, 2022, 11:24 a.m. UTC | #2
On 22/11/22 10:23, Tejun Heo wrote:
> Hello,
>

Thanks for having a look at this so quickly!

> On Tue, Nov 22, 2022 at 07:29:36PM +0000, Valentin Schneider wrote:
>> @@ -2039,12 +2060,48 @@ static void idle_worker_timeout(struct timer_list *t)
>>              worker = list_entry(pool->idle_list.prev, struct worker, entry);
>>              expires = worker->last_active + IDLE_WORKER_TIMEOUT;
>>
>> +		/* All remaining entries will be younger than this */
>>              if (time_before(jiffies, expires)) {
>> -			mod_timer(&pool->idle_timer, expires);
>> +			if (!cull_cnt)
>> +				mod_timer(&pool->idle_timer, expires);
>>                      break;
>>              }
>>
>> +		/*
>> +		 * Mark the idle worker ripe for culling.
>> +		 * If a preempted idle worker gets to run before the idle cull
>> +		 * handles it, it will just pop itself out of that list and
>> +		 * continue as normal.
>> +		 */
>> +		list_move(&worker->entry, &pool->idle_cull_list);
>> +	}
>> +	raw_spin_unlock_irq(&pool->lock);
>> +
>> +	if (cull_cnt)
>> +		queue_work(system_unbound_wq, &pool->idle_cull_work);
>> +}
>
> So, you mentioned this explicitly in the cover letter but I think I'd prefer
> if the timer were simpler and all logic were in the work item. It just needs
> to pick at the first worker and compare the expiration once, right?

Yep exactly. I wasn't fond of repeating the expiration check pattern, and
also it meant the culling worker could see different things than the
timer.

Moving everything in the worker does however mean we can get rid of the new
worker_pool.idle_cull_list, which is my least favourite bit of that
approach, so I'll give that a try.

> If that
> bothers you, we can make workers keep track of the oldest idle's timestamp
> in, say, wq->first_idle_at as the workers go idle and busy and then the
> timer can simply look at the value and decide to schedule the work item or
> not.
>

I don't think the overhead at worker_{enter,leave}_idle() would be worth it.


> Thanks.
>
> --
> tejun
  

Patch

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4fc8085f3fe17..b744288c58a4b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -169,7 +169,12 @@  struct worker_pool {
 
 	struct list_head	idle_list;	/* L: list of idle workers */
 	struct timer_list	idle_timer;	/* L: worker idle timeout */
-	struct timer_list	mayday_timer;	/* L: SOS timer for workers */
+
+	struct list_head	idle_cull_list  /* L: list of idle workers to cull */
+	____cacheline_aligned;
+	struct work_struct      idle_cull_work; /* L: worker idle cleanup */
+
+	struct timer_list	mayday_timer;	  /* L: SOS timer for workers */
 
 	/* a workers is either on busy_hash or idle_list, or the manager */
 	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
@@ -1812,7 +1817,9 @@  static void worker_enter_idle(struct worker *worker)
 	/* idle_list is LIFO */
 	list_add(&worker->entry, &pool->idle_list);
 
-	if (worker_cull_count(pool) && !timer_pending(&pool->idle_timer))
+	if (worker_cull_count(pool) &&
+	    !timer_pending(&pool->idle_timer) &&
+	    !work_pending(&pool->idle_cull_work))
 		mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
 
 	/* Sanity check nr_running. */
@@ -2025,13 +2032,27 @@  static void destroy_worker(struct worker *worker)
 	wake_up_process(worker->task);
 }
 
+/*
+ * idle_worker_timeout - check if some idle workers can now be deleted.
+ *
+ * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in
+ * worker_leave_idle(), as a worker flicking between idle and active while its
+ * pool is at the worker_cull_count() tipping point would cause too much timer
+ * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let
+ * it expire and re-evaluate things from there.
+ */
 static void idle_worker_timeout(struct timer_list *t)
 {
 	struct worker_pool *pool = from_timer(pool, t, idle_timer);
+	unsigned int max_cull_cnt, cull_cnt;
+
+	if (work_pending(&pool->idle_cull_work))
+		return;
 
 	raw_spin_lock_irq(&pool->lock);
 
-	while (worker_cull_count(pool)) {
+	max_cull_cnt = worker_cull_count(pool);
+	for (cull_cnt = 0; cull_cnt < max_cull_cnt; cull_cnt++) {
 		struct worker *worker;
 		unsigned long expires;
 
@@ -2039,12 +2060,48 @@  static void idle_worker_timeout(struct timer_list *t)
 		worker = list_entry(pool->idle_list.prev, struct worker, entry);
 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
 
+		/* All remaining entries will be younger than this */
 		if (time_before(jiffies, expires)) {
-			mod_timer(&pool->idle_timer, expires);
+			if (!cull_cnt)
+				mod_timer(&pool->idle_timer, expires);
 			break;
 		}
 
+		/*
+		 * Mark the idle worker ripe for culling.
+		 * If a preempted idle worker gets to run before the idle cull
+		 * handles it, it will just pop itself out of that list and
+		 * continue as normal.
+		 */
+		list_move(&worker->entry, &pool->idle_cull_list);
+	}
+	raw_spin_unlock_irq(&pool->lock);
+
+	if (cull_cnt)
+		queue_work(system_unbound_wq, &pool->idle_cull_work);
+}
+
+/*
+ * idle_cull_fn - cull workers that have been idle for too long.
+ */
+static void idle_cull_fn(struct work_struct *work)
+{
+	struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
+	struct worker *worker, *tmp;
+
+	raw_spin_lock_irq(&pool->lock);
+
+	list_for_each_entry_safe(worker, tmp, &pool->idle_cull_list, entry)
 		destroy_worker(worker);
+
+	/* Re-arm the idle timer if necessary */
+	if (pool->nr_idle) {
+		unsigned long expires;
+
+		worker = list_entry(pool->idle_list.prev, struct worker, entry);
+		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
+		if (time_before(jiffies, expires))
+			mod_timer(&pool->idle_timer, expires);
 	}
 
 	raw_spin_unlock_irq(&pool->lock);
@@ -3482,9 +3539,11 @@  static int init_worker_pool(struct worker_pool *pool)
 	pool->watchdog_ts = jiffies;
 	INIT_LIST_HEAD(&pool->worklist);
 	INIT_LIST_HEAD(&pool->idle_list);
+	INIT_LIST_HEAD(&pool->idle_cull_list);
 	hash_init(pool->busy_hash);
 
 	timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
+	INIT_WORK(&pool->idle_cull_work, idle_cull_fn);
 
 	timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
 
@@ -3632,6 +3691,7 @@  static void put_unbound_pool(struct worker_pool *pool)
 
 	/* shut down the timers */
 	del_timer_sync(&pool->idle_timer);
+	cancel_work_sync(&pool->idle_cull_work);
 	del_timer_sync(&pool->mayday_timer);
 
 	/* RCU protected to allow dereferences from get_work_pool() */