[v3,06/17] timer: Keep the pinned timers separate from the others
Commit Message
Seperate the storage space for pinned timers.
This is preparatory work for changing the NOHZ timer placement from a push
at enqueue time to a pull at expiry time model.
No functional change.
Originally-by: Richard Cochran (linutronix GmbH) <richardcochran@gmail.com>
Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
---
kernel/time/timer.c | 109 ++++++++++++++++++++++++++++++++------------
1 file changed, 81 insertions(+), 28 deletions(-)
Comments
On Tue, Oct 25, 2022 at 03:58:39PM +0200, Anna-Maria Behnsen wrote:
> @@ -1711,38 +1724,69 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
> if (cpu_is_offline(smp_processor_id()))
> return expires;
>
> - raw_spin_lock(&base->lock);
> + base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
> + base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);
>
> - nextevt = next_timer_interrupt(base);
> + raw_spin_lock(&base_local->lock);
> + raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
> +
> + nextevt_local = next_timer_interrupt(base_local);
> + nextevt_global = next_timer_interrupt(base_global);
>
> /*
> * We have a fresh next event. Check whether we can forward the
> * base. We can only do that when @basej is past base->clk
> * otherwise we might rewind base->clk.
> */
> - if (time_after(basej, base->clk)) {
> - if (time_after(nextevt, basej))
> - base->clk = basej;
> - else if (time_after(nextevt, base->clk))
> - base->clk = nextevt;
> + if (time_after(basej, base_local->clk)) {
> + if (time_after(nextevt_local, basej))
> + base_local->clk = basej;
> + else if (time_after(nextevt_local, base_local->clk))
> + base_local->clk = nextevt_local;
> + }
> +
> + if (time_after(basej, base_global->clk)) {
> + if (time_after(nextevt_global, basej))
> + base_global->clk = basej;
> + else if (time_after(nextevt_global, base_global->clk))
> + base_global->clk = nextevt_global;
Perhaps make a helper for the two above?
> }
>
> @@ -1763,6 +1807,9 @@ void timer_clear_idle(void)
> * the lock in the exit from idle path.
> */
> base->is_idle = false;
> +
> + base = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);
> + base->is_idle = false;
May be just:
__this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false)
__this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false)
> }
> #endif
>
> @@ -1820,17 +1869,21 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
> */
> static void run_local_timers(void)
> {
> - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
> + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
>
> hrtimer_run_queues();
> /* Raise the softirq only if required. */
> if (time_before(jiffies, base->next_expiry)) {
> if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
> return;
> - /* CPU is awake, so check the deferrable base. */
> + /* CPU is awake, so check for the global base. */
> base++;
> - if (time_before(jiffies, base->next_expiry))
> - return;
> + if (time_before(jiffies, base->next_expiry)) {
> + /* CPU is awake, so check the deferrable base. */
> + base++;
> + if (time_before(jiffies, base->next_expiry))
> + return;
> + }
Could be:
for (i = 0; i < NR_BASES; i++) {
struct timer_base *base = this_cpu_ptr(&timer_bases[i]);
if (time_after_eq(jiffies, base->next_expiry)) {
raise_softirq(TIMER_SOFTIRQ);
return;
}
}
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Thanks!
> }
> raise_softirq(TIMER_SOFTIRQ);
> }
> --
> 2.30.2
>
@@ -187,12 +187,18 @@ EXPORT_SYMBOL(jiffies_64);
#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
#ifdef CONFIG_NO_HZ_COMMON
-# define NR_BASES 2
-# define BASE_STD 0
-# define BASE_DEF 1
+/*
+ * If multiple bases need to be locked, use the base ordering for lock
+ * nesting, i.e. lowest number first.
+ */
+# define NR_BASES 3
+# define BASE_LOCAL 0
+# define BASE_GLOBAL 1
+# define BASE_DEF 2
#else
# define NR_BASES 1
-# define BASE_STD 0
+# define BASE_LOCAL 0
+# define BASE_GLOBAL 0
# define BASE_DEF 0
#endif
@@ -902,7 +908,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
{
- struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
+ int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
+ struct timer_base *base;
+
+ base = per_cpu_ptr(&timer_bases[index], cpu);
/*
* If the timer is deferrable and NO_HZ_COMMON is set then we need
@@ -915,7 +924,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
+ struct timer_base *base;
+
+ base = this_cpu_ptr(&timer_bases[index]);
/*
* If the timer is deferrable and NO_HZ_COMMON is set then we need
@@ -1700,9 +1712,10 @@ static unsigned long next_timer_interrupt(struct timer_base *base)
*/
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ unsigned long nextevt, nextevt_local, nextevt_global;
+ struct timer_base *base_local, *base_global;
+ bool local_first, is_idle;
u64 expires = KTIME_MAX;
- unsigned long nextevt;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1711,38 +1724,69 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
if (cpu_is_offline(smp_processor_id()))
return expires;
- raw_spin_lock(&base->lock);
+ base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
+ base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);
- nextevt = next_timer_interrupt(base);
+ raw_spin_lock(&base_local->lock);
+ raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
+
+ nextevt_local = next_timer_interrupt(base_local);
+ nextevt_global = next_timer_interrupt(base_global);
/*
* We have a fresh next event. Check whether we can forward the
* base. We can only do that when @basej is past base->clk
* otherwise we might rewind base->clk.
*/
- if (time_after(basej, base->clk)) {
- if (time_after(nextevt, basej))
- base->clk = basej;
- else if (time_after(nextevt, base->clk))
- base->clk = nextevt;
+ if (time_after(basej, base_local->clk)) {
+ if (time_after(nextevt_local, basej))
+ base_local->clk = basej;
+ else if (time_after(nextevt_local, base_local->clk))
+ base_local->clk = nextevt_local;
+ }
+
+ if (time_after(basej, base_global->clk)) {
+ if (time_after(nextevt_global, basej))
+ base_global->clk = basej;
+ else if (time_after(nextevt_global, base_global->clk))
+ base_global->clk = nextevt_global;
}
/*
- * Base is idle if the next event is more than a tick away. Also
+ * Check whether the local event is expiring before or at the same
+ * time as the global event.
+ *
+ * Note, that nextevt_global and nextevt_local might be based on
+ * different base->clk values. So it's not guaranteed that
+ * comparing with empty bases results in a correct local_first.
+ */
+ if (base_local->timers_pending && base_global->timers_pending)
+ local_first = time_before_eq(nextevt_local, nextevt_global);
+ else
+ local_first = base_local->timers_pending;
+
+ nextevt = local_first ? nextevt_local : nextevt_global;
+
+ /*
+ * Bases are idle if the next event is more than a tick away. Also
* the tick is stopped so any added timer must forward the base clk
* itself to keep granularity small. This idle logic is only
- * maintained for the BASE_STD base, deferrable timers may still
- * see large granularity skew (by design).
+ * maintained for the BASE_LOCAL and BASE_GLOBAL base, deferrable
+ * timers may still see large granularity skew (by design).
*/
- base->is_idle = time_after(nextevt, basej + 1);
+ is_idle = time_after(nextevt, basej + 1);
+
+ /* We need to mark both bases in sync */
+ base_local->is_idle = base_global->is_idle = is_idle;
- if (base->timers_pending) {
+ if (base_local->timers_pending || base_global->timers_pending) {
/* If we missed a tick already, force 0 delta */
if (time_before_eq(nextevt, basej))
nextevt = basej;
expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
}
- raw_spin_unlock(&base->lock);
+ raw_spin_unlock(&base_global->lock);
+ raw_spin_unlock(&base_local->lock);
return cmp_next_hrtimer_event(basem, expires);
}
@@ -1754,7 +1798,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
*/
void timer_clear_idle(void)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
/*
* We do this unlocked. The worst outcome is a remote enqueue sending
@@ -1763,6 +1807,9 @@ void timer_clear_idle(void)
* the lock in the exit from idle path.
*/
base->is_idle = false;
+
+ base = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);
+ base->is_idle = false;
}
#endif
@@ -1808,11 +1855,13 @@ static inline void __run_timers(struct timer_base *base)
*/
static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
__run_timers(base);
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
+ __run_timers(this_cpu_ptr(&timer_bases[BASE_GLOBAL]));
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
+ }
}
/*
@@ -1820,17 +1869,21 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
*/
static void run_local_timers(void)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
hrtimer_run_queues();
/* Raise the softirq only if required. */
if (time_before(jiffies, base->next_expiry)) {
if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
return;
- /* CPU is awake, so check the deferrable base. */
+ /* CPU is awake, so check for the global base. */
base++;
- if (time_before(jiffies, base->next_expiry))
- return;
+ if (time_before(jiffies, base->next_expiry)) {
+ /* CPU is awake, so check the deferrable base. */
+ base++;
+ if (time_before(jiffies, base->next_expiry))
+ return;
+ }
}
raise_softirq(TIMER_SOFTIRQ);
}