@@ -187,12 +187,18 @@ EXPORT_SYMBOL(jiffies_64);
#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
#ifdef CONFIG_NO_HZ_COMMON
-# define NR_BASES 2
-# define BASE_STD 0
-# define BASE_DEF 1
+/*
+ * If multiple bases need to be locked, use the base ordering for lock
+ * nesting, i.e. lowest number first.
+ */
+# define NR_BASES 3
+# define BASE_LOCAL 0
+# define BASE_GLOBAL 1
+# define BASE_DEF 2
#else
# define NR_BASES 1
-# define BASE_STD 0
+# define BASE_LOCAL 0
+# define BASE_GLOBAL 0
# define BASE_DEF 0
#endif
@@ -944,7 +950,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
{
- struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
+ int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
+ struct timer_base *base;
+
+ base = per_cpu_ptr(&timer_bases[index], cpu);
/*
* If the timer is deferrable and NO_HZ_COMMON is set then we need
@@ -957,7 +966,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
+ struct timer_base *base;
+
+ base = this_cpu_ptr(&timer_bases[index]);
/*
* If the timer is deferrable and NO_HZ_COMMON is set then we need
@@ -2006,6 +2018,9 @@ static unsigned long next_timer_interrupt(struct timer_base *base,
* Move next_expiry for the empty base into the future to prevent an
* unnecessary raise of the timer softirq when the next_expiry value
* will be reached even if there is no timer pending.
+ *
+ * This update is also required to make timer_base::next_expiry values
+ * easy comparable to find out which base holds the first pending timer.
*/
if (!base->timers_pending)
base->next_expiry = basej + NEXT_TIMER_MAX_DELTA;
@@ -2016,9 +2031,10 @@ static unsigned long next_timer_interrupt(struct timer_base *base,
static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
bool *idle)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ unsigned long nextevt, nextevt_local, nextevt_global;
+ struct timer_base *base_local, *base_global;
u64 expires = KTIME_MAX;
- unsigned long nextevt;
+ bool local_first;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -2030,10 +2046,20 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
return expires;
}
- raw_spin_lock(&base->lock);
- nextevt = next_timer_interrupt(base, basej);
+ base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
+ base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);
- if (base->timers_pending) {
+ raw_spin_lock(&base_local->lock);
+ raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
+
+ nextevt_local = next_timer_interrupt(base_local, basej);
+ nextevt_global = next_timer_interrupt(base_global, basej);
+
+ local_first = time_before_eq(nextevt_local, nextevt_global);
+
+ nextevt = local_first ? nextevt_local : nextevt_global;
+
+ if (base_local->timers_pending || base_global->timers_pending) {
/* If we missed a tick already, force 0 delta */
if (time_before(nextevt, basej))
nextevt = basej;
@@ -2044,31 +2070,31 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
* We have a fresh next event. Check whether we can forward the
* base.
*/
- __forward_timer_base(base, basej);
+ __forward_timer_base(base_local, basej);
+ __forward_timer_base(base_global, basej);
/*
* Set base->is_idle only when caller is timer_base_try_to_set_idle()
*/
if (idle) {
/*
- * Base is idle if the next event is more than a tick away.
+ * Bases are idle if the next event is more than a tick away.
*
* If the base is marked idle then any timer add operation must
* forward the base clk itself to keep granularity small. This
- * idle logic is only maintained for the BASE_STD base,
- * deferrable timers may still see large granularity skew (by
- * design).
+ * idle logic is only maintained for the BASE_LOCAL and
+ * BASE_GLOBAL base, deferrable timers may still see large
+ * granularity skew (by design).
*/
- if (!base->is_idle) {
- if (time_after(nextevt, basej + 1)) {
- base->is_idle = true;
- trace_timer_base_idle(true, base->cpu);
- }
+ if (!base_local->is_idle && time_after(nextevt, basej + 1)) {
+ base_local->is_idle = base_global->is_idle = true;
+ trace_timer_base_idle(true, base_local->cpu);
}
- *idle = base->is_idle;
+ *idle = base_local->is_idle;
}
- raw_spin_unlock(&base->lock);
+ raw_spin_unlock(&base_global->lock);
+ raw_spin_unlock(&base_local->lock);
return cmp_next_hrtimer_event(basem, expires);
}
@@ -2112,15 +2138,14 @@ u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
*/
void timer_clear_idle(void)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
-
/*
* We do this unlocked. The worst outcome is a remote enqueue sending
* a pointless IPI, but taking the lock would just make the window for
* sending the IPI a few instructions smaller for the cost of taking
* the lock in the exit from idle path.
*/
- base->is_idle = false;
+ __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
+ __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
trace_timer_base_idle(false, smp_processor_id());
}
#endif
@@ -2171,11 +2196,13 @@ static inline void __run_timers(struct timer_base *base)
*/
static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
__run_timers(base);
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
+ __run_timers(this_cpu_ptr(&timer_bases[BASE_GLOBAL]));
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
+ }
}
/*
@@ -2183,7 +2210,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
*/
static void run_local_timers(void)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
hrtimer_run_queues();