[RFT,v1] cpuidle: menu: Skip tick_nohz_get_sleep_length() call in some cases
Commit Message
From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Because the cost of calling tick_nohz_get_sleep_length() may increase
in the future, reorder the code in menu_select() so it first uses the
statistics to determine the expected idle duration. If that value is
higher than RESIDENCY_THRESHOLD_NS, tick_nohz_get_sleep_length() will
be called to obtain the time till the closest timer and refine the
idle duration prediction if necessary.
This causes the governor to always take the full overhead of
get_typical_interval() with the assumption that the cost will be
amortized by skipping the tick_nohz_get_sleep_length() call in the
cases when the predicted idle duration is relatively very small.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
This is a counterpart of the following two teo commits:
https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git/commit/?h=pm-cpuidle-teo&id=06b80d45d3f94b1bbd3ee02bf3e007f55ed3e120
https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git/commit/?h=pm-cpuidle-teo&id=57c51179c203de17cf17e357d0e56c04f5e2494a
and it is based on top of
https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git/commit/?h=pm-cpuidle-teo&id=9ebff7f2e77dc1cf7a143f6eee2852b6380096e5
Later today I will expose a git branch with all of the recent cpuidle governor
changes related to the tick_nohz_get_sleep_length() avoidance.
---
drivers/cpuidle/governors/gov.h | 14 ++++++++
drivers/cpuidle/governors/menu.c | 66 +++++++++++++++++++++------------------
drivers/cpuidle/governors/teo.c | 2 +
3 files changed, 53 insertions(+), 29 deletions(-)
Comments
On Wed, Aug 9, 2023 at 4:53 PM Rafael J. Wysocki <rjw@rjwysocki.net> wrote:
>
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>
> Because the cost of calling tick_nohz_get_sleep_length() may increase
> in the future, reorder the code in menu_select() so it first uses the
> statistics to determine the expected idle duration. If that value is
> higher than RESIDENCY_THRESHOLD_NS, tick_nohz_get_sleep_length() will
> be called to obtain the time till the closest timer and refine the
> idle duration prediction if necessary.
>
> This causes the governor to always take the full overhead of
> get_typical_interval() with the assumption that the cost will be
> amortized by skipping the tick_nohz_get_sleep_length() call in the
> cases when the predicted idle duration is relatively very small.
>
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> ---
>
> This is a counterpart of the following two teo commits:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git/commit/?h=pm-cpuidle-teo&id=06b80d45d3f94b1bbd3ee02bf3e007f55ed3e120
> https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git/commit/?h=pm-cpuidle-teo&id=57c51179c203de17cf17e357d0e56c04f5e2494a
>
> and it is based on top of
>
> https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git/commit/?h=pm-cpuidle-teo&id=9ebff7f2e77dc1cf7a143f6eee2852b6380096e5
>
> Later today I will expose a git branch with all of the recent cpuidle governor
> changes related to the tick_nohz_get_sleep_length() avoidance.
>
> ---
> drivers/cpuidle/governors/gov.h | 14 ++++++++
> drivers/cpuidle/governors/menu.c | 66 +++++++++++++++++++++------------------
> drivers/cpuidle/governors/teo.c | 2 +
> 3 files changed, 53 insertions(+), 29 deletions(-)
>
> Index: linux-pm/drivers/cpuidle/governors/menu.c
> ===================================================================
> --- linux-pm.orig/drivers/cpuidle/governors/menu.c
> +++ linux-pm/drivers/cpuidle/governors/menu.c
> @@ -19,6 +19,8 @@
> #include <linux/sched/stat.h>
> #include <linux/math64.h>
>
> +#include "gov.h"
> +
> #define BUCKETS 12
> #define INTERVAL_SHIFT 3
> #define INTERVALS (1UL << INTERVAL_SHIFT)
> @@ -166,8 +168,7 @@ static void menu_update(struct cpuidle_d
> * of points is below a threshold. If it is... then use the
> * average of these 8 points as the estimated value.
> */
> -static unsigned int get_typical_interval(struct menu_device *data,
> - unsigned int predicted_us)
> +static unsigned int get_typical_interval(struct menu_device *data)
> {
> int i, divisor;
> unsigned int min, max, thresh, avg;
> @@ -195,13 +196,6 @@ again:
> }
> }
>
> - /*
> - * If the result of the computation is going to be discarded anyway,
> - * avoid the computation altogether.
> - */
> - if (min >= predicted_us)
> - return UINT_MAX;
> -
> if (divisor == INTERVALS)
> avg = sum >> INTERVAL_SHIFT;
> else
> @@ -267,7 +261,6 @@ static int menu_select(struct cpuidle_dr
> {
> struct menu_device *data = this_cpu_ptr(&menu_devices);
> s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
> - unsigned int predicted_us;
> u64 predicted_ns;
> u64 interactivity_req;
> unsigned int nr_iowaiters;
> @@ -279,16 +272,41 @@ static int menu_select(struct cpuidle_dr
> data->needs_update = 0;
> }
>
> - /* determine the expected residency time, round up */
> - delta = tick_nohz_get_sleep_length(&delta_tick);
> - if (unlikely(delta < 0)) {
> - delta = 0;
> - delta_tick = 0;
> - }
> - data->next_timer_ns = delta;
> -
> nr_iowaiters = nr_iowait_cpu(dev->cpu);
> - data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
> +
> + /* Find the shortest expected idle interval. */
> + predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
> + if (predicted_ns > RESIDENCY_THRESHOLD_NS) {
> + unsigned int timer_us;
> +
> + /* Determine the time till the closest timer. */
> + delta = tick_nohz_get_sleep_length(&delta_tick);
> + if (unlikely(delta < 0)) {
> + delta = 0;
> + delta_tick = 0;
> + }
> +
> + data->next_timer_ns = delta;
> + data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
> +
> + /* Round up the result for half microseconds. */
> + timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
> + data->next_timer_ns *
> + data->correction_factor[data->bucket],
> + RESOLUTION * DECAY * NSEC_PER_USEC);
> + /* Use the lowest expected idle interval to pick the idle state. */
> + predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns);
> + } else {
> + /*
> + * Because the next timer event is not going to be determined
> + * in this case, assume that without the tick the closest timer
> + * will be in distant future and that the closest tick will occur
> + * after 1/2 of the tick period.
> + */
> + data->next_timer_ns = KTIME_MAX;
> + delta_tick = TICK_NSEC / 2;
> + data->bucket = which_bucket(KTIME_MAX, nr_iowaiters);
> + }
>
> if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
> ((data->next_timer_ns < drv->states[1].target_residency_ns ||
> @@ -303,16 +321,6 @@ static int menu_select(struct cpuidle_dr
> return 0;
> }
>
> - /* Round up the result for half microseconds. */
> - predicted_us = div_u64(data->next_timer_ns *
> - data->correction_factor[data->bucket] +
> - (RESOLUTION * DECAY * NSEC_PER_USEC) / 2,
> - RESOLUTION * DECAY * NSEC_PER_USEC);
> - /* Use the lowest expected idle interval to pick the idle state. */
> - predicted_ns = (u64)min(predicted_us,
> - get_typical_interval(data, predicted_us)) *
> - NSEC_PER_USEC;
> -
> if (tick_nohz_tick_stopped()) {
> /*
> * If the tick is already stopped, the cost of possible short
> Index: linux-pm/drivers/cpuidle/governors/teo.c
> ===================================================================
> --- linux-pm.orig/drivers/cpuidle/governors/teo.c
> +++ linux-pm/drivers/cpuidle/governors/teo.c
> @@ -140,6 +140,8 @@
> #include <linux/sched/topology.h>
> #include <linux/tick.h>
>
> +#include "gov.h"
> +
> /*
> * The number of bits to shift the CPU's capacity by in order to determine
> * the utilized threshold.
> Index: linux-pm/drivers/cpuidle/governors/gov.h
> ===================================================================
> --- /dev/null
> +++ linux-pm/drivers/cpuidle/governors/gov.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/* Common definitions for cpuidle governors. */
> +
> +#ifndef __CPUIDLE_GOVERNOR_H
> +#define __CPUIDLE_GOVERNOR_H
> +
> +/*
> + * Idle state target residency threshold used for deciding whether or not to
> + * check the time till the closest expected timer event.
> + */
> +#define RESIDENCY_THRESHOLD_NS (15 * NSEC_PER_USEC)
> +
> +#endif /* __CPUIDLE_GOVERNOR_H */
This patch is now present in the git branch at
git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git \
pm-cpuidle-gov
along with the previous teo governor changes.
On Wed, Aug 9, 2023 at 11:16 AM Rafael J. Wysocki <rafael@kernel.org> wrote:
> On Wed, Aug 9, 2023 at 4:53 PM Rafael J. Wysocki <rjw@rjwysocki.net> wrote:
> >
> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> >
> > Because the cost of calling tick_nohz_get_sleep_length() may increase
> > in the future, reorder the code in menu_select() so it first uses the
> > statistics to determine the expected idle duration. If that value is
> > higher than RESIDENCY_THRESHOLD_NS, tick_nohz_get_sleep_length() will
> > be called to obtain the time till the closest timer and refine the
> > idle duration prediction if necessary.
> >
> > This causes the governor to always take the full overhead of
> > get_typical_interval() with the assumption that the cost will be
> > amortized by skipping the tick_nohz_get_sleep_length() call in the
> > cases when the predicted idle duration is relatively very small.
> >
> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > ---
... deleted...
>
> This patch is now present in the git branch at
>
> git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git \
> pm-cpuidle-gov
>
> along with the previous teo governor changes.
Hi Rafael,
Thanks for the branch and adding it to the previous 6.5-rc4 code,
as now I can re-use the menu baseline tests already done.
My test computer boots by default to use the teo idle governor.
When I change to the menu governor, my system becomes
unresponsive and I have to re-boot.
Is anyone else having this difficulty?
... Doug
Hi Doug,
On Thu, Aug 10, 2023 at 6:28 PM Doug Smythies <dsmythies@telus.net> wrote:
>
> On Wed, Aug 9, 2023 at 11:16 AM Rafael J. Wysocki <rafael@kernel.org> wrote:
> > On Wed, Aug 9, 2023 at 4:53 PM Rafael J. Wysocki <rjw@rjwysocki.net> wrote:
> > >
> > > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > >
> > > Because the cost of calling tick_nohz_get_sleep_length() may increase
> > > in the future, reorder the code in menu_select() so it first uses the
> > > statistics to determine the expected idle duration. If that value is
> > > higher than RESIDENCY_THRESHOLD_NS, tick_nohz_get_sleep_length() will
> > > be called to obtain the time till the closest timer and refine the
> > > idle duration prediction if necessary.
> > >
> > > This causes the governor to always take the full overhead of
> > > get_typical_interval() with the assumption that the cost will be
> > > amortized by skipping the tick_nohz_get_sleep_length() call in the
> > > cases when the predicted idle duration is relatively very small.
> > >
> > > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > > ---
> ... deleted...
> >
> > This patch is now present in the git branch at
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git \
> > pm-cpuidle-gov
> >
> > along with the previous teo governor changes.
>
> Hi Rafael,
>
> Thanks for the branch and adding it to the previous 6.5-rc4 code,
> as now I can re-use the menu baseline tests already done.
>
> My test computer boots by default to use the teo idle governor.
> When I change to the menu governor, my system becomes
> unresponsive and I have to re-boot.
>
> Is anyone else having this difficulty?
There is a missing check it get_typical_interval(), my bad.
I'll send a v2 of the menu governor patch shortly and I'll update the
pm-cpuidle-gov branch accordingly.
Thanks!
===================================================================
@@ -19,6 +19,8 @@
#include <linux/sched/stat.h>
#include <linux/math64.h>
+#include "gov.h"
+
#define BUCKETS 12
#define INTERVAL_SHIFT 3
#define INTERVALS (1UL << INTERVAL_SHIFT)
@@ -166,8 +168,7 @@ static void menu_update(struct cpuidle_d
* of points is below a threshold. If it is... then use the
* average of these 8 points as the estimated value.
*/
-static unsigned int get_typical_interval(struct menu_device *data,
- unsigned int predicted_us)
+static unsigned int get_typical_interval(struct menu_device *data)
{
int i, divisor;
unsigned int min, max, thresh, avg;
@@ -195,13 +196,6 @@ again:
}
}
- /*
- * If the result of the computation is going to be discarded anyway,
- * avoid the computation altogether.
- */
- if (min >= predicted_us)
- return UINT_MAX;
-
if (divisor == INTERVALS)
avg = sum >> INTERVAL_SHIFT;
else
@@ -267,7 +261,6 @@ static int menu_select(struct cpuidle_dr
{
struct menu_device *data = this_cpu_ptr(&menu_devices);
s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
- unsigned int predicted_us;
u64 predicted_ns;
u64 interactivity_req;
unsigned int nr_iowaiters;
@@ -279,16 +272,41 @@ static int menu_select(struct cpuidle_dr
data->needs_update = 0;
}
- /* determine the expected residency time, round up */
- delta = tick_nohz_get_sleep_length(&delta_tick);
- if (unlikely(delta < 0)) {
- delta = 0;
- delta_tick = 0;
- }
- data->next_timer_ns = delta;
-
nr_iowaiters = nr_iowait_cpu(dev->cpu);
- data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
+
+ /* Find the shortest expected idle interval. */
+ predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
+ if (predicted_ns > RESIDENCY_THRESHOLD_NS) {
+ unsigned int timer_us;
+
+ /* Determine the time till the closest timer. */
+ delta = tick_nohz_get_sleep_length(&delta_tick);
+ if (unlikely(delta < 0)) {
+ delta = 0;
+ delta_tick = 0;
+ }
+
+ data->next_timer_ns = delta;
+ data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
+
+ /* Round up the result for half microseconds. */
+ timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
+ data->next_timer_ns *
+ data->correction_factor[data->bucket],
+ RESOLUTION * DECAY * NSEC_PER_USEC);
+ /* Use the lowest expected idle interval to pick the idle state. */
+ predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns);
+ } else {
+ /*
+ * Because the next timer event is not going to be determined
+ * in this case, assume that without the tick the closest timer
+ * will be in distant future and that the closest tick will occur
+ * after 1/2 of the tick period.
+ */
+ data->next_timer_ns = KTIME_MAX;
+ delta_tick = TICK_NSEC / 2;
+ data->bucket = which_bucket(KTIME_MAX, nr_iowaiters);
+ }
if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
((data->next_timer_ns < drv->states[1].target_residency_ns ||
@@ -303,16 +321,6 @@ static int menu_select(struct cpuidle_dr
return 0;
}
- /* Round up the result for half microseconds. */
- predicted_us = div_u64(data->next_timer_ns *
- data->correction_factor[data->bucket] +
- (RESOLUTION * DECAY * NSEC_PER_USEC) / 2,
- RESOLUTION * DECAY * NSEC_PER_USEC);
- /* Use the lowest expected idle interval to pick the idle state. */
- predicted_ns = (u64)min(predicted_us,
- get_typical_interval(data, predicted_us)) *
- NSEC_PER_USEC;
-
if (tick_nohz_tick_stopped()) {
/*
* If the tick is already stopped, the cost of possible short
===================================================================
@@ -140,6 +140,8 @@
#include <linux/sched/topology.h>
#include <linux/tick.h>
+#include "gov.h"
+
/*
* The number of bits to shift the CPU's capacity by in order to determine
* the utilized threshold.
===================================================================
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* Common definitions for cpuidle governors. */
+
+#ifndef __CPUIDLE_GOVERNOR_H
+#define __CPUIDLE_GOVERNOR_H
+
+/*
+ * Idle state target residency threshold used for deciding whether or not to
+ * check the time till the closest expected timer event.
+ */
+#define RESIDENCY_THRESHOLD_NS (15 * NSEC_PER_USEC)
+
+#endif /* __CPUIDLE_GOVERNOR_H */