diff mbox series

[1/4] sched/psi: rearrange polling code in preparation

Message ID	20230309170756.52927-2-cerasuolodomenico@gmail.com
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20; From: Domenico Cerasuolo <cerasuolodomenico@gmail.com> To: linux-kernel@vger.kernel.org Cc: peterz@infradead.org, surenb@google.com, brauner@kernel.org, chris@chrisdown.name, hannes@cmpxchg.org, Domenico Cerasuolo <cerasuolodomenico@gmail.com> Subject: [PATCH 1/4] sched/psi: rearrange polling code in preparation Date: Thu, 9 Mar 2023 18:07:53 +0100 Message-Id: <20230309170756.52927-2-cerasuolodomenico@gmail.com> In-Reply-To: <20230309170756.52927-1-cerasuolodomenico@gmail.com> References: <20230309170756.52927-1-cerasuolodomenico@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	sched/psi: Allow unprivileged PSI polling \| [0/4] sched/psi: Allow unprivileged PSI polling [1/4] sched/psi: rearrange polling code in preparation [2/4] sched/psi: rename existing poll members in preparation [3/4] sched/psi: extract update_triggers side effect [4/4] sched/psi: allow unprivileged polling of N*2s period

Commit Message

Domenico Cerasuolo March 9, 2023, 5:07 p.m. UTC

  Move a few functions up in the file to avoid forward declaration needed
in the patch implementing unprivileged PSI triggers.

Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
---
 kernel/sched/psi.c | 196 ++++++++++++++++++++++-----------------------
 1 file changed, 98 insertions(+), 98 deletions(-)

Comments

Suren Baghdasaryan March 20, 2023, 9:06 p.m. UTC | #1

On Thu, Mar 9, 2023 at 9:08 AM Domenico Cerasuolo
<cerasuolodomenico@gmail.com> wrote:
>
> Move a few functions up in the file to avoid forward declaration needed
> in the patch implementing unprivileged PSI triggers.
>
> Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>

LGTM. Will Ack when we finalize the rest of the patchset.

> ---
>  kernel/sched/psi.c | 196 ++++++++++++++++++++++-----------------------
>  1 file changed, 98 insertions(+), 98 deletions(-)
>
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 02e011cabe91..fe9269f1d2a4 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -384,92 +384,6 @@ static void collect_percpu_times(struct psi_group *group,
>                 *pchanged_states = changed_states;
>  }
>
> -static u64 update_averages(struct psi_group *group, u64 now)
> -{
> -       unsigned long missed_periods = 0;
> -       u64 expires, period;
> -       u64 avg_next_update;
> -       int s;
> -
> -       /* avgX= */
> -       expires = group->avg_next_update;
> -       if (now - expires >= psi_period)
> -               missed_periods = div_u64(now - expires, psi_period);
> -
> -       /*
> -        * The periodic clock tick can get delayed for various
> -        * reasons, especially on loaded systems. To avoid clock
> -        * drift, we schedule the clock in fixed psi_period intervals.
> -        * But the deltas we sample out of the per-cpu buckets above
> -        * are based on the actual time elapsing between clock ticks.
> -        */
> -       avg_next_update = expires + ((1 + missed_periods) * psi_period);
> -       period = now - (group->avg_last_update + (missed_periods * psi_period));
> -       group->avg_last_update = now;
> -
> -       for (s = 0; s < NR_PSI_STATES - 1; s++) {
> -               u32 sample;
> -
> -               sample = group->total[PSI_AVGS][s] - group->avg_total[s];
> -               /*
> -                * Due to the lockless sampling of the time buckets,
> -                * recorded time deltas can slip into the next period,
> -                * which under full pressure can result in samples in
> -                * excess of the period length.
> -                *
> -                * We don't want to report non-sensical pressures in
> -                * excess of 100%, nor do we want to drop such events
> -                * on the floor. Instead we punt any overage into the
> -                * future until pressure subsides. By doing this we
> -                * don't underreport the occurring pressure curve, we
> -                * just report it delayed by one period length.
> -                *
> -                * The error isn't cumulative. As soon as another
> -                * delta slips from a period P to P+1, by definition
> -                * it frees up its time T in P.
> -                */
> -               if (sample > period)
> -                       sample = period;
> -               group->avg_total[s] += sample;
> -               calc_avgs(group->avg[s], missed_periods, sample, period);
> -       }
> -
> -       return avg_next_update;
> -}
> -
> -static void psi_avgs_work(struct work_struct *work)
> -{
> -       struct delayed_work *dwork;
> -       struct psi_group *group;
> -       u32 changed_states;
> -       u64 now;
> -
> -       dwork = to_delayed_work(work);
> -       group = container_of(dwork, struct psi_group, avgs_work);
> -
> -       mutex_lock(&group->avgs_lock);
> -
> -       now = sched_clock();
> -
> -       collect_percpu_times(group, PSI_AVGS, &changed_states);
> -       /*
> -        * If there is task activity, periodically fold the per-cpu
> -        * times and feed samples into the running averages. If things
> -        * are idle and there is no data to process, stop the clock.
> -        * Once restarted, we'll catch up the running averages in one
> -        * go - see calc_avgs() and missed_periods.
> -        */
> -       if (now >= group->avg_next_update)
> -               group->avg_next_update = update_averages(group, now);
> -
> -       if (changed_states & PSI_STATE_RESCHEDULE) {
> -               schedule_delayed_work(dwork, nsecs_to_jiffies(
> -                               group->avg_next_update - now) + 1);
> -       }
> -
> -       mutex_unlock(&group->avgs_lock);
> -}
> -
>  /* Trigger tracking window manipulations */
>  static void window_reset(struct psi_window *win, u64 now, u64 value,
>                          u64 prev_growth)
> @@ -516,18 +430,6 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
>         return growth;
>  }
>
> -static void init_triggers(struct psi_group *group, u64 now)
> -{
> -       struct psi_trigger *t;
> -
> -       list_for_each_entry(t, &group->triggers, node)
> -               window_reset(&t->win, now,
> -                               group->total[PSI_POLL][t->state], 0);
> -       memcpy(group->polling_total, group->total[PSI_POLL],
> -                  sizeof(group->polling_total));
> -       group->polling_next_update = now + group->poll_min_period;
> -}
> -
>  static u64 update_triggers(struct psi_group *group, u64 now)
>  {
>         struct psi_trigger *t;
> @@ -590,6 +492,104 @@ static u64 update_triggers(struct psi_group *group, u64 now)
>         return now + group->poll_min_period;
>  }
>
> +static u64 update_averages(struct psi_group *group, u64 now)
> +{
> +       unsigned long missed_periods = 0;
> +       u64 expires, period;
> +       u64 avg_next_update;
> +       int s;
> +
> +       /* avgX= */
> +       expires = group->avg_next_update;
> +       if (now - expires >= psi_period)
> +               missed_periods = div_u64(now - expires, psi_period);
> +
> +       /*
> +        * The periodic clock tick can get delayed for various
> +        * reasons, especially on loaded systems. To avoid clock
> +        * drift, we schedule the clock in fixed psi_period intervals.
> +        * But the deltas we sample out of the per-cpu buckets above
> +        * are based on the actual time elapsing between clock ticks.
> +        */
> +       avg_next_update = expires + ((1 + missed_periods) * psi_period);
> +       period = now - (group->avg_last_update + (missed_periods * psi_period));
> +       group->avg_last_update = now;
> +
> +       for (s = 0; s < NR_PSI_STATES - 1; s++) {
> +               u32 sample;
> +
> +               sample = group->total[PSI_AVGS][s] - group->avg_total[s];
> +               /*
> +                * Due to the lockless sampling of the time buckets,
> +                * recorded time deltas can slip into the next period,
> +                * which under full pressure can result in samples in
> +                * excess of the period length.
> +                *
> +                * We don't want to report non-sensical pressures in
> +                * excess of 100%, nor do we want to drop such events
> +                * on the floor. Instead we punt any overage into the
> +                * future until pressure subsides. By doing this we
> +                * don't underreport the occurring pressure curve, we
> +                * just report it delayed by one period length.
> +                *
> +                * The error isn't cumulative. As soon as another
> +                * delta slips from a period P to P+1, by definition
> +                * it frees up its time T in P.
> +                */
> +               if (sample > period)
> +                       sample = period;
> +               group->avg_total[s] += sample;
> +               calc_avgs(group->avg[s], missed_periods, sample, period);
> +       }
> +
> +       return avg_next_update;
> +}
> +
> +static void psi_avgs_work(struct work_struct *work)
> +{
> +       struct delayed_work *dwork;
> +       struct psi_group *group;
> +       u32 changed_states;
> +       u64 now;
> +
> +       dwork = to_delayed_work(work);
> +       group = container_of(dwork, struct psi_group, avgs_work);
> +
> +       mutex_lock(&group->avgs_lock);
> +
> +       now = sched_clock();
> +
> +       collect_percpu_times(group, PSI_AVGS, &changed_states);
> +       /*
> +        * If there is task activity, periodically fold the per-cpu
> +        * times and feed samples into the running averages. If things
> +        * are idle and there is no data to process, stop the clock.
> +        * Once restarted, we'll catch up the running averages in one
> +        * go - see calc_avgs() and missed_periods.
> +        */
> +       if (now >= group->avg_next_update)
> +               group->avg_next_update = update_averages(group, now);
> +
> +       if (changed_states & PSI_STATE_RESCHEDULE) {
> +               schedule_delayed_work(dwork, nsecs_to_jiffies(
> +                               group->avg_next_update - now) + 1);
> +       }
> +
> +       mutex_unlock(&group->avgs_lock);
> +}
> +
> +static void init_triggers(struct psi_group *group, u64 now)
> +{
> +       struct psi_trigger *t;
> +
> +       list_for_each_entry(t, &group->triggers, node)
> +               window_reset(&t->win, now,
> +                               group->total[PSI_POLL][t->state], 0);
> +       memcpy(group->polling_total, group->total[PSI_POLL],
> +                  sizeof(group->polling_total));
> +       group->polling_next_update = now + group->poll_min_period;
> +}
> +
>  /* Schedule polling if it's not already scheduled or forced. */
>  static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
>                                    bool force)
> --
> 2.34.1
>

diff mbox series

Patch

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 02e011cabe91..fe9269f1d2a4 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -384,92 +384,6 @@  static void collect_percpu_times(struct psi_group *group,
 		*pchanged_states = changed_states;
 }
 
-static u64 update_averages(struct psi_group *group, u64 now)
-{
-	unsigned long missed_periods = 0;
-	u64 expires, period;
-	u64 avg_next_update;
-	int s;
-
-	/* avgX= */
-	expires = group->avg_next_update;
-	if (now - expires >= psi_period)
-		missed_periods = div_u64(now - expires, psi_period);
-
-	/*
-	 * The periodic clock tick can get delayed for various
-	 * reasons, especially on loaded systems. To avoid clock
-	 * drift, we schedule the clock in fixed psi_period intervals.
-	 * But the deltas we sample out of the per-cpu buckets above
-	 * are based on the actual time elapsing between clock ticks.
-	 */
-	avg_next_update = expires + ((1 + missed_periods) * psi_period);
-	period = now - (group->avg_last_update + (missed_periods * psi_period));
-	group->avg_last_update = now;
-
-	for (s = 0; s < NR_PSI_STATES - 1; s++) {
-		u32 sample;
-
-		sample = group->total[PSI_AVGS][s] - group->avg_total[s];
-		/*
-		 * Due to the lockless sampling of the time buckets,
-		 * recorded time deltas can slip into the next period,
-		 * which under full pressure can result in samples in
-		 * excess of the period length.
-		 *
-		 * We don't want to report non-sensical pressures in
-		 * excess of 100%, nor do we want to drop such events
-		 * on the floor. Instead we punt any overage into the
-		 * future until pressure subsides. By doing this we
-		 * don't underreport the occurring pressure curve, we
-		 * just report it delayed by one period length.
-		 *
-		 * The error isn't cumulative. As soon as another
-		 * delta slips from a period P to P+1, by definition
-		 * it frees up its time T in P.
-		 */
-		if (sample > period)
-			sample = period;
-		group->avg_total[s] += sample;
-		calc_avgs(group->avg[s], missed_periods, sample, period);
-	}
-
-	return avg_next_update;
-}
-
-static void psi_avgs_work(struct work_struct *work)
-{
-	struct delayed_work *dwork;
-	struct psi_group *group;
-	u32 changed_states;
-	u64 now;
-
-	dwork = to_delayed_work(work);
-	group = container_of(dwork, struct psi_group, avgs_work);
-
-	mutex_lock(&group->avgs_lock);
-
-	now = sched_clock();
-
-	collect_percpu_times(group, PSI_AVGS, &changed_states);
-	/*
-	 * If there is task activity, periodically fold the per-cpu
-	 * times and feed samples into the running averages. If things
-	 * are idle and there is no data to process, stop the clock.
-	 * Once restarted, we'll catch up the running averages in one
-	 * go - see calc_avgs() and missed_periods.
-	 */
-	if (now >= group->avg_next_update)
-		group->avg_next_update = update_averages(group, now);
-
-	if (changed_states & PSI_STATE_RESCHEDULE) {
-		schedule_delayed_work(dwork, nsecs_to_jiffies(
-				group->avg_next_update - now) + 1);
-	}
-
-	mutex_unlock(&group->avgs_lock);
-}
-
 /* Trigger tracking window manipulations */
 static void window_reset(struct psi_window *win, u64 now, u64 value,
 			 u64 prev_growth)
@@ -516,18 +430,6 @@  static u64 window_update(struct psi_window *win, u64 now, u64 value)
 	return growth;
 }
 
-static void init_triggers(struct psi_group *group, u64 now)
-{
-	struct psi_trigger *t;
-
-	list_for_each_entry(t, &group->triggers, node)
-		window_reset(&t->win, now,
-				group->total[PSI_POLL][t->state], 0);
-	memcpy(group->polling_total, group->total[PSI_POLL],
-		   sizeof(group->polling_total));
-	group->polling_next_update = now + group->poll_min_period;
-}
-
 static u64 update_triggers(struct psi_group *group, u64 now)
 {
 	struct psi_trigger *t;
@@ -590,6 +492,104 @@  static u64 update_triggers(struct psi_group *group, u64 now)
 	return now + group->poll_min_period;
 }
 
+static u64 update_averages(struct psi_group *group, u64 now)
+{
+	unsigned long missed_periods = 0;
+	u64 expires, period;
+	u64 avg_next_update;
+	int s;
+
+	/* avgX= */
+	expires = group->avg_next_update;
+	if (now - expires >= psi_period)
+		missed_periods = div_u64(now - expires, psi_period);
+
+	/*
+	 * The periodic clock tick can get delayed for various
+	 * reasons, especially on loaded systems. To avoid clock
+	 * drift, we schedule the clock in fixed psi_period intervals.
+	 * But the deltas we sample out of the per-cpu buckets above
+	 * are based on the actual time elapsing between clock ticks.
+	 */
+	avg_next_update = expires + ((1 + missed_periods) * psi_period);
+	period = now - (group->avg_last_update + (missed_periods * psi_period));
+	group->avg_last_update = now;
+
+	for (s = 0; s < NR_PSI_STATES - 1; s++) {
+		u32 sample;
+
+		sample = group->total[PSI_AVGS][s] - group->avg_total[s];
+		/*
+		 * Due to the lockless sampling of the time buckets,
+		 * recorded time deltas can slip into the next period,
+		 * which under full pressure can result in samples in
+		 * excess of the period length.
+		 *
+		 * We don't want to report non-sensical pressures in
+		 * excess of 100%, nor do we want to drop such events
+		 * on the floor. Instead we punt any overage into the
+		 * future until pressure subsides. By doing this we
+		 * don't underreport the occurring pressure curve, we
+		 * just report it delayed by one period length.
+		 *
+		 * The error isn't cumulative. As soon as another
+		 * delta slips from a period P to P+1, by definition
+		 * it frees up its time T in P.
+		 */
+		if (sample > period)
+			sample = period;
+		group->avg_total[s] += sample;
+		calc_avgs(group->avg[s], missed_periods, sample, period);
+	}
+
+	return avg_next_update;
+}
+
+static void psi_avgs_work(struct work_struct *work)
+{
+	struct delayed_work *dwork;
+	struct psi_group *group;
+	u32 changed_states;
+	u64 now;
+
+	dwork = to_delayed_work(work);
+	group = container_of(dwork, struct psi_group, avgs_work);
+
+	mutex_lock(&group->avgs_lock);
+
+	now = sched_clock();
+
+	collect_percpu_times(group, PSI_AVGS, &changed_states);
+	/*
+	 * If there is task activity, periodically fold the per-cpu
+	 * times and feed samples into the running averages. If things
+	 * are idle and there is no data to process, stop the clock.
+	 * Once restarted, we'll catch up the running averages in one
+	 * go - see calc_avgs() and missed_periods.
+	 */
+	if (now >= group->avg_next_update)
+		group->avg_next_update = update_averages(group, now);
+
+	if (changed_states & PSI_STATE_RESCHEDULE) {
+		schedule_delayed_work(dwork, nsecs_to_jiffies(
+				group->avg_next_update - now) + 1);
+	}
+
+	mutex_unlock(&group->avgs_lock);
+}
+
+static void init_triggers(struct psi_group *group, u64 now)
+{
+	struct psi_trigger *t;
+
+	list_for_each_entry(t, &group->triggers, node)
+		window_reset(&t->win, now,
+				group->total[PSI_POLL][t->state], 0);
+	memcpy(group->polling_total, group->total[PSI_POLL],
+		   sizeof(group->polling_total));
+	group->polling_next_update = now + group->poll_min_period;
+}
+
 /* Schedule polling if it's not already scheduled or forced. */
 static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
 				   bool force)