[3/4] sched/psi: extract update_triggers side effect

Message ID 20230309170756.52927-4-cerasuolodomenico@gmail.com
State New
Headers
Series sched/psi: Allow unprivileged PSI polling |

Commit Message

Domenico Cerasuolo March 9, 2023, 5:07 p.m. UTC
  The update of rtpoll_total inside update_triggers can be moved out of
the function since changed_states has the same information as the
update_total flag used in the function. Besides the simplification of
the function, with the next patch it would become an unwanted side
effect needed only for PSI_POLL.

Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
---
 kernel/sched/psi.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)
  

Comments

Suren Baghdasaryan March 20, 2023, 11 p.m. UTC | #1
On Thu, Mar 9, 2023 at 9:08 AM Domenico Cerasuolo
<cerasuolodomenico@gmail.com> wrote:
>
> The update of rtpoll_total inside update_triggers can be moved out of
> the function since changed_states has the same information as the
> update_total flag used in the function. Besides the simplification of
> the function, with the next patch it would become an unwanted side
> effect needed only for PSI_POLL.

(changed_states & group->rtpoll_states) and update_total flag are not
really equivalent. update_total flag depends on the difference between
group->polling_total[state] and group->total[PSI_POLL][state] while
changed_states depends on the difference between groupc->times and
groupc->times_prev. groupc->times_prev is updated every time
collect_percpu_times() is called and there are 3 places where that
happens: from psi_avgs_work(), from psi_poll_work() and from
psi_show(). group->polling_total[state] is updated only from
psi_poll_work(). Therefore the deltas between these values might not
always be in-sync.

Consider the following sequence as an example:

psi_poll_work()
...
psi_avgs_work()/psi_show()
  collect_percpu_times() // we detect a change in a monitored state
...
psi_poll_work()
  collect_percpu_times() // this time no change in monitored states
  update_triggers() // group->polling_total[state] !=
group->total[PSI_POLL][state]

In the last psi_poll_work() collect_percpu_times() recorded no change
in monitored states, so (changed_states & group->rtpoll_states) == 0,
however since the last time psi_poll_work() was called there was
actually a change in monitored states recorded by the first
collect_percpu_times(), therefore (group->polling_total[t->state] !=
total[t->state]) and we should update the totals. With your change we
will miss that update.

I think you can easily fix that by introducing update_triggers as an
output parameter in window_update() like this:

static u64 window_update(struct psi_window *win, u64 now, u64 value,
bool *update_triggers) {
    *update_total = false;
...
    if (new_stall) {
        *update_total = true;
...
}

static void psi_rtpoll_work(struct psi_group *group) {
+       bool update_triggers;
...
-       if (now >= group->rtpoll_next_update)
+       if (now >= group->rtpoll_next_update) {
                group->rtpoll_next_update = update_triggers(group,
now, &update_triggers);
+               if (update_triggers)
+                       memcpy(group->rtpoll_total, group->total[PSI_POLL],
+                                  sizeof(group->rtpoll_total));
+       }
}


>
> Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> ---
>  kernel/sched/psi.c | 20 +++++---------------
>  1 file changed, 5 insertions(+), 15 deletions(-)
>
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index a3d0b5cf797a..476941c1cbea 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -433,7 +433,6 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
>  static u64 update_triggers(struct psi_group *group, u64 now)
>  {
>         struct psi_trigger *t;
> -       bool update_total = false;
>         u64 *total = group->total[PSI_POLL];
>
>         /*
> @@ -456,14 +455,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
>                  * events without dropping any).
>                  */
>                 if (new_stall) {
> -                       /*
> -                        * Multiple triggers might be looking at the same state,
> -                        * remember to update group->polling_total[] once we've
> -                        * been through all of them. Also remember to extend the
> -                        * polling time if we see new stall activity.
> -                        */
> -                       update_total = true;
> -
>                         /* Calculate growth since last update */
>                         growth = window_update(&t->win, now, total[t->state]);
>                         if (!t->pending_event) {
> @@ -484,11 +475,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
>                 /* Reset threshold breach flag once event got generated */
>                 t->pending_event = false;
>         }
> -
> -       if (update_total)
> -               memcpy(group->rtpoll_total, total,
> -                               sizeof(group->rtpoll_total));
> -
>         return now + group->rtpoll_min_period;
>  }
>
> @@ -686,8 +672,12 @@ static void psi_rtpoll_work(struct psi_group *group)
>                 goto out;
>         }
>
> -       if (now >= group->rtpoll_next_update)
> +       if (now >= group->rtpoll_next_update) {
>                 group->rtpoll_next_update = update_triggers(group, now);
> +               if (changed_states & group->rtpoll_states)
> +                       memcpy(group->rtpoll_total, group->total[PSI_POLL],
> +                                  sizeof(group->rtpoll_total));
> +       }
>
>         psi_schedule_rtpoll_work(group,
>                 nsecs_to_jiffies(group->rtpoll_next_update - now) + 1,
> --
> 2.34.1
>
  
Suren Baghdasaryan March 22, 2023, 3:40 a.m. UTC | #2
On Tue, Mar 21, 2023 at 3:18 AM Domenico Cerasuolo
<cerasuolodomenico@gmail.com> wrote:
>
> Hi Suren, thanks for all the feedback! This makes sense, I only have one doubt, if we set update_total flag to window_update() and update_triggers(), that flag would be ignored when the caller is psi_avgs_work(), this would be happening in the next patch in the set.

I don't see why the update_triggers part should be conceptually
different between RT and unprivileged triggers. Could you please
explain?

> What do you think if I just remove this change from the patchset and then work on a solution after the iterations on the main change are completed? This was in fact just an attempt to clean up.
> I'll apply your suggested changes on the other patches, wait a bit for comments from someone else and then send V2.
>
> On Tue, Mar 21, 2023 at 12:00 AM Suren Baghdasaryan <surenb@google.com> wrote:
>>
>> On Thu, Mar 9, 2023 at 9:08 AM Domenico Cerasuolo
>> <cerasuolodomenico@gmail.com> wrote:
>> >
>> > The update of rtpoll_total inside update_triggers can be moved out of
>> > the function since changed_states has the same information as the
>> > update_total flag used in the function. Besides the simplification of
>> > the function, with the next patch it would become an unwanted side
>> > effect needed only for PSI_POLL.
>>
>> (changed_states & group->rtpoll_states) and update_total flag are not
>> really equivalent. update_total flag depends on the difference between
>> group->polling_total[state] and group->total[PSI_POLL][state] while
>> changed_states depends on the difference between groupc->times and
>> groupc->times_prev. groupc->times_prev is updated every time
>> collect_percpu_times() is called and there are 3 places where that
>> happens: from psi_avgs_work(), from psi_poll_work() and from
>> psi_show(). group->polling_total[state] is updated only from
>> psi_poll_work(). Therefore the deltas between these values might not
>> always be in-sync.
>>
>> Consider the following sequence as an example:
>>
>> psi_poll_work()
>> ...
>> psi_avgs_work()/psi_show()
>>   collect_percpu_times() // we detect a change in a monitored state
>> ...
>> psi_poll_work()
>>   collect_percpu_times() // this time no change in monitored states
>>   update_triggers() // group->polling_total[state] !=
>> group->total[PSI_POLL][state]
>>
>> In the last psi_poll_work() collect_percpu_times() recorded no change
>> in monitored states, so (changed_states & group->rtpoll_states) == 0,
>> however since the last time psi_poll_work() was called there was
>> actually a change in monitored states recorded by the first
>> collect_percpu_times(), therefore (group->polling_total[t->state] !=
>> total[t->state]) and we should update the totals. With your change we
>> will miss that update.
>>
>> I think you can easily fix that by introducing update_triggers as an
>> output parameter in window_update() like this:
>>
>> static u64 window_update(struct psi_window *win, u64 now, u64 value,
>> bool *update_triggers) {
>>     *update_total = false;
>> ...
>>     if (new_stall) {
>>         *update_total = true;
>> ...
>> }
>>
>> static void psi_rtpoll_work(struct psi_group *group) {
>> +       bool update_triggers;
>> ...
>> -       if (now >= group->rtpoll_next_update)
>> +       if (now >= group->rtpoll_next_update) {
>>                 group->rtpoll_next_update = update_triggers(group,
>> now, &update_triggers);
>> +               if (update_triggers)
>> +                       memcpy(group->rtpoll_total, group->total[PSI_POLL],
>> +                                  sizeof(group->rtpoll_total));
>> +       }
>> }
>>
>>
>> >
>> > Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
>> > Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
>> > ---
>> >  kernel/sched/psi.c | 20 +++++---------------
>> >  1 file changed, 5 insertions(+), 15 deletions(-)
>> >
>> > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>> > index a3d0b5cf797a..476941c1cbea 100644
>> > --- a/kernel/sched/psi.c
>> > +++ b/kernel/sched/psi.c
>> > @@ -433,7 +433,6 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
>> >  static u64 update_triggers(struct psi_group *group, u64 now)
>> >  {
>> >         struct psi_trigger *t;
>> > -       bool update_total = false;
>> >         u64 *total = group->total[PSI_POLL];
>> >
>> >         /*
>> > @@ -456,14 +455,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
>> >                  * events without dropping any).
>> >                  */
>> >                 if (new_stall) {
>> > -                       /*
>> > -                        * Multiple triggers might be looking at the same state,
>> > -                        * remember to update group->polling_total[] once we've
>> > -                        * been through all of them. Also remember to extend the
>> > -                        * polling time if we see new stall activity.
>> > -                        */
>> > -                       update_total = true;
>> > -
>> >                         /* Calculate growth since last update */
>> >                         growth = window_update(&t->win, now, total[t->state]);
>> >                         if (!t->pending_event) {
>> > @@ -484,11 +475,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
>> >                 /* Reset threshold breach flag once event got generated */
>> >                 t->pending_event = false;
>> >         }
>> > -
>> > -       if (update_total)
>> > -               memcpy(group->rtpoll_total, total,
>> > -                               sizeof(group->rtpoll_total));
>> > -
>> >         return now + group->rtpoll_min_period;
>> >  }
>> >
>> > @@ -686,8 +672,12 @@ static void psi_rtpoll_work(struct psi_group *group)
>> >                 goto out;
>> >         }
>> >
>> > -       if (now >= group->rtpoll_next_update)
>> > +       if (now >= group->rtpoll_next_update) {
>> >                 group->rtpoll_next_update = update_triggers(group, now);
>> > +               if (changed_states & group->rtpoll_states)
>> > +                       memcpy(group->rtpoll_total, group->total[PSI_POLL],
>> > +                                  sizeof(group->rtpoll_total));
>> > +       }
>> >
>> >         psi_schedule_rtpoll_work(group,
>> >                 nsecs_to_jiffies(group->rtpoll_next_update - now) + 1,
>> > --
>> > 2.34.1
>> >
  
Suren Baghdasaryan March 22, 2023, 4:41 p.m. UTC | #3
On Wed, Mar 22, 2023 at 3:14 AM Domenico Cerasuolo
<cerasuolodomenico@gmail.com> wrote:
>
> I'm not suggesting that update_triggers should be different, I agree that they should behave the same for both types of trigger.
> The problem is that if we extract the update_total information out of update_triggers, that information will be ignored by psi_avgs_work because avg_total is always updated in update_averages, only psi_poll_work would use it to copy the total to polling_total.
> If this is the only alternative to having `if (update_total && aggregator == PSI_POLL)` inside update_triggers, I'll add the argument to update_triggers, I'm just wondering if there could be another alternative.

I suggest you post the V2 with suggested changes and this approach and
it will be easier to decide whether this can be improved further.
Also, please do not top-post (read through
https://kernelnewbies.org/mailinglistguidelines for more hints).
Thanks,
Suren.

>
> On Wed, Mar 22, 2023 at 4:41 AM Suren Baghdasaryan <surenb@google.com> wrote:
>>
>> On Tue, Mar 21, 2023 at 3:18 AM Domenico Cerasuolo
>> <cerasuolodomenico@gmail.com> wrote:
>> >
>> > Hi Suren, thanks for all the feedback! This makes sense, I only have one doubt, if we set update_total flag to window_update() and update_triggers(), that flag would be ignored when the caller is psi_avgs_work(), this would be happening in the next patch in the set.
>>
>> I don't see why the update_triggers part should be conceptually
>> different between RT and unprivileged triggers. Could you please
>> explain?
>>
>> > What do you think if I just remove this change from the patchset and then work on a solution after the iterations on the main change are completed? This was in fact just an attempt to clean up.
>> > I'll apply your suggested changes on the other patches, wait a bit for comments from someone else and then send V2.
>> >
>> > On Tue, Mar 21, 2023 at 12:00 AM Suren Baghdasaryan <surenb@google.com> wrote:
>> >>
>> >> On Thu, Mar 9, 2023 at 9:08 AM Domenico Cerasuolo
>> >> <cerasuolodomenico@gmail.com> wrote:
>> >> >
>> >> > The update of rtpoll_total inside update_triggers can be moved out of
>> >> > the function since changed_states has the same information as the
>> >> > update_total flag used in the function. Besides the simplification of
>> >> > the function, with the next patch it would become an unwanted side
>> >> > effect needed only for PSI_POLL.
>> >>
>> >> (changed_states & group->rtpoll_states) and update_total flag are not
>> >> really equivalent. update_total flag depends on the difference between
>> >> group->polling_total[state] and group->total[PSI_POLL][state] while
>> >> changed_states depends on the difference between groupc->times and
>> >> groupc->times_prev. groupc->times_prev is updated every time
>> >> collect_percpu_times() is called and there are 3 places where that
>> >> happens: from psi_avgs_work(), from psi_poll_work() and from
>> >> psi_show(). group->polling_total[state] is updated only from
>> >> psi_poll_work(). Therefore the deltas between these values might not
>> >> always be in-sync.
>> >>
>> >> Consider the following sequence as an example:
>> >>
>> >> psi_poll_work()
>> >> ...
>> >> psi_avgs_work()/psi_show()
>> >>   collect_percpu_times() // we detect a change in a monitored state
>> >> ...
>> >> psi_poll_work()
>> >>   collect_percpu_times() // this time no change in monitored states
>> >>   update_triggers() // group->polling_total[state] !=
>> >> group->total[PSI_POLL][state]
>> >>
>> >> In the last psi_poll_work() collect_percpu_times() recorded no change
>> >> in monitored states, so (changed_states & group->rtpoll_states) == 0,
>> >> however since the last time psi_poll_work() was called there was
>> >> actually a change in monitored states recorded by the first
>> >> collect_percpu_times(), therefore (group->polling_total[t->state] !=
>> >> total[t->state]) and we should update the totals. With your change we
>> >> will miss that update.
>> >>
>> >> I think you can easily fix that by introducing update_triggers as an
>> >> output parameter in window_update() like this:
>> >>
>> >> static u64 window_update(struct psi_window *win, u64 now, u64 value,
>> >> bool *update_triggers) {
>> >>     *update_total = false;
>> >> ...
>> >>     if (new_stall) {
>> >>         *update_total = true;
>> >> ...
>> >> }
>> >>
>> >> static void psi_rtpoll_work(struct psi_group *group) {
>> >> +       bool update_triggers;
>> >> ...
>> >> -       if (now >= group->rtpoll_next_update)
>> >> +       if (now >= group->rtpoll_next_update) {
>> >>                 group->rtpoll_next_update = update_triggers(group,
>> >> now, &update_triggers);
>> >> +               if (update_triggers)
>> >> +                       memcpy(group->rtpoll_total, group->total[PSI_POLL],
>> >> +                                  sizeof(group->rtpoll_total));
>> >> +       }
>> >> }
>> >>
>> >>
>> >> >
>> >> > Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
>> >> > Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
>> >> > ---
>> >> >  kernel/sched/psi.c | 20 +++++---------------
>> >> >  1 file changed, 5 insertions(+), 15 deletions(-)
>> >> >
>> >> > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>> >> > index a3d0b5cf797a..476941c1cbea 100644
>> >> > --- a/kernel/sched/psi.c
>> >> > +++ b/kernel/sched/psi.c
>> >> > @@ -433,7 +433,6 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
>> >> >  static u64 update_triggers(struct psi_group *group, u64 now)
>> >> >  {
>> >> >         struct psi_trigger *t;
>> >> > -       bool update_total = false;
>> >> >         u64 *total = group->total[PSI_POLL];
>> >> >
>> >> >         /*
>> >> > @@ -456,14 +455,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
>> >> >                  * events without dropping any).
>> >> >                  */
>> >> >                 if (new_stall) {
>> >> > -                       /*
>> >> > -                        * Multiple triggers might be looking at the same state,
>> >> > -                        * remember to update group->polling_total[] once we've
>> >> > -                        * been through all of them. Also remember to extend the
>> >> > -                        * polling time if we see new stall activity.
>> >> > -                        */
>> >> > -                       update_total = true;
>> >> > -
>> >> >                         /* Calculate growth since last update */
>> >> >                         growth = window_update(&t->win, now, total[t->state]);
>> >> >                         if (!t->pending_event) {
>> >> > @@ -484,11 +475,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
>> >> >                 /* Reset threshold breach flag once event got generated */
>> >> >                 t->pending_event = false;
>> >> >         }
>> >> > -
>> >> > -       if (update_total)
>> >> > -               memcpy(group->rtpoll_total, total,
>> >> > -                               sizeof(group->rtpoll_total));
>> >> > -
>> >> >         return now + group->rtpoll_min_period;
>> >> >  }
>> >> >
>> >> > @@ -686,8 +672,12 @@ static void psi_rtpoll_work(struct psi_group *group)
>> >> >                 goto out;
>> >> >         }
>> >> >
>> >> > -       if (now >= group->rtpoll_next_update)
>> >> > +       if (now >= group->rtpoll_next_update) {
>> >> >                 group->rtpoll_next_update = update_triggers(group, now);
>> >> > +               if (changed_states & group->rtpoll_states)
>> >> > +                       memcpy(group->rtpoll_total, group->total[PSI_POLL],
>> >> > +                                  sizeof(group->rtpoll_total));
>> >> > +       }
>> >> >
>> >> >         psi_schedule_rtpoll_work(group,
>> >> >                 nsecs_to_jiffies(group->rtpoll_next_update - now) + 1,
>> >> > --
>> >> > 2.34.1
>> >> >
  

Patch

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index a3d0b5cf797a..476941c1cbea 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -433,7 +433,6 @@  static u64 window_update(struct psi_window *win, u64 now, u64 value)
 static u64 update_triggers(struct psi_group *group, u64 now)
 {
 	struct psi_trigger *t;
-	bool update_total = false;
 	u64 *total = group->total[PSI_POLL];
 
 	/*
@@ -456,14 +455,6 @@  static u64 update_triggers(struct psi_group *group, u64 now)
 		 * events without dropping any).
 		 */
 		if (new_stall) {
-			/*
-			 * Multiple triggers might be looking at the same state,
-			 * remember to update group->polling_total[] once we've
-			 * been through all of them. Also remember to extend the
-			 * polling time if we see new stall activity.
-			 */
-			update_total = true;
-
 			/* Calculate growth since last update */
 			growth = window_update(&t->win, now, total[t->state]);
 			if (!t->pending_event) {
@@ -484,11 +475,6 @@  static u64 update_triggers(struct psi_group *group, u64 now)
 		/* Reset threshold breach flag once event got generated */
 		t->pending_event = false;
 	}
-
-	if (update_total)
-		memcpy(group->rtpoll_total, total,
-				sizeof(group->rtpoll_total));
-
 	return now + group->rtpoll_min_period;
 }
 
@@ -686,8 +672,12 @@  static void psi_rtpoll_work(struct psi_group *group)
 		goto out;
 	}
 
-	if (now >= group->rtpoll_next_update)
+	if (now >= group->rtpoll_next_update) {
 		group->rtpoll_next_update = update_triggers(group, now);
+		if (changed_states & group->rtpoll_states)
+			memcpy(group->rtpoll_total, group->total[PSI_POLL],
+				   sizeof(group->rtpoll_total));
+	}
 
 	psi_schedule_rtpoll_work(group,
 		nsecs_to_jiffies(group->rtpoll_next_update - now) + 1,