[2/2] sched/psi: iterate through cgroups directly

Message ID 20230208161654.99556-3-ryncsn@gmail.com
State New
Headers
Series sched/psi: Optimize PSI iteration |

Commit Message

Kairui Song Feb. 8, 2023, 4:16 p.m. UTC
  From: Kairui Song <kasong@tencent.com>

psi_group->parent has the same hierarchy as the cgroup it's in.
So just iterate through cgroup instead.

By adjusting the iteration logic, save some space in psi_group
struct, and the performance is actually better. I see a measurable
performance gain using mmtests/perfpipe:

(AVG of 100 test, ops/sec, the higher the better)
KVM guest on a i7-9700:
        psi=0         root cgroup   5 levels of cgroup
Before: 59221         55352         47821
After:  60100         56036         50884

KVM guest on a Ryzen 9 5900HX:
        psi=0         root cgroup   5 levels of cgroup
Before: 144566        138919        128888
After:  145812        139580        133514

Signed-off-by: Kairui Song <kasong@tencent.com>
Signed-off-by: Kairui Song <ryncsn@gmail.com>
---
 include/linux/psi_types.h |  1 -
 kernel/sched/psi.c        | 47 ++++++++++++++++++++++++++++-----------
 2 files changed, 34 insertions(+), 14 deletions(-)
  

Comments

Michal Koutný Feb. 8, 2023, 5:29 p.m. UTC | #1
On Thu, Feb 09, 2023 at 12:16:54AM +0800, Kairui Song <ryncsn@gmail.com> wrote:
> Signed-off-by: Kairui Song <kasong@tencent.com>
> Signed-off-by: Kairui Song <ryncsn@gmail.com>

Typo?

> -static inline struct psi_group *task_psi_group(struct task_struct *task)
> +static inline struct psi_group *psi_iter_first(struct task_struct *task, void **iter)
>  {
>  #ifdef CONFIG_CGROUPS
> -	if (static_branch_likely(&psi_cgroups_enabled))
> -		return cgroup_psi(task_dfl_cgroup(task));
> +	if (static_branch_likely(&psi_cgroups_enabled)) {
> +		struct cgroup *cgroup = task_dfl_cgroup(task);
> +
> +		*iter = cgroup_parent(cgroup);

This seems to skip a cgroup level -- maybe that's the observed
performance gain?

> +		return cgroup_psi(cgroup);
> +	}
>  #endif
>  	return &psi_system;
>  }

Michal
  
Johannes Weiner Feb. 8, 2023, 7:15 p.m. UTC | #2
On Thu, Feb 09, 2023 at 12:16:54AM +0800, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
> 
> psi_group->parent has the same hierarchy as the cgroup it's in.
> So just iterate through cgroup instead.
> 
> By adjusting the iteration logic, save some space in psi_group
> struct, and the performance is actually better. I see a measurable
> performance gain using mmtests/perfpipe:
> 
> (AVG of 100 test, ops/sec, the higher the better)
> KVM guest on a i7-9700:
>         psi=0         root cgroup   5 levels of cgroup
> Before: 59221         55352         47821
> After:  60100         56036         50884
> 
> KVM guest on a Ryzen 9 5900HX:
>         psi=0         root cgroup   5 levels of cgroup
> Before: 144566        138919        128888
> After:  145812        139580        133514
> 
> Signed-off-by: Kairui Song <kasong@tencent.com>
> Signed-off-by: Kairui Song <ryncsn@gmail.com>

Awesome!

A few comments below:

> @@ -858,15 +858,34 @@ static void psi_group_change(struct psi_group *group, int cpu,
>  		schedule_delayed_work(&group->avgs_work, PSI_FREQ);
>  }
>  
> -static inline struct psi_group *task_psi_group(struct task_struct *task)
> +static inline struct psi_group *psi_iter_first(struct task_struct *task, void **iter)

Please name these psi_groups_first() and psi_groups_next().

>  #ifdef CONFIG_CGROUPS
> -	if (static_branch_likely(&psi_cgroups_enabled))
> -		return cgroup_psi(task_dfl_cgroup(task));
> +	if (static_branch_likely(&psi_cgroups_enabled)) {
> +		struct cgroup *cgroup = task_dfl_cgroup(task);
> +
> +		*iter = cgroup_parent(cgroup);
> +		return cgroup_psi(cgroup);
> +	}
>  #endif
>  	return &psi_system;
>  }
>  
> +static inline struct psi_group *psi_iter_next(void **iter)
> +{
> +#ifdef CONFIG_CGROUPS
> +	if (static_branch_likely(&psi_cgroups_enabled)) {
> +		struct cgroup *cgroup = *iter;
> +
> +		if (cgroup) {
> +			*iter = cgroup_parent(cgroup);
> +			return cgroup_psi(cgroup);
> +		}
> +	}
> +#endif
> +	return NULL;
> +}

> @@ -886,6 +905,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
>  {
>  	int cpu = task_cpu(task);
>  	struct psi_group *group;
> +	void *iter;
>  	u64 now;
>  
>  	if (!task->pid)
> @@ -895,16 +915,17 @@ void psi_task_change(struct task_struct *task, int clear, int set)
>  
>  	now = cpu_clock(cpu);
>  
> -	group = task_psi_group(task);
> +	group = psi_iter_first(task, &iter);
>  	do {
>  		psi_group_change(group, cpu, clear, set, now, true);
> -	} while ((group = group->parent));
> +	} while ((group = psi_iter_next(&iter)));
>  }
>  
>  void psi_task_switch(struct task_struct *prev, struct task_struct *next,
>  		     bool sleep)
>  {
>  	struct psi_group *group, *common = NULL;
> +	void *iter;
>  	int cpu = task_cpu(prev);
>  	u64 now = cpu_clock(cpu);

Please add @iter at the end to keep line length sorting.

> @@ -915,7 +936,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
>  		 * ancestors with @prev, those will already have @prev's
>  		 * TSK_ONCPU bit set, and we can stop the iteration there.
>  		 */
> -		group = task_psi_group(next);
> +		group = psi_iter_first(prev, &iter);
>  		do {
>  			if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
>  			    PSI_ONCPU) {
> @@ -924,7 +945,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
>  			}
>  
>  			psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
> -		} while ((group = group->parent));
> +		} while ((group = psi_iter_next(&iter)));
>  	}
>  
>  	if (prev->pid) {
> @@ -957,12 +978,12 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
>  
>  		psi_flags_change(prev, clear, set);
>  
> -		group = task_psi_group(prev);
> +		group = psi_iter_first(prev, &iter);
>  		do {
>  			if (group == common)
>  				break;
>  			psi_group_change(group, cpu, clear, set, now, wake_clock);
> -		} while ((group = group->parent));
> +		} while ((group = psi_iter_next(&iter)));
>  
>  		/*
>  		 * TSK_ONCPU is handled up to the common ancestor. If there are
> @@ -972,7 +993,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
>  		 */
>  		if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
>  			clear &= ~TSK_ONCPU;
> -			for (; group; group = group->parent)
> +			for (; group; group = psi_iter_next(&iter))
>  				psi_group_change(group, cpu, clear, set, now, wake_clock);
>  		}
>  	}
> @@ -983,6 +1004,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
>  {
>  	int cpu = task_cpu(task);
>  	struct psi_group *group;
> +	void *iter;
>  	struct psi_group_cpu *groupc;
>  	u64 now;

Ditto. You can move @groupc in the same patch.

Otherwise, this looks good to me. Please add:

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
  
Johannes Weiner Feb. 8, 2023, 7:20 p.m. UTC | #3
On Wed, Feb 08, 2023 at 06:29:56PM +0100, Michal Koutný wrote:
> On Thu, Feb 09, 2023 at 12:16:54AM +0800, Kairui Song <ryncsn@gmail.com> wrote:
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > Signed-off-by: Kairui Song <ryncsn@gmail.com>
> 
> Typo?
> 
> > -static inline struct psi_group *task_psi_group(struct task_struct *task)
> > +static inline struct psi_group *psi_iter_first(struct task_struct *task, void **iter)
> >  {
> >  #ifdef CONFIG_CGROUPS
> > -	if (static_branch_likely(&psi_cgroups_enabled))
> > -		return cgroup_psi(task_dfl_cgroup(task));
> > +	if (static_branch_likely(&psi_cgroups_enabled)) {
> > +		struct cgroup *cgroup = task_dfl_cgroup(task);
> > +
> > +		*iter = cgroup_parent(cgroup);
> 
> This seems to skip a cgroup level -- maybe that's the observed
> performance gain?

Hm, I don't think it does. It sets up *iter to point to the parent for
the _next() call, but it returns task_dfl_cgroup()->psi. The next call
does the same: cgroup = *iter, *iter = parent, return cgroup->psi.

It could be a bit more readable to have *iter always point to the
current cgroup - but no strong preference either way from me:

psi_groups_first(task, iter)
{
#ifdef CONFIG_CGROUPS
	if (static_branch_likely(&psi_cgroups_enabled)) {
		struct cgroup *cgroup = task_dfl_cgroup(task);

		*iter = cgroup;
		return cgroup_psi(cgroup);
	}
#endif
	return &psi_system;
}

psi_groups_next(iter)
{
#ifdef CONFIG_CGROUPS
	if (static_branch_likely(&psi_cgroups_enabled)) {
		struct cgroup *cgroup = *iter;

		if (cgroup) {
			*iter = cgroup_parent(cgroup);
			return cgroup_psi(cgroup);
		}
	}
	return NULL;
#endif
}
  
Michal Koutný Feb. 8, 2023, 9:57 p.m. UTC | #4
On Wed, Feb 08, 2023 at 02:20:12PM -0500, Johannes Weiner <hannes@cmpxchg.org> wrote:
> Hm, I don't think it does. It sets up *iter to point to the parent for
> the _next() call, but it returns task_dfl_cgroup()->psi. The next call
> does the same: cgroup = *iter, *iter = parent, return cgroup->psi.

You are right.

> It could be a bit more readable to have *iter always point to the
> current cgroup - but no strong preference either way from me:

A casually reading half-brain finds this more idiomatic :)

I just hope the "preloading" of the parent via iter isn't crucial to the
performance change.

Kairui, I like this reorganization and its effect!

Michal
  
Michal Koutný Feb. 8, 2023, 10:03 p.m. UTC | #5
On Wed, Feb 08, 2023 at 02:15:12PM -0500, Johannes Weiner <hannes@cmpxchg.org> wrote:
> Please name these psi_groups_first() and psi_groups_next().

One more suggestion: s/next/parent/ to avoid impression of (sub)tree
iteration and consistence with cg-related iters.

Thanks,
Michal
  
Kairui Song Feb. 9, 2023, 3:30 p.m. UTC | #6
Michal Koutný <mkoutny@suse.com> 于2023年2月9日周四 06:03写道:
> On Wed, Feb 08, 2023 at 02:15:12PM -0500, Johannes Weiner <hannes@cmpxchg.org> wrote:
> > Please name these psi_groups_first() and psi_groups_next().
>
> One more suggestion: s/next/parent/ to avoid impression of (sub)tree
> iteration and consistence with cg-related iters.
>
> Thanks,
> Michal
>

Thanks for the review, I'll update the patch with using this new name.
  
Kairui Song Feb. 9, 2023, 3:32 p.m. UTC | #7
Johannes Weiner <hannes@cmpxchg.org> 于2023年2月9日周四 03:15写道:
>
> On Thu, Feb 09, 2023 at 12:16:54AM +0800, Kairui Song wrote:
> > From: Kairui Song <kasong@tencent.com>
> >
> > psi_group->parent has the same hierarchy as the cgroup it's in.
> > So just iterate through cgroup instead.
> >
> > By adjusting the iteration logic, save some space in psi_group
> > struct, and the performance is actually better. I see a measurable
> > performance gain using mmtests/perfpipe:
> >
> > (AVG of 100 test, ops/sec, the higher the better)
> > KVM guest on a i7-9700:
> >         psi=0         root cgroup   5 levels of cgroup
> > Before: 59221         55352         47821
> > After:  60100         56036         50884
> >
> > KVM guest on a Ryzen 9 5900HX:
> >         psi=0         root cgroup   5 levels of cgroup
> > Before: 144566        138919        128888
> > After:  145812        139580        133514
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > Signed-off-by: Kairui Song <ryncsn@gmail.com>
>
> Awesome!
>
> A few comments below:
>
> > @@ -858,15 +858,34 @@ static void psi_group_change(struct psi_group *group, int cpu,
> >               schedule_delayed_work(&group->avgs_work, PSI_FREQ);
> >  }
> >
> > -static inline struct psi_group *task_psi_group(struct task_struct *task)
> > +static inline struct psi_group *psi_iter_first(struct task_struct *task, void **iter)
>
> Please name these psi_groups_first() and psi_groups_next().
>
> >  #ifdef CONFIG_CGROUPS
> > -     if (static_branch_likely(&psi_cgroups_enabled))
> > -             return cgroup_psi(task_dfl_cgroup(task));
> > +     if (static_branch_likely(&psi_cgroups_enabled)) {
> > +             struct cgroup *cgroup = task_dfl_cgroup(task);
> > +
> > +             *iter = cgroup_parent(cgroup);
> > +             return cgroup_psi(cgroup);
> > +     }
> >  #endif
> >       return &psi_system;
> >  }
> >
> > +static inline struct psi_group *psi_iter_next(void **iter)
> > +{
> > +#ifdef CONFIG_CGROUPS
> > +     if (static_branch_likely(&psi_cgroups_enabled)) {
> > +             struct cgroup *cgroup = *iter;
> > +
> > +             if (cgroup) {
> > +                     *iter = cgroup_parent(cgroup);
> > +                     return cgroup_psi(cgroup);
> > +             }
> > +     }
> > +#endif
> > +     return NULL;
> > +}
>
> > @@ -886,6 +905,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
> >  {
> >       int cpu = task_cpu(task);
> >       struct psi_group *group;
> > +     void *iter;
> >       u64 now;
> >
> >       if (!task->pid)
> > @@ -895,16 +915,17 @@ void psi_task_change(struct task_struct *task, int clear, int set)
> >
> >       now = cpu_clock(cpu);
> >
> > -     group = task_psi_group(task);
> > +     group = psi_iter_first(task, &iter);
> >       do {
> >               psi_group_change(group, cpu, clear, set, now, true);
> > -     } while ((group = group->parent));
> > +     } while ((group = psi_iter_next(&iter)));
> >  }
> >
> >  void psi_task_switch(struct task_struct *prev, struct task_struct *next,
> >                    bool sleep)
> >  {
> >       struct psi_group *group, *common = NULL;
> > +     void *iter;
> >       int cpu = task_cpu(prev);
> >       u64 now = cpu_clock(cpu);
>
> Please add @iter at the end to keep line length sorting.
>
> > @@ -915,7 +936,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
> >                * ancestors with @prev, those will already have @prev's
> >                * TSK_ONCPU bit set, and we can stop the iteration there.
> >                */
> > -             group = task_psi_group(next);
> > +             group = psi_iter_first(prev, &iter);
> >               do {
> >                       if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
> >                           PSI_ONCPU) {
> > @@ -924,7 +945,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
> >                       }
> >
> >                       psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
> > -             } while ((group = group->parent));
> > +             } while ((group = psi_iter_next(&iter)));
> >       }
> >
> >       if (prev->pid) {
> > @@ -957,12 +978,12 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
> >
> >               psi_flags_change(prev, clear, set);
> >
> > -             group = task_psi_group(prev);
> > +             group = psi_iter_first(prev, &iter);
> >               do {
> >                       if (group == common)
> >                               break;
> >                       psi_group_change(group, cpu, clear, set, now, wake_clock);
> > -             } while ((group = group->parent));
> > +             } while ((group = psi_iter_next(&iter)));
> >
> >               /*
> >                * TSK_ONCPU is handled up to the common ancestor. If there are
> > @@ -972,7 +993,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
> >                */
> >               if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
> >                       clear &= ~TSK_ONCPU;
> > -                     for (; group; group = group->parent)
> > +                     for (; group; group = psi_iter_next(&iter))
> >                               psi_group_change(group, cpu, clear, set, now, wake_clock);
> >               }
> >       }
> > @@ -983,6 +1004,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
> >  {
> >       int cpu = task_cpu(task);
> >       struct psi_group *group;
> > +     void *iter;
> >       struct psi_group_cpu *groupc;
> >       u64 now;
>
> Ditto. You can move @groupc in the same patch.
>
> Otherwise, this looks good to me. Please add:
>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>

Thanks! I'll update the patch as you suggested.
  
Kairui Song Feb. 9, 2023, 4:08 p.m. UTC | #8
Johannes Weiner <hannes@cmpxchg.org> 于2023年2月9日周四 03:20写道:
> On Wed, Feb 08, 2023 at 06:29:56PM +0100, Michal Koutný wrote:
> > On Thu, Feb 09, 2023 at 12:16:54AM +0800, Kairui Song <ryncsn@gmail.com> wrote:
> > > Signed-off-by: Kairui Song <kasong@tencent.com>
> > > Signed-off-by: Kairui Song <ryncsn@gmail.com>
> >
> > Typo?
> >
> > > -static inline struct psi_group *task_psi_group(struct task_struct *task)
> > > +static inline struct psi_group *psi_iter_first(struct task_struct *task, void **iter)
> > >  {
> > >  #ifdef CONFIG_CGROUPS
> > > -   if (static_branch_likely(&psi_cgroups_enabled))
> > > -           return cgroup_psi(task_dfl_cgroup(task));
> > > +   if (static_branch_likely(&psi_cgroups_enabled)) {
> > > +           struct cgroup *cgroup = task_dfl_cgroup(task);
> > > +
> > > +           *iter = cgroup_parent(cgroup);
> >
> > This seems to skip a cgroup level -- maybe that's the observed
> > performance gain?
>
> Hm, I don't think it does. It sets up *iter to point to the parent for
> the _next() call, but it returns task_dfl_cgroup()->psi. The next call
> does the same: cgroup = *iter, *iter = parent, return cgroup->psi.
>
> It could be a bit more readable to have *iter always point to the
> current cgroup - but no strong preference either way from me:
>
> psi_groups_first(task, iter)
> {
> #ifdef CONFIG_CGROUPS
>         if (static_branch_likely(&psi_cgroups_enabled)) {
>                 struct cgroup *cgroup = task_dfl_cgroup(task);
>
>                 *iter = cgroup;
>                 return cgroup_psi(cgroup);
>         }
> #endif
>         return &psi_system;
> }
>
> psi_groups_next(iter)
> {
> #ifdef CONFIG_CGROUPS
>         if (static_branch_likely(&psi_cgroups_enabled)) {
>                 struct cgroup *cgroup = *iter;
>
>                 if (cgroup) {
>                         *iter = cgroup_parent(cgroup);
>                         return cgroup_psi(cgroup);
>                 }
>         }
>         return NULL;
> #endif
> }
> psi_groups_next(iter)
> {
> #ifdef CONFIG_CGROUPS
>         if (static_branch_likely(&psi_cgroups_enabled)) {
>                 struct cgroup *cgroup = *iter;
>
>                 if (cgroup) {
>                         *iter = cgroup_parent(cgroup);
>                         return cgroup_psi(cgroup);
>                 }
>         }
>         return NULL;
> #endif
> }

It should be like this, right? For psi_groups_next, retrieving cgroup
parent should be done before "if (cgroup)".
+ psi_groups_next(iter)
+ {
+ #ifdef CONFIG_CGROUPS
+         if (static_branch_likely(&psi_cgroups_enabled)) {
+                 struct cgroup *cgroup = *iter;
+
+                 cgroup = cgroup_parent(cgroup);
+                 if (cgroup) {
+                         *iter = cgroup;
+                         return cgroup_psi(cgroup);
+                 }
+         }
+         return NULL;
+ #endif
+ }

Thanks for the suggestion!

I think your style is better indeed.

I tried to re-benchmark the code just in case, and found it seems my
previous benchmark result is not accurate enough now, some results
changed after I did a rebase to latest master, or maybe just 100 times
of perfpipe is not enough to get a stable result.

With a few times of retest, the final conclusion of the commit message
is still valid, will post V2 later just after more test.
  
Kairui Song Feb. 15, 2023, 5:49 p.m. UTC | #9
Kairui Song <ryncsn@gmail.com> 于2023年2月10日周五 00:08写道:
> Johannes Weiner <hannes@cmpxchg.org> 于2023年2月9日周四 03:20写道:
> > On Wed, Feb 08, 2023 at 06:29:56PM +0100, Michal Koutný wrote:
> > > On Thu, Feb 09, 2023 at 12:16:54AM +0800, Kairui Song <ryncsn@gmail.com> wrote:
> > > > Signed-off-by: Kairui Song <kasong@tencent.com>
> > > > Signed-off-by: Kairui Song <ryncsn@gmail.com>
> > >
> > > Typo?
> > >
> > > > -static inline struct psi_group *task_psi_group(struct task_struct *task)
> > > > +static inline struct psi_group *psi_iter_first(struct task_struct *task, void **iter)
> > > >  {
> > > >  #ifdef CONFIG_CGROUPS
> > > > -   if (static_branch_likely(&psi_cgroups_enabled))
> > > > -           return cgroup_psi(task_dfl_cgroup(task));
> > > > +   if (static_branch_likely(&psi_cgroups_enabled)) {
> > > > +           struct cgroup *cgroup = task_dfl_cgroup(task);
> > > > +
> > > > +           *iter = cgroup_parent(cgroup);
> > >
> > > This seems to skip a cgroup level -- maybe that's the observed
> > > performance gain?
> >
> > Hm, I don't think it does. It sets up *iter to point to the parent for
> > the _next() call, but it returns task_dfl_cgroup()->psi. The next call
> > does the same: cgroup = *iter, *iter = parent, return cgroup->psi.
> >
> > It could be a bit more readable to have *iter always point to the
> > current cgroup - but no strong preference either way from me:
> >
> > psi_groups_first(task, iter)
> > {
> > #ifdef CONFIG_CGROUPS
> >         if (static_branch_likely(&psi_cgroups_enabled)) {
> >                 struct cgroup *cgroup = task_dfl_cgroup(task);
> >
> >                 *iter = cgroup;
> >                 return cgroup_psi(cgroup);
> >         }
> > #endif
> >         return &psi_system;
> > }
> >
> > psi_groups_next(iter)
> > {
> > #ifdef CONFIG_CGROUPS
> >         if (static_branch_likely(&psi_cgroups_enabled)) {
> >                 struct cgroup *cgroup = *iter;
> >
> >                 if (cgroup) {
> >                         *iter = cgroup_parent(cgroup);
> >                         return cgroup_psi(cgroup);
> >                 }
> >         }
> >         return NULL;
> > #endif
> > }
> > psi_groups_next(iter)
> > {
> > #ifdef CONFIG_CGROUPS
> >         if (static_branch_likely(&psi_cgroups_enabled)) {
> >                 struct cgroup *cgroup = *iter;
> >
> >                 if (cgroup) {
> >                         *iter = cgroup_parent(cgroup);
> >                         return cgroup_psi(cgroup);
> >                 }
> >         }
> >         return NULL;
> > #endif
> > }
>
> It should be like this, right? For psi_groups_next, retrieving cgroup
> parent should be done before "if (cgroup)".
> + psi_groups_next(iter)
> + {
> + #ifdef CONFIG_CGROUPS
> +         if (static_branch_likely(&psi_cgroups_enabled)) {
> +                 struct cgroup *cgroup = *iter;
> +
> +                 cgroup = cgroup_parent(cgroup);
> +                 if (cgroup) {
> +                         *iter = cgroup;
> +                         return cgroup_psi(cgroup);
> +                 }
> +         }
> +         return NULL;
> + #endif
> + }
>
> Thanks for the suggestion!
>
> I think your style is better indeed.
>
> I tried to re-benchmark the code just in case, and found it seems my
> previous benchmark result is not accurate enough now, some results
> changed after I did a rebase to latest master, or maybe just 100 times
> of perfpipe is not enough to get a stable result.
>
> With a few times of retest, the final conclusion of the commit message
> is still valid, will post V2 later just after more test.

Hi, I just ran perf bench test for a few days on a machine to get an
stable average for latest rebased patch.
And found it is indeed faster when the cgroup hierarchy is not too deep:

With rootcg:
55718.9 op/s (unpatched) compared to 55862.2 (patched)
With 5 levels:
49975.5 op/s (unpatched) compared to 50778.5 op/s (patched)

Previous tests are a bit biased since I only run the test for 100 * 3
times, or maybe it is sensitive to some random kernel structure
changes.

But I ignored one important thing in my previous test, that the
performance actually drops heavily with deeper levers of cgroups:
With 8 levels:
48902.4 op/s (unpatched) compared to 47476.6 op/s (patched)
With 50 levels of cgroup:
25605.7375 op/s (unpatched) compared to 20417.275 op/s (patched)

That's a terrible regression :( , guess when there are too many cgroup
structures, CPU cache can't hold them, and since the *parent shares
same cacheline with rest of psi_group struct (like the bool enblaed,
which are access every time), so unpatched will be faster.

Sorry for missing such a critical point in v1, let's drop this series
until there is a better way to optimize it.
  
Michal Koutný Feb. 15, 2023, 6:25 p.m. UTC | #10
On Thu, Feb 16, 2023 at 01:49:22AM +0800, Kairui Song <ryncsn@gmail.com> wrote:
> With rootcg:
> 55718.9 op/s (unpatched) compared to 55862.2 (patched)
> With 5 levels:
> 49975.5 op/s (unpatched) compared to 50778.5 op/s (patched)
> 
> Previous tests are a bit biased since I only run the test for 100 * 3
> times, or maybe it is sensitive to some random kernel structure
> changes.
> 
> But I ignored one important thing in my previous test, that the
> performance actually drops heavily with deeper levers of cgroups:
> With 8 levels:
> 48902.4 op/s (unpatched) compared to 47476.6 op/s (patched)
> With 50 levels of cgroup:
> 25605.7375 op/s (unpatched) compared to 20417.275 op/s (patched)

IIUC, one could also interpret this as the parent caching within
psi_group is effective especially with deep hierarchies.

I'd say practical hierarchies are below 10 levels deep. But yeah, the
averaged results aren't so impressive.

Thanks for sharing your insights,
Michal
  

Patch

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 1e0a0d7ace3a..4066b846ce4a 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -154,7 +154,6 @@  struct psi_trigger {
 };
 
 struct psi_group {
-	struct psi_group *parent;
 	bool enabled;
 
 	/* Protects data used by the aggregator */
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 8ac8b81bfee6..c74f8ce46f81 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -858,15 +858,34 @@  static void psi_group_change(struct psi_group *group, int cpu,
 		schedule_delayed_work(&group->avgs_work, PSI_FREQ);
 }
 
-static inline struct psi_group *task_psi_group(struct task_struct *task)
+static inline struct psi_group *psi_iter_first(struct task_struct *task, void **iter)
 {
 #ifdef CONFIG_CGROUPS
-	if (static_branch_likely(&psi_cgroups_enabled))
-		return cgroup_psi(task_dfl_cgroup(task));
+	if (static_branch_likely(&psi_cgroups_enabled)) {
+		struct cgroup *cgroup = task_dfl_cgroup(task);
+
+		*iter = cgroup_parent(cgroup);
+		return cgroup_psi(cgroup);
+	}
 #endif
 	return &psi_system;
 }
 
+static inline struct psi_group *psi_iter_next(void **iter)
+{
+#ifdef CONFIG_CGROUPS
+	if (static_branch_likely(&psi_cgroups_enabled)) {
+		struct cgroup *cgroup = *iter;
+
+		if (cgroup) {
+			*iter = cgroup_parent(cgroup);
+			return cgroup_psi(cgroup);
+		}
+	}
+#endif
+	return NULL;
+}
+
 static void psi_flags_change(struct task_struct *task, int clear, int set)
 {
 	if (((task->psi_flags & set) ||
@@ -886,6 +905,7 @@  void psi_task_change(struct task_struct *task, int clear, int set)
 {
 	int cpu = task_cpu(task);
 	struct psi_group *group;
+	void *iter;
 	u64 now;
 
 	if (!task->pid)
@@ -895,16 +915,17 @@  void psi_task_change(struct task_struct *task, int clear, int set)
 
 	now = cpu_clock(cpu);
 
-	group = task_psi_group(task);
+	group = psi_iter_first(task, &iter);
 	do {
 		psi_group_change(group, cpu, clear, set, now, true);
-	} while ((group = group->parent));
+	} while ((group = psi_iter_next(&iter)));
 }
 
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		     bool sleep)
 {
 	struct psi_group *group, *common = NULL;
+	void *iter;
 	int cpu = task_cpu(prev);
 	u64 now = cpu_clock(cpu);
 
@@ -915,7 +936,7 @@  void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		 * ancestors with @prev, those will already have @prev's
 		 * TSK_ONCPU bit set, and we can stop the iteration there.
 		 */
-		group = task_psi_group(next);
+		group = psi_iter_first(prev, &iter);
 		do {
 			if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
 			    PSI_ONCPU) {
@@ -924,7 +945,7 @@  void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 			}
 
 			psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
-		} while ((group = group->parent));
+		} while ((group = psi_iter_next(&iter)));
 	}
 
 	if (prev->pid) {
@@ -957,12 +978,12 @@  void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 
 		psi_flags_change(prev, clear, set);
 
-		group = task_psi_group(prev);
+		group = psi_iter_first(prev, &iter);
 		do {
 			if (group == common)
 				break;
 			psi_group_change(group, cpu, clear, set, now, wake_clock);
-		} while ((group = group->parent));
+		} while ((group = psi_iter_next(&iter)));
 
 		/*
 		 * TSK_ONCPU is handled up to the common ancestor. If there are
@@ -972,7 +993,7 @@  void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		 */
 		if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
 			clear &= ~TSK_ONCPU;
-			for (; group; group = group->parent)
+			for (; group; group = psi_iter_next(&iter))
 				psi_group_change(group, cpu, clear, set, now, wake_clock);
 		}
 	}
@@ -983,6 +1004,7 @@  void psi_account_irqtime(struct task_struct *task, u32 delta)
 {
 	int cpu = task_cpu(task);
 	struct psi_group *group;
+	void *iter;
 	struct psi_group_cpu *groupc;
 	u64 now;
 
@@ -991,7 +1013,7 @@  void psi_account_irqtime(struct task_struct *task, u32 delta)
 
 	now = cpu_clock(cpu);
 
-	group = task_psi_group(task);
+	group = psi_iter_first(task, &iter);
 	do {
 		if (!group->enabled)
 			continue;
@@ -1007,7 +1029,7 @@  void psi_account_irqtime(struct task_struct *task, u32 delta)
 
 		if (group->poll_states & (1 << PSI_IRQ_FULL))
 			psi_schedule_poll_work(group, 1, false);
-	} while ((group = group->parent));
+	} while ((group = psi_iter_next(&iter)));
 }
 #endif
 
@@ -1089,7 +1111,6 @@  int psi_cgroup_alloc(struct cgroup *cgroup)
 		return -ENOMEM;
 	}
 	group_init(cgroup->psi);
-	cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
 	return 0;
 }