[RFC] rcu/kfree: Do not request RCU when not needed

Message ID 20221029132856.3752018-1-joel@joelfernandes.org
State New
Headers
Series [RFC] rcu/kfree: Do not request RCU when not needed |

Commit Message

Joel Fernandes Oct. 29, 2022, 1:28 p.m. UTC
  On ChromeOS, I am (almost) always seeing the optimization trigger.
Tested boot up and trace_printk'ing how often it triggers.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 kernel/rcu/tree.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)
  

Comments

Joel Fernandes Oct. 29, 2022, 1:31 p.m. UTC | #1
On Sat, Oct 29, 2022 at 9:29 AM Joel Fernandes (Google)
<joel@joelfernandes.org> wrote:
>
> On ChromeOS, I am (almost) always seeing the optimization trigger.
> Tested boot up and trace_printk'ing how often it triggers.
>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
>  kernel/rcu/tree.c | 18 +++++++++++++++++-
>  1 file changed, 17 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 591187b6352e..3e4c50b9fd33 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2935,6 +2935,7 @@ struct kfree_rcu_cpu_work {
>
>  /**
>   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> + * @rdp: The rdp of the CPU that this kfree_rcu corresponds to.
>   * @head: List of kfree_rcu() objects not yet waiting for a grace period
>   * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period

Yuck. I forgot to add last_gp_seq to the comments. Will await feedback
before re-spin.

 - Joel


>   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> @@ -2964,6 +2965,8 @@ struct kfree_rcu_cpu {
>         struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
>         raw_spinlock_t lock;
>         struct delayed_work monitor_work;
> +       struct rcu_data *rdp;
> +       unsigned long last_gp_seq;
>         bool initialized;
>         int count;
>
> @@ -3167,6 +3170,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
>                         mod_delayed_work(system_wq, &krcp->monitor_work, delay);
>                 return;
>         }
> +       krcp->last_gp_seq = krcp->rdp->gp_seq;
>         queue_delayed_work(system_wq, &krcp->monitor_work, delay);
>  }
>
> @@ -3217,7 +3221,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
>                         // be that the work is in the pending state when
>                         // channels have been detached following by each
>                         // other.
> -                       queue_rcu_work(system_wq, &krwp->rcu_work);
> +                       //
> +                       // NOTE about gp_seq wrap: In case of gp_seq overflow,
> +                       // it is possible for rdp->gp_seq to be less than
> +                       // krcp->last_gp_seq even though a GP might be over. In
> +                       // this rare case, we would just have one extra GP.
> +                       if (krcp->last_gp_seq &&
> +                           rcu_seq_completed_gp(krcp->last_gp_seq, krcp->rdp->gp_seq)) {
> +                               queue_work(system_wq, &krwp->rcu_work.work);
> +                       } else {
> +                               queue_rcu_work(system_wq, &krwp->rcu_work);
> +                       }
>                 }
>         }
>
> @@ -4802,6 +4816,8 @@ static void __init kfree_rcu_batch_init(void)
>         for_each_possible_cpu(cpu) {
>                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>
> +               krcp->rdp = per_cpu_ptr(&rcu_data, cpu);
> +               krcp->last_gp_seq = 0;
>                 for (i = 0; i < KFREE_N_BATCHES; i++) {
>                         INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
>                         krcp->krw_arr[i].krcp = krcp;
> --
> 2.38.1.273.g43a17bfeac-goog
>
  
Uladzislau Rezki Nov. 2, 2022, 12:37 p.m. UTC | #2
On Sat, Oct 29, 2022 at 01:28:56PM +0000, Joel Fernandes (Google) wrote:
> On ChromeOS, I am (almost) always seeing the optimization trigger.
> Tested boot up and trace_printk'ing how often it triggers.
> 
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
>  kernel/rcu/tree.c | 18 +++++++++++++++++-
>  1 file changed, 17 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 591187b6352e..3e4c50b9fd33 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2935,6 +2935,7 @@ struct kfree_rcu_cpu_work {
>  
>  /**
>   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> + * @rdp: The rdp of the CPU that this kfree_rcu corresponds to.
>   * @head: List of kfree_rcu() objects not yet waiting for a grace period
>   * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
>   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> @@ -2964,6 +2965,8 @@ struct kfree_rcu_cpu {
>  	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
>  	raw_spinlock_t lock;
>  	struct delayed_work monitor_work;
> +	struct rcu_data *rdp;
> +	unsigned long last_gp_seq;
>  	bool initialized;
>  	int count;
>  
> @@ -3167,6 +3170,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
>  			mod_delayed_work(system_wq, &krcp->monitor_work, delay);
>  		return;
>  	}
> +	krcp->last_gp_seq = krcp->rdp->gp_seq;
>  	queue_delayed_work(system_wq, &krcp->monitor_work, delay);
>  }
>  
> @@ -3217,7 +3221,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  			// be that the work is in the pending state when
>  			// channels have been detached following by each
>  			// other.
> -			queue_rcu_work(system_wq, &krwp->rcu_work);
> +			//
> +			// NOTE about gp_seq wrap: In case of gp_seq overflow,
> +			// it is possible for rdp->gp_seq to be less than
> +			// krcp->last_gp_seq even though a GP might be over. In
> +			// this rare case, we would just have one extra GP.
> +			if (krcp->last_gp_seq &&
>
This check can be eliminated i think. A kfree_rcu_cpu is defined as
static so by default the last_gp_set is set to zero.

>  
> @@ -4802,6 +4816,8 @@ static void __init kfree_rcu_batch_init(void)
>  	for_each_possible_cpu(cpu) {
>  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>  
> +		krcp->rdp = per_cpu_ptr(&rcu_data, cpu);
> +		krcp->last_gp_seq = 0;
>
Yep. This one can be just dropped.

But all the rest looks good :) I will give it a try from test point of
view. It is interested from the memory footprint point of view.

--
Uladzislau Rezki
  
Joel Fernandes Nov. 2, 2022, 4:11 p.m. UTC | #3
On Sat, Oct 29, 2022 at 9:29 AM Joel Fernandes (Google)
<joel@joelfernandes.org> wrote:
>
> On ChromeOS, I am (almost) always seeing the optimization trigger.
> Tested boot up and trace_printk'ing how often it triggers.
>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
>  kernel/rcu/tree.c | 18 +++++++++++++++++-
>  1 file changed, 17 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 591187b6352e..3e4c50b9fd33 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2935,6 +2935,7 @@ struct kfree_rcu_cpu_work {
>
>  /**
>   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> + * @rdp: The rdp of the CPU that this kfree_rcu corresponds to.
>   * @head: List of kfree_rcu() objects not yet waiting for a grace period
>   * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
>   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> @@ -2964,6 +2965,8 @@ struct kfree_rcu_cpu {
>         struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
>         raw_spinlock_t lock;
>         struct delayed_work monitor_work;
> +       struct rcu_data *rdp;
> +       unsigned long last_gp_seq;
>         bool initialized;
>         int count;
>
> @@ -3167,6 +3170,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
>                         mod_delayed_work(system_wq, &krcp->monitor_work, delay);
>                 return;
>         }
> +       krcp->last_gp_seq = krcp->rdp->gp_seq;

TODO: As Paul pointed out today in a meeting, we can't just sample
rdp->gp_seq directly as it can jump forward quickly from out-of-date
to non-out-of-date. Such GP can happen before a grace period actually
elapses. Boom!

He did make me feel better though that it is something easy to miss,
and to audit the current code about.

 - Joel



 - Joel



>         queue_delayed_work(system_wq, &krcp->monitor_work, delay);
>  }
>
> @@ -3217,7 +3221,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
>                         // be that the work is in the pending state when
>                         // channels have been detached following by each
>                         // other.
> -                       queue_rcu_work(system_wq, &krwp->rcu_work);
> +                       //
> +                       // NOTE about gp_seq wrap: In case of gp_seq overflow,
> +                       // it is possible for rdp->gp_seq to be less than
> +                       // krcp->last_gp_seq even though a GP might be over. In
> +                       // this rare case, we would just have one extra GP.
> +                       if (krcp->last_gp_seq &&
> +                           rcu_seq_completed_gp(krcp->last_gp_seq, krcp->rdp->gp_seq)) {
> +                               queue_work(system_wq, &krwp->rcu_work.work);
> +                       } else {
> +                               queue_rcu_work(system_wq, &krwp->rcu_work);
> +                       }
>                 }
>         }
>
> @@ -4802,6 +4816,8 @@ static void __init kfree_rcu_batch_init(void)
>         for_each_possible_cpu(cpu) {
>                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>
> +               krcp->rdp = per_cpu_ptr(&rcu_data, cpu);
> +               krcp->last_gp_seq = 0;
>                 for (i = 0; i < KFREE_N_BATCHES; i++) {
>                         INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
>                         krcp->krw_arr[i].krcp = krcp;
> --
> 2.38.1.273.g43a17bfeac-goog
>
  
Joel Fernandes Nov. 2, 2022, 4:13 p.m. UTC | #4
On Wed, Nov 2, 2022 at 8:37 AM Uladzislau Rezki <urezki@gmail.com> wrote:
>
> On Sat, Oct 29, 2022 at 01:28:56PM +0000, Joel Fernandes (Google) wrote:
> > On ChromeOS, I am (almost) always seeing the optimization trigger.
> > Tested boot up and trace_printk'ing how often it triggers.
> >
> > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > ---
> >  kernel/rcu/tree.c | 18 +++++++++++++++++-
> >  1 file changed, 17 insertions(+), 1 deletion(-)
> >
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 591187b6352e..3e4c50b9fd33 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -2935,6 +2935,7 @@ struct kfree_rcu_cpu_work {
> >
> >  /**
> >   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> > + * @rdp: The rdp of the CPU that this kfree_rcu corresponds to.
> >   * @head: List of kfree_rcu() objects not yet waiting for a grace period
> >   * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
> >   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> > @@ -2964,6 +2965,8 @@ struct kfree_rcu_cpu {
> >       struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> >       raw_spinlock_t lock;
> >       struct delayed_work monitor_work;
> > +     struct rcu_data *rdp;
> > +     unsigned long last_gp_seq;
> >       bool initialized;
> >       int count;
> >
> > @@ -3167,6 +3170,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
> >                       mod_delayed_work(system_wq, &krcp->monitor_work, delay);
> >               return;
> >       }
> > +     krcp->last_gp_seq = krcp->rdp->gp_seq;
> >       queue_delayed_work(system_wq, &krcp->monitor_work, delay);
> >  }
> >
> > @@ -3217,7 +3221,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
> >                       // be that the work is in the pending state when
> >                       // channels have been detached following by each
> >                       // other.
> > -                     queue_rcu_work(system_wq, &krwp->rcu_work);
> > +                     //
> > +                     // NOTE about gp_seq wrap: In case of gp_seq overflow,
> > +                     // it is possible for rdp->gp_seq to be less than
> > +                     // krcp->last_gp_seq even though a GP might be over. In
> > +                     // this rare case, we would just have one extra GP.
> > +                     if (krcp->last_gp_seq &&
> >
> This check can be eliminated i think. A kfree_rcu_cpu is defined as
> static so by default the last_gp_set is set to zero.

Ack.

> > @@ -4802,6 +4816,8 @@ static void __init kfree_rcu_batch_init(void)
> >       for_each_possible_cpu(cpu) {
> >               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> >
> > +             krcp->rdp = per_cpu_ptr(&rcu_data, cpu);
> > +             krcp->last_gp_seq = 0;
> >
> Yep. This one can be just dropped.
>
> But all the rest looks good :) I will give it a try from test point of
> view. It is interested from the memory footprint point of view.

Ack. Thanks. Even though we should not sample rdp->gp_seq, I think it
is still worth a test.
  
Paul E. McKenney Nov. 2, 2022, 4:35 p.m. UTC | #5
On Wed, Nov 02, 2022 at 12:13:17PM -0400, Joel Fernandes wrote:
> On Wed, Nov 2, 2022 at 8:37 AM Uladzislau Rezki <urezki@gmail.com> wrote:
> >
> > On Sat, Oct 29, 2022 at 01:28:56PM +0000, Joel Fernandes (Google) wrote:
> > > On ChromeOS, I am (almost) always seeing the optimization trigger.
> > > Tested boot up and trace_printk'ing how often it triggers.
> > >
> > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > ---
> > >  kernel/rcu/tree.c | 18 +++++++++++++++++-
> > >  1 file changed, 17 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index 591187b6352e..3e4c50b9fd33 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -2935,6 +2935,7 @@ struct kfree_rcu_cpu_work {
> > >
> > >  /**
> > >   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> > > + * @rdp: The rdp of the CPU that this kfree_rcu corresponds to.
> > >   * @head: List of kfree_rcu() objects not yet waiting for a grace period
> > >   * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
> > >   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> > > @@ -2964,6 +2965,8 @@ struct kfree_rcu_cpu {
> > >       struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> > >       raw_spinlock_t lock;
> > >       struct delayed_work monitor_work;
> > > +     struct rcu_data *rdp;
> > > +     unsigned long last_gp_seq;
> > >       bool initialized;
> > >       int count;
> > >
> > > @@ -3167,6 +3170,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
> > >                       mod_delayed_work(system_wq, &krcp->monitor_work, delay);
> > >               return;
> > >       }
> > > +     krcp->last_gp_seq = krcp->rdp->gp_seq;
> > >       queue_delayed_work(system_wq, &krcp->monitor_work, delay);
> > >  }
> > >
> > > @@ -3217,7 +3221,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
> > >                       // be that the work is in the pending state when
> > >                       // channels have been detached following by each
> > >                       // other.
> > > -                     queue_rcu_work(system_wq, &krwp->rcu_work);
> > > +                     //
> > > +                     // NOTE about gp_seq wrap: In case of gp_seq overflow,
> > > +                     // it is possible for rdp->gp_seq to be less than
> > > +                     // krcp->last_gp_seq even though a GP might be over. In
> > > +                     // this rare case, we would just have one extra GP.
> > > +                     if (krcp->last_gp_seq &&
> > >
> > This check can be eliminated i think. A kfree_rcu_cpu is defined as
> > static so by default the last_gp_set is set to zero.
> 
> Ack.
> 
> > > @@ -4802,6 +4816,8 @@ static void __init kfree_rcu_batch_init(void)
> > >       for_each_possible_cpu(cpu) {
> > >               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > >
> > > +             krcp->rdp = per_cpu_ptr(&rcu_data, cpu);
> > > +             krcp->last_gp_seq = 0;
> > >
> > Yep. This one can be just dropped.
> >
> > But all the rest looks good :) I will give it a try from test point of
> > view. It is interested from the memory footprint point of view.
> 
> Ack. Thanks. Even though we should not sample rdp->gp_seq, I think it
> is still worth a test.

Just for completeness, the main purpose of rdp->gp_seq is to reject
quiescent states that were seen during already-completed grace periods.

							Thanx, Paul
  
Uladzislau Rezki Nov. 2, 2022, 5:24 p.m. UTC | #6
On Wed, Nov 02, 2022 at 09:35:44AM -0700, Paul E. McKenney wrote:
> On Wed, Nov 02, 2022 at 12:13:17PM -0400, Joel Fernandes wrote:
> > On Wed, Nov 2, 2022 at 8:37 AM Uladzislau Rezki <urezki@gmail.com> wrote:
> > >
> > > On Sat, Oct 29, 2022 at 01:28:56PM +0000, Joel Fernandes (Google) wrote:
> > > > On ChromeOS, I am (almost) always seeing the optimization trigger.
> > > > Tested boot up and trace_printk'ing how often it triggers.
> > > >
> > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > > ---
> > > >  kernel/rcu/tree.c | 18 +++++++++++++++++-
> > > >  1 file changed, 17 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > index 591187b6352e..3e4c50b9fd33 100644
> > > > --- a/kernel/rcu/tree.c
> > > > +++ b/kernel/rcu/tree.c
> > > > @@ -2935,6 +2935,7 @@ struct kfree_rcu_cpu_work {
> > > >
> > > >  /**
> > > >   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> > > > + * @rdp: The rdp of the CPU that this kfree_rcu corresponds to.
> > > >   * @head: List of kfree_rcu() objects not yet waiting for a grace period
> > > >   * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
> > > >   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> > > > @@ -2964,6 +2965,8 @@ struct kfree_rcu_cpu {
> > > >       struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> > > >       raw_spinlock_t lock;
> > > >       struct delayed_work monitor_work;
> > > > +     struct rcu_data *rdp;
> > > > +     unsigned long last_gp_seq;
> > > >       bool initialized;
> > > >       int count;
> > > >
> > > > @@ -3167,6 +3170,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
> > > >                       mod_delayed_work(system_wq, &krcp->monitor_work, delay);
> > > >               return;
> > > >       }
> > > > +     krcp->last_gp_seq = krcp->rdp->gp_seq;
> > > >       queue_delayed_work(system_wq, &krcp->monitor_work, delay);
> > > >  }
> > > >
> > > > @@ -3217,7 +3221,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
> > > >                       // be that the work is in the pending state when
> > > >                       // channels have been detached following by each
> > > >                       // other.
> > > > -                     queue_rcu_work(system_wq, &krwp->rcu_work);
> > > > +                     //
> > > > +                     // NOTE about gp_seq wrap: In case of gp_seq overflow,
> > > > +                     // it is possible for rdp->gp_seq to be less than
> > > > +                     // krcp->last_gp_seq even though a GP might be over. In
> > > > +                     // this rare case, we would just have one extra GP.
> > > > +                     if (krcp->last_gp_seq &&
> > > >
> > > This check can be eliminated i think. A kfree_rcu_cpu is defined as
> > > static so by default the last_gp_set is set to zero.
> > 
> > Ack.
> > 
> > > > @@ -4802,6 +4816,8 @@ static void __init kfree_rcu_batch_init(void)
> > > >       for_each_possible_cpu(cpu) {
> > > >               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > > >
> > > > +             krcp->rdp = per_cpu_ptr(&rcu_data, cpu);
> > > > +             krcp->last_gp_seq = 0;
> > > >
> > > Yep. This one can be just dropped.
> > >
> > > But all the rest looks good :) I will give it a try from test point of
> > > view. It is interested from the memory footprint point of view.
> > 
> > Ack. Thanks. Even though we should not sample rdp->gp_seq, I think it
> > is still worth a test.
> 
> Just for completeness, the main purpose of rdp->gp_seq is to reject
> quiescent states that were seen during already-completed grace periods.
> 
So it means that instead of gp_seq reading we should take a snaphshot
of the current state:

snp = get_state_synchronize_rcu();

and later on do a:

cond_synchronize_rcu(snp);

to wait for a GP. Or if the poll_state_synchronize_rcu(oldstate)) != 0
queue_rcu_work().

Sorry for a description using the RCU API functions name :) 

--
Uladzislau Rezki
  
Joel Fernandes Nov. 2, 2022, 5:29 p.m. UTC | #7
On Wed, Nov 2, 2022 at 1:24 PM Uladzislau Rezki <urezki@gmail.com> wrote:
>
> On Wed, Nov 02, 2022 at 09:35:44AM -0700, Paul E. McKenney wrote:
> > On Wed, Nov 02, 2022 at 12:13:17PM -0400, Joel Fernandes wrote:
> > > On Wed, Nov 2, 2022 at 8:37 AM Uladzislau Rezki <urezki@gmail.com> wrote:
> > > >
> > > > On Sat, Oct 29, 2022 at 01:28:56PM +0000, Joel Fernandes (Google) wrote:
> > > > > On ChromeOS, I am (almost) always seeing the optimization trigger.
> > > > > Tested boot up and trace_printk'ing how often it triggers.
> > > > >
> > > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > > > ---
> > > > >  kernel/rcu/tree.c | 18 +++++++++++++++++-
> > > > >  1 file changed, 17 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > > index 591187b6352e..3e4c50b9fd33 100644
> > > > > --- a/kernel/rcu/tree.c
> > > > > +++ b/kernel/rcu/tree.c
> > > > > @@ -2935,6 +2935,7 @@ struct kfree_rcu_cpu_work {
> > > > >
> > > > >  /**
> > > > >   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> > > > > + * @rdp: The rdp of the CPU that this kfree_rcu corresponds to.
> > > > >   * @head: List of kfree_rcu() objects not yet waiting for a grace period
> > > > >   * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
> > > > >   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> > > > > @@ -2964,6 +2965,8 @@ struct kfree_rcu_cpu {
> > > > >       struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> > > > >       raw_spinlock_t lock;
> > > > >       struct delayed_work monitor_work;
> > > > > +     struct rcu_data *rdp;
> > > > > +     unsigned long last_gp_seq;
> > > > >       bool initialized;
> > > > >       int count;
> > > > >
> > > > > @@ -3167,6 +3170,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
> > > > >                       mod_delayed_work(system_wq, &krcp->monitor_work, delay);
> > > > >               return;
> > > > >       }
> > > > > +     krcp->last_gp_seq = krcp->rdp->gp_seq;
> > > > >       queue_delayed_work(system_wq, &krcp->monitor_work, delay);
> > > > >  }
> > > > >
> > > > > @@ -3217,7 +3221,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
> > > > >                       // be that the work is in the pending state when
> > > > >                       // channels have been detached following by each
> > > > >                       // other.
> > > > > -                     queue_rcu_work(system_wq, &krwp->rcu_work);
> > > > > +                     //
> > > > > +                     // NOTE about gp_seq wrap: In case of gp_seq overflow,
> > > > > +                     // it is possible for rdp->gp_seq to be less than
> > > > > +                     // krcp->last_gp_seq even though a GP might be over. In
> > > > > +                     // this rare case, we would just have one extra GP.
> > > > > +                     if (krcp->last_gp_seq &&
> > > > >
> > > > This check can be eliminated i think. A kfree_rcu_cpu is defined as
> > > > static so by default the last_gp_set is set to zero.
> > >
> > > Ack.
> > >
> > > > > @@ -4802,6 +4816,8 @@ static void __init kfree_rcu_batch_init(void)
> > > > >       for_each_possible_cpu(cpu) {
> > > > >               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > > > >
> > > > > +             krcp->rdp = per_cpu_ptr(&rcu_data, cpu);
> > > > > +             krcp->last_gp_seq = 0;
> > > > >
> > > > Yep. This one can be just dropped.
> > > >
> > > > But all the rest looks good :) I will give it a try from test point of
> > > > view. It is interested from the memory footprint point of view.
> > >
> > > Ack. Thanks. Even though we should not sample rdp->gp_seq, I think it
> > > is still worth a test.
> >
> > Just for completeness, the main purpose of rdp->gp_seq is to reject
> > quiescent states that were seen during already-completed grace periods.
> >
> So it means that instead of gp_seq reading we should take a snaphshot
> of the current state:
>
> snp = get_state_synchronize_rcu();
>
> and later on do a:
>
> cond_synchronize_rcu(snp);
>
> to wait for a GP.

This can't be called from the timer IRQ handler though (monitor)

> Or if the poll_state_synchronize_rcu(oldstate)) != 0
> queue_rcu_work().

But something like this should be possible (maybe)

> Sorry for a description using the RCU API functions name :)

I believe you will have to call rcu_poll_gp_seq_start() as well if you
are using polled API. I am planning to look at this properly more,
soon. Right now I am going to write up the rcutop doc and share with
you guys.

(Maybe RCU polling is the right thing to do as we reuse all the infra
and any corner case it is handling)

thanks,

- Joel


> --
> Uladzislau Rezki
  
Uladzislau Rezki Nov. 2, 2022, 5:30 p.m. UTC | #8
> 
> to wait for a GP. Or if the poll_state_synchronize_rcu(oldstate)) != 0
> queue_rcu_work().
> 
A small fix. If poll_state_synchronize_rcu(oldstate)) == 0 then
queue_rcu_work() since a GP is still in progress.
  
Uladzislau Rezki Nov. 2, 2022, 6:31 p.m. UTC | #9
On Wed, Nov 02, 2022 at 01:29:17PM -0400, Joel Fernandes wrote:
> On Wed, Nov 2, 2022 at 1:24 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> >
> > On Wed, Nov 02, 2022 at 09:35:44AM -0700, Paul E. McKenney wrote:
> > > On Wed, Nov 02, 2022 at 12:13:17PM -0400, Joel Fernandes wrote:
> > > > On Wed, Nov 2, 2022 at 8:37 AM Uladzislau Rezki <urezki@gmail.com> wrote:
> > > > >
> > > > > On Sat, Oct 29, 2022 at 01:28:56PM +0000, Joel Fernandes (Google) wrote:
> > > > > > On ChromeOS, I am (almost) always seeing the optimization trigger.
> > > > > > Tested boot up and trace_printk'ing how often it triggers.
> > > > > >
> > > > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > > > > ---
> > > > > >  kernel/rcu/tree.c | 18 +++++++++++++++++-
> > > > > >  1 file changed, 17 insertions(+), 1 deletion(-)
> > > > > >
> > > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > > > index 591187b6352e..3e4c50b9fd33 100644
> > > > > > --- a/kernel/rcu/tree.c
> > > > > > +++ b/kernel/rcu/tree.c
> > > > > > @@ -2935,6 +2935,7 @@ struct kfree_rcu_cpu_work {
> > > > > >
> > > > > >  /**
> > > > > >   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> > > > > > + * @rdp: The rdp of the CPU that this kfree_rcu corresponds to.
> > > > > >   * @head: List of kfree_rcu() objects not yet waiting for a grace period
> > > > > >   * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
> > > > > >   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> > > > > > @@ -2964,6 +2965,8 @@ struct kfree_rcu_cpu {
> > > > > >       struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> > > > > >       raw_spinlock_t lock;
> > > > > >       struct delayed_work monitor_work;
> > > > > > +     struct rcu_data *rdp;
> > > > > > +     unsigned long last_gp_seq;
> > > > > >       bool initialized;
> > > > > >       int count;
> > > > > >
> > > > > > @@ -3167,6 +3170,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
> > > > > >                       mod_delayed_work(system_wq, &krcp->monitor_work, delay);
> > > > > >               return;
> > > > > >       }
> > > > > > +     krcp->last_gp_seq = krcp->rdp->gp_seq;
> > > > > >       queue_delayed_work(system_wq, &krcp->monitor_work, delay);
> > > > > >  }
> > > > > >
> > > > > > @@ -3217,7 +3221,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
> > > > > >                       // be that the work is in the pending state when
> > > > > >                       // channels have been detached following by each
> > > > > >                       // other.
> > > > > > -                     queue_rcu_work(system_wq, &krwp->rcu_work);
> > > > > > +                     //
> > > > > > +                     // NOTE about gp_seq wrap: In case of gp_seq overflow,
> > > > > > +                     // it is possible for rdp->gp_seq to be less than
> > > > > > +                     // krcp->last_gp_seq even though a GP might be over. In
> > > > > > +                     // this rare case, we would just have one extra GP.
> > > > > > +                     if (krcp->last_gp_seq &&
> > > > > >
> > > > > This check can be eliminated i think. A kfree_rcu_cpu is defined as
> > > > > static so by default the last_gp_set is set to zero.
> > > >
> > > > Ack.
> > > >
> > > > > > @@ -4802,6 +4816,8 @@ static void __init kfree_rcu_batch_init(void)
> > > > > >       for_each_possible_cpu(cpu) {
> > > > > >               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > > > > >
> > > > > > +             krcp->rdp = per_cpu_ptr(&rcu_data, cpu);
> > > > > > +             krcp->last_gp_seq = 0;
> > > > > >
> > > > > Yep. This one can be just dropped.
> > > > >
> > > > > But all the rest looks good :) I will give it a try from test point of
> > > > > view. It is interested from the memory footprint point of view.
> > > >
> > > > Ack. Thanks. Even though we should not sample rdp->gp_seq, I think it
> > > > is still worth a test.
> > >
> > > Just for completeness, the main purpose of rdp->gp_seq is to reject
> > > quiescent states that were seen during already-completed grace periods.
> > >
> > So it means that instead of gp_seq reading we should take a snaphshot
> > of the current state:
> >
> > snp = get_state_synchronize_rcu();
> >
> > and later on do a:
> >
> > cond_synchronize_rcu(snp);
> >
> > to wait for a GP.
> 
> This can't be called from the timer IRQ handler though (monitor)
> 
> > Or if the poll_state_synchronize_rcu(oldstate)) != 0
> > queue_rcu_work().
> 
> But something like this should be possible (maybe)
> 
> > Sorry for a description using the RCU API functions name :)
> 
> I believe you will have to call rcu_poll_gp_seq_start() as well if you
> are using polled API. I am planning to look at this properly more,
> soon. Right now I am going to write up the rcutop doc and share with
> you guys.
> 
> (Maybe RCU polling is the right thing to do as we reuse all the infra
> and any corner case it is handling)
> 
OK. This is in my todo list also. Since we have discussed it let's move
it forward.

Below what i have came up with to switch for polling APIs:

From 799ce1653d159ef3d35f34a284f738c2c267c75f Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Wed, 2 Nov 2022 19:26:27 +0100
Subject: [PATCH 1/1] rcu: kvfree_rcu: Reduce a memory footptint by using
 polling APIs

Total time taken by all kfree'ers: 6564718459 ns, loops: 10000, batches: 1110, memory footprint: 5057MB
Total time taken by all kfree'ers: 8431051895 ns, loops: 10000, batches: 1109, memory footprint: 2749MB
Total time taken by all kfree'ers: 9477830789 ns, loops: 10000, batches: 1158, memory footprint: 2934MB
Total time taken by all kfree'ers: 9950211144 ns, loops: 10000, batches: 981, memory footprint: 2704MB

with a patch:

Total time taken by all kfree'ers: 7712110118 ns, loops: 10000, batches: 1660, memory footprint: 91MB
Total time taken by all kfree'ers: 7002403664 ns, loops: 10000, batches: 1482, memory footprint: 86MB
Total time taken by all kfree'ers: 7842282319 ns, loops: 10000, batches: 1738, memory footprint: 86MB
Total time taken by all kfree'ers: 7230161977 ns, loops: 10000, batches: 1542, memory footprint: 72MB

Tested with NOCB option, all offloading CPUs:

kvm.sh --memory 10G --torture rcuscale --allcpus --duration 1 \
  --kconfig CONFIG_NR_CPUS=64 \
  --kconfig CONFIG_RCU_NOCB_CPU=y \
  --kconfig CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y \
  --bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 \
  rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" --trust-make

According to data there is a big gain in memory footprint with a patch.
It is because of call_rcu() and call_rcu_flush() take more effort and
time to queue a callback and then wait for a gp.

With polling API:
  a) we do not need to queue any callback;
  b) we might not even need wait for a GP completion.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/rcu/tree.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 76973d716921..17c3d6f2c55b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2919,18 +2919,20 @@ struct kvfree_rcu_bulk_data {
 	((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
 
 /**
+ * @rcu_work: A work to reclaim a memory after a grace period
  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
- * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
  * @head_free: List of kfree_rcu() objects waiting for a grace period
  * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
  * @krcp: Pointer to @kfree_rcu_cpu structure
+ * @gp_snap: A snapshot of current grace period
  */
 
 struct kfree_rcu_cpu_work {
-	struct rcu_work rcu_work;
+	struct work_struct rcu_work;
 	struct rcu_head *head_free;
 	struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
 	struct kfree_rcu_cpu *krcp;
+	unsigned long gp_snap;
 };
 
 /**
@@ -3066,10 +3068,12 @@ static void kfree_rcu_work(struct work_struct *work)
 	struct kfree_rcu_cpu_work *krwp;
 	int i, j;
 
-	krwp = container_of(to_rcu_work(work),
+	krwp = container_of(work,
 			    struct kfree_rcu_cpu_work, rcu_work);
 	krcp = krwp->krcp;
 
+	cond_synchronize_rcu(krwp->gp_snap);
+
 	raw_spin_lock_irqsave(&krcp->lock, flags);
 	// Channels 1 and 2.
 	for (i = 0; i < FREE_N_CHANNELS; i++) {
@@ -3194,6 +3198,13 @@ static void kfree_rcu_monitor(struct work_struct *work)
 		if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
 			(krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
 				(krcp->head && !krwp->head_free)) {
+			/*
+			 * Take a snapshot for this krwp. Please note no
+			 * more any objects can be added to this krwp free
+			 * channels.
+			 */
+			krwp->gp_snap = get_state_synchronize_rcu();
+
 			// Channel 1 corresponds to the SLAB-pointer bulk path.
 			// Channel 2 corresponds to vmalloc-pointer bulk path.
 			for (j = 0; j < FREE_N_CHANNELS; j++) {
@@ -3217,7 +3228,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
 			// be that the work is in the pending state when
 			// channels have been detached following by each
 			// other.
-			queue_rcu_work(system_wq, &krwp->rcu_work);
+			queue_work(system_wq, &krwp->rcu_work);
 		}
 	}
 
@@ -4808,7 +4819,7 @@ static void __init kfree_rcu_batch_init(void)
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
-			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
+			INIT_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
 			krcp->krw_arr[i].krcp = krcp;
 		}
  
Paul E. McKenney Nov. 2, 2022, 6:32 p.m. UTC | #10
On Wed, Nov 02, 2022 at 06:30:24PM +0100, Uladzislau Rezki wrote:
> > 
> > to wait for a GP. Or if the poll_state_synchronize_rcu(oldstate)) != 0
> > queue_rcu_work().
> > 
> A small fix. If poll_state_synchronize_rcu(oldstate)) == 0 then
> queue_rcu_work() since a GP is still in progress.

Both get_state_synchronize_rcu() and poll_state_synchronize_rcu()
may be invoked from interrupt handlers.  And from NMI handlers,
for that matter.

							Thanx, Paul
  
Paul E. McKenney Nov. 2, 2022, 6:49 p.m. UTC | #11
On Wed, Nov 02, 2022 at 07:31:40PM +0100, Uladzislau Rezki wrote:
> On Wed, Nov 02, 2022 at 01:29:17PM -0400, Joel Fernandes wrote:
> > On Wed, Nov 2, 2022 at 1:24 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> > >
> > > On Wed, Nov 02, 2022 at 09:35:44AM -0700, Paul E. McKenney wrote:
> > > > On Wed, Nov 02, 2022 at 12:13:17PM -0400, Joel Fernandes wrote:
> > > > > On Wed, Nov 2, 2022 at 8:37 AM Uladzislau Rezki <urezki@gmail.com> wrote:
> > > > > >
> > > > > > On Sat, Oct 29, 2022 at 01:28:56PM +0000, Joel Fernandes (Google) wrote:
> > > > > > > On ChromeOS, I am (almost) always seeing the optimization trigger.
> > > > > > > Tested boot up and trace_printk'ing how often it triggers.
> > > > > > >
> > > > > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > > > > > ---
> > > > > > >  kernel/rcu/tree.c | 18 +++++++++++++++++-
> > > > > > >  1 file changed, 17 insertions(+), 1 deletion(-)
> > > > > > >
> > > > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > > > > index 591187b6352e..3e4c50b9fd33 100644
> > > > > > > --- a/kernel/rcu/tree.c
> > > > > > > +++ b/kernel/rcu/tree.c
> > > > > > > @@ -2935,6 +2935,7 @@ struct kfree_rcu_cpu_work {
> > > > > > >
> > > > > > >  /**
> > > > > > >   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> > > > > > > + * @rdp: The rdp of the CPU that this kfree_rcu corresponds to.
> > > > > > >   * @head: List of kfree_rcu() objects not yet waiting for a grace period
> > > > > > >   * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
> > > > > > >   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> > > > > > > @@ -2964,6 +2965,8 @@ struct kfree_rcu_cpu {
> > > > > > >       struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> > > > > > >       raw_spinlock_t lock;
> > > > > > >       struct delayed_work monitor_work;
> > > > > > > +     struct rcu_data *rdp;
> > > > > > > +     unsigned long last_gp_seq;
> > > > > > >       bool initialized;
> > > > > > >       int count;
> > > > > > >
> > > > > > > @@ -3167,6 +3170,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
> > > > > > >                       mod_delayed_work(system_wq, &krcp->monitor_work, delay);
> > > > > > >               return;
> > > > > > >       }
> > > > > > > +     krcp->last_gp_seq = krcp->rdp->gp_seq;
> > > > > > >       queue_delayed_work(system_wq, &krcp->monitor_work, delay);
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -3217,7 +3221,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
> > > > > > >                       // be that the work is in the pending state when
> > > > > > >                       // channels have been detached following by each
> > > > > > >                       // other.
> > > > > > > -                     queue_rcu_work(system_wq, &krwp->rcu_work);
> > > > > > > +                     //
> > > > > > > +                     // NOTE about gp_seq wrap: In case of gp_seq overflow,
> > > > > > > +                     // it is possible for rdp->gp_seq to be less than
> > > > > > > +                     // krcp->last_gp_seq even though a GP might be over. In
> > > > > > > +                     // this rare case, we would just have one extra GP.
> > > > > > > +                     if (krcp->last_gp_seq &&
> > > > > > >
> > > > > > This check can be eliminated i think. A kfree_rcu_cpu is defined as
> > > > > > static so by default the last_gp_set is set to zero.
> > > > >
> > > > > Ack.
> > > > >
> > > > > > > @@ -4802,6 +4816,8 @@ static void __init kfree_rcu_batch_init(void)
> > > > > > >       for_each_possible_cpu(cpu) {
> > > > > > >               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > > > > > >
> > > > > > > +             krcp->rdp = per_cpu_ptr(&rcu_data, cpu);
> > > > > > > +             krcp->last_gp_seq = 0;
> > > > > > >
> > > > > > Yep. This one can be just dropped.
> > > > > >
> > > > > > But all the rest looks good :) I will give it a try from test point of
> > > > > > view. It is interested from the memory footprint point of view.
> > > > >
> > > > > Ack. Thanks. Even though we should not sample rdp->gp_seq, I think it
> > > > > is still worth a test.
> > > >
> > > > Just for completeness, the main purpose of rdp->gp_seq is to reject
> > > > quiescent states that were seen during already-completed grace periods.
> > > >
> > > So it means that instead of gp_seq reading we should take a snaphshot
> > > of the current state:
> > >
> > > snp = get_state_synchronize_rcu();
> > >
> > > and later on do a:
> > >
> > > cond_synchronize_rcu(snp);
> > >
> > > to wait for a GP.
> > 
> > This can't be called from the timer IRQ handler though (monitor)
> > 
> > > Or if the poll_state_synchronize_rcu(oldstate)) != 0
> > > queue_rcu_work().
> > 
> > But something like this should be possible (maybe)
> > 
> > > Sorry for a description using the RCU API functions name :)
> > 
> > I believe you will have to call rcu_poll_gp_seq_start() as well if you
> > are using polled API. I am planning to look at this properly more,
> > soon. Right now I am going to write up the rcutop doc and share with
> > you guys.
> > 
> > (Maybe RCU polling is the right thing to do as we reuse all the infra
> > and any corner case it is handling)
> > 
> OK. This is in my todo list also. Since we have discussed it let's move
> it forward.
> 
> Below what i have came up with to switch for polling APIs:
> 
> >From 799ce1653d159ef3d35f34a284f738c2c267c75f Mon Sep 17 00:00:00 2001
> From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
> Date: Wed, 2 Nov 2022 19:26:27 +0100
> Subject: [PATCH 1/1] rcu: kvfree_rcu: Reduce a memory footptint by using
>  polling APIs
> 
> Total time taken by all kfree'ers: 6564718459 ns, loops: 10000, batches: 1110, memory footprint: 5057MB
> Total time taken by all kfree'ers: 8431051895 ns, loops: 10000, batches: 1109, memory footprint: 2749MB
> Total time taken by all kfree'ers: 9477830789 ns, loops: 10000, batches: 1158, memory footprint: 2934MB
> Total time taken by all kfree'ers: 9950211144 ns, loops: 10000, batches: 981, memory footprint: 2704MB
> 
> with a patch:
> 
> Total time taken by all kfree'ers: 7712110118 ns, loops: 10000, batches: 1660, memory footprint: 91MB
> Total time taken by all kfree'ers: 7002403664 ns, loops: 10000, batches: 1482, memory footprint: 86MB
> Total time taken by all kfree'ers: 7842282319 ns, loops: 10000, batches: 1738, memory footprint: 86MB
> Total time taken by all kfree'ers: 7230161977 ns, loops: 10000, batches: 1542, memory footprint: 72MB
> 
> Tested with NOCB option, all offloading CPUs:
> 
> kvm.sh --memory 10G --torture rcuscale --allcpus --duration 1 \
>   --kconfig CONFIG_NR_CPUS=64 \
>   --kconfig CONFIG_RCU_NOCB_CPU=y \
>   --kconfig CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y \
>   --bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 \
>   rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" --trust-make
> 
> According to data there is a big gain in memory footprint with a patch.
> It is because of call_rcu() and call_rcu_flush() take more effort and
> time to queue a callback and then wait for a gp.
> 
> With polling API:
>   a) we do not need to queue any callback;
>   b) we might not even need wait for a GP completion.
> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> ---
>  kernel/rcu/tree.c | 21 ++++++++++++++++-----
>  1 file changed, 16 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 76973d716921..17c3d6f2c55b 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2919,18 +2919,20 @@ struct kvfree_rcu_bulk_data {
>  	((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
>  
>  /**
> + * @rcu_work: A work to reclaim a memory after a grace period
>   * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
> - * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
>   * @head_free: List of kfree_rcu() objects waiting for a grace period
>   * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
>   * @krcp: Pointer to @kfree_rcu_cpu structure
> + * @gp_snap: A snapshot of current grace period
>   */
>  
>  struct kfree_rcu_cpu_work {
> -	struct rcu_work rcu_work;
> +	struct work_struct rcu_work;
>  	struct rcu_head *head_free;
>  	struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
>  	struct kfree_rcu_cpu *krcp;
> +	unsigned long gp_snap;
>  };
>  
>  /**
> @@ -3066,10 +3068,12 @@ static void kfree_rcu_work(struct work_struct *work)
>  	struct kfree_rcu_cpu_work *krwp;
>  	int i, j;
>  
> -	krwp = container_of(to_rcu_work(work),
> +	krwp = container_of(work,
>  			    struct kfree_rcu_cpu_work, rcu_work);
>  	krcp = krwp->krcp;
>  
> +	cond_synchronize_rcu(krwp->gp_snap);

Might this provoke OOMs in case of callback flooding?

An alternative might be something like this:

	if (!poll_state_synchronize_rcu(krwp->gp_snap)) {
		queue_rcu_work(system_wq, &krwp->rcu_work);
		return;
	}

Either way gets you a non-lazy callback in the case where a grace
period has not yet elapsed.

Or am I missing something that prevents OOMs here?

							Thanx, Paul

> +
>  	raw_spin_lock_irqsave(&krcp->lock, flags);
>  	// Channels 1 and 2.
>  	for (i = 0; i < FREE_N_CHANNELS; i++) {
> @@ -3194,6 +3198,13 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  		if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
>  			(krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
>  				(krcp->head && !krwp->head_free)) {
> +			/*
> +			 * Take a snapshot for this krwp. Please note no
> +			 * more any objects can be added to this krwp free
> +			 * channels.
> +			 */
> +			krwp->gp_snap = get_state_synchronize_rcu();
> +
>  			// Channel 1 corresponds to the SLAB-pointer bulk path.
>  			// Channel 2 corresponds to vmalloc-pointer bulk path.
>  			for (j = 0; j < FREE_N_CHANNELS; j++) {
> @@ -3217,7 +3228,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  			// be that the work is in the pending state when
>  			// channels have been detached following by each
>  			// other.
> -			queue_rcu_work(system_wq, &krwp->rcu_work);
> +			queue_work(system_wq, &krwp->rcu_work);
>  		}
>  	}
>  
> @@ -4808,7 +4819,7 @@ static void __init kfree_rcu_batch_init(void)
>  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>  
>  		for (i = 0; i < KFREE_N_BATCHES; i++) {
> -			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> +			INIT_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
>  			krcp->krw_arr[i].krcp = krcp;
>  		}
>  
> -- 
> 2.30.2
> 
> --
> Uladzislau Rezki
  
Joel Fernandes Nov. 2, 2022, 7:46 p.m. UTC | #12
> On Nov 2, 2022, at 2:49 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
> 
> On Wed, Nov 02, 2022 at 07:31:40PM +0100, Uladzislau Rezki wrote:
>>> On Wed, Nov 02, 2022 at 01:29:17PM -0400, Joel Fernandes wrote:
>>> On Wed, Nov 2, 2022 at 1:24 PM Uladzislau Rezki <urezki@gmail.com> wrote:
>>>> 
>>>> On Wed, Nov 02, 2022 at 09:35:44AM -0700, Paul E. McKenney wrote:
>>>>> On Wed, Nov 02, 2022 at 12:13:17PM -0400, Joel Fernandes wrote:
>>>>>> On Wed, Nov 2, 2022 at 8:37 AM Uladzislau Rezki <urezki@gmail.com> wrote:
>>>>>>> 
>>>>>>> On Sat, Oct 29, 2022 at 01:28:56PM +0000, Joel Fernandes (Google) wrote:
>>>>>>>> On ChromeOS, I am (almost) always seeing the optimization trigger.
>>>>>>>> Tested boot up and trace_printk'ing how often it triggers.
>>>>>>>> 
>>>>>>>> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
>>>>>>>> ---
>>>>>>>> kernel/rcu/tree.c | 18 +++++++++++++++++-
>>>>>>>> 1 file changed, 17 insertions(+), 1 deletion(-)
>>>>>>>> 
>>>>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>>>>>>>> index 591187b6352e..3e4c50b9fd33 100644
>>>>>>>> --- a/kernel/rcu/tree.c
>>>>>>>> +++ b/kernel/rcu/tree.c
>>>>>>>> @@ -2935,6 +2935,7 @@ struct kfree_rcu_cpu_work {
>>>>>>>> 
>>>>>>>> /**
>>>>>>>>  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
>>>>>>>> + * @rdp: The rdp of the CPU that this kfree_rcu corresponds to.
>>>>>>>>  * @head: List of kfree_rcu() objects not yet waiting for a grace period
>>>>>>>>  * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
>>>>>>>>  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
>>>>>>>> @@ -2964,6 +2965,8 @@ struct kfree_rcu_cpu {
>>>>>>>>      struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
>>>>>>>>      raw_spinlock_t lock;
>>>>>>>>      struct delayed_work monitor_work;
>>>>>>>> +     struct rcu_data *rdp;
>>>>>>>> +     unsigned long last_gp_seq;
>>>>>>>>      bool initialized;
>>>>>>>>      int count;
>>>>>>>> 
>>>>>>>> @@ -3167,6 +3170,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
>>>>>>>>                      mod_delayed_work(system_wq, &krcp->monitor_work, delay);
>>>>>>>>              return;
>>>>>>>>      }
>>>>>>>> +     krcp->last_gp_seq = krcp->rdp->gp_seq;
>>>>>>>>      queue_delayed_work(system_wq, &krcp->monitor_work, delay);
>>>>>>>> }
>>>>>>>> 
>>>>>>>> @@ -3217,7 +3221,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
>>>>>>>>                      // be that the work is in the pending state when
>>>>>>>>                      // channels have been detached following by each
>>>>>>>>                      // other.
>>>>>>>> -                     queue_rcu_work(system_wq, &krwp->rcu_work);
>>>>>>>> +                     //
>>>>>>>> +                     // NOTE about gp_seq wrap: In case of gp_seq overflow,
>>>>>>>> +                     // it is possible for rdp->gp_seq to be less than
>>>>>>>> +                     // krcp->last_gp_seq even though a GP might be over. In
>>>>>>>> +                     // this rare case, we would just have one extra GP.
>>>>>>>> +                     if (krcp->last_gp_seq &&
>>>>>>>> 
>>>>>>> This check can be eliminated i think. A kfree_rcu_cpu is defined as
>>>>>>> static so by default the last_gp_set is set to zero.
>>>>>> 
>>>>>> Ack.
>>>>>> 
>>>>>>>> @@ -4802,6 +4816,8 @@ static void __init kfree_rcu_batch_init(void)
>>>>>>>>      for_each_possible_cpu(cpu) {
>>>>>>>>              struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>>>>>>>> 
>>>>>>>> +             krcp->rdp = per_cpu_ptr(&rcu_data, cpu);
>>>>>>>> +             krcp->last_gp_seq = 0;
>>>>>>>> 
>>>>>>> Yep. This one can be just dropped.
>>>>>>> 
>>>>>>> But all the rest looks good :) I will give it a try from test point of
>>>>>>> view. It is interested from the memory footprint point of view.
>>>>>> 
>>>>>> Ack. Thanks. Even though we should not sample rdp->gp_seq, I think it
>>>>>> is still worth a test.
>>>>> 
>>>>> Just for completeness, the main purpose of rdp->gp_seq is to reject
>>>>> quiescent states that were seen during already-completed grace periods.
>>>>> 
>>>> So it means that instead of gp_seq reading we should take a snaphshot
>>>> of the current state:
>>>> 
>>>> snp = get_state_synchronize_rcu();
>>>> 
>>>> and later on do a:
>>>> 
>>>> cond_synchronize_rcu(snp);
>>>> 
>>>> to wait for a GP.
>>> 
>>> This can't be called from the timer IRQ handler though (monitor)
>>> 
>>>> Or if the poll_state_synchronize_rcu(oldstate)) != 0
>>>> queue_rcu_work().
>>> 
>>> But something like this should be possible (maybe)
>>> 
>>>> Sorry for a description using the RCU API functions name :)
>>> 
>>> I believe you will have to call rcu_poll_gp_seq_start() as well if you
>>> are using polled API. I am planning to look at this properly more,
>>> soon. Right now I am going to write up the rcutop doc and share with
>>> you guys.
>>> 
>>> (Maybe RCU polling is the right thing to do as we reuse all the infra
>>> and any corner case it is handling)
>>> 
>> OK. This is in my todo list also. Since we have discussed it let's move
>> it forward.
>> 
>> Below what i have came up with to switch for polling APIs:
>> 
>>> From 799ce1653d159ef3d35f34a284f738c2c267c75f Mon Sep 17 00:00:00 2001
>> From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
>> Date: Wed, 2 Nov 2022 19:26:27 +0100
>> Subject: [PATCH 1/1] rcu: kvfree_rcu: Reduce a memory footptint by using
>> polling APIs
>> 
>> Total time taken by all kfree'ers: 6564718459 ns, loops: 10000, batches: 1110, memory footprint: 5057MB
>> Total time taken by all kfree'ers: 8431051895 ns, loops: 10000, batches: 1109, memory footprint: 2749MB
>> Total time taken by all kfree'ers: 9477830789 ns, loops: 10000, batches: 1158, memory footprint: 2934MB
>> Total time taken by all kfree'ers: 9950211144 ns, loops: 10000, batches: 981, memory footprint: 2704MB
>> 
>> with a patch:
>> 
>> Total time taken by all kfree'ers: 7712110118 ns, loops: 10000, batches: 1660, memory footprint: 91MB
>> Total time taken by all kfree'ers: 7002403664 ns, loops: 10000, batches: 1482, memory footprint: 86MB
>> Total time taken by all kfree'ers: 7842282319 ns, loops: 10000, batches: 1738, memory footprint: 86MB
>> Total time taken by all kfree'ers: 7230161977 ns, loops: 10000, batches: 1542, memory footprint: 72MB
>> 
>> Tested with NOCB option, all offloading CPUs:
>> 
>> kvm.sh --memory 10G --torture rcuscale --allcpus --duration 1 \
>>  --kconfig CONFIG_NR_CPUS=64 \
>>  --kconfig CONFIG_RCU_NOCB_CPU=y \
>>  --kconfig CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y \
>>  --bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 \
>>  rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" --trust-make
>> 
>> According to data there is a big gain in memory footprint with a patch.
>> It is because of call_rcu() and call_rcu_flush() take more effort and
>> time to queue a callback and then wait for a gp.
>> 
>> With polling API:
>>  a) we do not need to queue any callback;
>>  b) we might not even need wait for a GP completion.
>> 
>> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
>> ---
>> kernel/rcu/tree.c | 21 ++++++++++++++++-----
>> 1 file changed, 16 insertions(+), 5 deletions(-)
>> 
>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>> index 76973d716921..17c3d6f2c55b 100644
>> --- a/kernel/rcu/tree.c
>> +++ b/kernel/rcu/tree.c
>> @@ -2919,18 +2919,20 @@ struct kvfree_rcu_bulk_data {
>>    ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
>> 
>> /**
>> + * @rcu_work: A work to reclaim a memory after a grace period
>>  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
>> - * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
>>  * @head_free: List of kfree_rcu() objects waiting for a grace period
>>  * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
>>  * @krcp: Pointer to @kfree_rcu_cpu structure
>> + * @gp_snap: A snapshot of current grace period
>>  */
>> 
>> struct kfree_rcu_cpu_work {
>> -    struct rcu_work rcu_work;
>> +    struct work_struct rcu_work;
>>    struct rcu_head *head_free;
>>    struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
>>    struct kfree_rcu_cpu *krcp;
>> +    unsigned long gp_snap;
>> };
>> 
>> /**
>> @@ -3066,10 +3068,12 @@ static void kfree_rcu_work(struct work_struct *work)
>>    struct kfree_rcu_cpu_work *krwp;
>>    int i, j;
>> 
>> -    krwp = container_of(to_rcu_work(work),
>> +    krwp = container_of(work,
>>                struct kfree_rcu_cpu_work, rcu_work);
>>    krcp = krwp->krcp;
>> 
>> +    cond_synchronize_rcu(krwp->gp_snap);
> 
> Might this provoke OOMs in case of callback flooding?
> 
> An alternative might be something like this:
> 
>    if (!poll_state_synchronize_rcu(krwp->gp_snap)) {
>        queue_rcu_work(system_wq, &krwp->rcu_work);
>        return;
>    }
> 
> Either way gets you a non-lazy callback in the case where a grace
> period has not yet elapsed.
> Or am I missing something that prevents OOMs here?

The memory consumptions appears to be much less in his testing with the onslaught of kfree, which makes OOM probably less likely.

Though, was your reasoning that in case of a grace period not elapsing, we need a non lazy callback queued, so as to make the reclaim happen sooner?

If so, the cond_synchronize_rcu() should already be conditionally queueing non-lazy CB since we don’t make synchronous users wait for seconds. Or did I miss something?

Thanks,

- Joel

> 
>                            Thanx, Paul
> 
>> +
>>    raw_spin_lock_irqsave(&krcp->lock, flags);
>>    // Channels 1 and 2.
>>    for (i = 0; i < FREE_N_CHANNELS; i++) {
>> @@ -3194,6 +3198,13 @@ static void kfree_rcu_monitor(struct work_struct *work)
>>        if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
>>            (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
>>                (krcp->head && !krwp->head_free)) {
>> +            /*
>> +             * Take a snapshot for this krwp. Please note no
>> +             * more any objects can be added to this krwp free
>> +             * channels.
>> +             */
>> +            krwp->gp_snap = get_state_synchronize_rcu();
>> +
>>            // Channel 1 corresponds to the SLAB-pointer bulk path.
>>            // Channel 2 corresponds to vmalloc-pointer bulk path.
>>            for (j = 0; j < FREE_N_CHANNELS; j++) {
>> @@ -3217,7 +3228,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
>>            // be that the work is in the pending state when
>>            // channels have been detached following by each
>>            // other.
>> -            queue_rcu_work(system_wq, &krwp->rcu_work);
>> +            queue_work(system_wq, &krwp->rcu_work);
>>        }
>>    }
>> 
>> @@ -4808,7 +4819,7 @@ static void __init kfree_rcu_batch_init(void)
>>        struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>> 
>>        for (i = 0; i < KFREE_N_BATCHES; i++) {
>> -            INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
>> +            INIT_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
>>            krcp->krw_arr[i].krcp = krcp;
>>        }
>> 
>> -- 
>> 2.30.2
>> 
>> --
>> Uladzislau Rezki
  
Joel Fernandes Nov. 2, 2022, 7:51 p.m. UTC | #13
> On Nov 2, 2022, at 2:32 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
> 
> On Wed, Nov 02, 2022 at 06:30:24PM +0100, Uladzislau Rezki wrote:
>>> 
>>> to wait for a GP. Or if the poll_state_synchronize_rcu(oldstate)) != 0
>>> queue_rcu_work().
>>> 
>> A small fix. If poll_state_synchronize_rcu(oldstate)) == 0 then
>> queue_rcu_work() since a GP is still in progress.
> 
> Both get_state_synchronize_rcu() and poll_state_synchronize_rcu()
> may be invoked from interrupt handlers.  And from NMI handlers,
> for that matter.

Thanks for clarifying. I meant to say cond_synchronize_rcu() cannot be called from interrupt context, but I made that comment before Vlad cooked his patch.

Thanks,

 - Joel


>                            Thanx, Paul
  
Paul E. McKenney Nov. 2, 2022, 8:28 p.m. UTC | #14
On Wed, Nov 02, 2022 at 03:46:59PM -0400, Joel Fernandes wrote:
> 
> 
> > On Nov 2, 2022, at 2:49 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
> > 
> > On Wed, Nov 02, 2022 at 07:31:40PM +0100, Uladzislau Rezki wrote:
> >>> On Wed, Nov 02, 2022 at 01:29:17PM -0400, Joel Fernandes wrote:
> >>> On Wed, Nov 2, 2022 at 1:24 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> >>>> 
> >>>> On Wed, Nov 02, 2022 at 09:35:44AM -0700, Paul E. McKenney wrote:
> >>>>> On Wed, Nov 02, 2022 at 12:13:17PM -0400, Joel Fernandes wrote:
> >>>>>> On Wed, Nov 2, 2022 at 8:37 AM Uladzislau Rezki <urezki@gmail.com> wrote:
> >>>>>>> 
> >>>>>>> On Sat, Oct 29, 2022 at 01:28:56PM +0000, Joel Fernandes (Google) wrote:
> >>>>>>>> On ChromeOS, I am (almost) always seeing the optimization trigger.
> >>>>>>>> Tested boot up and trace_printk'ing how often it triggers.
> >>>>>>>> 
> >>>>>>>> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> >>>>>>>> ---
> >>>>>>>> kernel/rcu/tree.c | 18 +++++++++++++++++-
> >>>>>>>> 1 file changed, 17 insertions(+), 1 deletion(-)
> >>>>>>>> 
> >>>>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> >>>>>>>> index 591187b6352e..3e4c50b9fd33 100644
> >>>>>>>> --- a/kernel/rcu/tree.c
> >>>>>>>> +++ b/kernel/rcu/tree.c
> >>>>>>>> @@ -2935,6 +2935,7 @@ struct kfree_rcu_cpu_work {
> >>>>>>>> 
> >>>>>>>> /**
> >>>>>>>>  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> >>>>>>>> + * @rdp: The rdp of the CPU that this kfree_rcu corresponds to.
> >>>>>>>>  * @head: List of kfree_rcu() objects not yet waiting for a grace period
> >>>>>>>>  * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
> >>>>>>>>  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> >>>>>>>> @@ -2964,6 +2965,8 @@ struct kfree_rcu_cpu {
> >>>>>>>>      struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> >>>>>>>>      raw_spinlock_t lock;
> >>>>>>>>      struct delayed_work monitor_work;
> >>>>>>>> +     struct rcu_data *rdp;
> >>>>>>>> +     unsigned long last_gp_seq;
> >>>>>>>>      bool initialized;
> >>>>>>>>      int count;
> >>>>>>>> 
> >>>>>>>> @@ -3167,6 +3170,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
> >>>>>>>>                      mod_delayed_work(system_wq, &krcp->monitor_work, delay);
> >>>>>>>>              return;
> >>>>>>>>      }
> >>>>>>>> +     krcp->last_gp_seq = krcp->rdp->gp_seq;
> >>>>>>>>      queue_delayed_work(system_wq, &krcp->monitor_work, delay);
> >>>>>>>> }
> >>>>>>>> 
> >>>>>>>> @@ -3217,7 +3221,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
> >>>>>>>>                      // be that the work is in the pending state when
> >>>>>>>>                      // channels have been detached following by each
> >>>>>>>>                      // other.
> >>>>>>>> -                     queue_rcu_work(system_wq, &krwp->rcu_work);
> >>>>>>>> +                     //
> >>>>>>>> +                     // NOTE about gp_seq wrap: In case of gp_seq overflow,
> >>>>>>>> +                     // it is possible for rdp->gp_seq to be less than
> >>>>>>>> +                     // krcp->last_gp_seq even though a GP might be over. In
> >>>>>>>> +                     // this rare case, we would just have one extra GP.
> >>>>>>>> +                     if (krcp->last_gp_seq &&
> >>>>>>>> 
> >>>>>>> This check can be eliminated i think. A kfree_rcu_cpu is defined as
> >>>>>>> static so by default the last_gp_set is set to zero.
> >>>>>> 
> >>>>>> Ack.
> >>>>>> 
> >>>>>>>> @@ -4802,6 +4816,8 @@ static void __init kfree_rcu_batch_init(void)
> >>>>>>>>      for_each_possible_cpu(cpu) {
> >>>>>>>>              struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> >>>>>>>> 
> >>>>>>>> +             krcp->rdp = per_cpu_ptr(&rcu_data, cpu);
> >>>>>>>> +             krcp->last_gp_seq = 0;
> >>>>>>>> 
> >>>>>>> Yep. This one can be just dropped.
> >>>>>>> 
> >>>>>>> But all the rest looks good :) I will give it a try from test point of
> >>>>>>> view. It is interested from the memory footprint point of view.
> >>>>>> 
> >>>>>> Ack. Thanks. Even though we should not sample rdp->gp_seq, I think it
> >>>>>> is still worth a test.
> >>>>> 
> >>>>> Just for completeness, the main purpose of rdp->gp_seq is to reject
> >>>>> quiescent states that were seen during already-completed grace periods.
> >>>>> 
> >>>> So it means that instead of gp_seq reading we should take a snaphshot
> >>>> of the current state:
> >>>> 
> >>>> snp = get_state_synchronize_rcu();
> >>>> 
> >>>> and later on do a:
> >>>> 
> >>>> cond_synchronize_rcu(snp);
> >>>> 
> >>>> to wait for a GP.
> >>> 
> >>> This can't be called from the timer IRQ handler though (monitor)
> >>> 
> >>>> Or if the poll_state_synchronize_rcu(oldstate)) != 0
> >>>> queue_rcu_work().
> >>> 
> >>> But something like this should be possible (maybe)
> >>> 
> >>>> Sorry for a description using the RCU API functions name :)
> >>> 
> >>> I believe you will have to call rcu_poll_gp_seq_start() as well if you
> >>> are using polled API. I am planning to look at this properly more,
> >>> soon. Right now I am going to write up the rcutop doc and share with
> >>> you guys.
> >>> 
> >>> (Maybe RCU polling is the right thing to do as we reuse all the infra
> >>> and any corner case it is handling)
> >>> 
> >> OK. This is in my todo list also. Since we have discussed it let's move
> >> it forward.
> >> 
> >> Below what i have came up with to switch for polling APIs:
> >> 
> >>> From 799ce1653d159ef3d35f34a284f738c2c267c75f Mon Sep 17 00:00:00 2001
> >> From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
> >> Date: Wed, 2 Nov 2022 19:26:27 +0100
> >> Subject: [PATCH 1/1] rcu: kvfree_rcu: Reduce a memory footptint by using
> >> polling APIs
> >> 
> >> Total time taken by all kfree'ers: 6564718459 ns, loops: 10000, batches: 1110, memory footprint: 5057MB
> >> Total time taken by all kfree'ers: 8431051895 ns, loops: 10000, batches: 1109, memory footprint: 2749MB
> >> Total time taken by all kfree'ers: 9477830789 ns, loops: 10000, batches: 1158, memory footprint: 2934MB
> >> Total time taken by all kfree'ers: 9950211144 ns, loops: 10000, batches: 981, memory footprint: 2704MB
> >> 
> >> with a patch:
> >> 
> >> Total time taken by all kfree'ers: 7712110118 ns, loops: 10000, batches: 1660, memory footprint: 91MB
> >> Total time taken by all kfree'ers: 7002403664 ns, loops: 10000, batches: 1482, memory footprint: 86MB
> >> Total time taken by all kfree'ers: 7842282319 ns, loops: 10000, batches: 1738, memory footprint: 86MB
> >> Total time taken by all kfree'ers: 7230161977 ns, loops: 10000, batches: 1542, memory footprint: 72MB
> >> 
> >> Tested with NOCB option, all offloading CPUs:
> >> 
> >> kvm.sh --memory 10G --torture rcuscale --allcpus --duration 1 \
> >>  --kconfig CONFIG_NR_CPUS=64 \
> >>  --kconfig CONFIG_RCU_NOCB_CPU=y \
> >>  --kconfig CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y \
> >>  --bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 \
> >>  rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" --trust-make
> >> 
> >> According to data there is a big gain in memory footprint with a patch.
> >> It is because of call_rcu() and call_rcu_flush() take more effort and
> >> time to queue a callback and then wait for a gp.
> >> 
> >> With polling API:
> >>  a) we do not need to queue any callback;
> >>  b) we might not even need wait for a GP completion.
> >> 
> >> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> >> ---
> >> kernel/rcu/tree.c | 21 ++++++++++++++++-----
> >> 1 file changed, 16 insertions(+), 5 deletions(-)
> >> 
> >> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> >> index 76973d716921..17c3d6f2c55b 100644
> >> --- a/kernel/rcu/tree.c
> >> +++ b/kernel/rcu/tree.c
> >> @@ -2919,18 +2919,20 @@ struct kvfree_rcu_bulk_data {
> >>    ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
> >> 
> >> /**
> >> + * @rcu_work: A work to reclaim a memory after a grace period
> >>  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
> >> - * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
> >>  * @head_free: List of kfree_rcu() objects waiting for a grace period
> >>  * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
> >>  * @krcp: Pointer to @kfree_rcu_cpu structure
> >> + * @gp_snap: A snapshot of current grace period
> >>  */
> >> 
> >> struct kfree_rcu_cpu_work {
> >> -    struct rcu_work rcu_work;
> >> +    struct work_struct rcu_work;
> >>    struct rcu_head *head_free;
> >>    struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
> >>    struct kfree_rcu_cpu *krcp;
> >> +    unsigned long gp_snap;
> >> };
> >> 
> >> /**
> >> @@ -3066,10 +3068,12 @@ static void kfree_rcu_work(struct work_struct *work)
> >>    struct kfree_rcu_cpu_work *krwp;
> >>    int i, j;
> >> 
> >> -    krwp = container_of(to_rcu_work(work),
> >> +    krwp = container_of(work,
> >>                struct kfree_rcu_cpu_work, rcu_work);
> >>    krcp = krwp->krcp;
> >> 
> >> +    cond_synchronize_rcu(krwp->gp_snap);
> > 
> > Might this provoke OOMs in case of callback flooding?
> > 
> > An alternative might be something like this:
> > 
> >    if (!poll_state_synchronize_rcu(krwp->gp_snap)) {
> >        queue_rcu_work(system_wq, &krwp->rcu_work);
> >        return;
> >    }
> > 
> > Either way gets you a non-lazy callback in the case where a grace
> > period has not yet elapsed.
> > Or am I missing something that prevents OOMs here?
> 
> The memory consumptions appears to be much less in his testing with the onslaught of kfree, which makes OOM probably less likely.
> 
> Though, was your reasoning that in case of a grace period not elapsing, we need a non lazy callback queued, so as to make the reclaim happen sooner?
> 
> If so, the cond_synchronize_rcu() should already be conditionally queueing non-lazy CB since we don’t make synchronous users wait for seconds. Or did I miss something?

My concern is that the synchronize_rcu() will block a kworker kthread
for some time, and that in callback-flood situations this might slow
things down due to exhausting the supply of kworkers.

In contrast, use of queue_rcu_work() frees up the kworker to handle
other pages that are filling up.

Perhaps your point is that the delay from synchronize_rcu() should make
the following pages take the fastpath through cond_synchronize_rcu()?

Either way, it might well be that context-switch overhead forces us
to batch these things somehow.  But let's worry about that when and if
it actually happens.

							Thanx, Paul

> Thanks,
> 
> - Joel
> 
> > 
> >                            Thanx, Paul
> > 
> >> +
> >>    raw_spin_lock_irqsave(&krcp->lock, flags);
> >>    // Channels 1 and 2.
> >>    for (i = 0; i < FREE_N_CHANNELS; i++) {
> >> @@ -3194,6 +3198,13 @@ static void kfree_rcu_monitor(struct work_struct *work)
> >>        if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
> >>            (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
> >>                (krcp->head && !krwp->head_free)) {
> >> +            /*
> >> +             * Take a snapshot for this krwp. Please note no
> >> +             * more any objects can be added to this krwp free
> >> +             * channels.
> >> +             */
> >> +            krwp->gp_snap = get_state_synchronize_rcu();
> >> +
> >>            // Channel 1 corresponds to the SLAB-pointer bulk path.
> >>            // Channel 2 corresponds to vmalloc-pointer bulk path.
> >>            for (j = 0; j < FREE_N_CHANNELS; j++) {
> >> @@ -3217,7 +3228,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
> >>            // be that the work is in the pending state when
> >>            // channels have been detached following by each
> >>            // other.
> >> -            queue_rcu_work(system_wq, &krwp->rcu_work);
> >> +            queue_work(system_wq, &krwp->rcu_work);
> >>        }
> >>    }
> >> 
> >> @@ -4808,7 +4819,7 @@ static void __init kfree_rcu_batch_init(void)
> >>        struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> >> 
> >>        for (i = 0; i < KFREE_N_BATCHES; i++) {
> >> -            INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> >> +            INIT_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> >>            krcp->krw_arr[i].krcp = krcp;
> >>        }
> >> 
> >> -- 
> >> 2.30.2
> >> 
> >> --
> >> Uladzislau Rezki
  
Joel Fernandes Nov. 2, 2022, 9:26 p.m. UTC | #15
On Wed, Nov 2, 2022 at 4:28 PM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Wed, Nov 02, 2022 at 03:46:59PM -0400, Joel Fernandes wrote:
> >
> >
> > > On Nov 2, 2022, at 2:49 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
> > >
> > > On Wed, Nov 02, 2022 at 07:31:40PM +0100, Uladzislau Rezki wrote:
> > >>> On Wed, Nov 02, 2022 at 01:29:17PM -0400, Joel Fernandes wrote:
> > >>> On Wed, Nov 2, 2022 at 1:24 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> > >>>>
> > >>>> On Wed, Nov 02, 2022 at 09:35:44AM -0700, Paul E. McKenney wrote:
> > >>>>> On Wed, Nov 02, 2022 at 12:13:17PM -0400, Joel Fernandes wrote:
> > >>>>>> On Wed, Nov 2, 2022 at 8:37 AM Uladzislau Rezki <urezki@gmail.com> wrote:
> > >>>>>>>
> > >>>>>>> On Sat, Oct 29, 2022 at 01:28:56PM +0000, Joel Fernandes (Google) wrote:
> > >>>>>>>> On ChromeOS, I am (almost) always seeing the optimization trigger.
> > >>>>>>>> Tested boot up and trace_printk'ing how often it triggers.
> > >>>>>>>>
> > >>>>>>>> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > >>>>>>>> ---
> > >>>>>>>> kernel/rcu/tree.c | 18 +++++++++++++++++-
> > >>>>>>>> 1 file changed, 17 insertions(+), 1 deletion(-)
> > >>>>>>>>
> > >>>>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > >>>>>>>> index 591187b6352e..3e4c50b9fd33 100644
> > >>>>>>>> --- a/kernel/rcu/tree.c
> > >>>>>>>> +++ b/kernel/rcu/tree.c
> > >>>>>>>> @@ -2935,6 +2935,7 @@ struct kfree_rcu_cpu_work {
> > >>>>>>>>
> > >>>>>>>> /**
> > >>>>>>>>  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> > >>>>>>>> + * @rdp: The rdp of the CPU that this kfree_rcu corresponds to.
> > >>>>>>>>  * @head: List of kfree_rcu() objects not yet waiting for a grace period
> > >>>>>>>>  * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
> > >>>>>>>>  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> > >>>>>>>> @@ -2964,6 +2965,8 @@ struct kfree_rcu_cpu {
> > >>>>>>>>      struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> > >>>>>>>>      raw_spinlock_t lock;
> > >>>>>>>>      struct delayed_work monitor_work;
> > >>>>>>>> +     struct rcu_data *rdp;
> > >>>>>>>> +     unsigned long last_gp_seq;
> > >>>>>>>>      bool initialized;
> > >>>>>>>>      int count;
> > >>>>>>>>
> > >>>>>>>> @@ -3167,6 +3170,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
> > >>>>>>>>                      mod_delayed_work(system_wq, &krcp->monitor_work, delay);
> > >>>>>>>>              return;
> > >>>>>>>>      }
> > >>>>>>>> +     krcp->last_gp_seq = krcp->rdp->gp_seq;
> > >>>>>>>>      queue_delayed_work(system_wq, &krcp->monitor_work, delay);
> > >>>>>>>> }
> > >>>>>>>>
> > >>>>>>>> @@ -3217,7 +3221,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
> > >>>>>>>>                      // be that the work is in the pending state when
> > >>>>>>>>                      // channels have been detached following by each
> > >>>>>>>>                      // other.
> > >>>>>>>> -                     queue_rcu_work(system_wq, &krwp->rcu_work);
> > >>>>>>>> +                     //
> > >>>>>>>> +                     // NOTE about gp_seq wrap: In case of gp_seq overflow,
> > >>>>>>>> +                     // it is possible for rdp->gp_seq to be less than
> > >>>>>>>> +                     // krcp->last_gp_seq even though a GP might be over. In
> > >>>>>>>> +                     // this rare case, we would just have one extra GP.
> > >>>>>>>> +                     if (krcp->last_gp_seq &&
> > >>>>>>>>
> > >>>>>>> This check can be eliminated i think. A kfree_rcu_cpu is defined as
> > >>>>>>> static so by default the last_gp_set is set to zero.
> > >>>>>>
> > >>>>>> Ack.
> > >>>>>>
> > >>>>>>>> @@ -4802,6 +4816,8 @@ static void __init kfree_rcu_batch_init(void)
> > >>>>>>>>      for_each_possible_cpu(cpu) {
> > >>>>>>>>              struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > >>>>>>>>
> > >>>>>>>> +             krcp->rdp = per_cpu_ptr(&rcu_data, cpu);
> > >>>>>>>> +             krcp->last_gp_seq = 0;
> > >>>>>>>>
> > >>>>>>> Yep. This one can be just dropped.
> > >>>>>>>
> > >>>>>>> But all the rest looks good :) I will give it a try from test point of
> > >>>>>>> view. It is interested from the memory footprint point of view.
> > >>>>>>
> > >>>>>> Ack. Thanks. Even though we should not sample rdp->gp_seq, I think it
> > >>>>>> is still worth a test.
> > >>>>>
> > >>>>> Just for completeness, the main purpose of rdp->gp_seq is to reject
> > >>>>> quiescent states that were seen during already-completed grace periods.
> > >>>>>
> > >>>> So it means that instead of gp_seq reading we should take a snaphshot
> > >>>> of the current state:
> > >>>>
> > >>>> snp = get_state_synchronize_rcu();
> > >>>>
> > >>>> and later on do a:
> > >>>>
> > >>>> cond_synchronize_rcu(snp);
> > >>>>
> > >>>> to wait for a GP.
> > >>>
> > >>> This can't be called from the timer IRQ handler though (monitor)
> > >>>
> > >>>> Or if the poll_state_synchronize_rcu(oldstate)) != 0
> > >>>> queue_rcu_work().
> > >>>
> > >>> But something like this should be possible (maybe)
> > >>>
> > >>>> Sorry for a description using the RCU API functions name :)
> > >>>
> > >>> I believe you will have to call rcu_poll_gp_seq_start() as well if you
> > >>> are using polled API. I am planning to look at this properly more,
> > >>> soon. Right now I am going to write up the rcutop doc and share with
> > >>> you guys.
> > >>>
> > >>> (Maybe RCU polling is the right thing to do as we reuse all the infra
> > >>> and any corner case it is handling)
> > >>>
> > >> OK. This is in my todo list also. Since we have discussed it let's move
> > >> it forward.
> > >>
> > >> Below what i have came up with to switch for polling APIs:
> > >>
> > >>> From 799ce1653d159ef3d35f34a284f738c2c267c75f Mon Sep 17 00:00:00 2001
> > >> From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
> > >> Date: Wed, 2 Nov 2022 19:26:27 +0100
> > >> Subject: [PATCH 1/1] rcu: kvfree_rcu: Reduce a memory footptint by using
> > >> polling APIs
> > >>
> > >> Total time taken by all kfree'ers: 6564718459 ns, loops: 10000, batches: 1110, memory footprint: 5057MB
> > >> Total time taken by all kfree'ers: 8431051895 ns, loops: 10000, batches: 1109, memory footprint: 2749MB
> > >> Total time taken by all kfree'ers: 9477830789 ns, loops: 10000, batches: 1158, memory footprint: 2934MB
> > >> Total time taken by all kfree'ers: 9950211144 ns, loops: 10000, batches: 981, memory footprint: 2704MB
> > >>
> > >> with a patch:
> > >>
> > >> Total time taken by all kfree'ers: 7712110118 ns, loops: 10000, batches: 1660, memory footprint: 91MB
> > >> Total time taken by all kfree'ers: 7002403664 ns, loops: 10000, batches: 1482, memory footprint: 86MB
> > >> Total time taken by all kfree'ers: 7842282319 ns, loops: 10000, batches: 1738, memory footprint: 86MB
> > >> Total time taken by all kfree'ers: 7230161977 ns, loops: 10000, batches: 1542, memory footprint: 72MB
> > >>
> > >> Tested with NOCB option, all offloading CPUs:
> > >>
> > >> kvm.sh --memory 10G --torture rcuscale --allcpus --duration 1 \
> > >>  --kconfig CONFIG_NR_CPUS=64 \
> > >>  --kconfig CONFIG_RCU_NOCB_CPU=y \
> > >>  --kconfig CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y \
> > >>  --bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 \
> > >>  rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" --trust-make
> > >>
> > >> According to data there is a big gain in memory footprint with a patch.
> > >> It is because of call_rcu() and call_rcu_flush() take more effort and
> > >> time to queue a callback and then wait for a gp.
> > >>
> > >> With polling API:
> > >>  a) we do not need to queue any callback;
> > >>  b) we might not even need wait for a GP completion.
> > >>
> > >> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > >> ---
> > >> kernel/rcu/tree.c | 21 ++++++++++++++++-----
> > >> 1 file changed, 16 insertions(+), 5 deletions(-)
> > >>
> > >> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > >> index 76973d716921..17c3d6f2c55b 100644
> > >> --- a/kernel/rcu/tree.c
> > >> +++ b/kernel/rcu/tree.c
> > >> @@ -2919,18 +2919,20 @@ struct kvfree_rcu_bulk_data {
> > >>    ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
> > >>
> > >> /**
> > >> + * @rcu_work: A work to reclaim a memory after a grace period
> > >>  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
> > >> - * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
> > >>  * @head_free: List of kfree_rcu() objects waiting for a grace period
> > >>  * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
> > >>  * @krcp: Pointer to @kfree_rcu_cpu structure
> > >> + * @gp_snap: A snapshot of current grace period
> > >>  */
> > >>
> > >> struct kfree_rcu_cpu_work {
> > >> -    struct rcu_work rcu_work;
> > >> +    struct work_struct rcu_work;
> > >>    struct rcu_head *head_free;
> > >>    struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
> > >>    struct kfree_rcu_cpu *krcp;
> > >> +    unsigned long gp_snap;
> > >> };
> > >>
> > >> /**
> > >> @@ -3066,10 +3068,12 @@ static void kfree_rcu_work(struct work_struct *work)
> > >>    struct kfree_rcu_cpu_work *krwp;
> > >>    int i, j;
> > >>
> > >> -    krwp = container_of(to_rcu_work(work),
> > >> +    krwp = container_of(work,
> > >>                struct kfree_rcu_cpu_work, rcu_work);
> > >>    krcp = krwp->krcp;
> > >>
> > >> +    cond_synchronize_rcu(krwp->gp_snap);
> > >
> > > Might this provoke OOMs in case of callback flooding?
> > >
> > > An alternative might be something like this:
> > >
> > >    if (!poll_state_synchronize_rcu(krwp->gp_snap)) {
> > >        queue_rcu_work(system_wq, &krwp->rcu_work);
> > >        return;
> > >    }
> > >
> > > Either way gets you a non-lazy callback in the case where a grace
> > > period has not yet elapsed.
> > > Or am I missing something that prevents OOMs here?
> >
> > The memory consumptions appears to be much less in his testing with the onslaught of kfree, which makes OOM probably less likely.
> >
> > Though, was your reasoning that in case of a grace period not elapsing, we need a non lazy callback queued, so as to make the reclaim happen sooner?
> >
> > If so, the cond_synchronize_rcu() should already be conditionally queueing non-lazy CB since we don’t make synchronous users wait for seconds. Or did I miss something?
>
> My concern is that the synchronize_rcu() will block a kworker kthread
> for some time, and that in callback-flood situations this might slow
> things down due to exhausting the supply of kworkers.
>
> In contrast, use of queue_rcu_work() frees up the kworker to handle
> other pages that are filling up.
>
> Perhaps your point is that the delay from synchronize_rcu() should make
> the following pages take the fastpath through cond_synchronize_rcu()?
>
> Either way, it might well be that context-switch overhead forces us
> to batch these things somehow.  But let's worry about that when and if
> it actually happens.

Your point sounds reasonable. Though we'd hope cmwq scales worker
thread count as needed, but we shouldn't probably stress it.

Though I am thinking, workqueue context is normally used to invoke
code that can block, and would the issue you mentioned affect those as
well, or affect RCU when those non-RCU work items block. So for
example, when other things in the system that can queue things on the
system_wq and block.  (I might be throwing darts in the dark).

To be safe, we can implement your suggestion which is basically a form
of my initial patch.

Should we add Tejun to the thread?

thanks,

 - Joel

>
>                                                         Thanx, Paul
>
> > Thanks,
> >
> > - Joel
> >
> > >
> > >                            Thanx, Paul
> > >
> > >> +
> > >>    raw_spin_lock_irqsave(&krcp->lock, flags);
> > >>    // Channels 1 and 2.
> > >>    for (i = 0; i < FREE_N_CHANNELS; i++) {
> > >> @@ -3194,6 +3198,13 @@ static void kfree_rcu_monitor(struct work_struct *work)
> > >>        if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
> > >>            (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
> > >>                (krcp->head && !krwp->head_free)) {
> > >> +            /*
> > >> +             * Take a snapshot for this krwp. Please note no
> > >> +             * more any objects can be added to this krwp free
> > >> +             * channels.
> > >> +             */
> > >> +            krwp->gp_snap = get_state_synchronize_rcu();
> > >> +
> > >>            // Channel 1 corresponds to the SLAB-pointer bulk path.
> > >>            // Channel 2 corresponds to vmalloc-pointer bulk path.
> > >>            for (j = 0; j < FREE_N_CHANNELS; j++) {
> > >> @@ -3217,7 +3228,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
> > >>            // be that the work is in the pending state when
> > >>            // channels have been detached following by each
> > >>            // other.
> > >> -            queue_rcu_work(system_wq, &krwp->rcu_work);
> > >> +            queue_work(system_wq, &krwp->rcu_work);
> > >>        }
> > >>    }
> > >>
> > >> @@ -4808,7 +4819,7 @@ static void __init kfree_rcu_batch_init(void)
> > >>        struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > >>
> > >>        for (i = 0; i < KFREE_N_BATCHES; i++) {
> > >> -            INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> > >> +            INIT_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> > >>            krcp->krw_arr[i].krcp = krcp;
> > >>        }
> > >>
> > >> --
> > >> 2.30.2
> > >>
> > >> --
> > >> Uladzislau Rezki
  
Paul E. McKenney Nov. 2, 2022, 10:35 p.m. UTC | #16
On Wed, Nov 02, 2022 at 05:26:40PM -0400, Joel Fernandes wrote:
> On Wed, Nov 2, 2022 at 4:28 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > On Wed, Nov 02, 2022 at 03:46:59PM -0400, Joel Fernandes wrote:
> > >
> > >
> > > > On Nov 2, 2022, at 2:49 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
> > > >
> > > > On Wed, Nov 02, 2022 at 07:31:40PM +0100, Uladzislau Rezki wrote:
> > > >>> On Wed, Nov 02, 2022 at 01:29:17PM -0400, Joel Fernandes wrote:
> > > >>> On Wed, Nov 2, 2022 at 1:24 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> > > >>>>
> > > >>>> On Wed, Nov 02, 2022 at 09:35:44AM -0700, Paul E. McKenney wrote:
> > > >>>>> On Wed, Nov 02, 2022 at 12:13:17PM -0400, Joel Fernandes wrote:
> > > >>>>>> On Wed, Nov 2, 2022 at 8:37 AM Uladzislau Rezki <urezki@gmail.com> wrote:
> > > >>>>>>>
> > > >>>>>>> On Sat, Oct 29, 2022 at 01:28:56PM +0000, Joel Fernandes (Google) wrote:
> > > >>>>>>>> On ChromeOS, I am (almost) always seeing the optimization trigger.
> > > >>>>>>>> Tested boot up and trace_printk'ing how often it triggers.
> > > >>>>>>>>
> > > >>>>>>>> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > >>>>>>>> ---
> > > >>>>>>>> kernel/rcu/tree.c | 18 +++++++++++++++++-
> > > >>>>>>>> 1 file changed, 17 insertions(+), 1 deletion(-)
> > > >>>>>>>>
> > > >>>>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > >>>>>>>> index 591187b6352e..3e4c50b9fd33 100644
> > > >>>>>>>> --- a/kernel/rcu/tree.c
> > > >>>>>>>> +++ b/kernel/rcu/tree.c
> > > >>>>>>>> @@ -2935,6 +2935,7 @@ struct kfree_rcu_cpu_work {
> > > >>>>>>>>
> > > >>>>>>>> /**
> > > >>>>>>>>  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> > > >>>>>>>> + * @rdp: The rdp of the CPU that this kfree_rcu corresponds to.
> > > >>>>>>>>  * @head: List of kfree_rcu() objects not yet waiting for a grace period
> > > >>>>>>>>  * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
> > > >>>>>>>>  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> > > >>>>>>>> @@ -2964,6 +2965,8 @@ struct kfree_rcu_cpu {
> > > >>>>>>>>      struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> > > >>>>>>>>      raw_spinlock_t lock;
> > > >>>>>>>>      struct delayed_work monitor_work;
> > > >>>>>>>> +     struct rcu_data *rdp;
> > > >>>>>>>> +     unsigned long last_gp_seq;
> > > >>>>>>>>      bool initialized;
> > > >>>>>>>>      int count;
> > > >>>>>>>>
> > > >>>>>>>> @@ -3167,6 +3170,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
> > > >>>>>>>>                      mod_delayed_work(system_wq, &krcp->monitor_work, delay);
> > > >>>>>>>>              return;
> > > >>>>>>>>      }
> > > >>>>>>>> +     krcp->last_gp_seq = krcp->rdp->gp_seq;
> > > >>>>>>>>      queue_delayed_work(system_wq, &krcp->monitor_work, delay);
> > > >>>>>>>> }
> > > >>>>>>>>
> > > >>>>>>>> @@ -3217,7 +3221,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
> > > >>>>>>>>                      // be that the work is in the pending state when
> > > >>>>>>>>                      // channels have been detached following by each
> > > >>>>>>>>                      // other.
> > > >>>>>>>> -                     queue_rcu_work(system_wq, &krwp->rcu_work);
> > > >>>>>>>> +                     //
> > > >>>>>>>> +                     // NOTE about gp_seq wrap: In case of gp_seq overflow,
> > > >>>>>>>> +                     // it is possible for rdp->gp_seq to be less than
> > > >>>>>>>> +                     // krcp->last_gp_seq even though a GP might be over. In
> > > >>>>>>>> +                     // this rare case, we would just have one extra GP.
> > > >>>>>>>> +                     if (krcp->last_gp_seq &&
> > > >>>>>>>>
> > > >>>>>>> This check can be eliminated i think. A kfree_rcu_cpu is defined as
> > > >>>>>>> static so by default the last_gp_set is set to zero.
> > > >>>>>>
> > > >>>>>> Ack.
> > > >>>>>>
> > > >>>>>>>> @@ -4802,6 +4816,8 @@ static void __init kfree_rcu_batch_init(void)
> > > >>>>>>>>      for_each_possible_cpu(cpu) {
> > > >>>>>>>>              struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > > >>>>>>>>
> > > >>>>>>>> +             krcp->rdp = per_cpu_ptr(&rcu_data, cpu);
> > > >>>>>>>> +             krcp->last_gp_seq = 0;
> > > >>>>>>>>
> > > >>>>>>> Yep. This one can be just dropped.
> > > >>>>>>>
> > > >>>>>>> But all the rest looks good :) I will give it a try from test point of
> > > >>>>>>> view. It is interested from the memory footprint point of view.
> > > >>>>>>
> > > >>>>>> Ack. Thanks. Even though we should not sample rdp->gp_seq, I think it
> > > >>>>>> is still worth a test.
> > > >>>>>
> > > >>>>> Just for completeness, the main purpose of rdp->gp_seq is to reject
> > > >>>>> quiescent states that were seen during already-completed grace periods.
> > > >>>>>
> > > >>>> So it means that instead of gp_seq reading we should take a snaphshot
> > > >>>> of the current state:
> > > >>>>
> > > >>>> snp = get_state_synchronize_rcu();
> > > >>>>
> > > >>>> and later on do a:
> > > >>>>
> > > >>>> cond_synchronize_rcu(snp);
> > > >>>>
> > > >>>> to wait for a GP.
> > > >>>
> > > >>> This can't be called from the timer IRQ handler though (monitor)
> > > >>>
> > > >>>> Or if the poll_state_synchronize_rcu(oldstate)) != 0
> > > >>>> queue_rcu_work().
> > > >>>
> > > >>> But something like this should be possible (maybe)
> > > >>>
> > > >>>> Sorry for a description using the RCU API functions name :)
> > > >>>
> > > >>> I believe you will have to call rcu_poll_gp_seq_start() as well if you
> > > >>> are using polled API. I am planning to look at this properly more,
> > > >>> soon. Right now I am going to write up the rcutop doc and share with
> > > >>> you guys.
> > > >>>
> > > >>> (Maybe RCU polling is the right thing to do as we reuse all the infra
> > > >>> and any corner case it is handling)
> > > >>>
> > > >> OK. This is in my todo list also. Since we have discussed it let's move
> > > >> it forward.
> > > >>
> > > >> Below what i have came up with to switch for polling APIs:
> > > >>
> > > >>> From 799ce1653d159ef3d35f34a284f738c2c267c75f Mon Sep 17 00:00:00 2001
> > > >> From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
> > > >> Date: Wed, 2 Nov 2022 19:26:27 +0100
> > > >> Subject: [PATCH 1/1] rcu: kvfree_rcu: Reduce a memory footptint by using
> > > >> polling APIs
> > > >>
> > > >> Total time taken by all kfree'ers: 6564718459 ns, loops: 10000, batches: 1110, memory footprint: 5057MB
> > > >> Total time taken by all kfree'ers: 8431051895 ns, loops: 10000, batches: 1109, memory footprint: 2749MB
> > > >> Total time taken by all kfree'ers: 9477830789 ns, loops: 10000, batches: 1158, memory footprint: 2934MB
> > > >> Total time taken by all kfree'ers: 9950211144 ns, loops: 10000, batches: 981, memory footprint: 2704MB
> > > >>
> > > >> with a patch:
> > > >>
> > > >> Total time taken by all kfree'ers: 7712110118 ns, loops: 10000, batches: 1660, memory footprint: 91MB
> > > >> Total time taken by all kfree'ers: 7002403664 ns, loops: 10000, batches: 1482, memory footprint: 86MB
> > > >> Total time taken by all kfree'ers: 7842282319 ns, loops: 10000, batches: 1738, memory footprint: 86MB
> > > >> Total time taken by all kfree'ers: 7230161977 ns, loops: 10000, batches: 1542, memory footprint: 72MB
> > > >>
> > > >> Tested with NOCB option, all offloading CPUs:
> > > >>
> > > >> kvm.sh --memory 10G --torture rcuscale --allcpus --duration 1 \
> > > >>  --kconfig CONFIG_NR_CPUS=64 \
> > > >>  --kconfig CONFIG_RCU_NOCB_CPU=y \
> > > >>  --kconfig CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y \
> > > >>  --bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 \
> > > >>  rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" --trust-make
> > > >>
> > > >> According to data there is a big gain in memory footprint with a patch.
> > > >> It is because of call_rcu() and call_rcu_flush() take more effort and
> > > >> time to queue a callback and then wait for a gp.
> > > >>
> > > >> With polling API:
> > > >>  a) we do not need to queue any callback;
> > > >>  b) we might not even need wait for a GP completion.
> > > >>
> > > >> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > >> ---
> > > >> kernel/rcu/tree.c | 21 ++++++++++++++++-----
> > > >> 1 file changed, 16 insertions(+), 5 deletions(-)
> > > >>
> > > >> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > >> index 76973d716921..17c3d6f2c55b 100644
> > > >> --- a/kernel/rcu/tree.c
> > > >> +++ b/kernel/rcu/tree.c
> > > >> @@ -2919,18 +2919,20 @@ struct kvfree_rcu_bulk_data {
> > > >>    ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
> > > >>
> > > >> /**
> > > >> + * @rcu_work: A work to reclaim a memory after a grace period
> > > >>  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
> > > >> - * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
> > > >>  * @head_free: List of kfree_rcu() objects waiting for a grace period
> > > >>  * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
> > > >>  * @krcp: Pointer to @kfree_rcu_cpu structure
> > > >> + * @gp_snap: A snapshot of current grace period
> > > >>  */
> > > >>
> > > >> struct kfree_rcu_cpu_work {
> > > >> -    struct rcu_work rcu_work;
> > > >> +    struct work_struct rcu_work;
> > > >>    struct rcu_head *head_free;
> > > >>    struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
> > > >>    struct kfree_rcu_cpu *krcp;
> > > >> +    unsigned long gp_snap;
> > > >> };
> > > >>
> > > >> /**
> > > >> @@ -3066,10 +3068,12 @@ static void kfree_rcu_work(struct work_struct *work)
> > > >>    struct kfree_rcu_cpu_work *krwp;
> > > >>    int i, j;
> > > >>
> > > >> -    krwp = container_of(to_rcu_work(work),
> > > >> +    krwp = container_of(work,
> > > >>                struct kfree_rcu_cpu_work, rcu_work);
> > > >>    krcp = krwp->krcp;
> > > >>
> > > >> +    cond_synchronize_rcu(krwp->gp_snap);
> > > >
> > > > Might this provoke OOMs in case of callback flooding?
> > > >
> > > > An alternative might be something like this:
> > > >
> > > >    if (!poll_state_synchronize_rcu(krwp->gp_snap)) {
> > > >        queue_rcu_work(system_wq, &krwp->rcu_work);
> > > >        return;
> > > >    }
> > > >
> > > > Either way gets you a non-lazy callback in the case where a grace
> > > > period has not yet elapsed.
> > > > Or am I missing something that prevents OOMs here?
> > >
> > > The memory consumptions appears to be much less in his testing with the onslaught of kfree, which makes OOM probably less likely.
> > >
> > > Though, was your reasoning that in case of a grace period not elapsing, we need a non lazy callback queued, so as to make the reclaim happen sooner?
> > >
> > > If so, the cond_synchronize_rcu() should already be conditionally queueing non-lazy CB since we don’t make synchronous users wait for seconds. Or did I miss something?
> >
> > My concern is that the synchronize_rcu() will block a kworker kthread
> > for some time, and that in callback-flood situations this might slow
> > things down due to exhausting the supply of kworkers.
> >
> > In contrast, use of queue_rcu_work() frees up the kworker to handle
> > other pages that are filling up.
> >
> > Perhaps your point is that the delay from synchronize_rcu() should make
> > the following pages take the fastpath through cond_synchronize_rcu()?
> >
> > Either way, it might well be that context-switch overhead forces us
> > to batch these things somehow.  But let's worry about that when and if
> > it actually happens.
> 
> Your point sounds reasonable. Though we'd hope cmwq scales worker
> thread count as needed, but we shouldn't probably stress it.

It is especially inadvisable to stress it during OOM events, when the
workqueue system might well get an allocation failure when it tries
creating another kworker kthread.  And we do want this to work well when
at/near OOM, because this is one of the things that can free memory.  ;-)

> Though I am thinking, workqueue context is normally used to invoke
> code that can block, and would the issue you mentioned affect those as
> well, or affect RCU when those non-RCU work items block. So for
> example, when other things in the system that can queue things on the
> system_wq and block.  (I might be throwing darts in the dark).
> 
> To be safe, we can implement your suggestion which is basically a form
> of my initial patch.
> 
> Should we add Tejun to the thread?

Let's get organized first, but that would be a good thing.  Or I could
reach out to Tejun internally.

For but one thing to get organized about, maybe kfree_rcu() should be
using a workqueue with the WQ_MEM_RECLAIM flag set.

							Thanx, Paul

> thanks,
> 
>  - Joel
> 
> >
> >                                                         Thanx, Paul
> >
> > > Thanks,
> > >
> > > - Joel
> > >
> > > >
> > > >                            Thanx, Paul
> > > >
> > > >> +
> > > >>    raw_spin_lock_irqsave(&krcp->lock, flags);
> > > >>    // Channels 1 and 2.
> > > >>    for (i = 0; i < FREE_N_CHANNELS; i++) {
> > > >> @@ -3194,6 +3198,13 @@ static void kfree_rcu_monitor(struct work_struct *work)
> > > >>        if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
> > > >>            (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
> > > >>                (krcp->head && !krwp->head_free)) {
> > > >> +            /*
> > > >> +             * Take a snapshot for this krwp. Please note no
> > > >> +             * more any objects can be added to this krwp free
> > > >> +             * channels.
> > > >> +             */
> > > >> +            krwp->gp_snap = get_state_synchronize_rcu();
> > > >> +
> > > >>            // Channel 1 corresponds to the SLAB-pointer bulk path.
> > > >>            // Channel 2 corresponds to vmalloc-pointer bulk path.
> > > >>            for (j = 0; j < FREE_N_CHANNELS; j++) {
> > > >> @@ -3217,7 +3228,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
> > > >>            // be that the work is in the pending state when
> > > >>            // channels have been detached following by each
> > > >>            // other.
> > > >> -            queue_rcu_work(system_wq, &krwp->rcu_work);
> > > >> +            queue_work(system_wq, &krwp->rcu_work);
> > > >>        }
> > > >>    }
> > > >>
> > > >> @@ -4808,7 +4819,7 @@ static void __init kfree_rcu_batch_init(void)
> > > >>        struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > > >>
> > > >>        for (i = 0; i < KFREE_N_BATCHES; i++) {
> > > >> -            INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> > > >> +            INIT_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> > > >>            krcp->krw_arr[i].krcp = krcp;
> > > >>        }
> > > >>
> > > >> --
> > > >> 2.30.2
> > > >>
> > > >> --
> > > >> Uladzislau Rezki
  
Uladzislau Rezki Nov. 3, 2022, 12:41 p.m. UTC | #17
> > >> /**
> > >> @@ -3066,10 +3068,12 @@ static void kfree_rcu_work(struct work_struct *work)
> > >>    struct kfree_rcu_cpu_work *krwp;
> > >>    int i, j;
> > >> 
> > >> -    krwp = container_of(to_rcu_work(work),
> > >> +    krwp = container_of(work,
> > >>                struct kfree_rcu_cpu_work, rcu_work);
> > >>    krcp = krwp->krcp;
> > >> 
> > >> +    cond_synchronize_rcu(krwp->gp_snap);
> > > 
> > > Might this provoke OOMs in case of callback flooding?
> > > 
> > > An alternative might be something like this:
> > > 
> > >    if (!poll_state_synchronize_rcu(krwp->gp_snap)) {
> > >        queue_rcu_work(system_wq, &krwp->rcu_work);
> > >        return;
> > >    }
> > > 
> > > Either way gets you a non-lazy callback in the case where a grace
> > > period has not yet elapsed.
> > > Or am I missing something that prevents OOMs here?
> > 
> > The memory consumptions appears to be much less in his testing with the onslaught of kfree, which makes OOM probably less likely.
> > 
> > Though, was your reasoning that in case of a grace period not elapsing, we need a non lazy callback queued, so as to make the reclaim happen sooner?
> > 
> > If so, the cond_synchronize_rcu() should already be conditionally queueing non-lazy CB since we don’t make synchronous users wait for seconds. Or did I miss something?
> 
> My concern is that the synchronize_rcu() will block a kworker kthread
> for some time, and that in callback-flood situations this might slow
> things down due to exhausting the supply of kworkers.
> 
This concern works in both cases. I mean in default configuration and
with a posted patch. The reclaim work, which name is kfree_rcu_work() only
does a progress when a gp is passed so the rcu_work_rcufn() can queue
our reclaim kworker.

As it is now:

1. Collect pointers, then we decide to drop them we queue the
   monitro_work() worker to the system_wq.

2. The monitor work, kfree_rcu_work(), tries to attach or saying
it by another words bypass a "backlog" to "free" channels.

3. It invokes the queue_rcu_work() that does call_rcu_flush() and
in its turn it queues our worker from the handler. So the worker
is run after GP is passed.

With a patch: 

[1] and [2] steps are the same. But on third step we do:

1. Record the GP status for last in channel;
2. Directly queue the drain work without any call_rcu() helpers;
3. On the reclaim worker entry we check if GP is passed;
4. If not it invokes synchronize_rcu().

The patch eliminates extra steps by not going via RCU-core route
instead it directly invokes the reclaim worker where it either
proceed or wait a GP if needed.

--
Uladzislau Rezki
  
Uladzislau Rezki Nov. 3, 2022, 12:44 p.m. UTC | #18
> 
> > Though I am thinking, workqueue context is normally used to invoke
> > code that can block, and would the issue you mentioned affect those as
> > well, or affect RCU when those non-RCU work items block. So for
> > example, when other things in the system that can queue things on the
> > system_wq and block.  (I might be throwing darts in the dark).
> > 
> > To be safe, we can implement your suggestion which is basically a form
> > of my initial patch.
> > 
> > Should we add Tejun to the thread?
> 
> Let's get organized first, but that would be a good thing.  Or I could
> reach out to Tejun internally.
> 
> For but one thing to get organized about, maybe kfree_rcu() should be
> using a workqueue with the WQ_MEM_RECLAIM flag set.
> 
It can be as an option to consider. Because such workqueue has some
special priority for better handling of memory releasing. I can have
a look at it closer to see how kvfree_rcu() works if it goes with WQ_MEM_RECLAIM.

--
Uladzislau Rezki
  
Uladzislau Rezki Nov. 3, 2022, 12:54 p.m. UTC | #19
> 
> 
> Nice, I am happy you worked on it. A comment bellow:
> 
> Below what i have came up with to switch for polling APIs:
> >
> > From 799ce1653d159ef3d35f34a284f738c2c267c75f Mon Sep 17 00:00:00 2001
> > From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
> > Date: Wed, 2 Nov 2022 19:26:27 +0100
> > Subject: [PATCH 1/1] rcu: kvfree_rcu: Reduce a memory footptint by using
> >  polling APIs
> >
> > Total time taken by all kfree'ers: 6564718459 ns, loops: 10000, batches:
> > 1110, memory footprint: 5057MB
> > Total time taken by all kfree'ers: 8431051895 ns, loops: 10000, batches:
> > 1109, memory footprint: 2749MB
> > Total time taken by all kfree'ers: 9477830789 ns, loops: 10000, batches:
> > 1158, memory footprint: 2934MB
> > Total time taken by all kfree'ers: 9950211144 ns, loops: 10000, batches:
> > 981, memory footprint: 2704MB
> >
> > with a patch:
> 
> 
> >
> > Total time taken by all kfree'ers: 7712110118 ns, loops: 10000, batches:
> > 1660, memory footprint: 91MB
> > Total time taken by all kfree'ers: 7002403664 ns, loops: 10000, batches:
> > 1482, memory footprint: 86MB
> > Total time taken by all kfree'ers: 7842282319 ns, loops: 10000, batches:
> > 1738, memory footprint: 86MB
> > Total time taken by all kfree'ers: 7230161977 ns, loops: 10000, batches:
> > 1542, memory footprint: 72MB
> >
> > Tested with NOCB option, all offloading CPUs:
> 
> 
> > kvm.sh --memory 10G --torture rcuscale --allcpus --duration 1 \
> >   --kconfig CONFIG_NR_CPUS=64 \
> >   --kconfig CONFIG_RCU_NOCB_CPU=y \
> >   --kconfig CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y \
> >   --bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 \
> >   rcuscale.holdoff=20 rcuscale.kfree_loops=10000
> > torture.disable_onoff_at_boot" --trust-make
> >
> > According to data there is a big gain in memory footprint with a patch.
> > It is because of call_rcu() and call_rcu_flush() take more effort and
> > time to queue a callback and then wait for a gp.
> 
> 
> >
> > With polling API:
> >   a) we do not need to queue any callback;
> >   b) we might not even need wait for a GP completion.
> >
> 
> Wow, I did not expect an extra grace period or 2 to cause so much memory
> unreclaimed.
> 
> Does this kernel have the 5 second timeout change?
> 
Yes. But it does not matter. Why? Because 5 second time out happens only
when a system lightly loaded. It can be 10 or 100 seconds.

Once a flood is detected we start to offload back-logged memory.

>
> If yes, then I am confused why extra grace period matters for memory. My
> goal with the idea was to avoid pointless RCU work when we don’t need it.
> But I can’t complaint.
> 
This patch does the same. It avoids of using any call_rcu() because we
do not need it. I mean extra layer.

Why it matters for the memory. I explained it in previous answer. But it
is because of extra layer. Since it is a flood scenario such delays can
lead to higher memory footprint.

The test easily detects it.

>
> Or is it that this test floods the pointer page so we do not wait full 5
> seconds, and benefit when we reclaim sooner with polled API, than waiting
> for grace periods?
> 
Right. Reclaim sooner with polling API.

--
Uladzislau Rezki
  
Uladzislau Rezki Nov. 3, 2022, 1:05 p.m. UTC | #20
On Thu, Nov 03, 2022 at 01:44:51PM +0100, Uladzislau Rezki wrote:
> > 
> > > Though I am thinking, workqueue context is normally used to invoke
> > > code that can block, and would the issue you mentioned affect those as
> > > well, or affect RCU when those non-RCU work items block. So for
> > > example, when other things in the system that can queue things on the
> > > system_wq and block.  (I might be throwing darts in the dark).
> > > 
> > > To be safe, we can implement your suggestion which is basically a form
> > > of my initial patch.
> > > 
> > > Should we add Tejun to the thread?
> > 
> > Let's get organized first, but that would be a good thing.  Or I could
> > reach out to Tejun internally.
> > 
> > For but one thing to get organized about, maybe kfree_rcu() should be
> > using a workqueue with the WQ_MEM_RECLAIM flag set.
> > 
> It can be as an option to consider. Because such workqueue has some
> special priority for better handling of memory releasing. I can have
> a look at it closer to see how kvfree_rcu() works if it goes with WQ_MEM_RECLAIM.
> 
An extra note. It would work well with posted patch because we can
directly queue the reclaim work to the WQ_MEM_RECLAIM queue.

As for now RCU-core kthreads like: rcugp, rcuop use "regular"
queue. I think system_wq one.

--
Uladzislau Rezki
  
Paul E. McKenney Nov. 3, 2022, 4:34 p.m. UTC | #21
On Thu, Nov 03, 2022 at 02:05:23PM +0100, Uladzislau Rezki wrote:
> On Thu, Nov 03, 2022 at 01:44:51PM +0100, Uladzislau Rezki wrote:
> > > 
> > > > Though I am thinking, workqueue context is normally used to invoke
> > > > code that can block, and would the issue you mentioned affect those as
> > > > well, or affect RCU when those non-RCU work items block. So for
> > > > example, when other things in the system that can queue things on the
> > > > system_wq and block.  (I might be throwing darts in the dark).
> > > > 
> > > > To be safe, we can implement your suggestion which is basically a form
> > > > of my initial patch.
> > > > 
> > > > Should we add Tejun to the thread?
> > > 
> > > Let's get organized first, but that would be a good thing.  Or I could
> > > reach out to Tejun internally.
> > > 
> > > For but one thing to get organized about, maybe kfree_rcu() should be
> > > using a workqueue with the WQ_MEM_RECLAIM flag set.
> > > 
> > It can be as an option to consider. Because such workqueue has some
> > special priority for better handling of memory releasing. I can have
> > a look at it closer to see how kvfree_rcu() works if it goes with WQ_MEM_RECLAIM.
> > 
> An extra note. It would work well with posted patch because we can
> directly queue the reclaim work to the WQ_MEM_RECLAIM queue.
> 
> As for now RCU-core kthreads like: rcugp, rcuop use "regular"
> queue. I think system_wq one.

Agreed, it is the expedited grace period that uses WQ_MEM_RECLAIM
workqueues.

							Thanx, Paul
  
Paul E. McKenney Nov. 3, 2022, 5:51 p.m. UTC | #22
On Thu, Nov 03, 2022 at 01:41:43PM +0100, Uladzislau Rezki wrote:
> > > >> /**
> > > >> @@ -3066,10 +3068,12 @@ static void kfree_rcu_work(struct work_struct *work)
> > > >>    struct kfree_rcu_cpu_work *krwp;
> > > >>    int i, j;
> > > >> 
> > > >> -    krwp = container_of(to_rcu_work(work),
> > > >> +    krwp = container_of(work,
> > > >>                struct kfree_rcu_cpu_work, rcu_work);
> > > >>    krcp = krwp->krcp;
> > > >> 
> > > >> +    cond_synchronize_rcu(krwp->gp_snap);
> > > > 
> > > > Might this provoke OOMs in case of callback flooding?
> > > > 
> > > > An alternative might be something like this:
> > > > 
> > > >    if (!poll_state_synchronize_rcu(krwp->gp_snap)) {
> > > >        queue_rcu_work(system_wq, &krwp->rcu_work);
> > > >        return;
> > > >    }
> > > > 
> > > > Either way gets you a non-lazy callback in the case where a grace
> > > > period has not yet elapsed.
> > > > Or am I missing something that prevents OOMs here?
> > > 
> > > The memory consumptions appears to be much less in his testing with the onslaught of kfree, which makes OOM probably less likely.
> > > 
> > > Though, was your reasoning that in case of a grace period not elapsing, we need a non lazy callback queued, so as to make the reclaim happen sooner?
> > > 
> > > If so, the cond_synchronize_rcu() should already be conditionally queueing non-lazy CB since we don’t make synchronous users wait for seconds. Or did I miss something?
> > 
> > My concern is that the synchronize_rcu() will block a kworker kthread
> > for some time, and that in callback-flood situations this might slow
> > things down due to exhausting the supply of kworkers.
> > 
> This concern works in both cases. I mean in default configuration and
> with a posted patch. The reclaim work, which name is kfree_rcu_work() only
> does a progress when a gp is passed so the rcu_work_rcufn() can queue
> our reclaim kworker.
> 
> As it is now:
> 
> 1. Collect pointers, then we decide to drop them we queue the
>    monitro_work() worker to the system_wq.
> 
> 2. The monitor work, kfree_rcu_work(), tries to attach or saying
> it by another words bypass a "backlog" to "free" channels.
> 
> 3. It invokes the queue_rcu_work() that does call_rcu_flush() and
> in its turn it queues our worker from the handler. So the worker
> is run after GP is passed.

So as it is now, we are not tying up a kworker kthread while waiting
for the grace period, correct?  We instead have an RCU callback queued
during that time, and the kworker kthread gets involved only after the
grace period ends.

> With a patch: 
> 
> [1] and [2] steps are the same. But on third step we do:
> 
> 1. Record the GP status for last in channel;
> 2. Directly queue the drain work without any call_rcu() helpers;
> 3. On the reclaim worker entry we check if GP is passed;
> 4. If not it invokes synchronize_rcu().

And #4 changes that, by (sometimes) tying up a kworker kthread for the
full grace period.

> The patch eliminates extra steps by not going via RCU-core route
> instead it directly invokes the reclaim worker where it either
> proceed or wait a GP if needed.

I agree that the use of the polled API could be reducing delays, which
is a good thing.  Just being my usual greedy self and asking "Why not
both?", that is use queue_rcu_work() instead of synchronize_rcu() in
conjunction with the polled APIs so as to avoid both the grace-period
delay and the tying up of the kworker kthread.

Or am I missing something here?

							Thanx, Paul
  
Paul E. McKenney Nov. 3, 2022, 5:52 p.m. UTC | #23
On Thu, Nov 03, 2022 at 01:44:51PM +0100, Uladzislau Rezki wrote:
> > 
> > > Though I am thinking, workqueue context is normally used to invoke
> > > code that can block, and would the issue you mentioned affect those as
> > > well, or affect RCU when those non-RCU work items block. So for
> > > example, when other things in the system that can queue things on the
> > > system_wq and block.  (I might be throwing darts in the dark).
> > > 
> > > To be safe, we can implement your suggestion which is basically a form
> > > of my initial patch.
> > > 
> > > Should we add Tejun to the thread?
> > 
> > Let's get organized first, but that would be a good thing.  Or I could
> > reach out to Tejun internally.
> > 
> > For but one thing to get organized about, maybe kfree_rcu() should be
> > using a workqueue with the WQ_MEM_RECLAIM flag set.
> > 
> It can be as an option to consider. Because such workqueue has some
> special priority for better handling of memory releasing. I can have
> a look at it closer to see how kvfree_rcu() works if it goes with WQ_MEM_RECLAIM.

Sounds good, thank you!

							Thanx, Paul
  
Joel Fernandes Nov. 3, 2022, 6:36 p.m. UTC | #24
> On Nov 3, 2022, at 1:51 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
> 
> On Thu, Nov 03, 2022 at 01:41:43PM +0100, Uladzislau Rezki wrote:
>>>>>> /**
>>>>>> @@ -3066,10 +3068,12 @@ static void kfree_rcu_work(struct work_struct *work)
>>>>>>   struct kfree_rcu_cpu_work *krwp;
>>>>>>   int i, j;
>>>>>> 
>>>>>> -    krwp = container_of(to_rcu_work(work),
>>>>>> +    krwp = container_of(work,
>>>>>>               struct kfree_rcu_cpu_work, rcu_work);
>>>>>>   krcp = krwp->krcp;
>>>>>> 
>>>>>> +    cond_synchronize_rcu(krwp->gp_snap);
>>>>> 
>>>>> Might this provoke OOMs in case of callback flooding?
>>>>> 
>>>>> An alternative might be something like this:
>>>>> 
>>>>>   if (!poll_state_synchronize_rcu(krwp->gp_snap)) {
>>>>>       queue_rcu_work(system_wq, &krwp->rcu_work);
>>>>>       return;
>>>>>   }
>>>>> 
>>>>> Either way gets you a non-lazy callback in the case where a grace
>>>>> period has not yet elapsed.
>>>>> Or am I missing something that prevents OOMs here?
>>>> 
>>>> The memory consumptions appears to be much less in his testing with the onslaught of kfree, which makes OOM probably less likely.
>>>> 
>>>> Though, was your reasoning that in case of a grace period not elapsing, we need a non lazy callback queued, so as to make the reclaim happen sooner?
>>>> 
>>>> If so, the cond_synchronize_rcu() should already be conditionally queueing non-lazy CB since we don’t make synchronous users wait for seconds. Or did I miss something?
>>> 
>>> My concern is that the synchronize_rcu() will block a kworker kthread
>>> for some time, and that in callback-flood situations this might slow
>>> things down due to exhausting the supply of kworkers.
>>> 
>> This concern works in both cases. I mean in default configuration and
>> with a posted patch. The reclaim work, which name is kfree_rcu_work() only
>> does a progress when a gp is passed so the rcu_work_rcufn() can queue
>> our reclaim kworker.
>> 
>> As it is now:
>> 
>> 1. Collect pointers, then we decide to drop them we queue the
>>   monitro_work() worker to the system_wq.
>> 
>> 2. The monitor work, kfree_rcu_work(), tries to attach or saying
>> it by another words bypass a "backlog" to "free" channels.
>> 
>> 3. It invokes the queue_rcu_work() that does call_rcu_flush() and
>> in its turn it queues our worker from the handler. So the worker
>> is run after GP is passed.
> 
> So as it is now, we are not tying up a kworker kthread while waiting
> for the grace period, correct?  We instead have an RCU callback queued
> during that time, and the kworker kthread gets involved only after the
> grace period ends.
> 
>> With a patch: 
>> 
>> [1] and [2] steps are the same. But on third step we do:
>> 
>> 1. Record the GP status for last in channel;
>> 2. Directly queue the drain work without any call_rcu() helpers;
>> 3. On the reclaim worker entry we check if GP is passed;
>> 4. If not it invokes synchronize_rcu().
> 
> And #4 changes that, by (sometimes) tying up a kworker kthread for the
> full grace period.
> 
>> The patch eliminates extra steps by not going via RCU-core route
>> instead it directly invokes the reclaim worker where it either
>> proceed or wait a GP if needed.
> 
> I agree that the use of the polled API could be reducing delays, which
> is a good thing.  Just being my usual greedy self and asking "Why not
> both?", that is use queue_rcu_work() instead of synchronize_rcu() in
> conjunction with the polled APIs so as to avoid both the grace-period
> delay and the tying up of the kworker kthread.
> 
> Or am I missing something here?

Yeah I am with Paul on this, NAK on “blocking in kworker” instead of “checking for grace period + queuing either regular work or RCU work”. Note that blocking also adds a pointless and fully avoidable scheduler round trip.

 - Joel


> 
>                            Thanx, Paul
  
Joel Fernandes Nov. 3, 2022, 6:43 p.m. UTC | #25
> On Nov 3, 2022, at 2:36 PM, Joel Fernandes <joel@joelfernandes.org> wrote:
> 
> 
> 
>> On Nov 3, 2022, at 1:51 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
>> 
>> On Thu, Nov 03, 2022 at 01:41:43PM +0100, Uladzislau Rezki wrote:
>>>>>>> /**
>>>>>>> @@ -3066,10 +3068,12 @@ static void kfree_rcu_work(struct work_struct *work)
>>>>>>>  struct kfree_rcu_cpu_work *krwp;
>>>>>>>  int i, j;
>>>>>>> 
>>>>>>> -    krwp = container_of(to_rcu_work(work),
>>>>>>> +    krwp = container_of(work,
>>>>>>>              struct kfree_rcu_cpu_work, rcu_work);
>>>>>>>  krcp = krwp->krcp;
>>>>>>> 
>>>>>>> +    cond_synchronize_rcu(krwp->gp_snap);
>>>>>> 
>>>>>> Might this provoke OOMs in case of callback flooding?
>>>>>> 
>>>>>> An alternative might be something like this:
>>>>>> 
>>>>>>  if (!poll_state_synchronize_rcu(krwp->gp_snap)) {
>>>>>>      queue_rcu_work(system_wq, &krwp->rcu_work);
>>>>>>      return;
>>>>>>  }
>>>>>> 
>>>>>> Either way gets you a non-lazy callback in the case where a grace
>>>>>> period has not yet elapsed.
>>>>>> Or am I missing something that prevents OOMs here?
>>>>> 
>>>>> The memory consumptions appears to be much less in his testing with the onslaught of kfree, which makes OOM probably less likely.
>>>>> 
>>>>> Though, was your reasoning that in case of a grace period not elapsing, we need a non lazy callback queued, so as to make the reclaim happen sooner?
>>>>> 
>>>>> If so, the cond_synchronize_rcu() should already be conditionally queueing non-lazy CB since we don’t make synchronous users wait for seconds. Or did I miss something?
>>>> 
>>>> My concern is that the synchronize_rcu() will block a kworker kthread
>>>> for some time, and that in callback-flood situations this might slow
>>>> things down due to exhausting the supply of kworkers.
>>>> 
>>> This concern works in both cases. I mean in default configuration and
>>> with a posted patch. The reclaim work, which name is kfree_rcu_work() only
>>> does a progress when a gp is passed so the rcu_work_rcufn() can queue
>>> our reclaim kworker.
>>> 
>>> As it is now:
>>> 
>>> 1. Collect pointers, then we decide to drop them we queue the
>>>  monitro_work() worker to the system_wq.
>>> 
>>> 2. The monitor work, kfree_rcu_work(), tries to attach or saying
>>> it by another words bypass a "backlog" to "free" channels.
>>> 
>>> 3. It invokes the queue_rcu_work() that does call_rcu_flush() and
>>> in its turn it queues our worker from the handler. So the worker
>>> is run after GP is passed.
>> 
>> So as it is now, we are not tying up a kworker kthread while waiting
>> for the grace period, correct?  We instead have an RCU callback queued
>> during that time, and the kworker kthread gets involved only after the
>> grace period ends.
>> 
>>> With a patch: 
>>> 
>>> [1] and [2] steps are the same. But on third step we do:
>>> 
>>> 1. Record the GP status for last in channel;
>>> 2. Directly queue the drain work without any call_rcu() helpers;
>>> 3. On the reclaim worker entry we check if GP is passed;
>>> 4. If not it invokes synchronize_rcu().
>> 
>> And #4 changes that, by (sometimes) tying up a kworker kthread for the
>> full grace period.
>> 
>>> The patch eliminates extra steps by not going via RCU-core route
>>> instead it directly invokes the reclaim worker where it either
>>> proceed or wait a GP if needed.
>> 
>> I agree that the use of the polled API could be reducing delays, which
>> is a good thing.  Just being my usual greedy self and asking "Why not
>> both?", that is use queue_rcu_work() instead of synchronize_rcu() in
>> conjunction with the polled APIs so as to avoid both the grace-period
>> delay and the tying up of the kworker kthread.
>> 
>> Or am I missing something here?
> 
> Yeah I am with Paul on this, NAK on “blocking in kworker” instead of “checking for grace period + queuing either regular work or RCU work”. Note that blocking also adds a pointless and fully avoidable scheduler round trip.

As a side note, it’s notable how nicely this work evolved over the years thanks to Vlad and all of y’all’s work. For instance, flooding pages with kfree pointers and grace period polling was not something even invented back when kfree_rcu was a simple wrapper. Now it soon will be actually freeing memory faster, by avoiding waiting on RCU when not needed! And of course this is all happening probably because we wanted RCU to be lazy in nocb is a nice side effect of that effort ;-)

 - Joel


> 
> - Joel
> 
> 
>> 
>>                           Thanx, Paul
  
Uladzislau Rezki Nov. 4, 2022, 2:35 p.m. UTC | #26
> > > 
> > This concern works in both cases. I mean in default configuration and
> > with a posted patch. The reclaim work, which name is kfree_rcu_work() only
> > does a progress when a gp is passed so the rcu_work_rcufn() can queue
> > our reclaim kworker.
> > 
> > As it is now:
> > 
> > 1. Collect pointers, then we decide to drop them we queue the
> >    monitro_work() worker to the system_wq.
> > 
> > 2. The monitor work, kfree_rcu_work(), tries to attach or saying
> > it by another words bypass a "backlog" to "free" channels.
> > 
> > 3. It invokes the queue_rcu_work() that does call_rcu_flush() and
> > in its turn it queues our worker from the handler. So the worker
> > is run after GP is passed.
> 
> So as it is now, we are not tying up a kworker kthread while waiting
> for the grace period, correct?  We instead have an RCU callback queued
> during that time, and the kworker kthread gets involved only after the
> grace period ends.
> 
Yes. The reclaim kworker is not kicked until a wrapper callback pushed
via queue_rcu_work() -> call_rcu_flush() is invoked by the nocb-kthread:

<snip>
static void rcu_work_rcufn(struct rcu_head *rcu)
{
...
	/* read the comment in __queue_work() */
	local_irq_disable();
	__queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
	local_irq_enable();
}

bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
{
...
	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
		rwork->wq = wq;
		call_rcu_flush(&rwork->rcu, rcu_work_rcufn);
..
<snip>

rcu_work_rcufn() callback runs our reclaim work after a full grace period.
There is not so good dependency. If it is number, say, 10 000 in the cblist
then we just wait for our time in a queue.

> > With a patch: 
> > 
> > [1] and [2] steps are the same. But on third step we do:
> > 
> > 1. Record the GP status for last in channel;
> > 2. Directly queue the drain work without any call_rcu() helpers;
> > 3. On the reclaim worker entry we check if GP is passed;
> > 4. If not it invokes synchronize_rcu().
> 
> And #4 changes that, by (sometimes) tying up a kworker kthread for the
> full grace period.
> 
Yes if GP is not passed we need to wait it anyway. Like this is done in
default case. call_rcu()'s callback waits for a full grace period also
and after that queues the work.

It is the same. But patched version can proceed reclaim right away if
"during the fly" the GP is passed. On my tests i see it is ~30-40% of
cases.

With a patch it also does not require go over RCU-core layer that is
definitely longer.

> > The patch eliminates extra steps by not going via RCU-core route
> > instead it directly invokes the reclaim worker where it either
> > proceed or wait a GP if needed.
> 
> I agree that the use of the polled API could be reducing delays, which
> is a good thing.  Just being my usual greedy self and asking "Why not
> both?", that is use queue_rcu_work() instead of synchronize_rcu() in
> conjunction with the polled APIs so as to avoid both the grace-period
> delay and the tying up of the kworker kthread.
> 
> Or am I missing something here?
> 
Looks like not :) 

1.
I was thinking about tracking a GP status for set of objects. With last
in one we need to keep it updated for the whole set. Such objects can be
freed faster, for example from monitor kworker. All other can go over
queue_rcu_work().

This is more complex design but probably could be better. It will require
split such chains for whom a GP is passed and not.

Please note there can be hidden problems and what i wrote might not be
easily applicable.

The patch i sent is an attempt to optimize it by not introducing the
big delta to current design.

2. 
call_rcu()
   -> queue_work();
          -> do reclaim

queue_work();
    -> synchronize_rcu() or not if passed, do reclaim

I can miss something though, why the latest is a bad way?

--
Uladzislau Rezki
  
Uladzislau Rezki Nov. 4, 2022, 2:39 p.m. UTC | #27
> >> 
> >> Or am I missing something here?
> > 
> > Yeah I am with Paul on this, NAK on “blocking in kworker” instead of “checking for grace period + queuing either regular work or RCU work”. Note that blocking also adds a pointless and fully avoidable scheduler round trip.
> 
> As a side note, it’s notable how nicely this work evolved over the years thanks to Vlad and all of y’all’s work. For instance, flooding pages with kfree pointers and grace period polling was not something even invented back when kfree_rcu was a simple wrapper. Now it soon will be actually freeing memory faster, by avoiding waiting on RCU when not needed! And of course this is all happening probably because we wanted RCU to be lazy in nocb is a nice side effect of that effort ;-)
> 
Yep. We want sometimes everything to be lazy :)

--
Uladzislau Rezki
  

Patch

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 591187b6352e..3e4c50b9fd33 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2935,6 +2935,7 @@  struct kfree_rcu_cpu_work {
 
 /**
  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
+ * @rdp: The rdp of the CPU that this kfree_rcu corresponds to.
  * @head: List of kfree_rcu() objects not yet waiting for a grace period
  * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
@@ -2964,6 +2965,8 @@  struct kfree_rcu_cpu {
 	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
 	raw_spinlock_t lock;
 	struct delayed_work monitor_work;
+	struct rcu_data *rdp;
+	unsigned long last_gp_seq;
 	bool initialized;
 	int count;
 
@@ -3167,6 +3170,7 @@  schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
 			mod_delayed_work(system_wq, &krcp->monitor_work, delay);
 		return;
 	}
+	krcp->last_gp_seq = krcp->rdp->gp_seq;
 	queue_delayed_work(system_wq, &krcp->monitor_work, delay);
 }
 
@@ -3217,7 +3221,17 @@  static void kfree_rcu_monitor(struct work_struct *work)
 			// be that the work is in the pending state when
 			// channels have been detached following by each
 			// other.
-			queue_rcu_work(system_wq, &krwp->rcu_work);
+			//
+			// NOTE about gp_seq wrap: In case of gp_seq overflow,
+			// it is possible for rdp->gp_seq to be less than
+			// krcp->last_gp_seq even though a GP might be over. In
+			// this rare case, we would just have one extra GP.
+			if (krcp->last_gp_seq &&
+			    rcu_seq_completed_gp(krcp->last_gp_seq, krcp->rdp->gp_seq)) {
+				queue_work(system_wq, &krwp->rcu_work.work);
+			} else {
+				queue_rcu_work(system_wq, &krwp->rcu_work);
+			}
 		}
 	}
 
@@ -4802,6 +4816,8 @@  static void __init kfree_rcu_batch_init(void)
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
+		krcp->rdp = per_cpu_ptr(&rcu_data, cpu);
+		krcp->last_gp_seq = 0;
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
 			krcp->krw_arr[i].krcp = krcp;