[RFC,2/2] time: alarmtimer: Use TASK_FREEZABLE to cleanup freezer handling

Message ID 20230211064527.3481754-2-jstultz@google.com
State New
Headers
Series [RFC,1/2] time: alarmtimer: Fix erroneous case of using 0 as an "invalid" initialization value |

Commit Message

John Stultz Feb. 11, 2023, 6:45 a.m. UTC
  Instead of trying to handle the freezer waking up tasks from
schedule() in nanosleep on alarmtimers explicitly, use
TASK_FREEZABLE which marks the task freezable when it goes
to schedule, which prevents the signal wakeup.

This allows for the freezer handling to be removed, simplifying
the code.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Michael <michael@mipisi.de>
Cc: Michael Trimarchi <michael@amarulasolutions.com>
Cc: kernel-team@android.com
Originally-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/lkml/alpine.DEB.2.21.1909021247250.3955@nanos.tec.linutronix.de/
[jstultz: Forward ported to 6.2-rc and split out from a separate
          fix.]
Signed-off-by: John Stultz <jstultz@google.com>
---
 kernel/time/alarmtimer.c | 53 ++--------------------------------------
 1 file changed, 2 insertions(+), 51 deletions(-)
  

Comments

Michael Nazzareno Trimarchi Feb. 15, 2023, 8:22 a.m. UTC | #1
Hi John

On Sat, Feb 11, 2023 at 7:45 AM John Stultz <jstultz@google.com> wrote:
>
> Instead of trying to handle the freezer waking up tasks from
> schedule() in nanosleep on alarmtimers explicitly, use
> TASK_FREEZABLE which marks the task freezable when it goes
> to schedule, which prevents the signal wakeup.
>
> This allows for the freezer handling to be removed, simplifying
> the code.
>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Stephen Boyd <sboyd@kernel.org>
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Michael <michael@mipisi.de>
> Cc: Michael Trimarchi <michael@amarulasolutions.com>
> Cc: kernel-team@android.com
> Originally-by: Thomas Gleixner <tglx@linutronix.de>
> Link: https://lore.kernel.org/lkml/alpine.DEB.2.21.1909021247250.3955@nanos.tec.linutronix.de/
> [jstultz: Forward ported to 6.2-rc and split out from a separate
>           fix.]
> Signed-off-by: John Stultz <jstultz@google.com>
> ---
>  kernel/time/alarmtimer.c | 53 ++--------------------------------------
>  1 file changed, 2 insertions(+), 51 deletions(-)
>
> diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
> index f7b2128f64e2..15ecde8fcc1b 100644
> --- a/kernel/time/alarmtimer.c
> +++ b/kernel/time/alarmtimer.c
> @@ -49,14 +49,6 @@ static struct alarm_base {
>         clockid_t               base_clockid;
>  } alarm_bases[ALARM_NUMTYPE];
>
> -#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS)
> -/* freezer information to handle clock_nanosleep triggered wakeups */
> -static enum alarmtimer_type freezer_alarmtype;
> -static ktime_t freezer_expires;
> -static ktime_t freezer_delta;
> -static DEFINE_SPINLOCK(freezer_delta_lock);
> -#endif
> -
>  #ifdef CONFIG_RTC_CLASS
>  /* rtc timer and device for setting alarm wakeups at suspend */
>  static struct rtc_timer                rtctimer;
> @@ -241,19 +233,12 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
>   */
>  static int alarmtimer_suspend(struct device *dev)
>  {
> -       ktime_t min, now, expires;
> +       ktime_t now, expires, min = KTIME_MAX;
>         int i, ret, type;
>         struct rtc_device *rtc;
>         unsigned long flags;
>         struct rtc_time tm;
>
> -       spin_lock_irqsave(&freezer_delta_lock, flags);
> -       min = freezer_delta;
> -       expires = freezer_expires;
> -       type = freezer_alarmtype;
> -       freezer_delta = KTIME_MAX;
> -       spin_unlock_irqrestore(&freezer_delta_lock, flags);
> -
>         rtc = alarmtimer_get_rtcdev();
>         /* If we have no rtcdev, just return */
>         if (!rtc)
> @@ -480,38 +465,6 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
>  EXPORT_SYMBOL_GPL(alarm_forward_now);
>
>  #ifdef CONFIG_POSIX_TIMERS
> -
> -static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
> -{
> -       struct alarm_base *base;
> -       unsigned long flags;
> -       ktime_t delta;
> -
> -       switch(type) {
> -       case ALARM_REALTIME:
> -               base = &alarm_bases[ALARM_REALTIME];
> -               type = ALARM_REALTIME_FREEZER;
> -               break;
> -       case ALARM_BOOTTIME:
> -               base = &alarm_bases[ALARM_BOOTTIME];
> -               type = ALARM_BOOTTIME_FREEZER;
> -               break;
> -       default:
> -               WARN_ONCE(1, "Invalid alarm type: %d\n", type);
> -               return;
> -       }
> -
> -       delta = ktime_sub(absexp, base->get_ktime());
> -
> -       spin_lock_irqsave(&freezer_delta_lock, flags);
> -       if (delta < freezer_delta) {
> -               freezer_delta = delta;
> -               freezer_expires = absexp;
> -               freezer_alarmtype = type;
> -       }
> -       spin_unlock_irqrestore(&freezer_delta_lock, flags);
> -}
> -
>  /**
>   * clock2alarm - helper that converts from clockid to alarmtypes
>   * @clockid: clockid.
> @@ -750,7 +703,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
>         struct restart_block *restart;
>         alarm->data = (void *)current;
>         do {
> -               set_current_state(TASK_INTERRUPTIBLE);
> +               set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);

For kernel 5.10.x and lts is possible to use freezable_schedule and
let this set_current_state as was before.
I have seen patch that introduce the new state but I suppose that in
order to be compatible to stable this should be the
change. Am I right?

Michael

>                 alarm_start(alarm, absexp);
>                 if (likely(alarm->data))
>                         schedule();
> @@ -765,8 +718,6 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
>         if (!alarm->data)
>                 return 0;
>
> -       if (freezing(current))
> -               alarmtimer_freezerset(absexp, type);
>         restart = &current->restart_block;
>         if (restart->nanosleep.type != TT_NONE) {
>                 struct timespec64 rmt;
> --
> 2.39.1.581.gbfd45094c4-goog
>
  
Thomas Gleixner Feb. 15, 2023, 1:52 p.m. UTC | #2
On Wed, Feb 15 2023 at 09:22, Michael Nazzareno Trimarchi wrote:
> On Sat, Feb 11, 2023 at 7:45 AM John Stultz <jstultz@google.com> wrote:
>>  /**
>>   * clock2alarm - helper that converts from clockid to alarmtypes
>>   * @clockid: clockid.
>> @@ -750,7 +703,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
>>         struct restart_block *restart;
>>         alarm->data = (void *)current;
>>         do {
>> -               set_current_state(TASK_INTERRUPTIBLE);
>> +               set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
>
> For kernel 5.10.x and lts is possible to use freezable_schedule and
> let this set_current_state as was before.
> I have seen patch that introduce the new state but I suppose that in
> order to be compatible to stable this should be the
> change. Am I right?

We always work against upstream first and there freezable_schedule() is
gone. Backports have to be sorted seperately.

Thanks,

        tglx
  
Michael Nazzareno Trimarchi Feb. 18, 2023, 2:56 p.m. UTC | #3
Hi John

On Sat, Feb 11, 2023 at 7:45 AM John Stultz <jstultz@google.com> wrote:
>
> Instead of trying to handle the freezer waking up tasks from
> schedule() in nanosleep on alarmtimers explicitly, use
> TASK_FREEZABLE which marks the task freezable when it goes
> to schedule, which prevents the signal wakeup.
>
> This allows for the freezer handling to be removed, simplifying
> the code.
>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Stephen Boyd <sboyd@kernel.org>
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Michael <michael@mipisi.de>
> Cc: Michael Trimarchi <michael@amarulasolutions.com>
> Cc: kernel-team@android.com
> Originally-by: Thomas Gleixner <tglx@linutronix.de>
> Link: https://lore.kernel.org/lkml/alpine.DEB.2.21.1909021247250.3955@nanos.tec.linutronix.de/
> [jstultz: Forward ported to 6.2-rc and split out from a separate
>           fix.]
> Signed-off-by: John Stultz <jstultz@google.com>
> ---
>  kernel/time/alarmtimer.c | 53 ++--------------------------------------
>  1 file changed, 2 insertions(+), 51 deletions(-)
>
> diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
> index f7b2128f64e2..15ecde8fcc1b 100644
> --- a/kernel/time/alarmtimer.c
> +++ b/kernel/time/alarmtimer.c
> @@ -49,14 +49,6 @@ static struct alarm_base {
>         clockid_t               base_clockid;
>  } alarm_bases[ALARM_NUMTYPE];
>
> -#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS)
> -/* freezer information to handle clock_nanosleep triggered wakeups */
> -static enum alarmtimer_type freezer_alarmtype;
> -static ktime_t freezer_expires;
> -static ktime_t freezer_delta;
> -static DEFINE_SPINLOCK(freezer_delta_lock);
> -#endif
> -
>  #ifdef CONFIG_RTC_CLASS
>  /* rtc timer and device for setting alarm wakeups at suspend */
>  static struct rtc_timer                rtctimer;
> @@ -241,19 +233,12 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
>   */
>  static int alarmtimer_suspend(struct device *dev)
>  {
> -       ktime_t min, now, expires;
> +       ktime_t now, expires, min = KTIME_MAX;
>         int i, ret, type;
>         struct rtc_device *rtc;
>         unsigned long flags;
>         struct rtc_time tm;
>
> -       spin_lock_irqsave(&freezer_delta_lock, flags);
> -       min = freezer_delta;
> -       expires = freezer_expires;
> -       type = freezer_alarmtype;
> -       freezer_delta = KTIME_MAX;
> -       spin_unlock_irqrestore(&freezer_delta_lock, flags);
> -
>         rtc = alarmtimer_get_rtcdev();
>         /* If we have no rtcdev, just return */
>         if (!rtc)
> @@ -480,38 +465,6 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
>  EXPORT_SYMBOL_GPL(alarm_forward_now);
>
>  #ifdef CONFIG_POSIX_TIMERS
> -
> -static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
> -{
> -       struct alarm_base *base;
> -       unsigned long flags;
> -       ktime_t delta;
> -
> -       switch(type) {
> -       case ALARM_REALTIME:
> -               base = &alarm_bases[ALARM_REALTIME];
> -               type = ALARM_REALTIME_FREEZER;
> -               break;
> -       case ALARM_BOOTTIME:
> -               base = &alarm_bases[ALARM_BOOTTIME];
> -               type = ALARM_BOOTTIME_FREEZER;
> -               break;
> -       default:
> -               WARN_ONCE(1, "Invalid alarm type: %d\n", type);
> -               return;
> -       }
> -
> -       delta = ktime_sub(absexp, base->get_ktime());
> -
> -       spin_lock_irqsave(&freezer_delta_lock, flags);
> -       if (delta < freezer_delta) {
> -               freezer_delta = delta;
> -               freezer_expires = absexp;
> -               freezer_alarmtype = type;
> -       }
> -       spin_unlock_irqrestore(&freezer_delta_lock, flags);
> -}
> -
>  /**
>   * clock2alarm - helper that converts from clockid to alarmtypes
>   * @clockid: clockid.
> @@ -750,7 +703,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
>         struct restart_block *restart;
>         alarm->data = (void *)current;
>         do {
> -               set_current_state(TASK_INTERRUPTIBLE);
> +               set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
>                 alarm_start(alarm, absexp);
>                 if (likely(alarm->data))
>                         schedule();
> @@ -765,8 +718,6 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
>         if (!alarm->data)
>                 return 0;
>
> -       if (freezing(current))
> -               alarmtimer_freezerset(absexp, type);
>         restart = &current->restart_block;
>         if (restart->nanosleep.type != TT_NONE) {
>                 struct timespec64 rmt;
> --
> 2.39.1.581.gbfd45094c4-goog
>

I have changed the alarm test to check some corner case

periodic_alarm
Start time (CLOCK_REALTIME_ALARM)[   85.624819] alarmtimer_enqueue: called
: 94:865096467
Setting alarm for every 4 seconds
Starting suspend loops
[   89.674127] PM: suspend entry (deep)
[   89.714916] Filesystems sync: 0.037 seconds
[   89.733594] Freezing user space processes
[   89.740680] Freezing user space processes completed (elapsed 0.002 seconds)
[   89.748593] OOM killer disabled.
[   89.752257] Freezing remaining freezable tasks
[   89.756807] alarmtimer_fired: called
[   89.756831] alarmtimer_dequeue: called <---- HERE

I have the dequeue but not an enquee of the periodic alarm. I was
thinking that create a periodic time of 4 seconds
and have the first alarm on suspend will always guarantee the re-arm
it but it's not working as I expect

Michael




[   89.767735] Freezing remaining freezable tasks completed (elapsed
0.003 seconds)
[   89.775626] printk: Suspending console(s) (use no_console_suspend to debug)
  
Thomas Gleixner Feb. 20, 2023, 7:23 a.m. UTC | #4
On Sat, Feb 18 2023 at 15:56, Michael Nazzareno Trimarchi wrote:
>
> I have changed the alarm test to check some corner case

Could you tell us please which test did you change and what the change is?

> periodic_alarm
> Start time (CLOCK_REALTIME_ALARM)[   85.624819] alarmtimer_enqueue: called
> : 94:865096467
> Setting alarm for every 4 seconds
> Starting suspend loops
> [   89.674127] PM: suspend entry (deep)
> [   89.714916] Filesystems sync: 0.037 seconds
> [   89.733594] Freezing user space processes
> [   89.740680] Freezing user space processes completed (elapsed 0.002 seconds)
> [   89.748593] OOM killer disabled.
> [   89.752257] Freezing remaining freezable tasks
> [   89.756807] alarmtimer_fired: called
> [   89.756831] alarmtimer_dequeue: called <---- HERE
>
> I have the dequeue but not an enquee of the periodic alarm. I was
> thinking that create a periodic time of 4 seconds
> and have the first alarm on suspend will always guarantee the re-arm
> it but it's not working as I expect

Again. You are not telling what you expect. It depends on how the timer
is set up whether the timer is self rearmed or not.

Thanks,

        tglx
  
Michael Nazzareno Trimarchi Feb. 20, 2023, 8:23 a.m. UTC | #5
Hi Thomas

On Mon, Feb 20, 2023 at 8:23 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Sat, Feb 18 2023 at 15:56, Michael Nazzareno Trimarchi wrote:
> >
> > I have changed the alarm test to check some corner case
>
> Could you tell us please which test did you change and what the change is?
>

  if (timer_create(CLOCK_REALTIME_ALARM, &se, &tm1) == -1) {
       printf("timer_create failed, %s unsupported?\n",
       clockstring(alarm_clock_id));
       exit(1);
  }

  clock_gettime(alarm_clock_id, &start_time);
  printf("Start time (%s): %ld:%ld\n", clockstring(alarm_clock_id),
                                start_time.tv_sec, start_time.tv_nsec);
  printf("Setting alarm for every %i seconds\n", SUSPEND_SECS);
  its1.it_value = start_time;
  its1.it_value.tv_sec += 4;
  /* Empiric value for get in between a freeze task and fire of the timer */
  its1.it_value.tv_nsec += 132079666;
  its1.it_interval.tv_sec = 4;
  its1.it_interval.tv_nsec = 0;

  timer_settime(tm1, TIMER_ABSTIME, &its1, &its2);

  printf("Starting suspend loops\n");
  while (1) {
      int ret;
      sleep(4);
      system("echo mem > /sys/power/state");
  }

> > periodic_alarm
> > Start time (CLOCK_REALTIME_ALARM)[   85.624819] alarmtimer_enqueue: called
> > : 94:865096467
> > Setting alarm for every 4 seconds
> > Starting suspend loops
> > [   89.674127] PM: suspend entry (deep)
> > [   89.714916] Filesystems sync: 0.037 seconds
> > [   89.733594] Freezing user space processes
> > [   89.740680] Freezing user space processes completed (elapsed 0.002 seconds)
> > [   89.748593] OOM killer disabled.
> > [   89.752257] Freezing remaining freezable tasks
> > [   89.756807] alarmtimer_fired: called
> > [   89.756831] alarmtimer_dequeue: called <---- HERE
> >
> > I have the dequeue but not an enquee of the periodic alarm. I was
> > thinking that create a periodic time of 4 seconds
> > and have the first alarm on suspend will always guarantee the re-arm
> > it but it's not working as I expect
>
> Again. You are not telling what you expect. It depends on how the timer
> is set up whether the timer is self rearmed or not.
>

Posted the pseudo code. As far as I understand, the timer periodic is
re-armed in get_signal
do_work_pending->do_signal()->get_signal(), then in the posix timer
code the enqueue_alarm is called. All the timers
used from suspend are coming from the expiration list that contains
only the enqueue alarm

My test case is a single core, arm and with only one REAL_TIME_ALARM
periodic timer created.

Michael

> Thanks,
>
>         tglx
  
Michael Nazzareno Trimarchi Feb. 20, 2023, 11:47 a.m. UTC | #6
Hi

On Mon, Feb 20, 2023 at 9:23 AM Michael Nazzareno Trimarchi
<michael@amarulasolutions.com> wrote:
>
> Hi Thomas
>
> On Mon, Feb 20, 2023 at 8:23 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> >
> > On Sat, Feb 18 2023 at 15:56, Michael Nazzareno Trimarchi wrote:
> > >
> > > I have changed the alarm test to check some corner case
> >
> > Could you tell us please which test did you change and what the change is?
> >
>

There are no changes in the kernel apart pr_info on enqueue dequeue
and fired call
in alarmtimer.c. linux  master branch sha
38f8ccde04a3fa317b51b05e63c3cb57e1641931
and both patches applied

time: alarmtimer: Use TASK_FREEZABLE to cleanup freezer handling
time: alarmtimer: Fix erroneous case of using 0 as an "invalid"
initialization value

Michael

>   if (timer_create(CLOCK_REALTIME_ALARM, &se, &tm1) == -1) {
>        printf("timer_create failed, %s unsupported?\n",
>        clockstring(alarm_clock_id));
>        exit(1);
>   }
>
>   clock_gettime(alarm_clock_id, &start_time);
>   printf("Start time (%s): %ld:%ld\n", clockstring(alarm_clock_id),
>                                 start_time.tv_sec, start_time.tv_nsec);
>   printf("Setting alarm for every %i seconds\n", SUSPEND_SECS);
>   its1.it_value = start_time;
>   its1.it_value.tv_sec += 4;
>   /* Empiric value for get in between a freeze task and fire of the timer */
>   its1.it_value.tv_nsec += 132079666;
>   its1.it_interval.tv_sec = 4;
>   its1.it_interval.tv_nsec = 0;
>
>   timer_settime(tm1, TIMER_ABSTIME, &its1, &its2);
>
>   printf("Starting suspend loops\n");
>   while (1) {
>       int ret;
>       sleep(4);
>       system("echo mem > /sys/power/state");
>   }
>
> > > periodic_alarm
> > > Start time (CLOCK_REALTIME_ALARM)[   85.624819] alarmtimer_enqueue: called
> > > : 94:865096467
> > > Setting alarm for every 4 seconds
> > > Starting suspend loops
> > > [   89.674127] PM: suspend entry (deep)
> > > [   89.714916] Filesystems sync: 0.037 seconds
> > > [   89.733594] Freezing user space processes
> > > [   89.740680] Freezing user space processes completed (elapsed 0.002 seconds)
> > > [   89.748593] OOM killer disabled.
> > > [   89.752257] Freezing remaining freezable tasks
> > > [   89.756807] alarmtimer_fired: called
> > > [   89.756831] alarmtimer_dequeue: called <---- HERE
> > >
> > > I have the dequeue but not an enquee of the periodic alarm. I was
> > > thinking that create a periodic time of 4 seconds
> > > and have the first alarm on suspend will always guarantee the re-arm
> > > it but it's not working as I expect
> >
> > Again. You are not telling what you expect. It depends on how the timer
> > is set up whether the timer is self rearmed or not.
> >
>
> Posted the pseudo code. As far as I understand, the timer periodic is
> re-armed in get_signal
> do_work_pending->do_signal()->get_signal(), then in the posix timer
> code the enqueue_alarm is called. All the timers
> used from suspend are coming from the expiration list that contains
> only the enqueue alarm
>
> My test case is a single core, arm and with only one REAL_TIME_ALARM
> periodic timer created.
>
> Michael
>
> > Thanks,
> >
> >         tglx
  
Thomas Gleixner Feb. 20, 2023, 5:48 p.m. UTC | #7
Michael!

On Mon, Feb 20 2023 at 12:47, Michael Nazzareno Trimarchi wrote:
> On Mon, Feb 20, 2023 at 9:23 AM Michael Nazzareno Trimarchi
> <michael@amarulasolutions.com> wrote:
>> On Mon, Feb 20, 2023 at 8:23 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>> > > Starting suspend loops
>> > > [   89.674127] PM: suspend entry (deep)
>> > > [   89.714916] Filesystems sync: 0.037 seconds
>> > > [   89.733594] Freezing user space processes
>> > > [   89.740680] Freezing user space processes completed (elapsed 0.002 seconds)
>> > > [   89.748593] OOM killer disabled.
>> > > [   89.752257] Freezing remaining freezable tasks
>> > > [   89.756807] alarmtimer_fired: called
>> > > [   89.756831] alarmtimer_dequeue: called <---- HERE
>> > >
>> > > I have the dequeue but not an enquee of the periodic alarm. I was
>> > > thinking that create a periodic time of 4 seconds
>> > > and have the first alarm on suspend will always guarantee the re-arm
>> > > it but it's not working as I expect
>> >
>> > Again. You are not telling what you expect. It depends on how the timer
>> > is set up whether the timer is self rearmed or not.
>>
>> Posted the pseudo code. As far as I understand, the timer periodic is
>> re-armed in get_signal do_work_pending->do_signal()->get_signal(),
>> then in the posix timer code the enqueue_alarm is called. All the
>> timers used from suspend are coming from the expiration list that
>> contains only the enqueue alarm.

Let me try to decode the above.

Yes, periodic timers are re-armed when the signal is delivered, but that
happens way later after the system resumed.

Here is what I observe:

[   27.349352] alarmtimer_enqueue()

U: Before SUSPEND

[   31.353228] PM: suspend entry (s2idle)
[   31.388857] Filesystems sync: 0.033 seconds
[   31.418427] Freezing user space processes
[   31.422406] Freezing user space processes completed (elapsed 0.002 seconds)
[   31.425435] OOM killer disabled.
[   31.426833] Freezing remaining freezable tasks
[   31.429838] Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
[   31.432922] printk: Suspending console(s) (use no_console_suspend to debug)
[   31.435912] alarmtimer alarmtimer.0.auto: PM: dpm_run_callback(): platform_pm_suspend+0x0/0x50 returns -16
[   31.435954] alarmtimer alarmtimer.0.auto: PM: failed to suspend: error -16

That means the RTC interrupt was raised before the system was able to suspend.

[   31.436077] PM: Some devices failed to suspend, or early wake event detected
[   31.444270] OOM killer enabled.
[   31.445011] Restarting tasks ... done.
[   31.446820] random: crng reseeded on system resumption
[   31.466019] PM: suspend exit

[   31.480283] alarmtimer_fired()
[   31.481403] alarmtimer_dequeue()     <- Signal queued

[   31.482596] alarmtimer_rearm()_      <- Signal delivery
[   31.483713] alarmtimer_enqueue()

U: ALRM signal received                 <- User space signal handler
U: Post Suspend                         <- system("echo .... >") returns

That's 6.2 + John's patches.

Thanks,

        tglx
  
Michael Nazzareno Trimarchi Feb. 20, 2023, 6:11 p.m. UTC | #8
Hi Thomas

On Mon, Feb 20, 2023 at 6:48 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> Michael!
>
> On Mon, Feb 20 2023 at 12:47, Michael Nazzareno Trimarchi wrote:
> > On Mon, Feb 20, 2023 at 9:23 AM Michael Nazzareno Trimarchi
> > <michael@amarulasolutions.com> wrote:
> >> On Mon, Feb 20, 2023 at 8:23 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> >> > > Starting suspend loops
> >> > > [   89.674127] PM: suspend entry (deep)
> >> > > [   89.714916] Filesystems sync: 0.037 seconds
> >> > > [   89.733594] Freezing user space processes
> >> > > [   89.740680] Freezing user space processes completed (elapsed 0.002 seconds)
> >> > > [   89.748593] OOM killer disabled.
> >> > > [   89.752257] Freezing remaining freezable tasks
> >> > > [   89.756807] alarmtimer_fired: called
> >> > > [   89.756831] alarmtimer_dequeue: called <---- HERE
> >> > >
> >> > > I have the dequeue but not an enquee of the periodic alarm. I was
> >> > > thinking that create a periodic time of 4 seconds
> >> > > and have the first alarm on suspend will always guarantee the re-arm
> >> > > it but it's not working as I expect
> >> >
> >> > Again. You are not telling what you expect. It depends on how the timer
> >> > is set up whether the timer is self rearmed or not.
> >>
> >> Posted the pseudo code. As far as I understand, the timer periodic is
> >> re-armed in get_signal do_work_pending->do_signal()->get_signal(),
> >> then in the posix timer code the enqueue_alarm is called. All the
> >> timers used from suspend are coming from the expiration list that
> >> contains only the enqueue alarm.
>
> Let me try to decode the above.
>
> Yes, periodic timers are re-armed when the signal is delivered, but that
> happens way later after the system resumed.
>
> Here is what I observe:
>
> [   27.349352] alarmtimer_enqueue()
>
> U: Before SUSPEND
>
> [   31.353228] PM: suspend entry (s2idle)
> [   31.388857] Filesystems sync: 0.033 seconds
> [   31.418427] Freezing user space processes
> [   31.422406] Freezing user space processes completed (elapsed 0.002 seconds)
> [   31.425435] OOM killer disabled.
> [   31.426833] Freezing remaining freezable tasks
> [   31.429838] Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
> [   31.432922] printk: Suspending console(s) (use no_console_suspend to debug)
> [   31.435912] alarmtimer alarmtimer.0.auto: PM: dpm_run_callback(): platform_pm_suspend+0x0/0x50 returns -16
> [   31.435954] alarmtimer alarmtimer.0.auto: PM: failed to suspend: error -16
>
> That means the RTC interrupt was raised before the system was able to suspend.

if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
    pm_wakeup_event(dev, 2 * MSEC_PER_SEC);
    return -EBUSY;
}

I think that above happens to you. So it means that you are too close
to this limit, can be?
Yes but the alarm for me was set to be fired just before freezing. Is
this a valid scenario? I can retest on 6.2

Let's say that I set an alarm to be fired just before the userspace
freeze happens. If I'm close to
it then then process will not process the signal to enquene again the
alarm in the list and then during
alarm suspend the list will be empty for the above.

So your suggestion is that something is wrong with my environment so I
will do the same on top of 6.2

Michael

>
> [   31.436077] PM: Some devices failed to suspend, or early wake event detected
> [   31.444270] OOM killer enabled.
> [   31.445011] Restarting tasks ... done.
> [   31.446820] random: crng reseeded on system resumption
> [   31.466019] PM: suspend exit
>
> [   31.480283] alarmtimer_fired()
> [   31.481403] alarmtimer_dequeue()     <- Signal queued
>
> [   31.482596] alarmtimer_rearm()_      <- Signal delivery
> [   31.483713] alarmtimer_enqueue()
>
> U: ALRM signal received                 <- User space signal handler
> U: Post Suspend                         <- system("echo .... >") returns
>
> That's 6.2 + John's patches.
>
> Thanks,
>
>         tglx
>
  
Thomas Gleixner Feb. 20, 2023, 9:18 p.m. UTC | #9
Michael!

On Mon, Feb 20 2023 at 19:11, Michael Nazzareno Trimarchi wrote:
> On Mon, Feb 20, 2023 at 6:48 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>> [   27.349352] alarmtimer_enqueue()
>>
>> U: Before SUSPEND
>>
>> [   31.353228] PM: suspend entry (s2idle)
>> [   31.388857] Filesystems sync: 0.033 seconds
>> [   31.418427] Freezing user space processes
>> [   31.422406] Freezing user space processes completed (elapsed 0.002 seconds)
>> [   31.425435] OOM killer disabled.
>> [   31.426833] Freezing remaining freezable tasks
>> [   31.429838] Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
>> [   31.432922] printk: Suspending console(s) (use no_console_suspend to debug)
>> [   31.435912] alarmtimer alarmtimer.0.auto: PM: dpm_run_callback(): platform_pm_suspend+0x0/0x50 returns -16
>> [   31.435954] alarmtimer alarmtimer.0.auto: PM: failed to suspend: error -16
>>
>> That means the RTC interrupt was raised before the system was able to suspend.
>
> if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
>     pm_wakeup_event(dev, 2 * MSEC_PER_SEC);
>     return -EBUSY;
> }
>
> I think that above happens to you. So it means that you are too close
> to this limit, can be?

Maybe.

> Yes but the alarm for me was set to be fired just before freezing. Is
> this a valid scenario?

Sure why not? 

> Let's say that I set an alarm to be fired just before the userspace
> freeze happens. If I'm close to it then then process will not process
> the signal to enquene again the alarm in the list and then during
> alarm suspend the list will be empty for the above.

Halfways, but slowly your explanations start to make sense. Here is the
dmesg output you provided:

> [   89.674127] PM: suspend entry (deep)
> [   89.714916] Filesystems sync: 0.037 seconds
> [   89.733594] Freezing user space processes
> [   89.740680] Freezing user space processes completed (elapsed 0.002 seconds)

User space tasks are frozen now.

> [   89.748593] OOM killer disabled.
> [   89.752257] Freezing remaining freezable tasks
> [   89.756807] alarmtimer_fired: called
> [   89.756831] alarmtimer_dequeue: called <---- HERE

Here fires the underlying hrtimer before devices are suspended, so the
sig_sendqueue() cannot wake up the task because task->state ==
TASK_FROZEN, which means the signal wont be handled and the timer wont
be rearmed until the task is thawed.

And as you correctly observed the alarmtimer_suspend() path won't see a
pending timer anymore because it is dequeued.

So precisely the time between freeze(alarmtask) and alarmtimer_suspend()
is a gaping hole which guarantees lost wakeups.

That's completely unrelated to Johns patches. That hole has been there
forever.

I assume that this horrible freezer hackery was supposed to plug that
hole, but that gem is not solving anything as far as I understand what
it is doing. I'm still failing to understand what it is supposed to
solve, but that's a different story.

Aside of that I can clearly reproduce the issue, now that I understand
what you are trying to tell, on plain 6.2 without and with John's
cleanup.

Something like the below plugs the hole reliably.

Thanks,

        tglx
---
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -26,6 +26,7 @@
 #include <linux/freezer.h>
 #include <linux/compat.h>
 #include <linux/module.h>
+#include <linux/suspend.h>
 #include <linux/time_namespace.h>
 
 #include "posix-timers.h"
@@ -176,6 +177,7 @@ static void alarmtimer_dequeue(struct al
 	alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
 }
 
+static atomic_t alarmtimer_wakeup;
 
 /**
  * alarmtimer_fired - Handles alarm hrtimer being fired.
@@ -194,6 +196,8 @@ static enum hrtimer_restart alarmtimer_f
 	int ret = HRTIMER_NORESTART;
 	int restart = ALARMTIMER_NORESTART;
 
+	atomic_inc(&alarmtimer_wakeup);
+
 	spin_lock_irqsave(&base->lock, flags);
 	alarmtimer_dequeue(base, alarm);
 	spin_unlock_irqrestore(&base->lock, flags);
@@ -244,6 +248,16 @@ static int alarmtimer_suspend(struct dev
 	if (!rtc)
 		return 0;
 
+	/*
+	 * Handle wakeups which happened between the start of suspend and
+	 * now as those wakeups might have tried to wake up a frozen task
+	 * which means they are not longer in the alarm timer list.
+	 */
+	if (atomic_read(&alarmtimer_wakeup)) {
+		pm_wakeup_event(dev, 0);
+		return -EBUSY;
+	}
+
 	/* Find the soonest timer to expire*/
 	for (i = 0; i < ALARM_NUMTYPE; i++) {
 		struct alarm_base *base = &alarm_bases[i];
@@ -296,6 +310,31 @@ static int alarmtimer_resume(struct devi
 	return 0;
 }
 
+static int alarmtimer_pm_notifier_fn(struct notifier_block *bl, unsigned long state,
+				     void *unused)
+{
+	switch (state) {
+	case PM_HIBERNATION_PREPARE:
+	case PM_POST_HIBERNATION:
+		atomic_set(&alarmtimer_wakeup, 0);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block alarmtimer_pm_notifier = {
+	.notifier_call = alarmtimer_pm_notifier_fn,
+};
+
+static inline int alarmtimer_register_pm_notifier(void)
+{
+	return register_pm_notifier(&alarmtimer_pm_notifier);
+}
+
+static inline void alarmtimer_unregister_pm_notifier(void)
+{
+	unregister_pm_notifier(&alarmtimer_pm_notifier);
+}
 #else
 static int alarmtimer_suspend(struct device *dev)
 {
@@ -306,6 +345,15 @@ static int alarmtimer_resume(struct devi
 {
 	return 0;
 }
+
+static inline int alarmtimer_register_pm_notifier(void)
+{
+	return 0;
+}
+
+static inline void alarmtimer_unregister_pm_notifier(void)
+{
+}
 #endif
 
 static void
@@ -904,11 +952,17 @@ static int __init alarmtimer_init(void)
 	if (error)
 		return error;
 
-	error = platform_driver_register(&alarmtimer_driver);
+	error = alarmtimer_register_pm_notifier();
 	if (error)
 		goto out_if;
 
+	error = platform_driver_register(&alarmtimer_driver);
+	if (error)
+		goto out_pm;
+
 	return 0;
+out_pm:
+	alarmtimer_unregister_pm_notifier();
 out_if:
 	alarmtimer_rtc_interface_remove();
 	return error;
  
Michael Nazzareno Trimarchi Feb. 20, 2023, 9:32 p.m. UTC | #10
Hi

On Mon, Feb 20, 2023 at 10:18 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> Michael!
>
> On Mon, Feb 20 2023 at 19:11, Michael Nazzareno Trimarchi wrote:
> > On Mon, Feb 20, 2023 at 6:48 PM Thomas Gleixner <tglx@linutronix.de> wrote:
> >> [   27.349352] alarmtimer_enqueue()
> >>
> >> U: Before SUSPEND
> >>
> >> [   31.353228] PM: suspend entry (s2idle)
> >> [   31.388857] Filesystems sync: 0.033 seconds
> >> [   31.418427] Freezing user space processes
> >> [   31.422406] Freezing user space processes completed (elapsed 0.002 seconds)
> >> [   31.425435] OOM killer disabled.
> >> [   31.426833] Freezing remaining freezable tasks
> >> [   31.429838] Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
> >> [   31.432922] printk: Suspending console(s) (use no_console_suspend to debug)
> >> [   31.435912] alarmtimer alarmtimer.0.auto: PM: dpm_run_callback(): platform_pm_suspend+0x0/0x50 returns -16
> >> [   31.435954] alarmtimer alarmtimer.0.auto: PM: failed to suspend: error -16
> >>
> >> That means the RTC interrupt was raised before the system was able to suspend.
> >
> > if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
> >     pm_wakeup_event(dev, 2 * MSEC_PER_SEC);
> >     return -EBUSY;
> > }
> >
> > I think that above happens to you. So it means that you are too close
> > to this limit, can be?
>
> Maybe.
>
> > Yes but the alarm for me was set to be fired just before freezing. Is
> > this a valid scenario?
>
> Sure why not?
>
> > Let's say that I set an alarm to be fired just before the userspace
> > freeze happens. If I'm close to it then then process will not process
> > the signal to enquene again the alarm in the list and then during
> > alarm suspend the list will be empty for the above.
>
> Halfways, but slowly your explanations start to make sense. Here is the
> dmesg output you provided:
>
> > [   89.674127] PM: suspend entry (deep)
> > [   89.714916] Filesystems sync: 0.037 seconds
> > [   89.733594] Freezing user space processes
> > [   89.740680] Freezing user space processes completed (elapsed 0.002 seconds)
>
> User space tasks are frozen now.
>
> > [   89.748593] OOM killer disabled.
> > [   89.752257] Freezing remaining freezable tasks
> > [   89.756807] alarmtimer_fired: called
> > [   89.756831] alarmtimer_dequeue: called <---- HERE
>
> Here fires the underlying hrtimer before devices are suspended, so the
> sig_sendqueue() cannot wake up the task because task->state ==
> TASK_FROZEN, which means the signal wont be handled and the timer wont
> be rearmed until the task is thawed.
>
> And as you correctly observed the alarmtimer_suspend() path won't see a
> pending timer anymore because it is dequeued.
>
> So precisely the time between freeze(alarmtask) and alarmtimer_suspend()
> is a gaping hole which guarantees lost wakeups.
>
> That's completely unrelated to Johns patches. That hole has been there
> forever.
>

That was clear, but I was trying to test the race here.

> I assume that this horrible freezer hackery was supposed to plug that
> hole, but that gem is not solving anything as far as I understand what
> it is doing. I'm still failing to understand what it is supposed to
> solve, but that's a different story.
>

Yes, cleaning up was good.

> Aside of that I can clearly reproduce the issue, now that I understand
> what you are trying to tell, on plain 6.2 without and with John's
> cleanup.
>
> Something like the below plugs the hole reliably.
>

Some comments below

> Thanks,
>
>         tglx
> ---
> --- a/kernel/time/alarmtimer.c
> +++ b/kernel/time/alarmtimer.c
> @@ -26,6 +26,7 @@
>  #include <linux/freezer.h>
>  #include <linux/compat.h>
>  #include <linux/module.h>
> +#include <linux/suspend.h>
>  #include <linux/time_namespace.h>
>
>  #include "posix-timers.h"
> @@ -176,6 +177,7 @@ static void alarmtimer_dequeue(struct al
>         alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
>  }
>
> +static atomic_t alarmtimer_wakeup;
>
>  /**
>   * alarmtimer_fired - Handles alarm hrtimer being fired.
> @@ -194,6 +196,8 @@ static enum hrtimer_restart alarmtimer_f
>         int ret = HRTIMER_NORESTART;
>         int restart = ALARMTIMER_NORESTART;
>
> +       atomic_inc(&alarmtimer_wakeup);
> +

       ptr->it_active = 0;
        if (ptr->it_interval) {
                atomic_inc(&alarmtimer_wakeup);
                si_private = ++ptr->it_requeue_pending;
        }

Should I not go to the alarm_handle_timer? and only if it's a periodic one?

Michael

>         spin_lock_irqsave(&base->lock, flags);
>         alarmtimer_dequeue(base, alarm);
>         spin_unlock_irqrestore(&base->lock, flags);
> @@ -244,6 +248,16 @@ static int alarmtimer_suspend(struct dev
>         if (!rtc)
>                 return 0;
>
> +       /*
> +        * Handle wakeups which happened between the start of suspend and
> +        * now as those wakeups might have tried to wake up a frozen task
> +        * which means they are not longer in the alarm timer list.
> +        */
> +       if (atomic_read(&alarmtimer_wakeup)) {
> +               pm_wakeup_event(dev, 0);
> +               return -EBUSY;
> +       }
> +
>         /* Find the soonest timer to expire*/
>         for (i = 0; i < ALARM_NUMTYPE; i++) {
>                 struct alarm_base *base = &alarm_bases[i];
> @@ -296,6 +310,31 @@ static int alarmtimer_resume(struct devi
>         return 0;
>  }
>
> +static int alarmtimer_pm_notifier_fn(struct notifier_block *bl, unsigned long state,
> +                                    void *unused)
> +{
> +       switch (state) {
> +       case PM_HIBERNATION_PREPARE:
> +       case PM_POST_HIBERNATION:
> +               atomic_set(&alarmtimer_wakeup, 0);
> +               break;
> +       }
> +       return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block alarmtimer_pm_notifier = {
> +       .notifier_call = alarmtimer_pm_notifier_fn,
> +};
> +
> +static inline int alarmtimer_register_pm_notifier(void)
> +{
> +       return register_pm_notifier(&alarmtimer_pm_notifier);
> +}
> +
> +static inline void alarmtimer_unregister_pm_notifier(void)
> +{
> +       unregister_pm_notifier(&alarmtimer_pm_notifier);
> +}
>  #else
>  static int alarmtimer_suspend(struct device *dev)
>  {
> @@ -306,6 +345,15 @@ static int alarmtimer_resume(struct devi
>  {
>         return 0;
>  }
> +
> +static inline int alarmtimer_register_pm_notifier(void)
> +{
> +       return 0;
> +}
> +
> +static inline void alarmtimer_unregister_pm_notifier(void)
> +{
> +}
>  #endif
>
>  static void
> @@ -904,11 +952,17 @@ static int __init alarmtimer_init(void)
>         if (error)
>                 return error;
>
> -       error = platform_driver_register(&alarmtimer_driver);
> +       error = alarmtimer_register_pm_notifier();
>         if (error)
>                 goto out_if;
>
> +       error = platform_driver_register(&alarmtimer_driver);
> +       if (error)
> +               goto out_pm;
> +
>         return 0;
> +out_pm:
> +       alarmtimer_unregister_pm_notifier();
>  out_if:
>         alarmtimer_rtc_interface_remove();
>         return error;
>
>
>
>
>
>
>
>
  
Thomas Gleixner Feb. 21, 2023, 12:12 a.m. UTC | #11
Michael!

On Mon, Feb 20 2023 at 22:32, Michael Nazzareno Trimarchi wrote:
> On Mon, Feb 20, 2023 at 10:18 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>>   * alarmtimer_fired - Handles alarm hrtimer being fired.
>> @@ -194,6 +196,8 @@ static enum hrtimer_restart alarmtimer_f
>>         int ret = HRTIMER_NORESTART;
>>         int restart = ALARMTIMER_NORESTART;
>>
>> +       atomic_inc(&alarmtimer_wakeup);
>> +
>
>        ptr->it_active = 0;
>         if (ptr->it_interval) {
>                 atomic_inc(&alarmtimer_wakeup);
>                 si_private = ++ptr->it_requeue_pending;
>         }
>
> Should I not go to the alarm_handle_timer? and only if it's a periodic
> one?

Why?

Any alarmtimer which hits that window has exactly the same problem.

It's not restricted to periodic timers. Why would a dropped one-shot
wakeup be acceptable?

It's neither restricted to posix timers. If a clock_nanosleep(ALARM)
expires in that window then the task wake up will just end up in the
/dev/null bucket for the very same reason. Why would this be correct?

Hmm?

<GRMBL>
> Michael
>
>>         spin_lock_irqsave(&base->lock, flags);

<SNIP>Tons of wasted electrons</SNIP>

Can you please trim your replies?

</GRMBL>

Thanks,

        tglx
  
Michael Nazzareno Trimarchi Feb. 21, 2023, 7:10 a.m. UTC | #12
Hi

On Tue, Feb 21, 2023 at 1:12 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> Michael!
>
> On Mon, Feb 20 2023 at 22:32, Michael Nazzareno Trimarchi wrote:
> > On Mon, Feb 20, 2023 at 10:18 PM Thomas Gleixner <tglx@linutronix.de> wrote:
> >>   * alarmtimer_fired - Handles alarm hrtimer being fired.
> >> @@ -194,6 +196,8 @@ static enum hrtimer_restart alarmtimer_f
> >>         int ret = HRTIMER_NORESTART;
> >>         int restart = ALARMTIMER_NORESTART;
> >>
> >> +       atomic_inc(&alarmtimer_wakeup);
> >> +
> >
> >        ptr->it_active = 0;
> >         if (ptr->it_interval) {
> >                 atomic_inc(&alarmtimer_wakeup);
> >                 si_private = ++ptr->it_requeue_pending;
> >         }
> >
> > Should I not go to the alarm_handle_timer? and only if it's a periodic
> > one?
>
> Why?
>

You are right. I will pay more attention to my reply.

Michael

> Any alarmtimer which hits that window has exactly the same problem.
>
> It's not restricted to periodic timers. Why would a dropped one-shot
> wakeup be acceptable?
>
> It's neither restricted to posix timers. If a clock_nanosleep(ALARM)
> expires in that window then the task wake up will just end up in the
> /dev/null bucket for the very same reason. Why would this be correct?
>
> Hmm?
>
> <GRMBL>
> > Michael
> >
> >>         spin_lock_irqsave(&base->lock, flags);
>
> <SNIP>Tons of wasted electrons</SNIP>
>
> Can you please trim your replies?
>
> </GRMBL>
>
> Thanks,
>
>         tglx
  
Michael Nazzareno Trimarchi Feb. 21, 2023, 11:50 a.m. UTC | #13
Hi John

On Sat, Feb 11, 2023 at 7:45 AM John Stultz <jstultz@google.com> wrote:
>
> Instead of trying to handle the freezer waking up tasks from
> schedule() in nanosleep on alarmtimers explicitly, use
> TASK_FREEZABLE which marks the task freezable when it goes
> to schedule, which prevents the signal wakeup.
>
> This allows for the freezer handling to be removed, simplifying
> the code.
>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Stephen Boyd <sboyd@kernel.org>
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Michael <michael@mipisi.de>
> Cc: Michael Trimarchi <michael@amarulasolutions.com>
> Cc: kernel-team@android.com
> Originally-by: Thomas Gleixner <tglx@linutronix.de>
> Link: https://lore.kernel.org/lkml/alpine.DEB.2.21.1909021247250.3955@nanos.tec.linutronix.de/
> [jstultz: Forward ported to 6.2-rc and split out from a separate
>           fix.]
> Signed-off-by: John Stultz <jstultz@google.com>
> ---
>  kernel/time/alarmtimer.c | 53 ++--------------------------------------
>  1 file changed, 2 insertions(+), 51 deletions(-)
>
> diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
> index f7b2128f64e2..15ecde8fcc1b 100644
> --- a/kernel/time/alarmtimer.c
> +++ b/kernel/time/alarmtimer.c
> @@ -49,14 +49,6 @@ static struct alarm_base {
>         clockid_t               base_clockid;
>  } alarm_bases[ALARM_NUMTYPE];
>
> -#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS)
> -/* freezer information to handle clock_nanosleep triggered wakeups */
> -static enum alarmtimer_type freezer_alarmtype;
> -static ktime_t freezer_expires;
> -static ktime_t freezer_delta;
> -static DEFINE_SPINLOCK(freezer_delta_lock);
> -#endif
> -
>  #ifdef CONFIG_RTC_CLASS
>  /* rtc timer and device for setting alarm wakeups at suspend */
>  static struct rtc_timer                rtctimer;
> @@ -241,19 +233,12 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
>   */
>  static int alarmtimer_suspend(struct device *dev)
>  {
> -       ktime_t min, now, expires;
> +       ktime_t now, expires, min = KTIME_MAX;
>         int i, ret, type;
>         struct rtc_device *rtc;
>         unsigned long flags;
>         struct rtc_time tm;
>
> -       spin_lock_irqsave(&freezer_delta_lock, flags);
> -       min = freezer_delta;
> -       expires = freezer_expires;
> -       type = freezer_alarmtype;
> -       freezer_delta = KTIME_MAX;
> -       spin_unlock_irqrestore(&freezer_delta_lock, flags);
> -
>         rtc = alarmtimer_get_rtcdev();
>         /* If we have no rtcdev, just return */
>         if (!rtc)
> @@ -480,38 +465,6 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
>  EXPORT_SYMBOL_GPL(alarm_forward_now);
>
>  #ifdef CONFIG_POSIX_TIMERS
> -
> -static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
> -{
> -       struct alarm_base *base;
> -       unsigned long flags;
> -       ktime_t delta;
> -
> -       switch(type) {
> -       case ALARM_REALTIME:
> -               base = &alarm_bases[ALARM_REALTIME];
> -               type = ALARM_REALTIME_FREEZER;
> -               break;
> -       case ALARM_BOOTTIME:
> -               base = &alarm_bases[ALARM_BOOTTIME];
> -               type = ALARM_BOOTTIME_FREEZER;
> -               break;
> -       default:
> -               WARN_ONCE(1, "Invalid alarm type: %d\n", type);
> -               return;
> -       }
> -
> -       delta = ktime_sub(absexp, base->get_ktime());
> -
> -       spin_lock_irqsave(&freezer_delta_lock, flags);
> -       if (delta < freezer_delta) {
> -               freezer_delta = delta;
> -               freezer_expires = absexp;
> -               freezer_alarmtype = type;
> -       }
> -       spin_unlock_irqrestore(&freezer_delta_lock, flags);
> -}
> -
>  /**
>   * clock2alarm - helper that converts from clockid to alarmtypes
>   * @clockid: clockid.
> @@ -750,7 +703,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
>         struct restart_block *restart;
>         alarm->data = (void *)current;
>         do {
> -               set_current_state(TASK_INTERRUPTIBLE);
> +               set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
>                 alarm_start(alarm, absexp);
>                 if (likely(alarm->data))
>                         schedule();
> @@ -765,8 +718,6 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
>         if (!alarm->data)
>                 return 0;
>
> -       if (freezing(current))
> -               alarmtimer_freezerset(absexp, type);
>         restart = &current->restart_block;
>         if (restart->nanosleep.type != TT_NONE) {
>                 struct timespec64 rmt;
> --
> 2.39.1.581.gbfd45094c4-goog
>
Tested-by: Michael Trimarchi <michael@amarulasolutions.com>

I will test Thomas patch soon

Michael
  
Michael Nazzareno Trimarchi Feb. 24, 2023, 10:02 a.m. UTC | #14
Hi Thomas

On Tue, Feb 21, 2023 at 8:10 AM Michael Nazzareno Trimarchi
<michael@amarulasolutions.com> wrote:
>
> Hi
>
> On Tue, Feb 21, 2023 at 1:12 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> >
> > Michael!
> >
> > On Mon, Feb 20 2023 at 22:32, Michael Nazzareno Trimarchi wrote:
> > > On Mon, Feb 20, 2023 at 10:18 PM Thomas Gleixner <tglx@linutronix.de> wrote:
> > >>   * alarmtimer_fired - Handles alarm hrtimer being fired.
> > >> @@ -194,6 +196,8 @@ static enum hrtimer_restart alarmtimer_f
> > >>         int ret = HRTIMER_NORESTART;
> > >>         int restart = ALARMTIMER_NORESTART;
> > >>
> > >> +       atomic_inc(&alarmtimer_wakeup);
> > >> +
> > >
> > >        ptr->it_active = 0;
> > >         if (ptr->it_interval) {
> > >                 atomic_inc(&alarmtimer_wakeup);
> > >                 si_private = ++ptr->it_requeue_pending;
> > >         }
> > >
> > > Should I not go to the alarm_handle_timer? and only if it's a periodic
> > > one?
> >
> > Why?
> >
>
> You are right. I will pay more attention to my reply.
>

I get time to test it and if the system suspend to ram we need to catch:

case PM_SUSPEND_PREPARE:
case PM_POST_SUSPEND:

Michael

> Michael
>
> > Any alarmtimer which hits that window has exactly the same problem.
> >
> > It's not restricted to periodic timers. Why would a dropped one-shot
> > wakeup be acceptable?
> >
> > It's neither restricted to posix timers. If a clock_nanosleep(ALARM)
> > expires in that window then the task wake up will just end up in the
> > /dev/null bucket for the very same reason. Why would this be correct?
> >
> > Hmm?
> >
> > <GRMBL>
> > > Michael
> > >
> > >>         spin_lock_irqsave(&base->lock, flags);
> >
> > <SNIP>Tons of wasted electrons</SNIP>
> >
> > Can you please trim your replies?
> >
> > </GRMBL>
> >
> > Thanks,
> >
> >         tglx
  
Michael Nazzareno Trimarchi Feb. 24, 2023, 10:32 a.m. UTC | #15
Hi

On Fri, Feb 24, 2023 at 11:02 AM Michael Nazzareno Trimarchi
<michael@amarulasolutions.com> wrote:
>
> Hi Thomas
>
> On Tue, Feb 21, 2023 at 8:10 AM Michael Nazzareno Trimarchi
> <michael@amarulasolutions.com> wrote:
> >
> > Hi
> >
> > On Tue, Feb 21, 2023 at 1:12 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> > >
> > > Michael!
> > >
> > > On Mon, Feb 20 2023 at 22:32, Michael Nazzareno Trimarchi wrote:
> > > > On Mon, Feb 20, 2023 at 10:18 PM Thomas Gleixner <tglx@linutronix.de> wrote:
> > > >>   * alarmtimer_fired - Handles alarm hrtimer being fired.
> > > >> @@ -194,6 +196,8 @@ static enum hrtimer_restart alarmtimer_f
> > > >>         int ret = HRTIMER_NORESTART;
> > > >>         int restart = ALARMTIMER_NORESTART;
> > > >>
> > > >> +       atomic_inc(&alarmtimer_wakeup);
> > > >> +
> > > >
> > > >        ptr->it_active = 0;
> > > >         if (ptr->it_interval) {
> > > >                 atomic_inc(&alarmtimer_wakeup);
> > > >                 si_private = ++ptr->it_requeue_pending;
> > > >         }
> > > >
> > > > Should I not go to the alarm_handle_timer? and only if it's a periodic
> > > > one?
> > >
> > > Why?
> > >
> >
> > You are right. I will pay more attention to my reply.
> >
>
> I get time to test it and if the system suspend to ram we need to catch:
>
> case PM_SUSPEND_PREPARE:
> case PM_POST_SUSPEND:
>
> Michael
>
> > Michael
> >
> > > Any alarmtimer which hits that window has exactly the same problem.
> > >
> > > It's not restricted to periodic timers. Why would a dropped one-shot
> > > wakeup be acceptable?
> > >
> > > It's neither restricted to posix timers. If a clock_nanosleep(ALARM)
> > > expires in that window then the task wake up will just end up in the
> > > /dev/null bucket for the very same reason. Why would this be correct?
> > >
> > > Hmm?
> > >
> > > <GRMBL>
> > > > Michael
> > > >
> > > >>         spin_lock_irqsave(&base->lock, flags);
> > >
[snip]

I have something like this

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index b68cb7f02a6b..b5f15e7f76cb 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -26,6 +26,7 @@
 #include <linux/freezer.h>
 #include <linux/compat.h>
 #include <linux/module.h>
+#include <linux/suspend.h>
 #include <linux/time_namespace.h>

 #include "posix-timers.h"
@@ -176,6 +177,7 @@ static void alarmtimer_dequeue(struct alarm_base
*base, struct alarm *alarm)
     alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
 }

+static atomic_t alarmtimer_wakeup;

 /**
  * alarmtimer_fired - Handles alarm hrtimer being fired.
@@ -194,6 +196,8 @@ static enum hrtimer_restart
alarmtimer_fired(struct hrtimer *timer)
     int ret = HRTIMER_NORESTART;
     int restart = ALARMTIMER_NORESTART;

+    atomic_inc(&alarmtimer_wakeup);
+
     spin_lock_irqsave(&base->lock, flags);
     alarmtimer_dequeue(base, alarm);
     spin_unlock_irqrestore(&base->lock, flags);
@@ -263,8 +267,18 @@ static int alarmtimer_suspend(struct device *dev)
         }
     }
     /* No timers to expire */
-    if (min == KTIME_MAX)
+    if (min == KTIME_MAX) {
+        /*
+         * Handle wakeups which happened between the start of suspend and
+         * now as those wakeups might have tried to wake up a frozen task
+         * which means they are no longer in the alarm timer list.
+         */
+        if (atomic_read(&alarmtimer_wakeup)) {
+            pm_wakeup_event(dev, 0);
+            return -EBUSY;
+        }
         return 0;
+    }

     if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
         pm_wakeup_event(dev, 2 * MSEC_PER_SEC);
@@ -296,6 +310,30 @@ static int alarmtimer_resume(struct device *dev)
     return 0;
 }

+static int alarmtimer_pm_notifier_fn(struct notifier_block *bl,
unsigned long state,
+                                    void *unused)
+{
+    switch (state) {
+    case PM_POST_SUSPEND:
+        atomic_set(&alarmtimer_wakeup, 0);
+        break;
+    }
+    return NOTIFY_DONE;
+}
+
+static struct notifier_block alarmtimer_pm_notifier = {
+       .notifier_call = alarmtimer_pm_notifier_fn,
+};
+
+static inline int alarmtimer_register_pm_notifier(void)
+{
+       return register_pm_notifier(&alarmtimer_pm_notifier);
+}
+
+static inline void alarmtimer_unregister_pm_notifier(void)
+{
+       unregister_pm_notifier(&alarmtimer_pm_notifier);
+}
 #else
 static int alarmtimer_suspend(struct device *dev)
 {
@@ -306,6 +344,15 @@ static int alarmtimer_resume(struct device *dev)
 {
     return 0;
 }
+
+static inline int alarmtimer_register_pm_notifier(void)
+{
+       return 0;
+}
+
+static inline void alarmtimer_unregister_pm_notifier(void)
+{
+}
 #endif

 static void
@@ -904,11 +951,18 @@ static int __init alarmtimer_init(void)
     if (error)
         return error;

-    error = platform_driver_register(&alarmtimer_driver);
+    error = alarmtimer_register_pm_notifier();
     if (error)
         goto out_if;

+    error = platform_driver_register(&alarmtimer_driver);
+    if (error)
+        goto out_pm;
+
     return 0;
+
+out_pm:
+    alarmtimer_unregister_pm_notifier();
 out_if:
     alarmtimer_rtc_interface_remove();
     return error;
  
Michael Nazzareno Trimarchi Feb. 24, 2023, 11:57 a.m. UTC | #16
Hi

Drop my last email. I have made a wrong assumption, below the fix I have done

On Fri, Feb 24, 2023 at 11:32 AM Michael Nazzareno Trimarchi
<michael@amarulasolutions.com> wrote:
>
> Hi
>
> On Fri, Feb 24, 2023 at 11:02 AM Michael Nazzareno Trimarchi
> <michael@amarulasolutions.com> wrote:
> >
> > Hi Thomas
> >
> > On Tue, Feb 21, 2023 at 8:10 AM Michael Nazzareno Trimarchi
> > <michael@amarulasolutions.com> wrote:
> > >
> > > Hi
> > >
> > > On Tue, Feb 21, 2023 at 1:12 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> > > >
> > > > Michael!
> > > >
> > > > On Mon, Feb 20 2023 at 22:32, Michael Nazzareno Trimarchi wrote:
> > > > > On Mon, Feb 20, 2023 at 10:18 PM Thomas Gleixner <tglx@linutronix.de> wrote:
> > > > >>   * alarmtimer_fired - Handles alarm hrtimer being fired.
> > > > >> @@ -194,6 +196,8 @@ static enum hrtimer_restart alarmtimer_f
> > > > >>         int ret = HRTIMER_NORESTART;
> > > > >>         int restart = ALARMTIMER_NORESTART;
> > > > >>
> > > > >> +       atomic_inc(&alarmtimer_wakeup);
> > > > >> +
> >
> I have something like this
>
[snip]

Fixed now and tested on my side as below

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index b68cb7f02a6b..1f2678fe6939 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -26,6 +26,7 @@
 #include <linux/freezer.h>
 #include <linux/compat.h>
 #include <linux/module.h>
+#include <linux/suspend.h>
 #include <linux/time_namespace.h>

 #include "posix-timers.h"
@@ -171,11 +172,11 @@ static void alarmtimer_dequeue(struct alarm_base
*base, struct alarm *alarm)
 {
     if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
         return;

     timerqueue_del(&base->timerqueue, &alarm->node);
     alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
 }

+static atomic_t alarmtimer_wakeup;

 /**
  * alarmtimer_fired - Handles alarm hrtimer being fired.
@@ -194,6 +195,8 @@ static enum hrtimer_restart
alarmtimer_fired(struct hrtimer *timer)
     int ret = HRTIMER_NORESTART;
     int restart = ALARMTIMER_NORESTART;

+    atomic_inc(&alarmtimer_wakeup);
+
     spin_lock_irqsave(&base->lock, flags);
     alarmtimer_dequeue(base, alarm);
     spin_unlock_irqrestore(&base->lock, flags);
@@ -244,6 +247,16 @@ static int alarmtimer_suspend(struct device *dev)
     if (!rtc)
         return 0;

+    /*
+     * Handle wakeups which happened between the start of suspend and
+     * now as those wakeups might have tried to wake up a frozen task
+     * which means they are not longer in the alarm timer list.
+     */
+    if (atomic_read(&alarmtimer_wakeup)) {
+        pm_wakeup_event(dev, 0);
+        return -EBUSY;
+    }
+
     /* Find the soonest timer to expire*/
     for (i = 0; i < ALARM_NUMTYPE; i++) {
         struct alarm_base *base = &alarm_bases[i];
@@ -296,6 +309,33 @@ static int alarmtimer_resume(struct device *dev)
     return 0;
 }

+static int alarmtimer_pm_notifier_fn(struct notifier_block *bl,
unsigned long state,
+                                    void *unused)
+{
+    switch (state) {
+    case PM_SUSPEND_PREPARE:
+    case PM_POST_SUSPEND:
+    case PM_HIBERNATION_PREPARE:
+    case PM_POST_HIBERNATION:
+        atomic_set(&alarmtimer_wakeup, 0);
+        break;
+    }
+    return NOTIFY_DONE;
+}
+
+static struct notifier_block alarmtimer_pm_notifier = {
+       .notifier_call = alarmtimer_pm_notifier_fn,
+};
+
+static inline int alarmtimer_register_pm_notifier(void)
+{
+       return register_pm_notifier(&alarmtimer_pm_notifier);
+}
+
+static inline void alarmtimer_unregister_pm_notifier(void)
+{
+       unregister_pm_notifier(&alarmtimer_pm_notifier);
+}
 #else
 static int alarmtimer_suspend(struct device *dev)
 {
@@ -306,6 +346,15 @@ static int alarmtimer_resume(struct device *dev)
 {
     return 0;
 }
+
+static inline int alarmtimer_register_pm_notifier(void)
+{
+       return 0;
+}
+
+static inline void alarmtimer_unregister_pm_notifier(void)
+{
+}
 #endif

 static void
@@ -904,11 +953,18 @@ static int __init alarmtimer_init(void)
     if (error)
         return error;

-    error = platform_driver_register(&alarmtimer_driver);
+    error = alarmtimer_register_pm_notifier();
     if (error)
         goto out_if;

+    error = platform_driver_register(&alarmtimer_driver);
+    if (error)
+        goto out_pm;
+
     return 0;
+
+out_pm:
+    alarmtimer_unregister_pm_notifier();
 out_if:
     alarmtimer_rtc_interface_remove();
     return error;
  
John Stultz Feb. 28, 2023, 12:03 a.m. UTC | #17
On Mon, Feb 20, 2023 at 1:18 PM Thomas Gleixner <tglx@linutronix.de> wrote:
> On Mon, Feb 20 2023 at 19:11, Michael Nazzareno Trimarchi wrote:
> > [   89.674127] PM: suspend entry (deep)
> > [   89.714916] Filesystems sync: 0.037 seconds
> > [   89.733594] Freezing user space processes
> > [   89.740680] Freezing user space processes completed (elapsed 0.002 seconds)
>
> User space tasks are frozen now.
>
> > [   89.748593] OOM killer disabled.
> > [   89.752257] Freezing remaining freezable tasks
> > [   89.756807] alarmtimer_fired: called
> > [   89.756831] alarmtimer_dequeue: called <---- HERE
>
> Here fires the underlying hrtimer before devices are suspended, so the
> sig_sendqueue() cannot wake up the task because task->state ==
> TASK_FROZEN, which means the signal wont be handled and the timer wont
> be rearmed until the task is thawed.
>
> And as you correctly observed the alarmtimer_suspend() path won't see a
> pending timer anymore because it is dequeued.
>
> So precisely the time between freeze(alarmtask) and alarmtimer_suspend()
> is a gaping hole which guarantees lost wakeups.
>

Hey Michael, Thomas,
 Sorry for being a little slow to respond the last few weeks. :(

So I've managed to reproduce the issue following Michael's
instructions and your analysis, Thomas, looks right!

Though I'm a bit confused on the suggested solution:


> @@ -194,6 +196,8 @@ static enum hrtimer_restart alarmtimer_f
>         int ret = HRTIMER_NORESTART;
>         int restart = ALARMTIMER_NORESTART;
>
> +       atomic_inc(&alarmtimer_wakeup);
> +
>         spin_lock_irqsave(&base->lock, flags);
>         alarmtimer_dequeue(base, alarm);
>         spin_unlock_irqrestore(&base->lock, flags);

Ok, so here we're incrementing the wakeup counter for each
alarmtimer_fired call.

> @@ -244,6 +248,16 @@ static int alarmtimer_suspend(struct dev
>         if (!rtc)
>                 return 0;
>
> +       /*
> +        * Handle wakeups which happened between the start of suspend and
> +        * now as those wakeups might have tried to wake up a frozen task
> +        * which means they are not longer in the alarm timer list.
> +        */
> +       if (atomic_read(&alarmtimer_wakeup)) {
> +               pm_wakeup_event(dev, 0);
> +               return -EBUSY;
> +       }
> +
>         /* Find the soonest timer to expire*/
>         for (i = 0; i < ALARM_NUMTYPE; i++) {
>                 struct alarm_base *base = &alarm_bases[i];

And here we're bailing on alarmtimer_suspend if an alarmtimer_fired
happened recently.  Still looks okish.


> +static int alarmtimer_pm_notifier_fn(struct notifier_block *bl, unsigned long state,
> +                                    void *unused)
> +{
> +       switch (state) {
> +       case PM_HIBERNATION_PREPARE:
> +       case PM_POST_HIBERNATION:
> +               atomic_set(&alarmtimer_wakeup, 0);
> +               break;
> +       }
> +       return NOTIFY_DONE;

But here, we're setting the alarmtimer_wakeup count to zero if we get
PM_HIBERNATION_PREPARE or  PM_POST_HIBERNATION notifications?
And Michael noted we need to add  PM_SUSPEND_PREPARE and
PM_POST_SUSPEND there for this to seemingly work.

This zeroing out the counter here feels a little sloppy, as it seems
nothing prevents the notifier from racing with the other added logic.

If the issue is that when we're expiring timers in alarmtimer_fire(),
a suspend event may come in midway after the alarmtimer_dequeue(),
while the timer is running but before the alarmtimer_enqueue(),
causing recurring timers to not be re-armed, it seems we probably
should do the accounting fully in alarmtimer_fire(), doing an
atomic_dec(&alarmtimer_wakeup) at the end of that function.  That way
we avoid suspending while running an alarmtimer, so the recurring
timers will always be back on the timer list when we do suspend.

This sort of has a risk that if there's timers that run for too long,
you'd prevent suspend for that period, but we already avoid suspending
if there's a alarmtimer within two seconds in the future, so having
alarmtimers recur quickly is inherently going to prevent you from
suspending.

I'll send out a reworked patch here in a moment after I validate it as working.

thanks
-john
  
John Stultz Feb. 28, 2023, 4:06 a.m. UTC | #18
On Mon, Feb 27, 2023 at 4:03 PM John Stultz <jstultz@google.com> wrote:
> > On Mon, Feb 20 2023 at 19:11, Michael Nazzareno Trimarchi wrote:
> > +static int alarmtimer_pm_notifier_fn(struct notifier_block *bl, unsigned long state,
> > +                                    void *unused)
> > +{
> > +       switch (state) {
> > +       case PM_HIBERNATION_PREPARE:
> > +       case PM_POST_HIBERNATION:
> > +               atomic_set(&alarmtimer_wakeup, 0);
> > +               break;
> > +       }
> > +       return NOTIFY_DONE;
>
> But here, we're setting the alarmtimer_wakeup count to zero if we get
> PM_HIBERNATION_PREPARE or  PM_POST_HIBERNATION notifications?
> And Michael noted we need to add  PM_SUSPEND_PREPARE and
> PM_POST_SUSPEND there for this to seemingly work.
>
> This zeroing out the counter here feels a little sloppy, as it seems
> nothing prevents the notifier from racing with the other added logic.
>
> If the issue is that when we're expiring timers in alarmtimer_fire(),
> a suspend event may come in midway after the alarmtimer_dequeue(),
> while the timer is running but before the alarmtimer_enqueue(),
> causing recurring timers to not be re-armed, it seems we probably
> should do the accounting fully in alarmtimer_fire(), doing an
> atomic_dec(&alarmtimer_wakeup) at the end of that function.  That way
> we avoid suspending while running an alarmtimer, so the recurring
> timers will always be back on the timer list when we do suspend.

Ah. Scratch that. I was testing with my rework of the patch and still
seeing trouble w/ Michael's reproducer.

My apologies, as I was mistaken that the race is in alarmtimer_fired
between the dequeue and the enqueue. It's between the
alarmtimer_fired() and the  posixtimer_rearm() logic, where a suspend
inbetween could prevent the timer from being rearmed.

This is messier as we need to allow the timer to fire and re-arm
itself and block suspend happening inbetween, and since the re-arming
logic is happening at the posixtimers abstraction level above the
alarmtimers, its difficult to totally prevent that.

So Thomas's notifier method of zeroing at the begining of suspend and
tracking any wakeups after that point makes more sense now. It still
feels a bit messy, but I'm not sure there's something better.

My only thought is this feels a little bit like its mirroring what the
pm_wakeup_event() logic is supposed to do. Should we be adding a
pm_wakeup_event() to alarmtimer_fired() to try to prevent suspend from
occuring for 500ms or so after an alarmtimer has fired so there is
enough time for it to be re-armed if needed?

thanks
-john
  
Thomas Gleixner March 1, 2023, 10:11 p.m. UTC | #19
On Mon, Feb 27 2023 at 20:06, John Stultz wrote:
> On Mon, Feb 27, 2023 at 4:03 PM John Stultz <jstultz@google.com> wrote:
>> > On Mon, Feb 20 2023 at 19:11, Michael Nazzareno Trimarchi wrote:
>> > +static int alarmtimer_pm_notifier_fn(struct notifier_block *bl, unsigned long state,
>> > +                                    void *unused)
>> > +{
>> > +       switch (state) {
>> > +       case PM_HIBERNATION_PREPARE:
>> > +       case PM_POST_HIBERNATION:
>> > +               atomic_set(&alarmtimer_wakeup, 0);
>> > +               break;
>> > +       }
>> > +       return NOTIFY_DONE;
>>
>> But here, we're setting the alarmtimer_wakeup count to zero if we get
>> PM_HIBERNATION_PREPARE or  PM_POST_HIBERNATION notifications?
>> And Michael noted we need to add  PM_SUSPEND_PREPARE and
>> PM_POST_SUSPEND there for this to seemingly work.

Yup. I missed those when sending out that hack.

> So Thomas's notifier method of zeroing at the begining of suspend and
> tracking any wakeups after that point makes more sense now. It still
> feels a bit messy, but I'm not sure there's something better.

I'm not enthused about it either. 

> My only thought is this feels a little bit like its mirroring what the
> pm_wakeup_event() logic is supposed to do. Should we be adding a
> pm_wakeup_event() to alarmtimer_fired() to try to prevent suspend from
> occuring for 500ms or so after an alarmtimer has fired so there is
> enough time for it to be re-armed if needed?

The question is whether this can be called unconditionally and how that
interacts with the suspend logic. Rafael?

Thanks,

        tglx
  
John Stultz March 2, 2023, 12:47 a.m. UTC | #20
On Wed, Mar 1, 2023 at 2:11 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Mon, Feb 27 2023 at 20:06, John Stultz wrote:
> > On Mon, Feb 27, 2023 at 4:03 PM John Stultz <jstultz@google.com> wrote:
> >> > On Mon, Feb 20 2023 at 19:11, Michael Nazzareno Trimarchi wrote:
> >> > +static int alarmtimer_pm_notifier_fn(struct notifier_block *bl, unsigned long state,
> >> > +                                    void *unused)
> >> > +{
> >> > +       switch (state) {
> >> > +       case PM_HIBERNATION_PREPARE:
> >> > +       case PM_POST_HIBERNATION:
> >> > +               atomic_set(&alarmtimer_wakeup, 0);
> >> > +               break;
> >> > +       }
> >> > +       return NOTIFY_DONE;
> >>
> >> But here, we're setting the alarmtimer_wakeup count to zero if we get
> >> PM_HIBERNATION_PREPARE or  PM_POST_HIBERNATION notifications?
> >> And Michael noted we need to add  PM_SUSPEND_PREPARE and
> >> PM_POST_SUSPEND there for this to seemingly work.
>
> Yup. I missed those when sending out that hack.
>
> > So Thomas's notifier method of zeroing at the begining of suspend and
> > tracking any wakeups after that point makes more sense now. It still
> > feels a bit messy, but I'm not sure there's something better.
>
> I'm not enthused about it either.

That said, it does work. :) In my testing, your approach has been
reliable, so it has that going for it.

> > My only thought is this feels a little bit like its mirroring what the
> > pm_wakeup_event() logic is supposed to do. Should we be adding a
> > pm_wakeup_event() to alarmtimer_fired() to try to prevent suspend from
> > occuring for 500ms or so after an alarmtimer has fired so there is
> > enough time for it to be re-armed if needed?
>
> The question is whether this can be called unconditionally and how that
> interacts with the suspend logic. Rafael?

I took a brief stab at this, and one thing is the test needs to use
the /sys/power/wakeup_count dance before suspending.
However, I still had some cases where the recurring alarmtimer got
lost, so I need to dig a bit more to understand what was going wrong
there.

In the meantime, I'm ok with Thomas' approach, but we probably need
some comment documentation that suggests it might be reworked in a
cleaner way?

thanks
-john
  
Michael Nazzareno Trimarchi March 2, 2023, 9:34 a.m. UTC | #21
Hi John

On Thu, Mar 2, 2023 at 1:48 AM John Stultz <jstultz@google.com> wrote:
>
> On Wed, Mar 1, 2023 at 2:11 PM Thomas Gleixner <tglx@linutronix.de> wrote:
> >
> > On Mon, Feb 27 2023 at 20:06, John Stultz wrote:
> > > On Mon, Feb 27, 2023 at 4:03 PM John Stultz <jstultz@google.com> wrote:
> > >> > On Mon, Feb 20 2023 at 19:11, Michael Nazzareno Trimarchi wrote:
> > >> > +static int alarmtimer_pm_notifier_fn(struct notifier_block *bl, unsigned long state,
> > >> > +                                    void *unused)
> > >> > +{
> > >> > +       switch (state) {
> > >> > +       case PM_HIBERNATION_PREPARE:
> > >> > +       case PM_POST_HIBERNATION:
> > >> > +               atomic_set(&alarmtimer_wakeup, 0);
> > >> > +               break;
> > >> > +       }
> > >> > +       return NOTIFY_DONE;
> > >>
> > >> But here, we're setting the alarmtimer_wakeup count to zero if we get
> > >> PM_HIBERNATION_PREPARE or  PM_POST_HIBERNATION notifications?
> > >> And Michael noted we need to add  PM_SUSPEND_PREPARE and
> > >> PM_POST_SUSPEND there for this to seemingly work.
> >
> > Yup. I missed those when sending out that hack.
> >
> > > So Thomas's notifier method of zeroing at the begining of suspend and
> > > tracking any wakeups after that point makes more sense now. It still
> > > feels a bit messy, but I'm not sure there's something better.
> >
> > I'm not enthused about it either.
>
> That said, it does work. :) In my testing, your approach has been
> reliable, so it has that going for it.
>
> > > My only thought is this feels a little bit like its mirroring what the
> > > pm_wakeup_event() logic is supposed to do. Should we be adding a
> > > pm_wakeup_event() to alarmtimer_fired() to try to prevent suspend from
> > > occuring for 500ms or so after an alarmtimer has fired so there is
> > > enough time for it to be re-armed if needed?
> >
> > The question is whether this can be called unconditionally and how that
> > interacts with the suspend logic. Rafael?
>
> I took a brief stab at this, and one thing is the test needs to use
> the /sys/power/wakeup_count dance before suspending.
> However, I still had some cases where the recurring alarmtimer got
> lost, so I need to dig a bit more to understand what was going wrong
> there.
>
> In the meantime, I'm ok with Thomas' approach, but we probably need
> some comment documentation that suggests it might be reworked in a
> cleaner way?
>
> thanks
> -john

For now I have pushed to our internal devices this commit message

time: alarmtimer: Fix wakeup lost between freeze(alarmtask) and
alarmtimer_suspend()

    An alarm timer can happen in between a freeze and alarmtimer_suspend as
    below output:

    > [   89.674127] PM: suspend entry (deep)
    > [   89.714916] Filesystems sync: 0.037 seconds
    > [   89.733594] Freezing user space processes
    > [   89.740680] Freezing user space processes completed (elapsed
0.002 seconds)

    User space tasks are frozen now.

    > [   89.748593] OOM killer disabled.
    > [   89.752257] Freezing remaining freezable tasks
    > [   89.756807] alarmtimer_fired: called
    > [   89.756831] alarmtimer_dequeue: called <---- HERE

    Here fires the underlying hrtimer before devices are suspended, so the
    sig_sendqueue() cannot wake up the task because task->state ==
    TASK_FROZEN, which means the signal won't be handled and the timer won't
    be rearmed until the task is thawed.

    The alarmtimer_suspend() path won't see a pending timer anymore because
    it is dequeued.

    So precisely the time between freeze(alarmtask) and alarmtimer_suspend()
    is a gaping hole which guarantees lost wakeups.

    That hole has been there forever.

    The old horrible freezer hackery was supposed to plug that
    hole, but that gem is not solving anything as far as I understand what
    it is doing.

Grab a bit from Thomas discussion

Michael
  
Rafael J. Wysocki March 2, 2023, 2:32 p.m. UTC | #22
On Mon, Feb 20, 2023 at 10:19 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> Michael!
>
> On Mon, Feb 20 2023 at 19:11, Michael Nazzareno Trimarchi wrote:
> > On Mon, Feb 20, 2023 at 6:48 PM Thomas Gleixner <tglx@linutronix.de> wrote:
> >> [   27.349352] alarmtimer_enqueue()
> >>
> >> U: Before SUSPEND
> >>
> >> [   31.353228] PM: suspend entry (s2idle)
> >> [   31.388857] Filesystems sync: 0.033 seconds
> >> [   31.418427] Freezing user space processes
> >> [   31.422406] Freezing user space processes completed (elapsed 0.002 seconds)
> >> [   31.425435] OOM killer disabled.
> >> [   31.426833] Freezing remaining freezable tasks
> >> [   31.429838] Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
> >> [   31.432922] printk: Suspending console(s) (use no_console_suspend to debug)
> >> [   31.435912] alarmtimer alarmtimer.0.auto: PM: dpm_run_callback(): platform_pm_suspend+0x0/0x50 returns -16
> >> [   31.435954] alarmtimer alarmtimer.0.auto: PM: failed to suspend: error -16
> >>
> >> That means the RTC interrupt was raised before the system was able to suspend.
> >
> > if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
> >     pm_wakeup_event(dev, 2 * MSEC_PER_SEC);
> >     return -EBUSY;
> > }
> >
> > I think that above happens to you. So it means that you are too close
> > to this limit, can be?
>
> Maybe.
>
> > Yes but the alarm for me was set to be fired just before freezing. Is
> > this a valid scenario?
>
> Sure why not?
>
> > Let's say that I set an alarm to be fired just before the userspace
> > freeze happens. If I'm close to it then then process will not process
> > the signal to enquene again the alarm in the list and then during
> > alarm suspend the list will be empty for the above.
>
> Halfways, but slowly your explanations start to make sense. Here is the
> dmesg output you provided:
>
> > [   89.674127] PM: suspend entry (deep)
> > [   89.714916] Filesystems sync: 0.037 seconds
> > [   89.733594] Freezing user space processes
> > [   89.740680] Freezing user space processes completed (elapsed 0.002 seconds)
>
> User space tasks are frozen now.
>
> > [   89.748593] OOM killer disabled.
> > [   89.752257] Freezing remaining freezable tasks
> > [   89.756807] alarmtimer_fired: called
> > [   89.756831] alarmtimer_dequeue: called <---- HERE
>
> Here fires the underlying hrtimer before devices are suspended, so the
> sig_sendqueue() cannot wake up the task because task->state ==
> TASK_FROZEN, which means the signal wont be handled and the timer wont
> be rearmed until the task is thawed.
>
> And as you correctly observed the alarmtimer_suspend() path won't see a
> pending timer anymore because it is dequeued.
>
> So precisely the time between freeze(alarmtask) and alarmtimer_suspend()
> is a gaping hole which guarantees lost wakeups.
>
> That's completely unrelated to Johns patches. That hole has been there
> forever.
>
> I assume that this horrible freezer hackery was supposed to plug that
> hole, but that gem is not solving anything as far as I understand what
> it is doing. I'm still failing to understand what it is supposed to
> solve, but that's a different story.
>
> Aside of that I can clearly reproduce the issue, now that I understand
> what you are trying to tell, on plain 6.2 without and with John's
> cleanup.
>
> Something like the below plugs the hole reliably.
>
> Thanks,
>
>         tglx
> ---
> --- a/kernel/time/alarmtimer.c
> +++ b/kernel/time/alarmtimer.c
> @@ -26,6 +26,7 @@
>  #include <linux/freezer.h>
>  #include <linux/compat.h>
>  #include <linux/module.h>
> +#include <linux/suspend.h>
>  #include <linux/time_namespace.h>
>
>  #include "posix-timers.h"
> @@ -176,6 +177,7 @@ static void alarmtimer_dequeue(struct al
>         alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
>  }
>
> +static atomic_t alarmtimer_wakeup;
>
>  /**
>   * alarmtimer_fired - Handles alarm hrtimer being fired.
> @@ -194,6 +196,8 @@ static enum hrtimer_restart alarmtimer_f
>         int ret = HRTIMER_NORESTART;
>         int restart = ALARMTIMER_NORESTART;
>
> +       atomic_inc(&alarmtimer_wakeup);
> +

This appears to be still somewhat racy, because the notifier can run
at this point AFAICS.

>         spin_lock_irqsave(&base->lock, flags);
>         alarmtimer_dequeue(base, alarm);
>         spin_unlock_irqrestore(&base->lock, flags);
> @@ -244,6 +248,16 @@ static int alarmtimer_suspend(struct dev
>         if (!rtc)
>                 return 0;
>
> +       /*
> +        * Handle wakeups which happened between the start of suspend and
> +        * now as those wakeups might have tried to wake up a frozen task
> +        * which means they are not longer in the alarm timer list.
> +        */
> +       if (atomic_read(&alarmtimer_wakeup)) {
> +               pm_wakeup_event(dev, 0);
> +               return -EBUSY;
> +       }
> +
>         /* Find the soonest timer to expire*/
>         for (i = 0; i < ALARM_NUMTYPE; i++) {
>                 struct alarm_base *base = &alarm_bases[i];
> @@ -296,6 +310,31 @@ static int alarmtimer_resume(struct devi
>         return 0;
>  }
>
> +static int alarmtimer_pm_notifier_fn(struct notifier_block *bl, unsigned long state,
> +                                    void *unused)
> +{
> +       switch (state) {
> +       case PM_HIBERNATION_PREPARE:
> +       case PM_POST_HIBERNATION:
> +               atomic_set(&alarmtimer_wakeup, 0);
> +               break;
> +       }
> +       return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block alarmtimer_pm_notifier = {
> +       .notifier_call = alarmtimer_pm_notifier_fn,
> +};
> +
> +static inline int alarmtimer_register_pm_notifier(void)
> +{
> +       return register_pm_notifier(&alarmtimer_pm_notifier);
> +}
> +
> +static inline void alarmtimer_unregister_pm_notifier(void)
> +{
> +       unregister_pm_notifier(&alarmtimer_pm_notifier);
> +}
>  #else
>  static int alarmtimer_suspend(struct device *dev)
>  {
> @@ -306,6 +345,15 @@ static int alarmtimer_resume(struct devi
>  {
>         return 0;
>  }
> +
> +static inline int alarmtimer_register_pm_notifier(void)
> +{
> +       return 0;
> +}
> +
> +static inline void alarmtimer_unregister_pm_notifier(void)
> +{
> +}
>  #endif
>
>  static void
> @@ -904,11 +952,17 @@ static int __init alarmtimer_init(void)
>         if (error)
>                 return error;
>
> -       error = platform_driver_register(&alarmtimer_driver);
> +       error = alarmtimer_register_pm_notifier();
>         if (error)
>                 goto out_if;
>
> +       error = platform_driver_register(&alarmtimer_driver);
> +       if (error)
> +               goto out_pm;
> +
>         return 0;
> +out_pm:
> +       alarmtimer_unregister_pm_notifier();
>  out_if:
>         alarmtimer_rtc_interface_remove();
>         return error;
  
Rafael J. Wysocki March 2, 2023, 2:54 p.m. UTC | #23
On Wed, Mar 1, 2023 at 11:11 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Mon, Feb 27 2023 at 20:06, John Stultz wrote:
> > On Mon, Feb 27, 2023 at 4:03 PM John Stultz <jstultz@google.com> wrote:
> >> > On Mon, Feb 20 2023 at 19:11, Michael Nazzareno Trimarchi wrote:
> >> > +static int alarmtimer_pm_notifier_fn(struct notifier_block *bl, unsigned long state,
> >> > +                                    void *unused)
> >> > +{
> >> > +       switch (state) {
> >> > +       case PM_HIBERNATION_PREPARE:
> >> > +       case PM_POST_HIBERNATION:
> >> > +               atomic_set(&alarmtimer_wakeup, 0);
> >> > +               break;
> >> > +       }
> >> > +       return NOTIFY_DONE;
> >>
> >> But here, we're setting the alarmtimer_wakeup count to zero if we get
> >> PM_HIBERNATION_PREPARE or  PM_POST_HIBERNATION notifications?
> >> And Michael noted we need to add  PM_SUSPEND_PREPARE and
> >> PM_POST_SUSPEND there for this to seemingly work.
>
> Yup. I missed those when sending out that hack.
>
> > So Thomas's notifier method of zeroing at the begining of suspend and
> > tracking any wakeups after that point makes more sense now. It still
> > feels a bit messy, but I'm not sure there's something better.
>
> I'm not enthused about it either.
>
> > My only thought is this feels a little bit like its mirroring what the
> > pm_wakeup_event() logic is supposed to do. Should we be adding a
> > pm_wakeup_event() to alarmtimer_fired() to try to prevent suspend from
> > occuring for 500ms or so after an alarmtimer has fired so there is
> > enough time for it to be re-armed if needed?
>
> The question is whether this can be called unconditionally and how that
> interacts with the suspend logic. Rafael?

The pm_wakeup_event() doesn't need the timeout, but it is conditional
on using /sys/power/wakeup_count.

However, in any case it doesn't guarantee that the target user of the
alarm timer will be able to handle the signal on time AFAICS.  To my
eyes, it is entirely possible for alarmtimer_fired() to run before
/sys/power/wakeup_count is written to while the signal will be sent to
the given task later, in which case there is no guarantee that the
task will not be frozen in the meantime.

Moreover, I'm wondering if all alarm timers should always wake up the
system from sleep or abort suspends in progress?  If the answer is
"no" here, it changes the problem at hand quite a bit.
  
Rafael J. Wysocki March 2, 2023, 2:56 p.m. UTC | #24
On Thu, Mar 2, 2023 at 3:54 PM Rafael J. Wysocki <rafael@kernel.org> wrote:
>
> On Wed, Mar 1, 2023 at 11:11 PM Thomas Gleixner <tglx@linutronix.de> wrote:
> >
> > On Mon, Feb 27 2023 at 20:06, John Stultz wrote:
> > > On Mon, Feb 27, 2023 at 4:03 PM John Stultz <jstultz@google.com> wrote:
> > >> > On Mon, Feb 20 2023 at 19:11, Michael Nazzareno Trimarchi wrote:
> > >> > +static int alarmtimer_pm_notifier_fn(struct notifier_block *bl, unsigned long state,
> > >> > +                                    void *unused)
> > >> > +{
> > >> > +       switch (state) {
> > >> > +       case PM_HIBERNATION_PREPARE:
> > >> > +       case PM_POST_HIBERNATION:
> > >> > +               atomic_set(&alarmtimer_wakeup, 0);
> > >> > +               break;
> > >> > +       }
> > >> > +       return NOTIFY_DONE;
> > >>
> > >> But here, we're setting the alarmtimer_wakeup count to zero if we get
> > >> PM_HIBERNATION_PREPARE or  PM_POST_HIBERNATION notifications?
> > >> And Michael noted we need to add  PM_SUSPEND_PREPARE and
> > >> PM_POST_SUSPEND there for this to seemingly work.
> >
> > Yup. I missed those when sending out that hack.
> >
> > > So Thomas's notifier method of zeroing at the begining of suspend and
> > > tracking any wakeups after that point makes more sense now. It still
> > > feels a bit messy, but I'm not sure there's something better.
> >
> > I'm not enthused about it either.
> >
> > > My only thought is this feels a little bit like its mirroring what the
> > > pm_wakeup_event() logic is supposed to do. Should we be adding a
> > > pm_wakeup_event() to alarmtimer_fired() to try to prevent suspend from
> > > occuring for 500ms or so after an alarmtimer has fired so there is
> > > enough time for it to be re-armed if needed?
> >
> > The question is whether this can be called unconditionally and how that
> > interacts with the suspend logic. Rafael?
>
> The pm_wakeup_event() doesn't need the timeout, but it is conditional
> on using /sys/power/wakeup_count.
>
> However, in any case it doesn't guarantee that the target user of the
> alarm timer will be able to handle the signal on time AFAICS.  To my
> eyes, it is entirely possible for alarmtimer_fired() to run before
> /sys/power/wakeup_count is written to while the signal will be sent to

I actually should have said "read from" instead of "written to" here, sorry.

> the given task later, in which case there is no guarantee that the
> task will not be frozen in the meantime.
>
> Moreover, I'm wondering if all alarm timers should always wake up the
> system from sleep or abort suspends in progress?  If the answer is
> "no" here, it changes the problem at hand quite a bit.
  
Rafael J. Wysocki March 2, 2023, 3 p.m. UTC | #25
On Thu, Mar 2, 2023 at 1:48 AM John Stultz <jstultz@google.com> wrote:
>
> On Wed, Mar 1, 2023 at 2:11 PM Thomas Gleixner <tglx@linutronix.de> wrote:
> >
> > On Mon, Feb 27 2023 at 20:06, John Stultz wrote:
> > > On Mon, Feb 27, 2023 at 4:03 PM John Stultz <jstultz@google.com> wrote:
> > >> > On Mon, Feb 20 2023 at 19:11, Michael Nazzareno Trimarchi wrote:
> > >> > +static int alarmtimer_pm_notifier_fn(struct notifier_block *bl, unsigned long state,
> > >> > +                                    void *unused)
> > >> > +{
> > >> > +       switch (state) {
> > >> > +       case PM_HIBERNATION_PREPARE:
> > >> > +       case PM_POST_HIBERNATION:
> > >> > +               atomic_set(&alarmtimer_wakeup, 0);
> > >> > +               break;
> > >> > +       }
> > >> > +       return NOTIFY_DONE;
> > >>
> > >> But here, we're setting the alarmtimer_wakeup count to zero if we get
> > >> PM_HIBERNATION_PREPARE or  PM_POST_HIBERNATION notifications?
> > >> And Michael noted we need to add  PM_SUSPEND_PREPARE and
> > >> PM_POST_SUSPEND there for this to seemingly work.
> >
> > Yup. I missed those when sending out that hack.
> >
> > > So Thomas's notifier method of zeroing at the begining of suspend and
> > > tracking any wakeups after that point makes more sense now. It still
> > > feels a bit messy, but I'm not sure there's something better.
> >
> > I'm not enthused about it either.
>
> That said, it does work. :) In my testing, your approach has been
> reliable, so it has that going for it.
>
> > > My only thought is this feels a little bit like its mirroring what the
> > > pm_wakeup_event() logic is supposed to do. Should we be adding a
> > > pm_wakeup_event() to alarmtimer_fired() to try to prevent suspend from
> > > occuring for 500ms or so after an alarmtimer has fired so there is
> > > enough time for it to be re-armed if needed?
> >
> > The question is whether this can be called unconditionally and how that
> > interacts with the suspend logic. Rafael?
>
> I took a brief stab at this, and one thing is the test needs to use
> the /sys/power/wakeup_count dance before suspending.

That's correct.

> However, I still had some cases where the recurring alarmtimer got
> lost, so I need to dig a bit more to understand what was going wrong
> there.

I'm interested in that too, so if you have any conclusions, please let me know.

> In the meantime, I'm ok with Thomas' approach, but we probably need
> some comment documentation that suggests it might be reworked in a
> cleaner way?

Well, in theory, the PM notifier can run in parallel with
alarmtimer_fired() right after it has incremented the atomic var,
can't it?
  
Thomas Gleixner March 2, 2023, 10:21 p.m. UTC | #26
On Thu, Mar 02 2023 at 15:32, Rafael J. Wysocki wrote:
> On Mon, Feb 20, 2023 at 10:19 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>> +static atomic_t alarmtimer_wakeup;
>>
>>  /**
>>   * alarmtimer_fired - Handles alarm hrtimer being fired.
>> @@ -194,6 +196,8 @@ static enum hrtimer_restart alarmtimer_f
>>         int ret = HRTIMER_NORESTART;
>>         int restart = ALARMTIMER_NORESTART;
>>
>> +       atomic_inc(&alarmtimer_wakeup);
>> +
>
> This appears to be still somewhat racy, because the notifier can run
> at this point AFAICS.

Indeed it is. Let me think more about this.
  
Thomas Gleixner March 2, 2023, 10:58 p.m. UTC | #27
On Thu, Mar 02 2023 at 23:21, Thomas Gleixner wrote:
> On Thu, Mar 02 2023 at 15:32, Rafael J. Wysocki wrote:
>> On Mon, Feb 20, 2023 at 10:19 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>>> +static atomic_t alarmtimer_wakeup;
>>>
>>>  /**
>>>   * alarmtimer_fired - Handles alarm hrtimer being fired.
>>> @@ -194,6 +196,8 @@ static enum hrtimer_restart alarmtimer_f
>>>         int ret = HRTIMER_NORESTART;
>>>         int restart = ALARMTIMER_NORESTART;
>>>
>>> +       atomic_inc(&alarmtimer_wakeup);
>>> +
>>
>> This appears to be still somewhat racy, because the notifier can run
>> at this point AFAICS.
>
> Indeed it is. Let me think more about this.

All of this is inherently racy as there is zero feedback whether the
event has been consumed or not. Making this feedback based is not
necessarily trivial, but let me stare into that.

Thanks,

        tglx
  
Michael Nazzareno Trimarchi March 15, 2023, 8:12 p.m. UTC | #28
Hi Rafael

On Thu, Mar 2, 2023 at 4:01 PM Rafael J. Wysocki <rafael@kernel.org> wrote:
>
> On Thu, Mar 2, 2023 at 1:48 AM John Stultz <jstultz@google.com> wrote:
> >
> > On Wed, Mar 1, 2023 at 2:11 PM Thomas Gleixner <tglx@linutronix.de> wrote:
> > >
> > > On Mon, Feb 27 2023 at 20:06, John Stultz wrote:
> > > > On Mon, Feb 27, 2023 at 4:03 PM John Stultz <jstultz@google.com> wrote:
> > > >> > On Mon, Feb 20 2023 at 19:11, Michael Nazzareno Trimarchi wrote:
> > > >> > +static int alarmtimer_pm_notifier_fn(struct notifier_block *bl, unsigned long state,
> > > >> > +                                    void *unused)
> > > >> > +{
> > > >> > +       switch (state) {
> > > >> > +       case PM_HIBERNATION_PREPARE:
> > > >> > +       case PM_POST_HIBERNATION:
> > > >> > +               atomic_set(&alarmtimer_wakeup, 0);
> > > >> > +               break;
> > > >> > +       }
> > > >> > +       return NOTIFY_DONE;
> > > >>
> > > >> But here, we're setting the alarmtimer_wakeup count to zero if we get
> > > >> PM_HIBERNATION_PREPARE or  PM_POST_HIBERNATION notifications?
> > > >> And Michael noted we need to add  PM_SUSPEND_PREPARE and
> > > >> PM_POST_SUSPEND there for this to seemingly work.
> > >
> > > Yup. I missed those when sending out that hack.
> > >
> > > > So Thomas's notifier method of zeroing at the begining of suspend and
> > > > tracking any wakeups after that point makes more sense now. It still
> > > > feels a bit messy, but I'm not sure there's something better.
> > >
> > > I'm not enthused about it either.
> >
> > That said, it does work. :) In my testing, your approach has been
> > reliable, so it has that going for it.
> >
> > > > My only thought is this feels a little bit like its mirroring what the
> > > > pm_wakeup_event() logic is supposed to do. Should we be adding a
> > > > pm_wakeup_event() to alarmtimer_fired() to try to prevent suspend from
> > > > occuring for 500ms or so after an alarmtimer has fired so there is
> > > > enough time for it to be re-armed if needed?
> > >
> > > The question is whether this can be called unconditionally and how that
> > > interacts with the suspend logic. Rafael?
> >
> > I took a brief stab at this, and one thing is the test needs to use
> > the /sys/power/wakeup_count dance before suspending.
>
> That's correct.
>
> > However, I still had some cases where the recurring alarmtimer got
> > lost, so I need to dig a bit more to understand what was going wrong
> > there.
>
> I'm interested in that too, so if you have any conclusions, please let me know.
>
> > In the meantime, I'm ok with Thomas' approach, but we probably need
> > some comment documentation that suggests it might be reworked in a
> > cleaner way?
>
> Well, in theory, the PM notifier can run in parallel with
> alarmtimer_fired() right after it has incremented the atomic var,
> can't it?

Can we add a function like pm_get_state()?

So can we mark the suspend window using some state?

Michael
  
Michael Nazzareno Trimarchi Jan. 11, 2024, 8:28 a.m. UTC | #29
Hi all

On Sat, Feb 11, 2023 at 7:45 AM John Stultz <jstultz@google.com> wrote:
>
> Instead of trying to handle the freezer waking up tasks from
> schedule() in nanosleep on alarmtimers explicitly, use
> TASK_FREEZABLE which marks the task freezable when it goes
> to schedule, which prevents the signal wakeup.
>
> This allows for the freezer handling to be removed, simplifying
> the code.
>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Stephen Boyd <sboyd@kernel.org>
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Michael <michael@mipisi.de>
> Cc: Michael Trimarchi <michael@amarulasolutions.com>
> Cc: kernel-team@android.com
> Originally-by: Thomas Gleixner <tglx@linutronix.de>
> Link: https://lore.kernel.org/lkml/alpine.DEB.2.21.1909021247250.3955@nanos.tec.linutronix.de/
> [jstultz: Forward ported to 6.2-rc and split out from a separate
>           fix.]
> Signed-off-by: John Stultz <jstultz@google.com>
> ---

Is this land?

Michael

>  kernel/time/alarmtimer.c | 53 ++--------------------------------------
>  1 file changed, 2 insertions(+), 51 deletions(-)
>
> diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
> index f7b2128f64e2..15ecde8fcc1b 100644
> --- a/kernel/time/alarmtimer.c
> +++ b/kernel/time/alarmtimer.c
> @@ -49,14 +49,6 @@ static struct alarm_base {
>         clockid_t               base_clockid;
>  } alarm_bases[ALARM_NUMTYPE];
>
> -#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS)
> -/* freezer information to handle clock_nanosleep triggered wakeups */
> -static enum alarmtimer_type freezer_alarmtype;
> -static ktime_t freezer_expires;
> -static ktime_t freezer_delta;
> -static DEFINE_SPINLOCK(freezer_delta_lock);
> -#endif
> -
>  #ifdef CONFIG_RTC_CLASS
>  /* rtc timer and device for setting alarm wakeups at suspend */
>  static struct rtc_timer                rtctimer;
> @@ -241,19 +233,12 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
>   */
>  static int alarmtimer_suspend(struct device *dev)
>  {
> -       ktime_t min, now, expires;
> +       ktime_t now, expires, min = KTIME_MAX;
>         int i, ret, type;
>         struct rtc_device *rtc;
>         unsigned long flags;
>         struct rtc_time tm;
>
> -       spin_lock_irqsave(&freezer_delta_lock, flags);
> -       min = freezer_delta;
> -       expires = freezer_expires;
> -       type = freezer_alarmtype;
> -       freezer_delta = KTIME_MAX;
> -       spin_unlock_irqrestore(&freezer_delta_lock, flags);
> -
>         rtc = alarmtimer_get_rtcdev();
>         /* If we have no rtcdev, just return */
>         if (!rtc)
> @@ -480,38 +465,6 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
>  EXPORT_SYMBOL_GPL(alarm_forward_now);
>
>  #ifdef CONFIG_POSIX_TIMERS
> -
> -static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
> -{
> -       struct alarm_base *base;
> -       unsigned long flags;
> -       ktime_t delta;
> -
> -       switch(type) {
> -       case ALARM_REALTIME:
> -               base = &alarm_bases[ALARM_REALTIME];
> -               type = ALARM_REALTIME_FREEZER;
> -               break;
> -       case ALARM_BOOTTIME:
> -               base = &alarm_bases[ALARM_BOOTTIME];
> -               type = ALARM_BOOTTIME_FREEZER;
> -               break;
> -       default:
> -               WARN_ONCE(1, "Invalid alarm type: %d\n", type);
> -               return;
> -       }
> -
> -       delta = ktime_sub(absexp, base->get_ktime());
> -
> -       spin_lock_irqsave(&freezer_delta_lock, flags);
> -       if (delta < freezer_delta) {
> -               freezer_delta = delta;
> -               freezer_expires = absexp;
> -               freezer_alarmtype = type;
> -       }
> -       spin_unlock_irqrestore(&freezer_delta_lock, flags);
> -}
> -
>  /**
>   * clock2alarm - helper that converts from clockid to alarmtypes
>   * @clockid: clockid.
> @@ -750,7 +703,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
>         struct restart_block *restart;
>         alarm->data = (void *)current;
>         do {
> -               set_current_state(TASK_INTERRUPTIBLE);
> +               set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
>                 alarm_start(alarm, absexp);
>                 if (likely(alarm->data))
>                         schedule();
> @@ -765,8 +718,6 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
>         if (!alarm->data)
>                 return 0;
>
> -       if (freezing(current))
> -               alarmtimer_freezerset(absexp, type);
>         restart = &current->restart_block;
>         if (restart->nanosleep.type != TT_NONE) {
>                 struct timespec64 rmt;
> --
> 2.39.1.581.gbfd45094c4-goog
>
  

Patch

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index f7b2128f64e2..15ecde8fcc1b 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -49,14 +49,6 @@  static struct alarm_base {
 	clockid_t		base_clockid;
 } alarm_bases[ALARM_NUMTYPE];
 
-#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS)
-/* freezer information to handle clock_nanosleep triggered wakeups */
-static enum alarmtimer_type freezer_alarmtype;
-static ktime_t freezer_expires;
-static ktime_t freezer_delta;
-static DEFINE_SPINLOCK(freezer_delta_lock);
-#endif
-
 #ifdef CONFIG_RTC_CLASS
 /* rtc timer and device for setting alarm wakeups at suspend */
 static struct rtc_timer		rtctimer;
@@ -241,19 +233,12 @@  EXPORT_SYMBOL_GPL(alarm_expires_remaining);
  */
 static int alarmtimer_suspend(struct device *dev)
 {
-	ktime_t min, now, expires;
+	ktime_t now, expires, min = KTIME_MAX;
 	int i, ret, type;
 	struct rtc_device *rtc;
 	unsigned long flags;
 	struct rtc_time tm;
 
-	spin_lock_irqsave(&freezer_delta_lock, flags);
-	min = freezer_delta;
-	expires = freezer_expires;
-	type = freezer_alarmtype;
-	freezer_delta = KTIME_MAX;
-	spin_unlock_irqrestore(&freezer_delta_lock, flags);
-
 	rtc = alarmtimer_get_rtcdev();
 	/* If we have no rtcdev, just return */
 	if (!rtc)
@@ -480,38 +465,6 @@  u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
 EXPORT_SYMBOL_GPL(alarm_forward_now);
 
 #ifdef CONFIG_POSIX_TIMERS
-
-static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
-{
-	struct alarm_base *base;
-	unsigned long flags;
-	ktime_t delta;
-
-	switch(type) {
-	case ALARM_REALTIME:
-		base = &alarm_bases[ALARM_REALTIME];
-		type = ALARM_REALTIME_FREEZER;
-		break;
-	case ALARM_BOOTTIME:
-		base = &alarm_bases[ALARM_BOOTTIME];
-		type = ALARM_BOOTTIME_FREEZER;
-		break;
-	default:
-		WARN_ONCE(1, "Invalid alarm type: %d\n", type);
-		return;
-	}
-
-	delta = ktime_sub(absexp, base->get_ktime());
-
-	spin_lock_irqsave(&freezer_delta_lock, flags);
-	if (delta < freezer_delta) {
-		freezer_delta = delta;
-		freezer_expires = absexp;
-		freezer_alarmtype = type;
-	}
-	spin_unlock_irqrestore(&freezer_delta_lock, flags);
-}
-
 /**
  * clock2alarm - helper that converts from clockid to alarmtypes
  * @clockid: clockid.
@@ -750,7 +703,7 @@  static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
 	struct restart_block *restart;
 	alarm->data = (void *)current;
 	do {
-		set_current_state(TASK_INTERRUPTIBLE);
+		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
 		alarm_start(alarm, absexp);
 		if (likely(alarm->data))
 			schedule();
@@ -765,8 +718,6 @@  static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
 	if (!alarm->data)
 		return 0;
 
-	if (freezing(current))
-		alarmtimer_freezerset(absexp, type);
 	restart = &current->restart_block;
 	if (restart->nanosleep.type != TT_NONE) {
 		struct timespec64 rmt;