[v4,12/24] x86/resctrl: Make resctrl_arch_rmid_read() retry when it is interrupted

Message ID 20230525180209.19497-13-james.morse@arm.com
State New
Headers
Series x86/resctrl: monitored closid+rmid together, separate arch/fs locking |

Commit Message

James Morse May 25, 2023, 6:01 p.m. UTC
  resctrl_arch_rmid_read() could be called by resctrl in process context,
and then called by the PMU driver from irq context on the same CPU.
This could cause struct arch_mbm_state's prev_msr value to go backwards,
leading to the chunks value being incremented multiple times.

The struct arch_mbm_state holds both the previous msr value, and a count
of the number of chunks. These two fields need to be updated atomically.

Read the prev_msr before accessing the hardware, and cmpxchg() the value
back. If the value has changed, the whole thing is re-attempted.

Signed-off-by: James Morse <james.morse@arm.com>
---
 arch/x86/kernel/cpu/resctrl/internal.h |  5 +++--
 arch/x86/kernel/cpu/resctrl/monitor.c  | 28 +++++++++++++++++++-------
 2 files changed, 24 insertions(+), 9 deletions(-)
  

Comments

Peter Newman June 6, 2023, 8:49 a.m. UTC | #1
Hi James,

On Thu, May 25, 2023 at 8:03 PM James Morse <james.morse@arm.com> wrote:
> +interrupted:
> +       am = get_arch_mbm_state(hw_dom, rmid, eventid);
> +       if (am)
> +               start_msr_val = atomic64_read(&am->prev_msr);
> +
>         ret = __rmid_read(rmid, eventid, &msr_val);
>         if (ret)
>                 return ret;
>
>         am = get_arch_mbm_state(hw_dom, rmid, eventid);
>         if (am) {
> -               am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
> -                                                hw_res->mbm_width);
> -               chunks = get_corrected_mbm_count(rmid, am->chunks);
> -               am->prev_msr = msr_val;
> +               old_msr_val = atomic64_cmpxchg(&am->prev_msr, start_msr_val,
> +                                              msr_val);
> +               if (old_msr_val != start_msr_val)
> +                       goto interrupted;
> +
> +               chunks = mbm_overflow_count(start_msr_val, msr_val,
> +                                           hw_res->mbm_width);
> +               atomic64_add(chunks, &am->chunks);
> +
> +               chunks = get_corrected_mbm_count(rmid,
> +                                                atomic64_read(&am->chunks));
>         } else {
>                 chunks = msr_val;
>         }

It looks like if __rmid_read() is interrupted by an occupancy counter
read between writing QM_EVTSEL and reading QM_CTR, it will not perform
any update to am->prev_msr, and the interrupted read will return the
same counter value as in the interrupting read.

Maybe there's something you can create to check that's updated unconditionally?

-Peter
  
James Morse June 6, 2023, 5:03 p.m. UTC | #2
Hi Peter,

On 06/06/2023 09:49, Peter Newman wrote:
> On Thu, May 25, 2023 at 8:03 PM James Morse <james.morse@arm.com> wrote:
>> +interrupted:
>> +       am = get_arch_mbm_state(hw_dom, rmid, eventid);
>> +       if (am)
>> +               start_msr_val = atomic64_read(&am->prev_msr);
>> +
>>         ret = __rmid_read(rmid, eventid, &msr_val);
>>         if (ret)
>>                 return ret;
>>
>>         am = get_arch_mbm_state(hw_dom, rmid, eventid);
>>         if (am) {
>> -               am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
>> -                                                hw_res->mbm_width);
>> -               chunks = get_corrected_mbm_count(rmid, am->chunks);
>> -               am->prev_msr = msr_val;
>> +               old_msr_val = atomic64_cmpxchg(&am->prev_msr, start_msr_val,
>> +                                              msr_val);
>> +               if (old_msr_val != start_msr_val)
>> +                       goto interrupted;
>> +
>> +               chunks = mbm_overflow_count(start_msr_val, msr_val,
>> +                                           hw_res->mbm_width);
>> +               atomic64_add(chunks, &am->chunks);
>> +
>> +               chunks = get_corrected_mbm_count(rmid,
>> +                                                atomic64_read(&am->chunks));
>>         } else {
>>                 chunks = msr_val;
>>         }
> 
> It looks like if __rmid_read() is interrupted by an occupancy counter
> read between writing QM_EVTSEL and reading QM_CTR, it will not perform
> any update to am->prev_msr, and the interrupted read will return the
> same counter value as in the interrupting read.

Yup, that's a problem. I was only looking at the mbm state in memory, not the CPU register.
I think the fix is to read back QM_EVTSEL after reading QM_CTR. I'll do this in
__rmid_read() to avoid returning -EINTR. It creates two retry loops which is annoying, but
making the window larger means you're more likely to see false positives.

----------------------------%<----------------------------
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl
/monitor.c
index e24390d2e661..aeba035bb680 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -101,6 +101,7 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned
 long val)

 static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
 {
+       u32 _rmid, _eventid;
        u64 msr_val;

        /*
@@ -110,9 +111,15 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
         * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
         * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
         * are error bits.
+        * QM_EVTSEL is re-read to detect if this function was interrupted by
+        * another call, meaning the QM_CTR value may belong to a different
+        * event.
         */
-       wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
-       rdmsrl(MSR_IA32_QM_CTR, msr_val);
+       do {
+               wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+               rdmsrl(MSR_IA32_QM_CTR, msr_val);
+               rdmsr(MSR_IA32_QM_EVTSEL, _eventid, _rmid);
+       } while (eventid != _eventid || rmid != _rmid);

        if (msr_val & RMID_VAL_ERROR)
                return -EIO;
----------------------------%<----------------------------


Thanks!

James
  
Peter Newman June 7, 2023, 12:51 p.m. UTC | #3
Hi James,

On Tue, Jun 6, 2023 at 7:03 PM James Morse <james.morse@arm.com> wrote:
> On 06/06/2023 09:49, Peter Newman wrote:
> > It looks like if __rmid_read() is interrupted by an occupancy counter
> > read between writing QM_EVTSEL and reading QM_CTR, it will not perform
> > any update to am->prev_msr, and the interrupted read will return the
> > same counter value as in the interrupting read.
>
> Yup, that's a problem. I was only looking at the mbm state in memory, not the CPU register.
> I think the fix is to read back QM_EVTSEL after reading QM_CTR. I'll do this in
> __rmid_read() to avoid returning -EINTR. It creates two retry loops which is annoying, but
> making the window larger means you're more likely to see false positives.
>
> ----------------------------%<----------------------------
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl
> /monitor.c
> index e24390d2e661..aeba035bb680 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -101,6 +101,7 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned
>  long val)
>
>  static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
>  {
> +       u32 _rmid, _eventid;
>         u64 msr_val;
>
>         /*
> @@ -110,9 +111,15 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
>          * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
>          * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
>          * are error bits.
> +        * QM_EVTSEL is re-read to detect if this function was interrupted by
> +        * another call, meaning the QM_CTR value may belong to a different
> +        * event.
>          */
> -       wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
> -       rdmsrl(MSR_IA32_QM_CTR, msr_val);
> +       do {
> +               wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
> +               rdmsrl(MSR_IA32_QM_CTR, msr_val);
> +               rdmsr(MSR_IA32_QM_EVTSEL, _eventid, _rmid);
> +       } while (eventid != _eventid || rmid != _rmid);
>
>         if (msr_val & RMID_VAL_ERROR)
>                 return -EIO;

I happen to be tracking the cost of resctrl_arch_rmid_read() calls, so
I measured the impact of your fix on my AMD EPYC 7B12:

with both this and the soft RMID series[1] applied:

Base switch         7955     0.23%
Hard RMID Switch    8476     6.80%
Soft RMID Switch    10173   28.17%
CLOSID+Soft RMID    10740   35.32%

then adding EVTSEL read-back patch:

Base switch         7985
Hard RMID Switch    8540     6.96%
Soft RMID Switch    11015   37.95%
CLOSID+Soft RMID    11590   45.16%

The Soft RMID switches contain two __rmid_read() calls, so this
implies each QM_EVTSEL read-back is around 420 cycles on this AMD
implementation.

Even if you don't agree with my plan to add resctrl_arch_rmid_read()
calls to context switches, there should be cheaper ways to handle
this.

-Peter

[1] https://lore.kernel.org/lkml/20230421141723.2405942-4-peternewman@google.com/
  
Peter Newman June 8, 2023, 8:53 a.m. UTC | #4
Hi James,

On Thu, May 25, 2023 at 8:03 PM James Morse <james.morse@arm.com> wrote:
>
> resctrl_arch_rmid_read() could be called by resctrl in process context,
> and then called by the PMU driver from irq context on the same CPU.

Will there be x86 PMU changes to do this or is this only ARM? I just
want to make sure the x86 resctrl_arch_rmid_read() changes are
actually needed.

Thanks!
-Peter
  
James Morse July 17, 2023, 5:06 p.m. UTC | #5
Hi Peter,

On 6/8/23 09:53, Peter Newman wrote:
> On Thu, May 25, 2023 at 8:03 PM James Morse <james.morse@arm.com> wrote:
>>
>> resctrl_arch_rmid_read() could be called by resctrl in process context,
>> and then called by the PMU driver from irq context on the same CPU.
> 
> Will there be x86 PMU changes to do this or is this only ARM? I just
> want to make sure the x86 resctrl_arch_rmid_read() changes are
> actually needed.

I plan to add that as an uncore 'resctrl_pmu' that works for all architectures by calling resctrl_arch_rmid_read(). Having that be specific to arm64 isn't helpful to user-space.
Perf is the natural home for these counters, and it will make it easier to add per-architecture
(or per-soc) counters, without having to teach resctrl about them.

This patch moved to be 'this' side of the 'move to /fs/resctrl' because this can happen
without the PMU changes. If a NOHZ_FULL CPU makes a syscall to read a counter, that happens
in process context, if a CPU in another domain wants to read the same counter, it has to send
an IPI which might target the same CPU.

(I've not investigated  whether this is an existing bug)


Thanks,

James
  
James Morse July 17, 2023, 5:07 p.m. UTC | #6
Hi Peter,

On 6/7/23 13:51, Peter Newman wrote:
> On Tue, Jun 6, 2023 at 7:03 PM James Morse <james.morse@arm.com> wrote:
>> On 06/06/2023 09:49, Peter Newman wrote:
>>> It looks like if __rmid_read() is interrupted by an occupancy counter
>>> read between writing QM_EVTSEL and reading QM_CTR, it will not perform
>>> any update to am->prev_msr, and the interrupted read will return the
>>> same counter value as in the interrupting read.
>>
>> Yup, that's a problem. I was only looking at the mbm state in memory, not the CPU register.
>> I think the fix is to read back QM_EVTSEL after reading QM_CTR. I'll do this in
>> __rmid_read() to avoid returning -EINTR. It creates two retry loops which is annoying, but
>> making the window larger means you're more likely to see false positives.
>>
>> ----------------------------%<----------------------------
>> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl
>> /monitor.c
>> index e24390d2e661..aeba035bb680 100644
>> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
>> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
>> @@ -101,6 +101,7 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned
>>   long val)
>>
>>   static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
>>   {
>> +       u32 _rmid, _eventid;
>>          u64 msr_val;
>>
>>          /*
>> @@ -110,9 +111,15 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
>>           * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
>>           * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
>>           * are error bits.
>> +        * QM_EVTSEL is re-read to detect if this function was interrupted by
>> +        * another call, meaning the QM_CTR value may belong to a different
>> +        * event.
>>           */
>> -       wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
>> -       rdmsrl(MSR_IA32_QM_CTR, msr_val);
>> +       do {
>> +               wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
>> +               rdmsrl(MSR_IA32_QM_CTR, msr_val);
>> +               rdmsr(MSR_IA32_QM_EVTSEL, _eventid, _rmid);
>> +       } while (eventid != _eventid || rmid != _rmid);
>>
>>          if (msr_val & RMID_VAL_ERROR)
>>                  return -EIO;

> I happen to be tracking the cost of resctrl_arch_rmid_read() calls, so
> I measured the impact of your fix on my AMD EPYC 7B12:
> 
> with both this and the soft RMID series[1] applied:

> The Soft RMID switches contain two __rmid_read() calls, so this
> implies each QM_EVTSEL read-back is around 420 cycles on this AMD
> implementation.

Oooer. I assumed writes might have tedious side-effects but reads would cheap.
I suppose its because another CPU may have modified this value in the meantime.


> Even if you don't agree with my plan to add resctrl_arch_rmid_read()
> calls to context switches, there should be cheaper ways to handle
> this.

Yup, I've swapped this for a sequence counter[0], which should push that cost into the noise.
Anything left will be the cost of the atomics.


Thanks,

James

[0] barely tested:
------------------%<------------------
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 238831d53479..86d3a1b99be6 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -16,6 +16,7 @@
   */
  
  #include <linux/module.h>
+#include <linux/percpu.h>
  #include <linux/sizes.h>
  #include <linux/slab.h>
  
@@ -24,6 +25,9 @@
  
  #include "internal.h"
  
+/* Sequence number for writes to IA32 QM_EVTSEL */
+static DEFINE_PER_CPU(u64, qm_evtsel_seq);
+
  struct rmid_entry {
         /*
          * Some architectures's resctrl_arch_rmid_read() needs the CLOSID value
@@ -178,8 +182,7 @@ static inline struct rmid_entry *__rmid_entry(u32 idx)
  
  static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
  {
-       u32 _rmid, _eventid;
-       u64 msr_val;
+       u64 msr_val, seq;
  
         /*
          * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
@@ -188,15 +191,16 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
          * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
          * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
          * are error bits.
-        * QM_EVTSEL is re-read to detect if this function was interrupted by
-        * another call, meaning the QM_CTR value may belong to a different
-        * event.
+        * A per-cpu sequence counter is incremented each time QM_EVTSEL is
+        * written. This is used to detect if this function was interrupted by
+        * another call without re-reading the MSRs. Retry the MSR read when
+        * this happens as the QM_CTR value may belong to a different event.
          */
         do {
+               seq = this_cpu_inc_return(qm_evtsel_seq);
                 wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
                 rdmsrl(MSR_IA32_QM_CTR, msr_val);
-               rdmsr(MSR_IA32_QM_EVTSEL, _eventid, _rmid);
-       } while (eventid != _eventid || rmid != _rmid);
+       } while (seq != this_cpu_read(qm_evtsel_seq));
  
         if (msr_val & RMID_VAL_ERROR)
                 return -EIO;

------------------%<------------------
  

Patch

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 6f18cf26988c..7960366b9434 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -2,6 +2,7 @@ 
 #ifndef _ASM_X86_RESCTRL_INTERNAL_H
 #define _ASM_X86_RESCTRL_INTERNAL_H
 
+#include <linux/atomic.h>
 #include <linux/resctrl.h>
 #include <linux/sched.h>
 #include <linux/kernfs.h>
@@ -338,8 +339,8 @@  struct mbm_state {
  *		find this struct.
  */
 struct arch_mbm_state {
-	u64	chunks;
-	u64	prev_msr;
+	atomic64_t	chunks;
+	atomic64_t	prev_msr;
 };
 
 /**
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index e267869d60d5..1f470e55d555 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -225,13 +225,15 @@  void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
 {
 	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
 	struct arch_mbm_state *am;
+	u64 msr_val;
 
 	am = get_arch_mbm_state(hw_dom, rmid, eventid);
 	if (am) {
 		memset(am, 0, sizeof(*am));
 
 		/* Record any initial, non-zero count value. */
-		__rmid_read(rmid, eventid, &am->prev_msr);
+		__rmid_read(rmid, eventid, &msr_val);
+		atomic64_set(&am->prev_msr, msr_val);
 	}
 }
 
@@ -266,23 +268,35 @@  int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
 {
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+	u64 start_msr_val, old_msr_val, msr_val, chunks;
 	struct arch_mbm_state *am;
-	u64 msr_val, chunks;
-	int ret;
+	int ret = 0;
 
 	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
 		return -EINVAL;
 
+interrupted:
+	am = get_arch_mbm_state(hw_dom, rmid, eventid);
+	if (am)
+		start_msr_val = atomic64_read(&am->prev_msr);
+
 	ret = __rmid_read(rmid, eventid, &msr_val);
 	if (ret)
 		return ret;
 
 	am = get_arch_mbm_state(hw_dom, rmid, eventid);
 	if (am) {
-		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
-						 hw_res->mbm_width);
-		chunks = get_corrected_mbm_count(rmid, am->chunks);
-		am->prev_msr = msr_val;
+		old_msr_val = atomic64_cmpxchg(&am->prev_msr, start_msr_val,
+					       msr_val);
+		if (old_msr_val != start_msr_val)
+			goto interrupted;
+
+		chunks = mbm_overflow_count(start_msr_val, msr_val,
+					    hw_res->mbm_width);
+		atomic64_add(chunks, &am->chunks);
+
+		chunks = get_corrected_mbm_count(rmid,
+						 atomic64_read(&am->chunks));
 	} else {
 		chunks = msr_val;
 	}